3.3.5 Sample Rows

OREdplyr functions for sampling rows.

Table 3-6 Sampling Row Functions

Function Description

sample_frac

Samples an ore.frame object by a fraction.

sample_n

Samples an ore.frame object by a fixed number of rows.

Example 3-75 Sampling Rows

These examples use the ore.frame object MTCARS that is created by using the ore.push function on the mtcars data.frame object. They exemplify the use of the sampling functions sample_n and sample_frac. They also use the OREdplyr functions arrange and summarize.

MTCARS <- ore.push(mtcars)
by_cyl <- group_by(MTCARS, cyl)

# Sample fixed number per group of rows from the entire dataset
sample_n(MTCARS, 10)
nrow(sample_n(MTCARS, 50, replace = TRUE))
sample_n(MTCARS, 10, weight = mpg)
sample_n(MTCARS, 10, weight = MTCARS[["mpg"]])

# Sample fixed number of rows per group with replacement and weight 
arrange(sample_n(by_cyl, 3), cyl, mpg)
arrange(summarise(sample_n(by_cyl, 10, replace = TRUE), n = n()), cyl)
arrange(summarise(sample_n(by_cyl, 3, weight = mpg/mean(mpg)), n = n()), cyl)
arrange(summarise(sample_n(by_cyl, 3, 
                           weight = by_cyl[["mpg"]]/mean(by_cyl[["mpg"]])), n = n()), cyl) 

# Sample fixed fraction per group
nrow(sample_frac(MTCARS, 0.1))
nrow(sample_frac(MTCARS, 1.5, replace = TRUE))
nrow(sample_frac(MTCARS, 0.1, weight = 1/mpg))

Listing for This Example

R> MTCARS <- ore.push(mtcars)
R> by_cyl <- group_by(MTCARS, cyl)
R> 
R> # Sample fixed number per group of rows from the entire dataset
R> sample_n(MTCARS, 10)
                  mpg cyl  disp  hp drat    wt  qsec vs am gear carb
Datsun 710|4     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
Ford Pantera L|2 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
Honda Civic|10   30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
Lotus Europa|6   30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
Maserati Bora|3  15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
Mazda RX4|5      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag|9  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
Merc 280|8       19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
Toyota Corolla|7 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
Toyota Corona|1  21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
R> nrow(sample_n(MTCARS, 50, replace = TRUE))
[1] 50
R>
R> # Sample fixed number of rows per group with replacement and weight 
R> arrange(sample_n(by_cyl, 3), cyl, mpg)
  cyl  mpg  disp  hp drat    wt  qsec vs am gear carb
1   4 22.8 108.0  93 3.85 2.320 18.61  1  1    4    1
2   4 24.4 146.7  62 3.69 3.190 20.00  1  0    4    2
3   4 30.4  95.1 113 3.77 1.513 16.90  1  1    5    2
4   6 19.2 167.6 123 3.92 3.440 18.30  1  0    4    4
5   6 19.7 145.0 175 3.62 2.770 15.50  0  1    5    6
6   6 21.4 258.0 110 3.08 3.215 19.44  1  0    3    1
7   8 10.4 460.0 215 3.00 5.424 17.82  0  0    3    4
8   8 15.2 304.0 150 3.15 3.435 17.30  0  0    3    2
9   8 15.2 275.8 180 3.07 3.780 18.00  0  0    3    3
R> arrange(summarise(sample_n(by_cyl, 10, replace = TRUE), n = n()), cyl)
  cyl  n
1   4 10
2   6 10
3   8 10
R> arrange(summarise(sample_n(by_cyl, 3, weight = mpg/mean(mpg)), n = n()), cyl)
  cyl n
1   4 3
2   6 3
3   8 3
R> arrange(summarise(sample_n(by_cyl, 3, weight = by_cyl[["mpg"]]/mean(by_cyl[["mpg"]])), n = n()), cyl)
  cyl n
1   4 3
2   6 3
3   8 3
R> 
R> nrow(sample_frac(MTCARS, 0.1))
[1] 3
R> nrow(sample_frac(MTCARS, 1.5, replace = TRUE))
[1] 48
R> nrow(sample_frac(MTCARS, 0.1, weight = 1/mpg))
[1] 3