stratified = function(df, id, group, size, seed="NULL", ...) { # USE: * Specify your data frame, ID variable (as column number), and # grouping variable (as column number) as the first three arguments. # * Decide on your sample size. For a sample proportional to the # population, enter "size" as a decimal. For an equal number of # samples from each group, enter "size" as a whole number. # * Decide on if you want to use a seed or not. If not, leave blank # or type "NULL" (with quotes). # # Example 1: To sample 10% of each group from a data frame named "z", where # the ID variable is the first variable, the grouping variable # is the fourth variable, and the desired seed is "1", use: # # > stratified(z, 1, 4, .1, 1) # # Example 2: To run the same sample as above but without a seed, use: # # > stratified(z, 1, 4, .1) # # Example 3: To sample 5 from each group from a data frame named "z", where # the ID variable is the first variable, the grouping variable # is the third variable, and the desired seed is 2, use: # # > stratified(z, 1, 3, 5, 2) # # NOTE: Not tested on datasets with LOTS of groups or with HUGE # differences in group sizes. Probably INCREDIBLY inefficient. k = unstack(data.frame(as.vector(df[id]), as.vector(df[group]))) l = length(k) results = vector("list", l) if (seed == "NULL" & size < 1) { for (i in 1:length(k)) { N = k[[i]] n = round(length(N)*size) results[[i]] = list(sample(N, n, ...)) } } else if (seed == "NULL" & size >= 1) { for (i in 1:length(k)) { N = k[[i]] results[[i]] = list(sample(N, size, ...)) } } else if (size < 1) { for (i in 1:length(k)) { set.seed(seed) N = k[[i]] n = round(length(N)*size) results[[i]] = list(sample(N, n, ...)) } } else if (size >= 1) { for (i in 1:length(k)) { set.seed(seed) N = k[[i]] results[[i]] = list(sample(N, size, ...)) } } z = data.frame(c(unlist(results))) names(z) = names(df[id]) w = merge(df, z) w[order(w[group]), ] }