我有一个具有属性 A、B、C 的数据集。C 是一个具有 2 个标签 zz 和 z 的因子。
number of (z) > number of (zz)
,我想 under sample 我的数据集,以便在新数据中具有相同的 zz 和 z 值。 不能为此使用任何外部包。 如果可以使用 sample
功能来完成的话最好
--------------------------------------------------
| Attribute A | Attribute B . | Attribute c |
--------------------------------------------------
| xx | y1 | zz |
--------------------------------------------------
| mm | r1 | z |
--------------------------------------------------
| ab | 1r | z |
--------------------------------------------------
| ry | cm | zz |
--------------------------------------------------
| ca | rx | z |
--------------------------------------------------
| mm | zr | z |
--------------------------------------------------
结果应该是-
| Attribute A | Attribute B . | Attribute c |
--------------------------------------------------
| xx | y1 | zz |
--------------------------------------------------
| mm | r1 | z |
--------------------------------------------------
| ab | 1r | z |
--------------------------------------------------
| ry | cm | zz |
--------------------------------------------------
这里 zz 的概率 = z 的概率 = 0.5
假设您的数据位于名为
data
的数据框中,其中包含 A
、B
和 C
列,您可以执行以下操作:
## rows that have "z" and "zz" entries
z_ind <- which(data$C == "z")
zz_ind <- which(data$C == "zz")
nsamp <- 10 #number of elements to sample
## if you want all elements of the smaller class, could be:
## nsamp <- min(length(z_ind), length(zz_ind))
## select `nsamp` entries with "z" and `nsamp` entries with "zz"
pick_z <- sample(z_ind, nsamp)
pick_zz <- sample(zz_ind, nsamp)
new_data <- data[c(pick_z, pick_zz), ]
这是我用来欠采样/过采样和自举的代码。也是为模型创建公式的功能。
#to divide a set to two sets
mark = sample(1:dim(name_df)[1],size=floor(dim(name_df)[1]*0.7),replace=F)
train_set = name_df[+mark,]
test_set = name_df[-mark,]
good_ones = which(train_set$dependent_var==0)
bad_ones = which(train_set$dependent_var==1)
#undersample
good_ones_undersampled = good_ones[sample(1:length(good_ones),size=length(bad_ones),replace=F)]
new_train_set = rbind(train_set[good_ones_undersampled,],train_set[bad_ones,])
summary(train.um$dependent_var) #check
#to randomize the set
new_train_set = new_train_set[sample(1:dim(new_train_set)[1],size=dim(new_train_set)[1],replace=F),]
#under/oversample
good_ones_undersampled = good_ones[sample(1:length(good_ones),size=length(bad_ones)*2,replace=F)]
bad_ones_oversampled = bad_ones[sample(1:length(bad_ones),size=length(bad_ones)*2,replace=T)]
new_train_set = rbind(train_set[good_ones_undersampled,],train_set[bad_ones_oversampled,])
new_train_set = new_train_set[sample(1:dim(new_train_set)[1],size=dim(new_train_set)[1],replace=F),]
#weighting
obs_weights = rep(0,dim(train_set)[1])
obs_weights[train_set$dependent_var==1] = 0.5/length(bad_ones) #same weights to good and bad ones
obs_weights[train_set$dependent_var==0] = 0.5/length(good_ones)
train_set$weights = obs_weights
#formula
formula = as.formula(dependent_var~x1+x2+x3+x4+x5+x6)
create_formula = function(y='dependent_var',x=independent_vars) {
formula = paste(y,'~1',sep='')
if (is.null(x)) return(as.formula(formula))
number_of_vars = length(x)
for (i in 1:number_of_vars) formula = paste(formula,'+',x[i],sep='')
return(as.formula(formula))
}
formula = create_formula(y='dependent_var',x=names(train_set)[c(2:4,7:10)])