在许多数据帧中使因子水平均匀的最佳方法

问题描述 投票:2回答:2

我有许多data.frames,每个都有一个因素。我想确保它们都使用相同的级别。这样做的正确方法是什么?

在下面的代码中,您将看到我使用整个级别集合中的级别和一个小的便利函数changeLevels重新分配每个案例的因子。我希望有更好的方法来做到这一点。

set.seed(1234)
b<-round(runif(100,1,10),digits=2)
set.seed(2345)
b2<-round(runif(100,11,20),digits=2)
set.seed(3456)
b3<-round(runif(50,15,18),digits=2)

#.. all potential levels
bt<-factor(sort(c(b,b2,b3)))
lvls<-levels(bt)

t1<-as.data.frame(table(sample(b,5)))
t2<-as.data.frame(table(sample(b,1)))
t3<-as.data.frame(table(sample(b,1)))
t4<-as.data.frame(table(sample(b,8)))
t5<-as.data.frame(table(sample(b2,20)))
t6<-as.data.frame(table(sample(b3,18)))

t1<-cbind(t1,p="A")
t2<-cbind(t2,p="B")
t3<-cbind(t3,p="C")
t4<-cbind(t4,p="D")
t5<-cbind(t5,p="E")
t6<-cbind(t6,p="F")

d<-data.frame()
d<-rbind(d,t2,t3,t6,t4,t5,t1)

#.. out of order bins
ggplot(d,aes(x=factor(Var1),fill=factor(p))) +
  geom_bar(aes(weight=Freq)) +
  facet_grid( p ~ ., margins=T)+
  ggtitle("out of order bins")

changeFactor<-function(t,lvls){
  temp<-as.numeric(as.character(t))
  factor(temp,levels=lvls)
}

t1$Var1<-changeFactor(t1$Var1,lvls)
t2$Var1<-changeFactor(t2$Var1,lvls)
t3$Var1<-changeFactor(t3$Var1,lvls)
t4$Var1<-changeFactor(t4$Var1,lvls)
t5$Var1<-changeFactor(t5$Var1,lvls)
t6$Var1<-changeFactor(t6$Var1,lvls)

d<-data.frame()
d<-rbind(d,t2,t3,t6,t4,t5,t1)

#.. in order bins
ggplot(d,aes(x=factor(Var1),fill=factor(p))) +
  geom_bar(aes(weight=Freq)) +
  facet_grid( p ~ ., margins=T)+
  ggtitle("in order bins")
r r-factor
2个回答
2
投票

简短回答:将您的数据保存在列表中并学习*pply系列

set.seed(1234)
b<-round(runif(100,1,10),digits=2)
set.seed(2345)
b2<-round(runif(100,11,20),digits=2)
set.seed(3456)
b3<-round(runif(50,15,18),digits=2)

#.. all potential levels
bt<-factor(sort(c(b,b2,b3)))
lvls<-levels(bt)

options(stringsAsFactors = FALSE)
f <- function(x, y, z)
  cbind(data.frame(table(sample(x, y))), p = z)

datl <- Map(f, list(b,b,b,b,b2,b3), c(5,1,1,8,20,18), LETTERS[1:6])

changeFactor<-function(t,lvls){
  temp<-as.numeric(as.character(t))
  factor(temp,levels=lvls)
}

datl <- lapply(rapply(datl, f = function(x) changeFactor(x, lvls), 
                     classes = 'factor', how = 'replace'),
              data.frame)

d <- do.call(rbind, datl[c(2, 3, 6, 4, 5, 1)])

#.. in order bins
ggplot(d,aes(x=factor(Var1),fill=factor(p))) +
  geom_bar(aes(weight=Freq)) +
  facet_grid( p ~ ., margins=T)+
  ggtitle("in order bins")

答案很长:

set.seed(1234)
b<-round(runif(100,1,10),digits=2)
set.seed(2345)
b2<-round(runif(100,11,20),digits=2)
set.seed(3456)
b3<-round(runif(50,15,18),digits=2)

#.. all potential levels
bt<-factor(sort(c(b,b2,b3)))
lvls<-levels(bt)

首先,我不希望出现任何意外因素,所以stringsAsFactors = FALSE然后编写一个函数f来做你想做的事情,并检查以确保它有效

options(stringsAsFactors = FALSE)
f <- function(x, y, z)
  cbind(data.frame(table(sample(x, y))), p = z)

f(b, 5, 'A')

#   Var1 Freq p
# 1 1.13    1 A
# 2 1.46    1 A
# 3 2.09    1 A
# 4  2.5    1 A
# 5 7.02    1 A

似乎工作,所以只需Map它到参数列表并检查输出

datl <- Map(f, list(b,b,b,b,b2,b3), c(5,1,1,8,20,18), LETTERS[1:6])

# List of 6
# $ :'data.frame':  5 obs. of  3 variables:
#   ..$ Var1: Factor w/ 5 levels "2.02","3.09",..: 1 2 3 4 5
#   ..$ Freq: int [1:5] 1 1 1 1 1
#   ..$ p   : chr [1:5] "A" "A" "A" "A" ...
# $ :'data.frame':  1 obs. of  3 variables:
#   ..$ Var1: Factor w/ 1 level "1.63": 1
#   ..$ Freq: int 1
#   ..$ p   : chr "B"

所以将所有内容与ggplot结合使用

d <- do.call(rbind, datl[c(2, 3, 6, 4, 5, 1)])

library(ggplot2)
#.. out of order bins
ggplot(d,aes(x=factor(Var1),fill=factor(p))) +
  geom_bar(aes(weight=Freq)) +
  facet_grid( p ~ ., margins=T)+
  ggtitle("out of order bins")

changeFactor<-function(t,lvls){
  temp<-as.numeric(as.character(t))
  factor(temp,levels=lvls)
}

再次确保该函数在一个数据框上执行它应该执行的操作

changeFactor(datl[[1]]$Var1, lvls)

# [1] 2.02 3.09 3.79 3.89 8.3 
# 234 Levels: 1.09 1.12 1.13 1.24 1.36 1.38 1.41 1.46 1.63 1.66 1.81 1.95 ... 19.86

所以立即再次应用它们并检查输出

datl <- lapply(rapply(datl, f = function(x) changeFactor(x, lvls), 
                     classes = 'factor', how = 'replace'),
              data.frame)
str(datl)
# List of 6
# $ :'data.frame':  5 obs. of  3 variables:
#   ..$ Var1: Factor w/ 234 levels "1.09","1.12",..: 13 28 41 45 81
#   ..$ Freq: int [1:5] 1 1 1 1 1
#   ..$ p   : chr [1:5] "A" "A" "A" "A" ...
# $ :'data.frame':  1 obs. of  3 variables:
#   ..$ Var1: Factor w/ 234 levels "1.09","1.12",..: 9
#   ..$ Freq: int 1
#   ..$ p   : chr "B"
# ...

再次结合并绘制

d <- do.call(rbind, datl[c(2, 3, 6, 4, 5, 1)])

#.. in order bins
ggplot(d,aes(x=factor(Var1),fill=factor(p))) +
  geom_bar(aes(weight=Freq)) +
  facet_grid( p ~ ., margins=T)+
  ggtitle("in order bins")


2
投票

我认为你用as.character“阅读”因素的方式是你不知道所有“真实”水平的最佳方式。

但既然你知道它们(它们都存储在lvls中),为什么不在构建你的ti$Var1向量时直接使用它们呢?也就是说,而不是:

ti = as.data.frame(table(sample(b,5)));                 # automately creates a factor vector ti$Var1 with what is found inside the sample as levels
ti$Var1 = factor(as.character(ti$Var1), levels = lvls); # replaces it with a new factor, created by reading each value of the previous one and assigning it a level from lvls

(这最终是你做的),直接做:

tab = table(sample(b,5));                         
ti = data.frame(myVar  = factor(names(tab), lvls)       # creates directly the right factor vector with levels drawn from lvls
              , myFreq = as.numeric(tab)
              );

(这最终是你想要的)(并且'甚至可以让你更好地控制ti列的名字)

否则,你会得到空行:

factoredSample = factor(sample(b,5), lvls);             # directly associates each drawn value with a level from lvls
ti = as.data.frame(table(factoredSample));              # and table will then also count the non-represented levels within factoredSample

(顺便说一句,我不知道这是否只是为了提出问题的目的,但如果你真的必须在你的脚本中处理这么多几乎相同的data.frames,你可能使用了错误的数据结构体。)

© www.soinside.com 2019 - 2024. All rights reserved.