df <- data.frame('var1' = c('x','1', 'X', '', 'X'), "var2" = c('y','3', '', 'X', ''), "var3" = c('y','7', '', 'X', 'X'))
library(sqldf)
testcases_sql <-
("
CASE WHEN var1 = 1 THEN 1 ELSE 0 END AS a1,
CASE WHEN var1 = 1 AND var2 = 'y' THEN 1 ELSE 0 END AS b1,
CASE WHEN var1= 1 AND var2= 3 THEN 1 ELSE 0 END AS b1,
CASE WHEN var1= 1 AND var2= 3 THEN 1 ELSE 0 END AS b1,
CASE WHEN var1= 1 AND var2= 'X' THEN 1 ELSE 0 END AS b1,
CASE WHEN var1= 1 AND var2= 'X' AND var3=7 THEN 1 ELSE 0 END AS c1,
CASE WHEN var1= 'X' AND var3='X' THEN 1 ELSE 0 END AS c1")
sql_string = paste("SELECT *" , ",", testcases_sql, " FROM ", "df", sep=" ")
#create criteria
data = sqldf(sql_string)
head(data)
SQLDF为每个条件创建一个新的对话框
head(data) # var1 var2 var3 a1 b1 b1 b1 b1 c1 c1 # 1 x y y 0 0 0 0 0 0 0 # 2 1 3 7 1 0 1 1 0 0 0 # 3 X 0 0 0 0 0 0 0 # 4 X X 0 0 0 0 0 0 0 # 5 X X 0 0 0 0 0 0 1
但是我需要所有标准都在一个变量中,以便所有b1都在一个列中,所有c1都在一个列中,依此类推。每行满足条件的次数无关紧要,每行只需要一个“ 1”即可。在我最初的df中,没有标准可以重复多少次的系统,这完全是随机的。我的预期结果是:
wished_df <- data.frame('var1' = c('x','1', 'X', '', 'X'), "var2" = c('y','3', '', 'X', ''), "var3" = c('y','7', '', 'X', 'X'), "a1" = c('0','1', '0', '0', '0'), "b1=" =c('0','1', '0', '0','0'), "c1=" =c('0','0', '0', '0','1') )
head(wished_df)
# var1 var2 var3 a1 b1 c1
#1 x y y 0 0 0
#2 1 3 7 1 1 0
#3 X 0 0 0
#4 X X 0 0 0
#5 X X 0 0 1
可能sqldf并不是为此的最佳功能。我最好的方法是在以后通过将变量合并在一起来更改df
#sum the variable data$newb1 <- data$b1 + data$b1 + data$b1 + data$b1 #error in `$<-.data.frame`(`*tmp*`, newb1, value = numeric(0)) : replacement has 0 rows, data has 5 #delete the old variable data$b1 <- data$b1 <-data$b1 <- data$b1 <- NULL #rename the variable data$b1 <- data$newb1 #delete old variable data$newb1 <- NULL #repeat for c1, d1, e1 and so on... data$newc1 <- data$c1 + data$c1 data$c1 <- data$c1 <- NULL data$c1 <- data$newc1 data$newc1 <- NULL
这不起作用,并且非常难看,将需要很多代码(我有80个测试用例)。有更简单的方法吗?
非常感谢
我对R很陌生,并且具有df,在其中使用sqldf创建了一些条件(a1,b1,c1,d1 ..依此类推)(在此示例中,我仅显示a1至c1)df
df <- data.frame('var1' = c('x','1', 'X', '', 'X'),
"var2" = c('y','3', '', 'X', ''),
"var3" = c('y','7', '', 'X', 'X'))
df$a1 <- 1 * (df$var1 == "1")
df$b1 <- 1 * ((df$var1 == "1") & (df$var2 == "y" | df$var2 == "3" | df$var2 == "X"))
df$c1 <- 1 * ((df$var1 == "1" & df$var2 == "X" & df$var3 == "7") |
(df$var1 == "X" & df$var3 == "X"))
df
#> var1 var2 var3 a1 b1 c1
#> 1 x y y 0 0 0
#> 2 1 3 7 1 1 0
#> 3 X 0 0 0
#> 4 X X 0 0 0
#> 5 X X 0 0 1
testcasesSQL
,因为$字符串插值要求变量名使用单词字符。如果测试用例有某种模式,则可以使用R代码生成testcasesSQL字符串,但尚不清楚是否存在,我们只需修复问题中的代码并将其转换为更紧凑的SQL。library(sqldf)
testcasesSQL <- "(var1 = 1) or (var1 = 1 AND var2 = 'y') as a1,
(var1 = 1 AND var2 = 3) or (var1 = 1 AND var2 = 3) or (var1 = 1 AND var2 = 'X') AS b1,
(var1 = 1 AND var2 = 'X' AND var3 = 7) or (var1 = 'X' AND var3 ='X') AS c1"
mydf <- "df"
fn$sqldf("select *, $testcasesSQL from $mydf")
给予: