我有一个data.table如下:
panelID = c(1:50)
year= c(2001:2010)
country = c("NLD", "GRC", "GBR")
n <- 2
library(data.table)
set.seed(123)
DT <- data.table(panelID = rep(sample(panelID), each = n),
country = rep(sample(country, length(panelID), replace = T), each = n),
year = c(replicate(length(panelID), sample(year, n))),
some_NA = sample(0:5, 6),
some_NA_factor = sample(0:5, 6),
norm = round(runif(100)/10,2),
Income = round(rnorm(10,-5,5),2),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = sample(100,100),
Educ = round(rnorm(10,0.75,0.3),2))
DT [, uniqueID := .I] # Creates a unique ID
DT[DT == 0] <- NA
我一直在编写一个函数,以自动使用二级最小二乘法。作为功能的一部分,我需要对所有具有NA的行进行子集化(但仅适用于2SLS中使用的变量)。但是由于某种原因,它似乎并没有在函数中子集化。
subsetfun <- function (dataset, depvar, indepvars) {
sub_set <- subset(dataset, !is.na(depvar))
sub_set <- subset(sub_set, !is.na(indepvars))
return(sub_set)
}
sub_set <- subsetfun(DT, depvar="some_NA", indepvars=c("panelID", "some_NA_factor"))
子设置在函数外部起作用,但由于某种原因不在函数内部起作用。
编辑:函数中的第一行没有给出错误。第二行可能有语法问题。它给出了错误:
Error in `[.data.table`(x, r, vars, with = FALSE) :
i evaluates to a logical vector length 2 but there are 100 rows. Recycling of logical i is no longer allowed as it hides more bugs than is worth the rare convenience. Explicitly use rep(...,length=.N) if you really need to recycle.
所需的输出:
> dput(sub_set)
structure(list(panelID = c(31L, 15L, 14L, 14L, 3L, 42L, 43L,
43L, 37L, 48L, 25L, 25L, 26L, 27L, 5L, 5L, 40L, 28L, 9L, 9L,
29L, 8L, 41L, 41L, 7L, 10L, 36L, 36L, 19L, 4L, 45L, 45L, 17L,
11L, 32L, 32L, 21L, 12L, 49L, 49L, 50L, 13L, 24L, 24L, 30L, 33L,
20L, 20L, 18L, 46L, 22L, 22L, 39L, 38L, 35L, 35L, 47L, 6L, 1L,
1L, 2L, 23L, 44L, 44L, 16L, 34L), country = c("GBR", "NLD", "GRC",
"GRC", "NLD", "NLD", "GBR", "GBR", "NLD", "GRC", "NLD", "NLD",
"GBR", "NLD", "GBR", "GBR", "GRC", "GBR", "GRC", "GRC", "GRC",
"GBR", "GRC", "GRC", "GRC", "GBR", "GBR", "GBR", "NLD", "GRC",
"GRC", "GRC", "NLD", "GRC", "NLD", "NLD", "NLD", "GRC", "GBR",
"GBR", "GBR", "NLD", "GRC", "GRC", "NLD", "GRC", "NLD", "NLD",
"GBR", "GBR", "GRC", "GRC", "GBR", "NLD", "GRC", "GRC", "GRC",
"GBR", "GRC", "GRC", "NLD", "GBR", "GBR", "GBR", "GBR", "GRC"
), year = c(2010L, 2008L, 2003L, 2002L, 2010L, 2006L, 2004L,
2001L, 2006L, 2003L, 2008L, 2001L, 2007L, 2006L, 2007L, 2005L,
2006L, 2007L, 2004L, 2003L, 2009L, 2009L, 2007L, 2002L, 2003L,
2007L, 2004L, 2001L, 2008L, 2008L, 2006L, 2004L, 2008L, 2010L,
2006L, 2001L, 2010L, 2007L, 2008L, 2005L, 2002L, 2008L, 2010L,
2004L, 2005L, 2008L, 2008L, 2009L, 2008L, 2009L, 2007L, 2010L,
2010L, 2007L, 2010L, 2001L, 2009L, 2005L, 2006L, 2009L, 2004L,
2007L, 2010L, 2004L, 2008L, 2010L), some_NA = c(4L, 1L, 5L, 3L,
4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L,
4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L,
4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L,
4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L), some_NA_factor = c(1L,
5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L,
5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L,
5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L,
5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L,
5L), norm = c(0.05, 0.07, 0.01, 0.04, 0.08, 0.04, 0.09, 0.09,
0.03, 0.01, NA, 0.1, NA, 0.06, 0.03, 0.07, 0.08, 0.07, 0.06,
0.06, 0.1, 0.05, 0.02, 0.05, 0.04, 0.02, 0.06, 0.01, 0.1, 0.06,
0.1, 0.07, 0.04, 0.01, 0.02, 0.09, 0.03, 0.02, NA, 0.04, 0.05,
0.09, 0.05, 0.05, 0.1, 0.03, 0.1, 0.06, 0.08, 0.05, 0.09, 0.02,
0.03, 0.1, 0.03, 0.07, 0.08, 0.03, 0.01, 0.01, 0.1, 0.09, 0.06,
0.04, 0.04, 0.03), Income = c(-2.65, -9.35, -7.31, -14.56, -3.15,
-6.42, -2.65, -0.2, -5.03, -14.56, -3.15, -7.31, -3.39, -0.2,
-5.03, -9.35, -7.31, -7.31, -3.39, -6.42, -2.65, -9.35, -7.31,
-14.56, -3.15, -6.42, -2.65, -0.2, -5.03, -14.56, -3.15, -7.31,
-3.39, -0.2, -5.03, -9.35, -7.31, -7.31, -3.39, -6.42, -2.65,
-9.35, -7.31, -14.56, -3.15, -6.42, -2.65, -0.2, -5.03, -14.56,
-3.15, -7.31, -3.39, -0.2, -5.03, -9.35, -7.31, -7.31, -3.39,
-6.42, -2.65, -9.35, -7.31, -14.56, -3.15, -6.42), Happiness = c(1L,
10L, 2L, 9L, 5L, 4L, 1L, 3L, 6L, 9L, 5L, 7L, 8L, 3L, 6L, 10L,
2L, 7L, 8L, 4L, 1L, 10L, 2L, 9L, 5L, 4L, 1L, 3L, 6L, 9L, 5L,
7L, 8L, 3L, 6L, 10L, 2L, 7L, 8L, 4L, 1L, 10L, 2L, 9L, 5L, 4L,
1L, 3L, 6L, 9L, 5L, 7L, 8L, 3L, 6L, 10L, 2L, 7L, 8L, 4L, 1L,
10L, 2L, 9L, 5L, 4L), Sex = c(1.08, 0.77, 0.54, 0.53, 1.02, 0.72,
1.08, 0.96, 0.64, 0.53, 1.02, 0.45, 1.34, 0.96, 0.64, 0.77, 0.54,
0.45, 1.34, 0.72, 1.08, 0.77, 0.54, 0.53, 1.02, 0.72, 1.08, 0.96,
0.64, 0.53, 1.02, 0.45, 1.34, 0.96, 0.64, 0.77, 0.54, 0.45, 1.34,
0.72, 1.08, 0.77, 0.54, 0.53, 1.02, 0.72, 1.08, 0.96, 0.64, 0.53,
1.02, 0.45, 1.34, 0.96, 0.64, 0.77, 0.54, 0.45, 1.34, 0.72, 1.08,
0.77, 0.54, 0.53, 1.02, 0.72), Age = c(63L, 2L, 19L, 77L, 13L,
22L, 74L, 16L, 69L, 33L, 80L, 28L, 49L, 83L, 66L, 86L, 76L, 6L,
15L, 96L, 59L, 48L, 45L, 10L, 42L, 39L, 18L, 17L, 20L, 98L, 44L,
8L, 95L, 73L, 65L, 26L, 57L, 32L, 81L, 71L, 27L, 92L, 4L, 100L,
62L, 56L, 54L, 87L, 23L, 35L, 75L, 58L, 38L, 52L, 46L, 34L, 72L,
90L, 1L, 93L, 7L, 78L, 68L, 97L, 64L, 25L), Educ = c(0.54, 0.43,
0.62, 0.85, 0.15, 1.36, 0.54, 0.52, 0.47, 0.85, 0.15, 0.81, 1.12,
0.52, 0.47, 0.43, 0.62, 0.81, 1.12, 1.36, 0.54, 0.43, 0.62, 0.85,
0.15, 1.36, 0.54, 0.52, 0.47, 0.85, 0.15, 0.81, 1.12, 0.52, 0.47,
0.43, 0.62, 0.81, 1.12, 1.36, 0.54, 0.43, 0.62, 0.85, 0.15, 1.36,
0.54, 0.52, 0.47, 0.85, 0.15, 0.81, 1.12, 0.52, 0.47, 0.43, 0.62,
0.81, 1.12, 1.36, 0.54, 0.43, 0.62, 0.85, 0.15, 1.36), uniqueID = c(1L,
4L, 5L, 6L, 7L, 10L, 11L, 12L, 13L, 16L, 17L, 18L, 19L, 22L,
23L, 24L, 25L, 28L, 29L, 30L, 31L, 34L, 35L, 36L, 37L, 40L, 41L,
42L, 43L, 46L, 47L, 48L, 49L, 52L, 53L, 54L, 55L, 58L, 59L, 60L,
61L, 64L, 65L, 66L, 67L, 70L, 71L, 72L, 73L, 76L, 77L, 78L, 79L,
82L, 83L, 84L, 85L, 88L, 89L, 90L, 91L, 94L, 95L, 96L, 97L, 100L
)), row.names = c(NA, -66L), .internal.selfref = <pointer: 0x00000231152b1ef0>, class = c("data.table",
"data.frame"))
我不确定100%是否与结果完全相同,因为我无法导入子集的dput语句,但是我能够使用dplyr
过滤子集。主要思想来自于这个问题dplyr filter with condition on multiple columns
library(tidyverse)
subsetfun <- function (dataset, depvar, indepvars) {
sub_set <- dataset %>% filter_at(vars(indepvars,depvar), all_vars(!is.na(.))) %>% as.data.table(.)
return(sub_set)
}
sub_set <- subsetfun(DT, depvar="some_NA", indepvars=c("panelID", "some_NA_factor"))