删除具有子集的NA的行在函数内不起作用

问题描述 投票:0回答:1

我有一个data.table如下:

panelID = c(1:50)   
year= c(2001:2010)
country = c("NLD", "GRC", "GBR")
n <- 2
library(data.table)
set.seed(123)
DT <- data.table(panelID = rep(sample(panelID), each = n),
                 country = rep(sample(country, length(panelID), replace = T), each = n),
                 year = c(replicate(length(panelID), sample(year, n))),
                 some_NA = sample(0:5, 6),                                             
                 some_NA_factor = sample(0:5, 6),         
                 norm = round(runif(100)/10,2),
                 Income = round(rnorm(10,-5,5),2),
                 Happiness = sample(10,10),
                 Sex = round(rnorm(10,0.75,0.3),2),
                 Age = sample(100,100),
                 Educ = round(rnorm(10,0.75,0.3),2))        
DT [, uniqueID := .I]                                                                        # Creates a unique ID     
DT[DT == 0] <- NA    

我一直在编写一个函数,以自动使用二级最小二乘法。作为功​​能的一部分,我需要对所有具有NA的行进行子集化(但仅适用于2SLS中使用的变量)。但是由于某种原因,它似乎并没有在函数中子集化。

subsetfun <- function (dataset, depvar, indepvars) {
  sub_set <- subset(dataset, !is.na(depvar))
  sub_set <- subset(sub_set, !is.na(indepvars))
  return(sub_set)
}  

sub_set <- subsetfun(DT, depvar="some_NA", indepvars=c("panelID", "some_NA_factor")) 

子设置在函数外部起作用,但由于某种原因不在函数内部起作用。

编辑:函数中的第一行没有给出错误。第二行可能有语法问题。它给出了错误:

Error in `[.data.table`(x, r, vars, with = FALSE) : 
  i evaluates to a logical vector length 2 but there are 100 rows. Recycling of logical i is no longer allowed as it hides more bugs than is worth the rare convenience. Explicitly use rep(...,length=.N) if you really need to recycle.

所需的输出:

> dput(sub_set)
structure(list(panelID = c(31L, 15L, 14L, 14L, 3L, 42L, 43L, 
43L, 37L, 48L, 25L, 25L, 26L, 27L, 5L, 5L, 40L, 28L, 9L, 9L, 
29L, 8L, 41L, 41L, 7L, 10L, 36L, 36L, 19L, 4L, 45L, 45L, 17L, 
11L, 32L, 32L, 21L, 12L, 49L, 49L, 50L, 13L, 24L, 24L, 30L, 33L, 
20L, 20L, 18L, 46L, 22L, 22L, 39L, 38L, 35L, 35L, 47L, 6L, 1L, 
1L, 2L, 23L, 44L, 44L, 16L, 34L), country = c("GBR", "NLD", "GRC", 
"GRC", "NLD", "NLD", "GBR", "GBR", "NLD", "GRC", "NLD", "NLD", 
"GBR", "NLD", "GBR", "GBR", "GRC", "GBR", "GRC", "GRC", "GRC", 
"GBR", "GRC", "GRC", "GRC", "GBR", "GBR", "GBR", "NLD", "GRC", 
"GRC", "GRC", "NLD", "GRC", "NLD", "NLD", "NLD", "GRC", "GBR", 
"GBR", "GBR", "NLD", "GRC", "GRC", "NLD", "GRC", "NLD", "NLD", 
"GBR", "GBR", "GRC", "GRC", "GBR", "NLD", "GRC", "GRC", "GRC", 
"GBR", "GRC", "GRC", "NLD", "GBR", "GBR", "GBR", "GBR", "GRC"
), year = c(2010L, 2008L, 2003L, 2002L, 2010L, 2006L, 2004L, 
2001L, 2006L, 2003L, 2008L, 2001L, 2007L, 2006L, 2007L, 2005L, 
2006L, 2007L, 2004L, 2003L, 2009L, 2009L, 2007L, 2002L, 2003L, 
2007L, 2004L, 2001L, 2008L, 2008L, 2006L, 2004L, 2008L, 2010L, 
2006L, 2001L, 2010L, 2007L, 2008L, 2005L, 2002L, 2008L, 2010L, 
2004L, 2005L, 2008L, 2008L, 2009L, 2008L, 2009L, 2007L, 2010L, 
2010L, 2007L, 2010L, 2001L, 2009L, 2005L, 2006L, 2009L, 2004L, 
2007L, 2010L, 2004L, 2008L, 2010L), some_NA = c(4L, 1L, 5L, 3L, 
4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 
4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 
4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 
4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L), some_NA_factor = c(1L, 
5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 
5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 
5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 
5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 
5L), norm = c(0.05, 0.07, 0.01, 0.04, 0.08, 0.04, 0.09, 0.09, 
0.03, 0.01, NA, 0.1, NA, 0.06, 0.03, 0.07, 0.08, 0.07, 0.06, 
0.06, 0.1, 0.05, 0.02, 0.05, 0.04, 0.02, 0.06, 0.01, 0.1, 0.06, 
0.1, 0.07, 0.04, 0.01, 0.02, 0.09, 0.03, 0.02, NA, 0.04, 0.05, 
0.09, 0.05, 0.05, 0.1, 0.03, 0.1, 0.06, 0.08, 0.05, 0.09, 0.02, 
0.03, 0.1, 0.03, 0.07, 0.08, 0.03, 0.01, 0.01, 0.1, 0.09, 0.06, 
0.04, 0.04, 0.03), Income = c(-2.65, -9.35, -7.31, -14.56, -3.15, 
-6.42, -2.65, -0.2, -5.03, -14.56, -3.15, -7.31, -3.39, -0.2, 
-5.03, -9.35, -7.31, -7.31, -3.39, -6.42, -2.65, -9.35, -7.31, 
-14.56, -3.15, -6.42, -2.65, -0.2, -5.03, -14.56, -3.15, -7.31, 
-3.39, -0.2, -5.03, -9.35, -7.31, -7.31, -3.39, -6.42, -2.65, 
-9.35, -7.31, -14.56, -3.15, -6.42, -2.65, -0.2, -5.03, -14.56, 
-3.15, -7.31, -3.39, -0.2, -5.03, -9.35, -7.31, -7.31, -3.39, 
-6.42, -2.65, -9.35, -7.31, -14.56, -3.15, -6.42), Happiness = c(1L, 
10L, 2L, 9L, 5L, 4L, 1L, 3L, 6L, 9L, 5L, 7L, 8L, 3L, 6L, 10L, 
2L, 7L, 8L, 4L, 1L, 10L, 2L, 9L, 5L, 4L, 1L, 3L, 6L, 9L, 5L, 
7L, 8L, 3L, 6L, 10L, 2L, 7L, 8L, 4L, 1L, 10L, 2L, 9L, 5L, 4L, 
1L, 3L, 6L, 9L, 5L, 7L, 8L, 3L, 6L, 10L, 2L, 7L, 8L, 4L, 1L, 
10L, 2L, 9L, 5L, 4L), Sex = c(1.08, 0.77, 0.54, 0.53, 1.02, 0.72, 
1.08, 0.96, 0.64, 0.53, 1.02, 0.45, 1.34, 0.96, 0.64, 0.77, 0.54, 
0.45, 1.34, 0.72, 1.08, 0.77, 0.54, 0.53, 1.02, 0.72, 1.08, 0.96, 
0.64, 0.53, 1.02, 0.45, 1.34, 0.96, 0.64, 0.77, 0.54, 0.45, 1.34, 
0.72, 1.08, 0.77, 0.54, 0.53, 1.02, 0.72, 1.08, 0.96, 0.64, 0.53, 
1.02, 0.45, 1.34, 0.96, 0.64, 0.77, 0.54, 0.45, 1.34, 0.72, 1.08, 
0.77, 0.54, 0.53, 1.02, 0.72), Age = c(63L, 2L, 19L, 77L, 13L, 
22L, 74L, 16L, 69L, 33L, 80L, 28L, 49L, 83L, 66L, 86L, 76L, 6L, 
15L, 96L, 59L, 48L, 45L, 10L, 42L, 39L, 18L, 17L, 20L, 98L, 44L, 
8L, 95L, 73L, 65L, 26L, 57L, 32L, 81L, 71L, 27L, 92L, 4L, 100L, 
62L, 56L, 54L, 87L, 23L, 35L, 75L, 58L, 38L, 52L, 46L, 34L, 72L, 
90L, 1L, 93L, 7L, 78L, 68L, 97L, 64L, 25L), Educ = c(0.54, 0.43, 
0.62, 0.85, 0.15, 1.36, 0.54, 0.52, 0.47, 0.85, 0.15, 0.81, 1.12, 
0.52, 0.47, 0.43, 0.62, 0.81, 1.12, 1.36, 0.54, 0.43, 0.62, 0.85, 
0.15, 1.36, 0.54, 0.52, 0.47, 0.85, 0.15, 0.81, 1.12, 0.52, 0.47, 
0.43, 0.62, 0.81, 1.12, 1.36, 0.54, 0.43, 0.62, 0.85, 0.15, 1.36, 
0.54, 0.52, 0.47, 0.85, 0.15, 0.81, 1.12, 0.52, 0.47, 0.43, 0.62, 
0.81, 1.12, 1.36, 0.54, 0.43, 0.62, 0.85, 0.15, 1.36), uniqueID = c(1L, 
4L, 5L, 6L, 7L, 10L, 11L, 12L, 13L, 16L, 17L, 18L, 19L, 22L, 
23L, 24L, 25L, 28L, 29L, 30L, 31L, 34L, 35L, 36L, 37L, 40L, 41L, 
42L, 43L, 46L, 47L, 48L, 49L, 52L, 53L, 54L, 55L, 58L, 59L, 60L, 
61L, 64L, 65L, 66L, 67L, 70L, 71L, 72L, 73L, 76L, 77L, 78L, 79L, 
82L, 83L, 84L, 85L, 88L, 89L, 90L, 91L, 94L, 95L, 96L, 97L, 100L
)), row.names = c(NA, -66L), .internal.selfref = <pointer: 0x00000231152b1ef0>, class = c("data.table", 
"data.frame"))
r function data.table subset
1个回答
0
投票

我不确定100%是否与结果完全相同,因为我无法导入子集的dput语句,但是我能够使用dplyr过滤子集。主要思想来自于这个问题dplyr filter with condition on multiple columns

library(tidyverse)
subsetfun <- function (dataset, depvar, indepvars) {

sub_set <- dataset %>% filter_at(vars(indepvars,depvar), all_vars(!is.na(.))) %>% as.data.table(.)
return(sub_set)
}  

sub_set <- subsetfun(DT, depvar="some_NA", indepvars=c("panelID", "some_NA_factor")) 
© www.soinside.com 2019 - 2024. All rights reserved.