假设我有几年的个人数据,其中个人ID(indiv)可以识别跨年的个人,但家庭ID(househ)只能分配一年内的家庭关系。
示例:
libary(dplyr)
test_01 <- data.frame(indiv=c(1,2,3,4,5,6),househ=c(1,1,2,3,4,4),time=rep(1,6)) #1&2 form a household, 3 and 4 are single, 5&6 form a household
test_02 <- data.frame(indiv=c(2,3,4,5,6,7),househ=c(1,2,2,3,3,4),time=rep(2,6)) #1 exits, so 2 is now a new household, 3&4 now form a new household, 5&6 still do, 7 enters
test_03 <- data.frame(indiv=c(2,3,4,5,7,8,9,10),househ=c(1,2,2,3,4,5,5,6),time=rep(3,8)) #according to logic above
data_test_panel <- bind_rows(test_01,test_02,test_03)
期望的时间一致的家庭变量是:
data_test_panel$true_household <- c(1,1,2,3,4,4,5,6,6,4,4,7,5,6,6,8,7,9,9,10)
到目前为止我尝试过:
library(data.table)
setDT(data_test_panel)[,cons_household := .GRP,.(time,househ)] # where cons_household is the new household ID. However, this doesn't give the same household ID across time but assigns new values for every appearance of a household.
衷心感谢您的帮助!
/塞维林
dplyr 方法
library(dplyr)
data_test_panel |>
mutate(
indiv_in_household = list(indiv),
.by = c(househ, time)
) |>
mutate(
cons_household = cur_group_id(),
.by = indiv_in_household
)
数据表方法data_test_panel$indiv_in_household <- data_test_panel[
, .(split(
rep(indiv, length(indiv)),
rep(seq(length(indiv)), each = length(indiv))
)),
.(househ, time)
]$V1
head(data_test_panel)
# indiv househ time true_household indiv_in_household
# <num> <num> <num> <num> <list>
# 1: 1 1 1 1 1,2
# 2: 2 1 1 1 1,2
# 3: 3 2 1 2 3
# 4: 4 3 1 3 4
# 5: 5 4 1 4 5,6
# 6: 6 4 1 4 5,6
data_test_panel[, cons_household := .GRP, sapply(indiv_in_household, toString)]
data_test_panel[cons_household != true_household]
# Empty data.table (0 rows and 6 cols): indiv,househ,time,true_household,cons_household,indiv_in_household