我在R中有一个具有重复ID的数据框,我想在同一行上对齐。
df <- data_frame(id = c("A1", "A2", "C2", "A2", "C2", "A2"),
date = c("2010-01-15", "2016-03-05", "2017-05-21", "2013-09-03", "2015-11-25", "2011-07-07"),
iT = c("z", "z", "v", "w", "z", "v"))
我想用for循环将同一行上的每个ID对齐,但一次只能子集一个变量(实际上我实际上只有10个)。
datalist = list()
datalist1 = list()
unique_id <- unique(df$id)
for (i in unique_id) {
subdf <- subset(df$date, df$id == i)
datalist[[i]] <- subdf
subdf1 <- subset(df$iT, df$id == i)
datalist1[[i]] <- subdf1
}
df1 <- plyr::ldply(datalist, rbind)
df2 <- plyr::ldply(datalist1, rbind)
df3 <- merge.data.frame(df1,df2, by.x = ".id", by.y = ".id")
我有成千上万的行,带有2到7个重复的ID
这是我希望的:
df3 <- data_frame(id = c("A1", "A2", "C2"),
date1 = c("2010-01-15", "2016-03-05", "2017-05-21"),
iT = c("z", "z", "v"),
date2 = c("NA", "2013-09-03","2015-11-25"),
iT.2 = c("NA", "w", "z"),
date3 = c("NA", "2011-07-07", "NA"),
iT.3 = c("NA", "v", "NA"))
[dcast()
可以一次重塑多个值列:
library(data.table)
dcast(setDT(df), id ~ rowid(id), value.var = c("date", "iT"))
id date_1 date_2 date_3 iT_1 iT_2 iT_3 1: A1 2010-01-15 <NA> <NA> z <NA> <NA> 2: A2 2016-03-05 2013-09-03 2011-07-07 z w v 3: C2 2017-05-21 2015-11-25 <NA> v z <NA>
使用dplyr
和tidyr
,您可以获取长格式的数据,为id
和列名的每种组合创建唯一的ID,并将数据恢复为宽格式。
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -id) %>%
group_by(id, name) %>%
mutate(name1 = paste0(name, row_number())) %>%
ungroup() %>%
select(-name) %>%
pivot_wider(names_from = name1, values_from = value)
# id date1 iT1 date2 iT2 date3 iT3
# <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 A1 2010-01-15 z NA NA NA NA
#2 A2 2016-03-05 z 2013-09-03 w 2011-07-07 v
#3 C2 2017-05-21 v 2015-11-25 z NA NA
考虑带有transform
,ave
,seq_along
,reshape
,grep
,merge
,paste0
和outer
的基R:
# CREATE A RUNNING GROUP NUMBER FOR RESHAPING
df$id_num <- with(transform(df, n=1), ave(n, id, FUN=seq_along))
# MERGE TWO WIDE FORMAT SETS FOR date and iT
df <- merge(reshape(df[c("id", "id_num", names(df)[grep("date", names(df))])],
v.names = "date", timevar = "id_num", direction = "wide"),
reshape(df[c("id", "id_num", names(df)[grep("iT", names(df))])],
v.names = "iT", timevar = "id_num", direction = "wide"),
by = "id", suffices = c("", "_"))
# RE-ORDER COLUMNS BY PAIR COMBINATIONS
df <- df[c("id", outer(c("date.", "iT."), c(1:3), paste0))]
df
# id date.1 iT.1 date.2 iT.2 date.3 iT.3
# 1 A1 2010-01-15 z <NA> <NA> <NA> <NA>
# 2 A2 2016-03-05 z 2013-09-03 w 2011-07-07 v
# 3 C2 2017-05-21 v 2015-11-25 z <NA> <NA>
“在同一行上对齐” =重塑形状。 ;)
Base R(reshape)
df <- as.data.frame(df)[order(df$id),] # Convert to data frame and order by id
df$time <- ave(df$id, df$id, FUN=seq_along) # Add "time" variable.
stats::reshape(df, direction="wide", v.names=c("date","iT")) # just one line.
id date.1 iT.1 date.2 iT.2 date.3 iT.3
1 A1 2010-01-15 z <NA> <NA> <NA> <NA>
2 A2 2016-03-05 z 2013-09-03 w 2011-07-07 v
3 C2 2017-05-21 v 2015-11-25 z <NA> <NA>