我正在尝试根据任务数据框中的日期和逻辑(顺序#匹配)在主数据框中创建2列。以下2个解决方案适用于将任务数据帧spread
放入主数据帧的情况,尽管以下示例运行得非常快,但是在我的常规代码中,它非常占用内存,并且计算时间很长(主数据帧有800,000行,并且任务数据帧(大约20个任务,可能有150万行)。
是否有更好的解决方案,而不必将任务数据框添加到主数据框即可获取列G1和G2(示例1)以及Gate1_end和Gate 2_end(示例2)?
示例1
maindf <- data.frame(
Order = c(145,158,165,148,568,465,248,693,357,482),
Location = c("US","US","Canada","US","France","US","Mexico","Mexico","UK","UK"),
Animal = c("Cow","Dog","Cat","Dog","Cat","Cow","Horse","Dog","Dog","Cat"),
stringsAsFactors = FALSE
)
Tasksdf_Start <- data.frame(
Tasks_Start = c("RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start",
"RTQT_Start","RTQR_Start",
"QUOT_Start",
"QUOG_Start",
"RTQT_Start","QUOT_Start",
"RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start",
"RTQT_Start","RTQR_Start",
"QUOT_Start",
"QUOG_Start",
"RTQT_Start","QUOT_Start"
),
Dates = c("1/1/2020","1/2/2020","1/6/2020","1/20/2020",
"1/6/2020","1/9/2020",
"1/14/2020",
"1/17/2020",
"1/19/2020","1/8/2020",
"1/15/2020","1/3/2020", "1/6/2020","1/19/2020",
"1/22/2020","1/25/2020",
"1/1/2020",
"1/6/2020",
"1/24/2020","1/14/2020"
),
Order = c(145, 145, 145, 145,
158, 158,
165,
148,
568, 568,
465, 465, 465, 465,
248, 248,
693,
357,
482, 482
),
stringsAsFactors = FALSE
)
Tasksdf_End <- data.frame(
Tasks_End = c("CRDT_End", "CUST_End", "VEND_End",
"CUST_End",
"CRDT_End",
"CUST_End",
"CRDT_End", "CUST_End",
"VEND_End",
"CRDT_End",
"VEND_End",
"VEND_End",
"CRDT_End"
),
Dates = c("1/22/2020", "1/18/2020", "1/5/2020",
"1/15/2020",
"1/16/2020",
"1/18/2020",
"1/7/2020", "1/24/2020",
"1/1/2020",
"1/18/2020",
"1/8/2020",
"1/4/2020",
"1/6/2020"
),
Order = c(145, 145, 145,
158,
165,
148,
568, 568,
465,
248,
693,
357,
482
),
stringsAsFactors = FALSE
)
dfS <- left_join(maindf, Tasksdf_Start, by = c("Order" = "Order")) %>%
spread(Tasks_Start, Dates)
dfE <- left_join(maindf, Tasksdf_End, by = c("Order" = "Order")) %>%
spread(Tasks_End, Dates)
dfE <- dfE %>%
select(1,4:ncol(dfE))
df <- left_join(dfS, dfE, by = c("Order" = "Order"))
df$G1 <- pmin(df$QUOG_Start, df$QUOT_Start, df$RTQR_Start, df$RTQT_Start, na.rm = TRUE)
df$G2 <- pmax(df$CRDT_End, df$CUST_End, df$VEND_End, na.rm = TRUE)
示例2 (非常类似于示例1,但逻辑更加复杂)
maindf <- data.frame(
Order = c(145,158,165,148,568,465,248,693,357,482),
Gates = c("Gate1","Gate2","Gate2","Gate3","Gate2","Gate4","Gate1","Gate1","Gate2","Gate3"),
Animal = c("Cow","Dog","Cat","Dog","Cat","Cow","Horse","Dog","Dog","Cat"),
stringsAsFactors = FALSE
)
Tasksdf_Start <- data.frame(
Tasks_Start = c("RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start", "CRDT_Start", "CUST_Start", "VEND_Start",
"RTQT_Start","RTQR_Start", "CUST_Start", "CRDT_Start",
"QUOT_Start", "CRDT_Start",
"QUOG_Start", "CUST_Start", "CRDT_Start",
"RTQT_Start","QUOT_Start","CRDT_Start", "CUST_Start",
"RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start", "VEND_Start",
"RTQT_Start","RTQR_Start", "CRDT_Start",
"QUOT_Start", "VEND_Start", "CRDT_Start",
"QUOG_Start", "VEND_Start",
"RTQT_Start","QUOT_Start", "CRDT_Start"
),
Dates = c("1/1/2020","1/2/2020","1/6/2020","1/20/2020", "1/8/2020","1/19/2020","1/8/2020",
"1/6/2020","1/9/2020", "1/1/2020","1/9/2020",
"1/14/2020", "1/1/2020",
"1/17/2020", "1/1/2020", "1/1/2020",
"1/19/2020","1/8/2020", "1/1/2020", "1/1/2020",
"1/15/2020","1/3/2020", "1/6/2020","1/19/2020", "1/1/2020",
"1/22/2020","1/25/2020", "1/1/2020",
"1/1/2020", "1/1/2020", "1/1/2020",
"1/6/2020", "1/1/2020",
"1/24/2020","1/14/2020", "1/1/2020"
),
Order = c(145, 145, 145, 145, 145, 145, 145,
158, 158, 158, 158,
165, 165,
148, 148, 148,
568, 568, 568, 568,
465, 465, 465, 465, 465,
248, 248, 248,
693, 693, 693,
357, 357,
482, 482, 482
),
stringsAsFactors = FALSE
)
Tasksdf_End <- data.frame(
Tasks_End = c("CRDT_End", "CUST_End", "VEND_End",
"CUST_End",
"CRDT_End",
"CUST_End",
"CRDT_End", "CUST_End",
"VEND_End",
"CRDT_End",
"VEND_End",
"VEND_End",
"CRDT_End"
),
Dates = c("1/22/2020", "1/18/2020", "1/5/2020",
"1/15/2020",
"1/16/2020",
"1/18/2020",
"1/7/2020", "1/24/2020",
"1/1/2020",
"1/18/2020",
"1/8/2020",
"1/4/2020",
"1/6/2020"
),
Order = c(145, 145, 145,
158,
165,
148,
568, 568,
465,
248,
693,
357,
482
),
stringsAsFactors = FALSE
)
dfS <- left_join(maindf, Tasksdf_Start, by = c("Order" = "Order")) %>%
spread(Tasks_Start, Dates)
dfE <- left_join(maindf, Tasksdf_End, by = c("Order" = "Order")) %>%
spread(Tasks_End, Dates)
dfE <- dfE %>%
select(1,4:ncol(dfE))
df <- left_join(dfS, dfE, by = c("Order" = "Order"))
df$Gate1_End <- if_else(df$Gates == "Gate1" | df$Gates == "Gate0", as.character(NA), pmin(df$QUOG_Start, df$QUOT_Start, df$RTQR_Start, df$RTQT_Start, na.rm = TRUE))
df <- df %>%
mutate(Gate2Open = rowSums(!is.na(select(.,one_of(c('CRDT_Start', 'CUST_Start', 'VEND_Start'))))),
Gate2Close = rowSums(!is.na(select(.,one_of(c('CRDT_End', 'CUST_End', 'VEND_End')))))
)
df$Gate2_End <- if_else(df$Gates == "Gate2" | df$Gates == "Gate1" | df$Gates == "Gate0", as.character(NA),
if_else((df$Gate2Close - df$Gate2Open) == 0,
pmax(df$CRDT_End, df$CUST_End, df$VEND_End, na.rm = TRUE),
as.character(NA))
)
使用mutate
,您不必创建宽数据框即可添加摘要列:
示例1:
# calculate min for the START values
start <- Tasksdf_Start %>%
group_by(Order) %>%
mutate(G1=min(Dates, na.rm = TRUE)) %>%
dplyr::select(Order, G1) %>%
summarise_each(lst(max)) %>% # merge identical rows
rename(G1=max)
# calculate max for the END vales
end <- Tasksdf_End %>%
group_by(Order) %>%
mutate(G2=max(Dates, na.rm = TRUE)) %>%
dplyr::select(Order, G2) %>%
summarise_each(lst(max)) %>% # merge identical rows
rename(G2=max)
# join everything
maindf <- maindf %>%
full_join(start, end, by ="Order")
示例2(使用maindf的列名代替c('CRDT_Start', 'CUST_Start', 'VEND_Start')
-见注释):
# calculate min for START values + count occurence of each "Order"
start <- Tasksdf_Start %>%
group_by(Order) %>%
mutate(Gate2Open = n()) %>%
mutate(Gate1_End=min(Dates, na.rm = TRUE)) %>%
dplyr::select(Order, Gate1_End, Gate2Open) %>%
summarise_each(lst(max)) %>% # merge identical rows
rename(Gate1_End=Gate1_End_max, Gate2Open = Gate2Open_max)
# calculate max for END values + count occurence of each "Order"
end <- Tasksdf_End %>%
group_by(Order) %>%
mutate(Gate2Close = n())%>%
mutate(Gate2_End=max(Dates, na.rm = TRUE)) %>%
dplyr::select(Order, Gate2_End, Gate2Close) %>%
summarise_each(lst(max)) %>% # merge identical rows
rename(Gate2_End=Gate2_End_max, Gate2Close = Gate2Close_max)
# join everything & apply conditions
maindf <- maindf %>%
full_join(end, by ="Order") %>%
full_join(start, by = "Order") %>%
group_by(Order) %>%
mutate(Gate1_End = case_when(Gates == "Gate1" | Gates == "Gate2" ~ as.character(NA),
Gates != "Gate1" & Gates != "Gate2" ~ Gate1_End),
Gate2_End = case_when(Gates == "Gate2" | Gates == "Gate1" | Gates == "Gate0" ~ as.character(NA),
Gate2Close - Gate2Open == 0 ~ Gate2_End))
也许会有更优雅的方法,但是这样您避免使用spread()
。