数据框
df <- structure (list(
subject_id = c("232-5467", "232-6784", "232-3457", "232-0987", "232-1245", "232-1945"),
HIV_VL_result_date_1 = c("2015-10-11","2015-10-10","2015-11-06","2016-02-02","2017-12-04","2019-02-15"),
VL_results_1 = c("LDL", "LDL", "LDL", "<100", "44405", "2322"),
HIV_VL_result_date_2 = c("2017-05-21", "2022-04-07", "2016-08-21", "2016-11-01", "2018-02-26",NA),
VL_results_2 = c("LDL", "5613", "LDL", "LDL", "93356", NA),
HIV_VL_result_date_3 = c("2018-06-27", "2022-07-15", "2022-04-13", "2017-03-01","2018-05-19",NA),
VL_results_3 = c("LDL", "6590", "LDL", "LDL", "19078",NA),
HIV_VL_result_date_4 = c("2020-04-16", "2022-08-15", NA, "2022-06-07", "2020-01-16",NA),
VL_results_4 = c("LDL", "375", NA, "36", "44",NA),
HIV_VL_result_date_5 = c("2021-03-25", "2023-01-28", NA, NA, "2022-05-03",NA),
VL_results_5 = c("LDL", "9125", NA, NA, "LDL",NA),
HIV_VL_result_date_6 = c("2022-03-07", NA, NA, NA, "2022-11-15",NA),
VL_results_6 = c("LDL", NA, NA, NA, "<20",NA),
preg_date = c("2022-03-04","2022-08-13","2022-05-04","2022-06-02","2022-04-14",NA),
estimated_start_date = c("2021-06-24", "2021-11-06", "2021-08-20","2021-09-27","2021-08-04",NA)),
class = "data.frame", row.names = c(NA, -6L))
我想请选择位于estimated_start_date 和 preg_date 之间的 HIV_VL_result_date 列,然后选择这些列中最高的 VL_results。
我尝试过的:
df <- df %>%
mutate_at(
vars(starts_with("VL_results_")),
~ case_when(
. == "LDL" ~ 0,
. == "<20" ~ 20,
. == "<50" ~ 50,
TRUE ~ as.numeric(.)
)
)
df <- viral_suppres_edit %>%
filter(!is.na(preg_date), !is.na(estimated_start_date)) %>%
mutate(
preg_date = ymd(preg_date),
estimated_start_date = ymd(estimated_start_date),
HIV_VL_result_date_1 = ymd(HIV_VL_result_date_1),
HIV_VL_result_date_2 = ymd(HIV_VL_result_date_2),
HIV_VL_result_date_3 = ymd(HIV_VL_result_date_3),
HIV_VL_result_date_4 = ymd(HIV_VL_result_date_4),
HIV_VL_result_date_5 = ymd(HIV_VL_result_date_5),
HIV_VL_result_date_6 = ymd(HIV_VL_result_date_6),
) %>%
rowwise() %>%
mutate(
highest_hivvl = max(as.numeric(VL_results_1), as.numeric(VL_results_2), as.numeric(VL_results_3), as.numeric(VL_results_4), as.numeric(VL_results_5), as.numeric(VL_results_6),
na.rm = TRUE))
过滤不起作用。它最终使用整个 df 并获取该行中最高的 VL。我还尝试了各种其他方法(例如pivot_longer)但仍在挣扎。请帮助纠正我的代码或提供更有效的方法。
您可能想要
reshape
太长,创建一个 match
列,其中 mapply
到 subset
,然后按 id aggregate
max
。
> reshape(df, idvar='subject_id', varying=2:13, direction='long',
+ v.names=c('IV_VL_result_date', 'VL_results')) |>
+ transform(match=mapply(\(x, y, z) x >= y & x <= z,
+ IV_VL_result_date, estimated_start_date, preg_date),
+ VL_results=as.numeric(
+ stringi::stri_replace_all_fixed(VL_results,
+ c('LDL', '<'),
+ c(0, ''), vectorize_all=FALSE)
+ )
+ ) |>
+ subset(match) |>
+ aggregate(VL_results ~ subject_id, max)
subject_id VL_results
1 232-3457 0
2 232-6784 6590