按日期范围过滤多列,并计算R中的最大值

问题描述 投票:0回答:1

数据框

df <- structure (list(
  subject_id = c("232-5467", "232-6784", "232-3457", "232-0987", "232-1245", "232-1945"), 
  HIV_VL_result_date_1 = c("2015-10-11","2015-10-10","2015-11-06","2016-02-02","2017-12-04","2019-02-15"),
  VL_results_1 = c("LDL", "LDL", "LDL", "<100", "44405", "2322"), 
  HIV_VL_result_date_2 = c("2017-05-21", "2022-04-07", "2016-08-21", "2016-11-01", "2018-02-26",NA),
  VL_results_2 = c("LDL", "5613", "LDL", "LDL", "93356", NA), 
  HIV_VL_result_date_3 = c("2018-06-27", "2022-07-15", "2022-04-13", "2017-03-01","2018-05-19",NA), 
  VL_results_3 = c("LDL", "6590", "LDL", "LDL", "19078",NA), 
  HIV_VL_result_date_4 = c("2020-04-16", "2022-08-15", NA, "2022-06-07", "2020-01-16",NA),
  VL_results_4 = c("LDL", "375", NA, "36", "44",NA),
  HIV_VL_result_date_5 = c("2021-03-25", "2023-01-28", NA, NA, "2022-05-03",NA),
  VL_results_5 = c("LDL", "9125", NA, NA, "LDL",NA),
  HIV_VL_result_date_6 = c("2022-03-07", NA, NA, NA, "2022-11-15",NA),
  VL_results_6 = c("LDL", NA, NA, NA, "<20",NA),
  preg_date = c("2022-03-04","2022-08-13","2022-05-04","2022-06-02","2022-04-14",NA),
  estimated_start_date = c("2021-06-24", "2021-11-06", "2021-08-20","2021-09-27","2021-08-04",NA)), 
  class = "data.frame", row.names = c(NA, -6L))

我想请选择位于estimated_start_date 和 preg_date 之间的 HIV_VL_result_date 列,然后选择这些列中最高的 VL_results。

我尝试过的:

df <- df %>%
  mutate_at(
    vars(starts_with("VL_results_")),
    ~ case_when(
      . == "LDL" ~ 0,
      . == "<20" ~ 20,
      . == "<50" ~ 50,
      TRUE ~ as.numeric(.)
    )
  )

df <- viral_suppres_edit %>%
  filter(!is.na(preg_date), !is.na(estimated_start_date)) %>%
  mutate(
    preg_date = ymd(preg_date),
    estimated_start_date = ymd(estimated_start_date),
    HIV_VL_result_date_1 = ymd(HIV_VL_result_date_1),
    HIV_VL_result_date_2 = ymd(HIV_VL_result_date_2),
    HIV_VL_result_date_3 = ymd(HIV_VL_result_date_3),
    HIV_VL_result_date_4 = ymd(HIV_VL_result_date_4),
    HIV_VL_result_date_5 = ymd(HIV_VL_result_date_5),
    HIV_VL_result_date_6 = ymd(HIV_VL_result_date_6),
  ) %>%
  rowwise() %>%
  mutate(
    highest_hivvl = max(as.numeric(VL_results_1), as.numeric(VL_results_2), as.numeric(VL_results_3), as.numeric(VL_results_4), as.numeric(VL_results_5), as.numeric(VL_results_6),
    na.rm = TRUE))

过滤不起作用。它最终使用整个 df 并获取该行中最高的 VL。我还尝试了各种其他方法(例如pivot_longer)但仍在挣扎。请帮助纠正我的代码或提供更有效的方法。

r dplyr
1个回答
0
投票

您可能想要

reshape
太长,创建一个
match
列,其中
mapply
subset
,然后按 id
aggregate
max

> reshape(df, idvar='subject_id', varying=2:13, direction='long', 
+         v.names=c('IV_VL_result_date', 'VL_results')) |> 
+   transform(match=mapply(\(x, y, z) x >= y & x <= z, 
+                          IV_VL_result_date, estimated_start_date, preg_date),
+             VL_results=as.numeric(
+               stringi::stri_replace_all_fixed(VL_results, 
+                                               c('LDL', '<'), 
+                                               c(0, ''), vectorize_all=FALSE)
+             )
+   ) |> 
+   subset(match) |>
+   aggregate(VL_results ~ subject_id, max)
  subject_id VL_results
1   232-3457          0
2   232-6784       6590
© www.soinside.com 2019 - 2024. All rights reserved.