我对this帖子有一个变体,我需要根据跨5列的代码构建一个以逗号分隔的值字段。我需要检查每一列以确定该年是否存在几种不同作物中的任何一种。不过,每种作物都可以用多个代码表示。我可以对每种代码组合执行一系列 grepl 语句,但这会很长。 grepl 模式可以从变量构建吗?或者还有其他方法可以做到这一点吗?
这是一个将数据限制为 4 年、3 种作物(玉米、大豆和小麦)的示例, 每个只有几个不同的代码。
corn_codes <- 1
soybean_codes <- c(2,446)
wheat_codes <- c(3,427)
df <-
crossing(yr2019=as.character(c(1,2,3,446,427)),
yr2020=as.character(c(1,2,3,446,427)),
yr2021=as.character(c(1,2,3,446,427)),
yr2022=as.character(c(1,2,3,446,427))) %>%
rowwise() %>%
mutate(new_column = case_when(
grepl("1,.*2,.*3|1,.*3,.*2|2,.*1,.*3|2,.*3,.*1|3,.*1,.*2|3,.*2,.*1",
paste(as.vector(distinct(as.data.frame(c(yr2019,yr2020,yr2021,yr2022))))[[1]],
collapse = ",")) ~ "1,2,3",
grepl("1,.*446,.*3|1,.*3,.*446|446,.*1,.*3|446,.*3,.*1|3,.*1,.*446|3,.*446,.*1",
paste(as.vector(distinct(as.data.frame(c(yr2019,yr2020,yr2021,yr2022))))[[1]],
collapse = ",")) ~ "1,2,3",
grepl("1,.*2,.*427|1,.*427,.*2|2,.*1,.*427|2,.*427,.*1|427,.*1,.*2|427,.*2,.*1",
paste(as.vector(distinct(as.data.frame(c(yr2019,yr2020,yr2021,yr2022))))[[1]],
collapse = ",")) ~ "1,2,3",
grepl("1,.*446,.*427|1,.*427,.*446|446,.*1,.*427|446,.*427,.*1|427,.*1,.*446|427,.*446,.*1",
paste(as.vector(distinct(as.data.frame(c(yr2019,yr2020,yr2021,yr2022))))[[1]],
collapse = ",")) ~ "1,2,3",
TRUE ~ "x"
) # end case_when
) %>% # end mutate
print(n=20)
前 20/625 行:
# A tibble: 625 × 5
# Rowwise:
yr2019 yr2020 yr2021 yr2022 new_column
<chr> <chr> <chr> <chr> <chr>
1 1 1 1 1 x
2 1 1 1 2 x
3 1 1 1 3 x
4 1 1 1 427 x
5 1 1 1 446 x
6 1 1 2 1 x
7 1 1 2 2 x
8 1 1 2 3 1,2,3
9 1 1 2 427 1,2,3
10 1 1 2 446 x
11 1 1 3 1 x
12 1 1 3 2 1,2,3
13 1 1 3 3 x
14 1 1 3 427 1,2,3
15 1 1 3 446 1,2,3
16 1 1 427 1 x
17 1 1 427 2 1,2,3
18 1 1 427 3 x
19 1 1 427 427 x
20 1 1 427 446 1,2,3
# … with 605 more rows
这会查找四年内以任何顺序发生的三种作物中的任何一种。有两个问题:
感谢所有建议。
这似乎是错误的分析方法。在我看来,切换作物名称的代码会更容易,然后使用简单的
ifelse
来确定是否所有作物都存在:
df %>%
mutate(across(everything(), ~ case_when(.x %in% corn_codes ~ "corn",
.x %in% soybean_codes ~ "soybean",
.x %in% wheat_codes ~ "wheat",
TRUE ~ "other"))) %>%
rowwise() %>%
mutate(new_column = ifelse(all(c("corn", "soybean", "wheat") %in%
c_across(everything())), "1, 2, 3", "x"))
#> # A tibble: 625 x 5
#> # Rowwise:
#> yr2019 yr2020 yr2021 yr2022 new_column
#> <chr> <chr> <chr> <chr> <chr>
#> 1 corn corn corn corn x
#> 2 corn corn corn soybean x
#> 3 corn corn corn wheat x
#> 4 corn corn corn wheat x
#> 5 corn corn corn soybean x
#> 6 corn corn soybean corn x
#> 7 corn corn soybean soybean x
#> 8 corn corn soybean wheat 1, 2, 3
#> 9 corn corn soybean wheat 1, 2, 3
#> 10 corn corn soybean soybean x
#> 11 corn corn wheat corn x
#> 12 corn corn wheat soybean 1, 2, 3
#> 13 corn corn wheat wheat x
#> 14 corn corn wheat wheat x
#> 15 corn corn wheat soybean 1, 2, 3
#> 16 corn corn wheat corn x
#> 17 corn corn wheat soybean 1, 2, 3
#> 18 corn corn wheat wheat x
#> 19 corn corn wheat wheat x
#> 20 corn corn wheat soybean 1, 2, 3
#> # i 605 more rows
创建于 2023-08-23,使用 reprex v2.0.2