我有这个示例 data.frame:
df1 <- data.frame(v1 = c('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'),
v2 = c('B', 'A', 'D', 'C', 'F', 'E', 'H', 'G'),
value = c(1.12, 1.12, 12.52, 12.52, 3.19, 3.19, 12.52, 12.52))
> df1
v1 v2 value
1 A B 1.12
2 B A 1.12
3 C D 12.52
4 D C 12.52
5 E F 3.19
6 F E 3.19
7 G H 12.52
8 H G 12.52
row 1
中的A和B等组合对我来说与B和A等组合相同,其中value
列中的值也相同。如何删除出于我的目的而重复的行?
预期结果:
df2 <- data.frame(v1 = c('A', 'C', 'E', 'G'),
v2 = c('B', 'D', 'F', 'H'),
value = c(1.12, 12.52, 3.19, 12.52))
> df2
v1 v2 value
1 A B 1.12
2 C D 12.52
3 E F 3.19
4 G H 12.52
这个想法是考虑 v1 和 v2 可以互换。
df1 <- data.frame(v1 = c('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'),
v2 = c('B', 'A', 'D', 'C', 'F', 'E', 'H', 'G'),
value = c(1.12, 1.12, 12.52, 12.52, 3.19, 3.19, 12.52, 12.52))
### with tidyverse:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(purrr)
df2 <- df1 %>%
mutate(combination = pmap_chr(list(v1, v2), ~ paste(sort(c(..1, ..2)), collapse = ","))) %>%
filter(!duplicated(combination)) %>%
select(-combination)
df2
#> v1 v2 value
#> 1 A B 1.12
#> 2 C D 12.52
#> 3 E F 3.19
#> 4 G H 12.52
### Base R:
df2 <- df1[!duplicated(t(apply(df1[, c("v1", "v2")], 1, sort))), ]
df2
#> v1 v2 value
#> 1 A B 1.12
#> 3 C D 12.52
#> 5 E F 3.19
#> 7 G H 12.52
创建于 2023-12-24,使用 reprex v2.0.2
您可以按元素
c
连接v1和
v2的
min
和max
,看看它们是否是duplicated
。
> df1 <- structure(list(v1 = c("A", "B", "C", "D", "E", "F", "G", "H"),
+ v2 = c("B", "A", "D", "C", "F", "E", "H", "G"), value = c(1.12,
+ 1.12, 12.52, 12.52, 3.19, 3.19, 12.52, 12.52)), class = "data.frame", row.names = c(NA,
+ -8L))
> df1[!with(df1, mapply(\(...) c(min(...), max(...)), v1, v2)) |> t() |> duplicated(), ]
v1 v2 value
1 A B 1.12
3 C D 12.52
5 E F 3.19
7 G H 12.52
速度相当快。
> dfB <- df1[sample.int(nrow(df1), 1e4, replace=TRUE), ]
> library(dplyr); library(purrr)
> microbenchmark::microbenchmark(
+ mapply=dfB[!with(dfB, mapply(\(...) c(min(...), max(...)), v1, v2)) |> t() |> duplicated(), ],
+ apply=dfB[!duplicated(t(apply(dfB[, c("v1", "v2")], 1, sort))), ],
+ tidy=dfB %>%
+ mutate(combination = pmap_chr(list(v1, v2), ~ paste(sort(c(..1, ..2)), collapse = ","))) %>%
+ filter(!duplicated(combination)) %>%
+ select(-combination),
+ check='equivalent'
+ )
$ Rscript --vanilla foo.R
Unit: milliseconds
expr min lq mean median uq max neval cld
mapply 69.62856 76.20623 83.45243 81.29934 89.74701 110.8039 100 a
apply 514.42432 559.80302 593.04269 594.35724 620.06086 728.9266 100 b
tidy 339.91324 368.55064 389.65171 386.97757 410.54651 485.9414 100 c
数据:
> dput(df1)
structure(list(v1 = c("A", "B", "C", "D", "E", "F", "G", "H"),
v2 = c("B", "A", "D", "C", "F", "E", "H", "G"), value = c(1.12,
1.12, 12.52, 12.52, 3.19, 3.19, 12.52, 12.52)), class = "data.frame", row.names = c(NA,
-8L))