从 R 中的两个数据框中按组减去向量

问题描述 投票:0回答:3

我在 R 中有两个数据框。 第一个数据框包含几个列特征,以及一个列,该列指示特定样本(行)是否属于某个组(因子变量)。第二个数据框包含相同数量的列,行数等于唯一组的数量。我想从第一个数据帧的每个样本中减去第二个数据帧的相应向量,其中使用同名列中的键组指定对应关系。

这里是主要数据集的例子:

df_repr <- structure(list(f1 = c(-3.9956064225704, 
-0.52380279948658, 0.61089389331505, -3.47273625634875, -4.486918671214, 
-6.1761970731672, -4.62305749757367, -4.42540643005429, -3.61613137597131, 
-3.29821425516253), f2 = c(-1.57918114753228, 
-4.10523012500727, -1.80270009366593, -0.00905317702835884, -0.899585192079915, 
-2.89341515186212, 0.0132542126386332, -3.32639898550135, -0.867793877742314, 
0.0911950321630834), f3 = c(-6.02532301769732, 
-4.90073348094302, -3.73159604513274, -3.55290209472808, -6.63194560195811, 
2.69409789701296, -4.17675978927128, -3.84141885970095, -1.20571283849034, 
1.54287440902102), group = structure(c(1L, 1L, 1L, 1L, 1L, 
2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor")), class = c("tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -10L))

这是一个示例数据帧,其中包含要从第一个数据帧的相应组的每一行中减去的向量:

to_subtract <- structure(list(group = structure(1:2, .Label = c("A", 
"B"), class = "factor"), f1 = c(-2.78048744402161, 
-2.33583431665818), f2 = c(-2.56086962108741, 
-0.689157827347865), f3 = c(-3.60224982918457, 
-0.782365376308658)), row.names = c(NA, -2L), class = c("tbl_df", 
"tbl", "data.frame"))

我试着这样做:

df_repr %>%
  group_by(group) %>%
  mutate(across(where(is.numeric), ~ . - to_subtract[to_subtract$group == unique(.$group), -1]))

但是我得到以下错误:

Error in `mutate()`:
ℹ️ In argument: `across(...)`.
ℹ️ In group 1: `group = A`.
Caused by error in `across()`:
! Can't compute column `f1`.
Caused by error in `f1$group`:
! $ operator is invalid for atomic vectors

此示例的预期输出:

       f1     f2      f3 group
    <dbl>  <dbl>   <dbl> <fct>
 1 -1.22   0.982 -2.42   A    
 2  2.26  -1.54  -1.30   A    
 3  3.39   0.758 -0.129  A    
 4 -0.692  2.55   0.0493 A    
 5 -1.71   1.66  -3.03   A    
 6 -3.84  -2.20   3.48   B    
 7 -2.29   0.702 -3.39   B    
 8 -2.09  -2.64  -3.06   B    
 9 -1.28  -0.179 -0.423  B    
10 -0.962  0.780  2.33   B 
r dataframe dplyr tidyverse
3个回答
1
投票

您可以将您的目标数据框与

to_subtract
组合在一起,同时设置一个逻辑列来指示从哪个中减去。然后在
mutate
中做减法,并重新整形为您想要的格式。

要使用

mutate(.by)
功能,您需要
dplyr
版本>=1.1.0。如果没有,在
group_by(group)
之前使用传统的
mutate
方法。

library(dplyr)

rbind(to_subtract %>% mutate(target = T), df_repr %>% mutate(target = F)) %>% 
  mutate(across(where(is.numeric), ~ .x - .x[target]), .by = group) %>% 
  filter(!target) %>% 
  select(-target)

# A tibble: 10 × 4
   group     f1     f2      f3
   <fct>  <dbl>  <dbl>   <dbl>
 1 A     -1.22   0.982 -2.42  
 2 A      2.26  -1.54  -1.30  
 3 A      3.39   0.758 -0.129 
 4 A     -0.692  2.55   0.0493
 5 A     -1.71   1.66  -3.03  
 6 B     -3.84  -2.20   3.48  
 7 B     -2.29   0.702 -3.39  
 8 B     -2.09  -2.64  -3.06  
 9 B     -1.28  -0.179 -0.423 
10 B     -0.962  0.780  2.33  

1
投票

您可以使用

powerjoin

library(powerjoin)

power_left_join(df_repr, to_subtract, by = "group", conflict = `-`)

# A tibble: 10 × 4
   group     f1     f2      f3
   <fct>  <dbl>  <dbl>   <dbl>
 1 A     -1.22   0.982 -2.42
 2 A      2.26  -1.54  -1.30  
 3 A      3.39   0.758 -0.129
 4 A     -0.692  2.55   0.0493
 5 A     -1.71   1.66  -3.03
 6 B     -3.84  -2.20   3.48
 7 B     -2.29   0.702 -3.39
 8 B     -2.09  -2.64  -3.06  
 9 B     -1.28  -0.179 -0.423
10 B     -0.962  0.780  2.33

0
投票

另一种方法是使用

group_modify()
并进行
data.frame
操作。为此,我们需要将
to_subtract
的行号复制到
df_rep
:

library(dplyr)

df_repr %>%
  group_by(group) %>% 
  group_modify(\(df, grp) {
    # get current group in `to_subtract` and drop `group` column
    df2 <- to_subtract[to_subtract$group == grp$group, -1]
    # match row numbers of `df` and  substract
    df - df2[rep(1, nrow(df)), ]
  })
#> # A tibble: 10 × 4
#> # Groups:   group [2]
#>    group     f1     f2      f3
#>    <fct>  <dbl>  <dbl>   <dbl>
#>  1 A     -1.22   0.982 -2.42  
#>  2 A      2.26  -1.54  -1.30  
#>  3 A      3.39   0.758 -0.129 
#>  4 A     -0.692  2.55   0.0493
#>  5 A     -1.71   1.66  -3.03  
#>  6 B     -3.84  -2.20   3.48  
#>  7 B     -2.29   0.702 -3.39  
#>  8 B     -2.09  -2.64  -3.06  
#>  9 B     -1.28  -0.179 -0.423 
#> 10 B     -0.962  0.780  2.33

来自 OP 的数据

df_repr <- structure(list(f1 = c(-3.9956064225704, 
                                 -0.52380279948658, 0.61089389331505, -3.47273625634875, -4.486918671214, 
                                 -6.1761970731672, -4.62305749757367, -4.42540643005429, -3.61613137597131, 
                                 -3.29821425516253), f2 = c(-1.57918114753228, 
                                                            -4.10523012500727, -1.80270009366593, -0.00905317702835884, -0.899585192079915, 
                                                            -2.89341515186212, 0.0132542126386332, -3.32639898550135, -0.867793877742314, 
                                                            0.0911950321630834), f3 = c(-6.02532301769732, 
                                                                                        -4.90073348094302, -3.73159604513274, -3.55290209472808, -6.63194560195811, 
                                                                                        2.69409789701296, -4.17675978927128, -3.84141885970095, -1.20571283849034, 
                                                                                        1.54287440902102), group = structure(c(1L, 1L, 1L, 1L, 1L, 
                                                                                                                               2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor")), class = c("tbl_df", "tbl", 
                                                                                                                                                                                                        "data.frame"), row.names = c(NA, -10L))


to_subtract <- structure(list(group = structure(1:2, .Label = c("A", 
                                                                "B"), class = "factor"), f1 = c(-2.78048744402161, 
                                                                                                -2.33583431665818), f2 = c(-2.56086962108741, 
                                                                                                                           -0.689157827347865), f3 = c(-3.60224982918457, 
                                                                                                                                                       -0.782365376308658)), row.names = c(NA, -2L), class = c("tbl_df", 
                                                                                                                                                                                                               "tbl", "data.frame"))

创建于 2023-03-09 与 reprex v2.0.2

© www.soinside.com 2019 - 2024. All rights reserved.