匹配值等于或小于,按组

问题描述 投票:0回答:4

我有一个包含列的数据框:

ID = c("AAHON","AAHON","AAHON","AAHON","AAHON","AAHON","AAJCQ","AAJCQ",
       "AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ")

value = c(0,0.001,0.002,0.003,0.004,0.006,0,0.000333,0.0005,
          0.001,0.002,0.002333,0.003667,0.023,0.0245)

newval = c(0.990295749,0.993992606,0.994454713,0.995378928,0.995841035,
           0.996303142,0.989833641,0.990295749,0.990757856,0.99676525,0.997227357,
           0.997689464,0.998151571,0.998613678,0.999075786)

和相应的数据框:

ID     value      newval
AAHON   0         0.990295749
AAHON   0.001     0.993992606
AAHON   0.002     0.994454713
AAHON   0.003     0.995378928
AAHON   0.004     0.995841035
AAHON   0.006     0.996303142
AAJCQ   0         0.989833641
AAJCQ   0.000333  0.990295749
AAJCQ   0.0005    0.990757856
AAJCQ   0.001     0.99676525
AAJCQ   0.002     0.997227357
AAJCQ   0.002333  0.997689464
AAJCQ   0.003667  0.998151571
AAJCQ   0.023     0.998613678
AAJCQ   0.0245    0.999075786

我需要一个函数,为每个值返回一行,该值与第二列中的对应值以及每个组中的对应值完全匹配。例如,如果我要求“0.002”,那么预期的输出将是:

ID      value   newval
AAHON   0.002   0.994454713
AAJCQ   0.002   0.997227357

这很简单,也很容易做到。但是,我还需要这个函数在我询问它是否不存在于组中的那个函数之前返回值。例如,如果我要求“0.005”,那么预期的输出将是:

ID      value       newval
AAHON   0.004       0.995841035
AAJCQ   0.003667    0.998151571

如果一个值存在于一个组中而不存在于另一个组中(例如“0.003”),结果将如下所示:

ID      value       newval
AAHON   0.003       0.995378928
AAJCQ   0.002333    0.997689464
r dataframe subset
4个回答
2
投票

filter
slice_max
:过滤小于或等于目标值的值后,使用
slice_max
会给我们最接近的值。

library(dplyr)

df1 %>% 
  filter(value<=0.003) %>% 
  slice_max(value, by = ID)

或作为函数:

closest_leq <- function(dat, grp, val_col, val){
  require(dplyr)
  
  dat %>% 
    filter({{ val_col }} <= val) %>% 
    slice_max({{ val_col }}, by = {{ grp }})
}

closest_leq(df1, ID, value, 0.003)

filter
abs
:
另外,我们可以先筛选小于等于目标值的值,然后筛选最接近目标值(最小绝对差)的值。

library(dplyr)  

df1 %>% 
  filter(value <= 0.003, .by = ID) %>% 
  filter(abs(value - 0.003) == min(abs(value - 0.003)), .by = ID)

或者如果你想要它作为一个函数:

closest_leq <- function(dat, grp, val_col, val){
  require(dplyr)
  
  dat %>% 
    filter({{ val_col }} <= val, 
           .by = {{ grp }}) %>% 
    filter(abs({{ val_col }} - val) == min(abs({{ val_col }} - val)), 
           .by = {{ grp }})
}

closest_leq(df1, ID, value, 0.003)

或在

base

subset(df1, value <= 0.003)[with(subset(df1, value <= 0.003), 
                                  ave(value, ID, FUN=max)==value),]

结果:

#>      ID    value    newval
#> 1 AAHON 0.003000 0.9953789
#> 2 AAJCQ 0.002333 0.9976895

如果您想一次检查多个值,我们可以使用

purrr::map_

# values to check, e.g.
x = c(0.002, 0.003, 0.005) 

purrr::map_dfr(x, ~closest_leq(df1, ID, value, .x))

     ID    value    newval
1 AAHON 0.002000 0.9944547
2 AAJCQ 0.002000 0.9972274
3 AAHON 0.003000 0.9953789
4 AAJCQ 0.002333 0.9976895
5 AAHON 0.004000 0.9958410
6 AAJCQ 0.003667 0.9981516

或按组

base::findInterval
,使用
by
ave

# by
do.call(rbind, 
        by(df1, df1$ID, \(dd) dd[findInterval(x, dd$value), ]))

# ave
df1[ave(df1$value, df1$ID, FUN = \(v) seq_along(v) %in% findInterval(x, v)) == 1, ]

数据:

data.frame(ID = c("AAHON","AAHON","AAHON","AAHON","AAHON","AAHON","AAJCQ","AAJCQ",
                  "AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ"),
           value = c(0,0.001,0.002,0.003,0.004,0.006,0,0.000333,0.0005,
                     0.001,0.002,0.002333,0.003667,0.023,0.0245),
           newval = c(0.990295749,0.993992606,0.994454713,0.995378928,0.995841035,
                      0.996303142,0.989833641,0.990295749,0.990757856,0.99676525,
                      0.997227357,0.997689464,0.998151571,0.998613678,0.999075786)) -> df1

1
投票

使用 tidyverse 工具

library(tidyverse)


ID = c("AAHON","AAHON","AAHON","AAHON","AAHON","AAHON","AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ")
value = c(0,0.001,0.002,0.003,0.004,0.006,0,0.000333,0.0005,0.001,0.002,0.002333,0.003667,0.023,0.0245)
newval = c(0.990295749,0.993992606,0.994454713,0.995378928,0.995841035,0.996303142,0.989833641,0.990295749,0.990757856,0.99676525,0.997227357,0.997689464,0.998151571,0.998613678,0.999075786)
df=data.frame(ID,value, newval)

# set the value yo want yo ask for
ask=0.003

df %>% 
  filter(value<=ask) %>% 
  group_by(ID) %>% 
  arrange(value, .by_group = T) %>% 
  slice_tail(n=1)

你得到

  ID      value newval
  <chr>   <dbl>  <dbl>
1 AAHON 0.003    0.995
2 AAJCQ 0.00233  0.998

希望有帮助


0
投票

我能想到的最简单的方法是使用 R 中的

dplyr
包。

library(dplyr)

df <- data.frame(ID, value, newval)
val <- .0023

df %>% group_by(ID) %>% 
       filter(value <= val) %>% 
       arrange(value) %>% 
       filter(row_number() == n())

这给你:

  ID    value newval
  <chr> <dbl>  <dbl>
1 AAHON 0.002  0.994
2 AAJCQ 0.002  0.997

0.005 的例子:

  ID      value newval
  <chr>   <dbl>  <dbl>
1 AAJCQ 0.00367  0.998
2 AAHON 0.004    0.996

0.003

  ID      value newval
  <chr>   <dbl>  <dbl>
1 AAJCQ 0.00233  0.998
2 AAHON 0.003    0.995

0
投票

在 Base R 中你可以做:

merge(df, aggregate(value~ID, df, max, subset = value <= 0.005))
     ID    value    newval
1 AAHON 0.004000 0.9958410
2 AAJCQ 0.003667 0.9981516

merge(df, aggregate(value~ID, df, max, subset = value <= 0.002))
     ID value    newval
1 AAHON 0.002 0.9944547
2 AAJCQ 0.002 0.9972274

您还可以:

subset(df, ave(value<=0.005, ID, FUN=\(x)which.min(x)-1 == seq_along(x)))
      ID    value    newval
5  AAHON 0.004000 0.9958410
13 AAJCQ 0.003667 0.9981516

subset(df, ave(value<=0.002, ID, FUN=\(x)which.min(x)-1 == seq_along(x)))
      ID value    newval
3  AAHON 0.002 0.9944547
11 AAJCQ 0.002 0.9972274

subset(df, ave(value<=0.003, ID, FUN=\(x)which.min(x)-1 == seq_along(x)))
      ID    value    newval
4  AAHON 0.003000 0.9953789
12 AAJCQ 0.002333 0.9976895
© www.soinside.com 2019 - 2024. All rights reserved.