我有一个包含列的数据框:
ID = c("AAHON","AAHON","AAHON","AAHON","AAHON","AAHON","AAJCQ","AAJCQ",
"AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ")
value = c(0,0.001,0.002,0.003,0.004,0.006,0,0.000333,0.0005,
0.001,0.002,0.002333,0.003667,0.023,0.0245)
newval = c(0.990295749,0.993992606,0.994454713,0.995378928,0.995841035,
0.996303142,0.989833641,0.990295749,0.990757856,0.99676525,0.997227357,
0.997689464,0.998151571,0.998613678,0.999075786)
和相应的数据框:
ID value newval
AAHON 0 0.990295749
AAHON 0.001 0.993992606
AAHON 0.002 0.994454713
AAHON 0.003 0.995378928
AAHON 0.004 0.995841035
AAHON 0.006 0.996303142
AAJCQ 0 0.989833641
AAJCQ 0.000333 0.990295749
AAJCQ 0.0005 0.990757856
AAJCQ 0.001 0.99676525
AAJCQ 0.002 0.997227357
AAJCQ 0.002333 0.997689464
AAJCQ 0.003667 0.998151571
AAJCQ 0.023 0.998613678
AAJCQ 0.0245 0.999075786
我需要一个函数,为每个值返回一行,该值与第二列中的对应值以及每个组中的对应值完全匹配。例如,如果我要求“0.002”,那么预期的输出将是:
ID value newval
AAHON 0.002 0.994454713
AAJCQ 0.002 0.997227357
这很简单,也很容易做到。但是,我还需要这个函数在我询问它是否不存在于组中的那个函数之前返回值。例如,如果我要求“0.005”,那么预期的输出将是:
ID value newval
AAHON 0.004 0.995841035
AAJCQ 0.003667 0.998151571
如果一个值存在于一个组中而不存在于另一个组中(例如“0.003”),结果将如下所示:
ID value newval
AAHON 0.003 0.995378928
AAJCQ 0.002333 0.997689464
filter
和slice_max
:过滤小于或等于目标值的值后,使用slice_max
会给我们最接近的值。
library(dplyr)
df1 %>%
filter(value<=0.003) %>%
slice_max(value, by = ID)
或作为函数:
closest_leq <- function(dat, grp, val_col, val){
require(dplyr)
dat %>%
filter({{ val_col }} <= val) %>%
slice_max({{ val_col }}, by = {{ grp }})
}
closest_leq(df1, ID, value, 0.003)
filter
和 abs
: 另外,我们可以先筛选小于等于目标值的值,然后筛选最接近目标值(最小绝对差)的值。
library(dplyr)
df1 %>%
filter(value <= 0.003, .by = ID) %>%
filter(abs(value - 0.003) == min(abs(value - 0.003)), .by = ID)
或者如果你想要它作为一个函数:
closest_leq <- function(dat, grp, val_col, val){
require(dplyr)
dat %>%
filter({{ val_col }} <= val,
.by = {{ grp }}) %>%
filter(abs({{ val_col }} - val) == min(abs({{ val_col }} - val)),
.by = {{ grp }})
}
closest_leq(df1, ID, value, 0.003)
或在
base
:
subset(df1, value <= 0.003)[with(subset(df1, value <= 0.003),
ave(value, ID, FUN=max)==value),]
结果:
#> ID value newval
#> 1 AAHON 0.003000 0.9953789
#> 2 AAJCQ 0.002333 0.9976895
如果您想一次检查多个值,我们可以使用
purrr::map_
:
# values to check, e.g.
x = c(0.002, 0.003, 0.005)
purrr::map_dfr(x, ~closest_leq(df1, ID, value, .x))
ID value newval
1 AAHON 0.002000 0.9944547
2 AAJCQ 0.002000 0.9972274
3 AAHON 0.003000 0.9953789
4 AAJCQ 0.002333 0.9976895
5 AAHON 0.004000 0.9958410
6 AAJCQ 0.003667 0.9981516
或按组
base::findInterval
,使用by
或ave
:
# by
do.call(rbind,
by(df1, df1$ID, \(dd) dd[findInterval(x, dd$value), ]))
# ave
df1[ave(df1$value, df1$ID, FUN = \(v) seq_along(v) %in% findInterval(x, v)) == 1, ]
data.frame(ID = c("AAHON","AAHON","AAHON","AAHON","AAHON","AAHON","AAJCQ","AAJCQ",
"AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ"),
value = c(0,0.001,0.002,0.003,0.004,0.006,0,0.000333,0.0005,
0.001,0.002,0.002333,0.003667,0.023,0.0245),
newval = c(0.990295749,0.993992606,0.994454713,0.995378928,0.995841035,
0.996303142,0.989833641,0.990295749,0.990757856,0.99676525,
0.997227357,0.997689464,0.998151571,0.998613678,0.999075786)) -> df1
使用 tidyverse 工具
library(tidyverse)
ID = c("AAHON","AAHON","AAHON","AAHON","AAHON","AAHON","AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ","AAJCQ")
value = c(0,0.001,0.002,0.003,0.004,0.006,0,0.000333,0.0005,0.001,0.002,0.002333,0.003667,0.023,0.0245)
newval = c(0.990295749,0.993992606,0.994454713,0.995378928,0.995841035,0.996303142,0.989833641,0.990295749,0.990757856,0.99676525,0.997227357,0.997689464,0.998151571,0.998613678,0.999075786)
df=data.frame(ID,value, newval)
# set the value yo want yo ask for
ask=0.003
df %>%
filter(value<=ask) %>%
group_by(ID) %>%
arrange(value, .by_group = T) %>%
slice_tail(n=1)
你得到
ID value newval
<chr> <dbl> <dbl>
1 AAHON 0.003 0.995
2 AAJCQ 0.00233 0.998
希望有帮助
我能想到的最简单的方法是使用 R 中的
dplyr
包。
library(dplyr)
df <- data.frame(ID, value, newval)
val <- .0023
df %>% group_by(ID) %>%
filter(value <= val) %>%
arrange(value) %>%
filter(row_number() == n())
这给你:
ID value newval
<chr> <dbl> <dbl>
1 AAHON 0.002 0.994
2 AAJCQ 0.002 0.997
0.005 的例子:
ID value newval
<chr> <dbl> <dbl>
1 AAJCQ 0.00367 0.998
2 AAHON 0.004 0.996
0.003
ID value newval
<chr> <dbl> <dbl>
1 AAJCQ 0.00233 0.998
2 AAHON 0.003 0.995
在 Base R 中你可以做:
merge(df, aggregate(value~ID, df, max, subset = value <= 0.005))
ID value newval
1 AAHON 0.004000 0.9958410
2 AAJCQ 0.003667 0.9981516
merge(df, aggregate(value~ID, df, max, subset = value <= 0.002))
ID value newval
1 AAHON 0.002 0.9944547
2 AAJCQ 0.002 0.9972274
您还可以:
subset(df, ave(value<=0.005, ID, FUN=\(x)which.min(x)-1 == seq_along(x)))
ID value newval
5 AAHON 0.004000 0.9958410
13 AAJCQ 0.003667 0.9981516
subset(df, ave(value<=0.002, ID, FUN=\(x)which.min(x)-1 == seq_along(x)))
ID value newval
3 AAHON 0.002 0.9944547
11 AAJCQ 0.002 0.9972274
subset(df, ave(value<=0.003, ID, FUN=\(x)which.min(x)-1 == seq_along(x)))
ID value newval
4 AAHON 0.003000 0.9953789
12 AAJCQ 0.002333 0.9976895