是否有更快的方法来执行非等值连接并找到 R 中连接值的最大值？

Question

我正在尝试加快一些 R 代码的速度。由于数据量较大（数千万行），处理需要一定的时间。本质上，我有一个名为

parameters

的小型 data.table，其中包含税率和阈值，以及一个名为

taxation_data

的大型 data.table，其中包含个人收入水平数据。我想计算每个人的

gross tax

，需要从

parameters

表中查找相关税率和起征点。

我的第一次尝试（未显示）是执行非等连接并过滤连接值的最大值。这非常慢，我找到了一种使用

cut

函数来提高速度的方法（参见下面的示例）。我仍然认为必须有一种更快的方法来做到这一点。特别是，我发现有趣的是

cut

步骤非常快，但合并步骤很慢。有什么想法吗？

这是我能想到的最好的：

library(tidyverse)
library(data.table)

parameters <- data.table("Component" = c("A","A","B","B","C","C"),
                         "Year" = c(2020, 2021, 2020, 2021,
                                    2020, 2021),
                         "Threshold_lower" = c(0,0,18000,18000,40000,50000),
                         "Threshold_upper" = c(18000,18000,40000,50000,Inf,Inf),
                         "Rate" = c(0,0,0.2,0.2,0.4,0.45),
                         "Tax paid (up to MTR)" = c(0,0,0,0,4400,6400)) 


taxation_data <- data.table("Year" = c(2020,2020,2021,2021),
                            "Income" = c(20000, 15000,80000,45000))
  

# Based on the parameters, determine which "component" (threshold) applies to each
# individual in the taxation_data
lapply(unique(parameters$Year), function(x) {
  # Tax rates apply up to the upper part of the threshold "Threshold_upper"
  thresholds <- parameters[Year == x, .(Component, Threshold_upper)] 
  thresholds <- setNames(c(thresholds$Threshold_upper), c(as.character(thresholds$Component)))
  taxation_data[Year == x, Component := cut(Income, breaks = thresholds, 
                                            labels = names(thresholds)[2:length(thresholds)], 
                                            include.lowest = TRUE)]
}) %>% 
  invisible()

# Merge in the other variables from parameters
taxation_data <- merge(taxation_data, 
                       parameters[, .(Component, Year, Threshold_lower, Rate, `Tax paid (up to MTR)`)],
                       by.x = c("Year", "Component"), 
                       by.y=c("Year", "Component"), 
                       all.x=TRUE)
# Calculate `gross tax`
setnafill(taxation_data, fill = 0, cols = c("Rate", "Tax paid (up to MTR)", "Threshold_lower"))
taxation_data[, `Gross tax` := (Income - Threshold_lower) * Rate + `Tax paid (up to MTR)`]

Answer 1

不确定我是否遗漏了一些东西，这不是只是一个简单的非等值合并，不需要特殊处理吗？

# because names/values are lost in the merge
parameters[, thlow := Threshold_lower]
parameters[taxation_data, on = .(Year, thlow <= Income, Threshold_upper >= Income)
  ][, c("Income", "thlow", "Threshold_upper") := .(thlow, NULL, NULL)
  ][, tax := (Income - Threshold_lower) * Rate + `Tax paid (up to MTR)`
  ][]
#    Component  Year Threshold_lower  Rate Tax paid (up to MTR) Income   tax
#       <char> <num>           <num> <num>                <num>  <num> <num>
# 1:         B  2020           18000  0.20                    0  20000   400
# 2:         A  2020               0  0.00                    0  15000     0
# 3:         C  2021           50000  0.45                 6400  80000 19900
# 4:         B  2021           18000  0.20                    0  45000  5400

Answer 2

通过每年向

Income

添加固定金额，我们可以通过单个

findInterval

调用手动执行连接。作为函数：

library(data.table)

tax_join2 <- function(parameters, taxation_data) {
  # add an amount every year after the first so there is no overlap in
  # components between years
  interval <- max(parameters$Threshold_lower, taxation_data$Income) + 1
  min_year <- min(parameters$Year)
  parameters2 <- setorder(copy(parameters), Year, Threshold_lower)[
    ,Threshold_upper := Threshold_lower + interval*(Year - min_year)
  ]
  setcolorder(
    taxation_data[
      ,c(
        "Component",
        "Threshold_lower",
        "Rate",
        "Tax paid (up to MTR)"
      ) := parameters2[
        findInterval(
          Income + interval*(taxation_data$Year - min_year),
          parameters2$Threshold_upper
        ),
        c(1, 3, 5, 6)
      ]
    ][, tax := (Income - Threshold_lower)*Rate + `Tax paid (up to MTR)`],
    c(
      "Component",
      "Year",
      "Threshold_lower",
      "Rate",
      "Tax paid (up to MTR)",
      "Income",
      "tax"
    )
  )
}

对示例数据进行测试：

parameters <- data.table("Component" = c("A","A","B","B","C","C"),
                         "Year" = c(2020, 2021, 2020, 2021,
                                    2020, 2021),
                         "Threshold_lower" = c(0,0,18000,18000,40000,50000),
                         "Threshold_upper" = c(18000,18000,40000,50000,Inf,Inf),
                         "Rate" = c(0,0,0.2,0.2,0.4,0.45),
                         "Tax paid (up to MTR)" = c(0,0,0,0,4400,6400)) 


taxation_data <- data.table("Year" = c(2020,2020,2021,2021),
                            "Income" = c(20000, 15000,80000,45000))

tax_join2(parameters, taxation_data)[]
#>    Component Year Threshold_lower Rate Tax paid (up to MTR) Income   tax
#> 1:         B 2020           18000 0.20                    0  20000   400
#> 2:         A 2020               0 0.00                    0  15000     0
#> 3:         C 2021           50000 0.45                 6400  80000 19900
#> 4:         B 2021           18000 0.20                    0  45000  5400

将时间与 @r2evans 提议的简单非等值连接（作为函数）进行比较。

tax_join1 <- function(parameters, taxation_data) {
  parameters <- copy(parameters)[, thlow := Threshold_lower]
  parameters[
    taxation_data, on = .(Year, thlow <= Income, Threshold_upper >= Income)
  ][
    , c("Income", "thlow", "Threshold_upper") := .(thlow, NULL, NULL)
  ][
    , tax := (Income - Threshold_lower) * Rate + `Tax paid (up to MTR)`
  ]
}

更大的示例数据集，有 100M 行：

set.seed(1673481669)

parameters <- data.table("Component" = rep(LETTERS[1:3], each = 13),
                         "Year" = rep(2010:2022, 3),
                         "Threshold_lower" = rep(c(0,18000,40000), each = 13),
                         "Threshold_upper" = rep(c(18000,40000,Inf), each = 13),
                         "Rate" = rep(c(0,0.2,0.4), each = 13),
                         "Tax paid (up to MTR)" = rep(c(0,0,4400), each = 13))

taxation_data <- data.table(Year = sample(2010:2022, 1e8, 1),
                            Income = runif(1e5, 0, max(parameters$Threshold_lower)*1.3))

时间：

system.time(dt1 <- tax_join1(parameters, taxation_data))
#>    user  system elapsed 
#>   41.21    3.86   42.06
system.time(dt2 <- tax_join2(parameters, taxation_data))
#>    user  system elapsed 
#>    9.06    2.17   12.41
identical(dt1, dt2)
#> [1] TRUE

Answer 3

我发现了一种既简单又快得多的方法。无需加入。

library(tidyverse)
library(data.table)

parameters <- data.table("Component" = c("A","A","B","B","C","C"),
                         "Year" = c(2020, 2021, 2020, 2021,
                                    2020, 2021),
                         "Threshold_lower" = c(0,0,18000,18000,40000,50000),
                         "Threshold_upper" = c(18000,18000,40000,50000,Inf,Inf),
                         "Rate" = c(0,0,0.2,0.2,0.4,0.45),
                         "Tax paid (up to MTR)" = c(0,0,0,0,4400,6400)) 


taxation_data <- data.table("Year" = c(2020,2020,2021,2021),
                            "Income" = c(20000, 15000,80000,45000))

thresholds <- c(0,0,18000,18000,40000,50000)
rates <- c(0,0,0.2,0.2,0.4,0.45)
# mtr = tax paid up to last threshold
mtr <- (thresholds - lag(thresholds)) * lag(rates)
mtr[is.na(mtr)] <- 0
mtr <- cumsum(mtr)
upper_thresholds <- c(thresholds[-1], Inf)

tax_formula <- paste0("fcase(",
                      paste0("`Income` <= ", upper_thresholds, ", (`Income` - ",thresholds,") * ",rates," + ", mtr, collapse = " ,"),
                      ")")

taxation_data[, `Gross tax` := eval(parse(text = tax_formula))]

是否有更快的方法来执行非等值连接并找到 R 中连接值的最大值？

问题描述投票：0回答：3

3个回答

最新问题

是否有更快的方法来执行非等值连接并找到 R 中连接值的最大值？

问题描述 投票：0回答：3

3个回答

最新问题

问题描述投票：0回答：3