R:满足条件的所有数字组合

问题描述 投票:0回答:6

我正在使用 R 编程语言。

假设我有数字:2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020

我的问题:我想找出这些数字的所有可能的差值大于1的对。 (2013-2010)、(2019-2011)等

目前,我正在使用一种非常笨拙的方法来做到这一点:

x = 2010:2020
grid = expand.grid(x,x)
grid$diff = grid$Var1 - grid$Var2
grid$condition = ifelse(grid$Var1 - grid$Var2 > 1, "yes", "no")

final = grid[grid$condition == "yes",]

有更有效的方法吗?

例如,这里我首先生成所有可能的组合,然后消除无效的组合。当数量很多时,这可能无效。有更好的方法吗?

谢谢!

r combinations
6个回答
4
投票

在你的问题中,目标是获得向量中数字的组合,使得该对的差值大于 1,但你不想进行不必要的比较,浪费处理时间。虽然我喜欢 Ritchie Sacramento's answer 对于连续序列,我想提供一个可以处理序列中的间隙、未排序序列和非整数序列的答案。我认为要使其更“高效”的唯一方法是迭代序列并收集比当前迭代数至少大 1 的所有数字。

在我的方法中,我首先强制序列排序。然后,我循环遍历序列中的每个元素,并仅比较当前元素之后的索引。如果这些满足差异标准,我会将它们添加到运行的配对列表中。这假设您没有重复的数字,否则您可能需要先运行

unique()

#' Find pairs in a sequence with a difference greater than a specified offset.
#'
#' This function sorts the input sequence, and then iterates through the sorted sequence 
#' to find pairs of numbers with a difference greater than the specified difference offset. 
#' Pairs are returned in a matrix where pair[1] > pair[2].
#'
#' @param x A numeric vector representing the input sequence.
#' @param differenceOffset A numeric value representing the minimum difference between pairs. Default is 1.
#' @return A matrix where each row represents a pair with a difference greater than the difference offset.
#' @examples
#' find_pairs(c(1,2,3,4,5))  # uses default differenceOffset of 1
#' find_pairs(c(2,4,6,8,10), 2)
find_pairs <- function(x, differenceOffset = 1) {
  # sort the sequence
  x_sorted <- sort(x)
  len <- length(x_sorted)

  # initialize a list to store the pairs
  pairs <- list()

  # iterate through the sorted sequence
  for(current in 1:(len - 1)) {
    # iterate and test the next index forward of the current
    for(test in (current + 1):len) {
      # if the difference is > differenceOffset, add the pair to the list
      if(x_sorted[test] - x_sorted[current] > differenceOffset) {
        # flip the order to put pair[1] > pair[2] in the output
        pairs <- c(pairs, list(c(x_sorted[test], x_sorted[current])))
      }
    }
  }

  # convert the list to a matrix
  do.call(rbind, pairs)
}

这里有几个用法示例。

# unsorted example
> find_pairs(c(4,3,2,5,1))
     [,1] [,2]
[1,]    3    1
[2,]    4    1
[3,]    5    1
[4,]    4    2
[5,]    5    2
[6,]    5    3

# use difference offset of >2
> find_pairs(c(1,2,3,4,5), 2)
     [,1] [,2]
[1,]    4    1
[2,]    5    1
[3,]    5    2

# your example
> find_pairs(2010:2020)
      [,1] [,2]
 [1,] 2012 2010
 [2,] 2013 2010
 [3,] 2014 2010
 [4,] 2015 2010
 [5,] 2016 2010
 [6,] 2017 2010
 [7,] 2018 2010
 [8,] 2019 2010
 [9,] 2020 2010
[10,] 2013 2011
...
[40,] 2018 2016
[41,] 2019 2016
[42,] 2020 2016
[43,] 2019 2017
[44,] 2020 2017
[45,] 2020 2018

# gap example
> find_pairs(c(1:2, 6:8))
     [,1] [,2]
[1,]    6    1
[2,]    7    1
[3,]    8    1
[4,]    6    2
[5,]    7    2
[6,]    8    2
[7,]    8    6


# non-integer example
> find_pairs(seq(1, 3, by = 0.25), 1)
      [,1] [,2]
 [1,] 2.25 1.00
 [2,] 2.50 1.00
 [3,] 2.75 1.00
 [4,] 3.00 1.00
 [5,] 2.50 1.25
 [6,] 2.75 1.25
 [7,] 3.00 1.25
 [8,] 2.75 1.50
 [9,] 3.00 1.50
[10,] 3.00 1.75

就时间复杂度而言,上述解决方案的总时间复杂度 [O(n log n + n)] 可能小于使用

expand.grid
[O(n^2)] 对于长向量的解决方案。

干杯!


3
投票

第一次尝试:

nums <- 2010:2020

combn(nums, 2)[,combn(nums, 2, diff) > 1]

更新:从头开始创建对:

library(purrr)

f <- function(d, start, end){
    map(start:(end-d), \(x) c(x, x + d))
}

pairs <- function(min_diff, start, end){
  map(min_diff:(end-start), \(d) f(d, start, end)) |> 
    flatten()
}

pairs(2, 2010, 2020)

3
投票

使用非等连接进行比较的data.table选项。
我简化了您的选择以使其进行更公平的比较,因为不需要

ifelse
子步骤

library(data.table)
x = 2010:4500

system.time({
    xd <- data.table(x)
    xd[, xp1 := x + 1]
    out1 <- xd[xd, on=.(x>xp1), nomatch=0L, .(Var1=x.x, Var2=i.x, diff=x.x-i.x)]
})
##   user  system elapsed 
##   0.04    0.01    0.07 

system.time({
    grid = expand.grid(x,x)
    out2 <- grid[grid$Var1 - grid$Var2 > 1,]
    out2$diff <- out2$Var1 - out2$Var2
})
##   user  system elapsed 
##   0.23    0.08    0.32 

每一行的结果都匹配:

all(mapply(\(x,y) all(x==y), out1, out2))
## [1] TRUE

3
投票

dist
解决方案、使用循环的高效基本解决方案以及高效
data.table
解决方案,所有这些都以迄今为止提出的一些解决方案为基准。

dist
解决方案:

library(parallelDist)

f1 <- function(x, mindiff = 1) {
  n <- 2*length(x)
  d <- parDist(as.matrix(x))
  y <- which(d > mindiff)
  i <- (n + 1 - sqrt((n - 1)^2 - 8*(y - 1)))%/%2
  data.frame(Var1 = x[i], Var2 = x[i + y - (n - i)*(i - 1)/2], diff = d[y])
}

内存高效的基本解决方案,可打破

for
循环:

f2 <- function(x, mindiff = 1) {
  x <- sort(x)
  n <- length(x)
  n1 <- n + 1L
  idx <- rep(n1, n - 1L)
  j <- 2L
  
  for (i in 1:(n - 1L)) {
    for (j in j:n) {
      if (x[j] - x[i] > mindiff) {
        idx[i] <- j
        break
      }
    }
    
    if (idx[i] == n1) break
  }
  
  span <- n - idx + 1L
  within(
    out <- data.frame(
      Var1 = x[rep.int(1:(n - 1L), span)], Var2 = x[sequence(span, idx)]
    ),
    diff <- Var2 - Var1
  )
}

先前解决方案的矢量化版本:

library(data.table)

f3 <- function(x, mindiff = 1) {
  x <- sort(x)
  n <- length(x)

  setorder(
    data.table(
      x = c(x, x + mindiff), idx1 = rep(1:n, 2), keep = rep(0:1, each = n)
    ), x
  )[,idx2 := cummax(idx1)][
    keep == 1L, {
      span <- n - idx2
      .(
        Var1 = x[rep.int(idx1, span)],
        Var2 = x[sequence(span, idx2 + 1L)]
      )
    }
  ][,diff := Var2 - Var1]
}

为目前提出的其他解决方案定义函数:

f0 <- function(x, mindiff = 1) {
  # modified from OP
  grid = expand.grid(x, x)
  out <- grid[grid$Var1 - grid$Var2 > mindiff,]
  out$diff <- out$Var1 - out$Var2
  out
}

f4 <- function(x, mindiff = 1) {
  # from @thelatemail
  xd <- data.table(x)
  xd[, xp1 := x + mindiff]
  xd[xd, on=.(x>xp1), nomatch=0L, .(Var1=x.x, Var2=i.x, diff=x.x-i.x)]
}

find_pairs <- function(x, differenceOffset = 1) {
  # from @Khlick
  # sort the sequence
  x_sorted <- sort(x)
  len <- length(x_sorted)
  
  # initialize a list to store the pairs
  pairs <- list()
  
  # iterate through the sorted sequence
  for(current in 1:(len - 1)) {
    # iterate and test the next index forward of the current
    for(test in (current + 1):len) {
      # if the difference is > differenceOffset, add the pair to the list
      if(x_sorted[test] - x_sorted[current] > differenceOffset) {
        # flip the order to put pair[1] > pair[2] in the output
        pairs <- c(pairs, list(c(x_sorted[test], x_sorted[current])))
      }
    }
  }
  
  # convert the list to a matrix
  do.call(rbind, pairs)
}

使用较小数据集进行基准测试:

x <- runif(1e2, 0, 10)

microbenchmark::microbenchmark(
  expand.grid = nrow(f0(x)),
  loop1 = nrow(find_pairs(x)),
  dist = nrow(f1(x)),
  loop2 = nrow(f2(x)),
  data.table1 = nrow(f3(x)),
  data.table2 = nrow(f4(x)),
  check = "equal"
)
#> Unit: microseconds
#>         expr     min       lq      mean   median       uq      max neval
#>  expand.grid   606.9   673.15   746.301   705.05   754.60   3534.9   100
#>        loop1 66948.0 68710.55 71555.309 70563.85 73099.55 113492.4   100
#>         dist   534.5   578.20   634.820   620.00   671.80    898.3   100
#>        loop2   319.0   393.90   466.185   435.85   514.00    920.0   100
#>  data.table1  1870.9  1980.05  2158.900  2079.15  2302.05   3229.2   100
#>  data.table2  1804.9  2032.10  2746.983  2719.85  3284.35   5810.3   100

具有更大数据集的基准:

x <- runif(1e4, 0, 10)

microbenchmark::microbenchmark(
  expand.grid = nrow(f0(x)),
  # loop1 = nrow(find_pairs(x)), # runs for several minutes without completing
  dist = nrow(f1(x)),
  loop2 = nrow(f2(x)),
  data.table1 = nrow(f3(x)),
  data.table2 = nrow(f4(x)),
  check = "equal",
  times = 10
)
#> Unit: milliseconds
#>         expr       min        lq      mean    median        uq       max neval
#>  expand.grid 4333.9824 4526.9577 4578.6956 4580.5573 4625.4831 4792.6571    10
#>         dist 1996.4856 2121.0837 2134.5839 2161.4953 2168.5389 2215.8524    10
#>        loop2  667.5903  748.8159  838.5044  788.3667  930.6512 1043.5914    10
#>  data.table1  432.8631  570.3818  635.3953  633.7162  706.0260  791.4238    10
#>  data.table2 1592.9779 1619.0466 1736.3997 1764.8470 1826.4533 1872.4085    10

2
投票

对于示例中的一组连续数字,您可以使用

rep()
sequence()
来构建向量。这应该和使用基本 R 一样高效。

f <- function(start, end, difference) {
  ds <- ((end - start) - difference + 1):1
  data.frame(
    v1 = sequence(ds, (start + difference):end),
    v2 = rep(start:(end - difference), ds)
    )
}

f(2010, 2020, 2)

     v1   v2
1  2012 2010
2  2013 2010
3  2014 2010
4  2015 2010
5  2016 2010
6  2017 2010
7  2018 2010
8  2019 2010
9  2020 2010
10 2013 2011
...
43 2019 2017
44 2020 2017
45 2020 2018

0
投票

一个简单的基本 R 选项,带有

lapply

do.call(
    rbind,
    lapply(x, \(i) if (any(lg <- (i - x > 1))) cbind(i, j = x[lg]))
)

给予

         i    j
 [1,] 2012 2010
 [2,] 2013 2010
 [3,] 2013 2011
 [4,] 2014 2010
 [5,] 2014 2011
 [6,] 2014 2012
 [7,] 2015 2010
 [8,] 2015 2011
 [9,] 2015 2012
[10,] 2015 2013
[11,] 2016 2010
[12,] 2016 2011
[13,] 2016 2012
[14,] 2016 2013
[15,] 2016 2014
[16,] 2017 2010
[17,] 2017 2011
[18,] 2017 2012
[19,] 2017 2013
[20,] 2017 2014
[21,] 2017 2015
[22,] 2018 2010
[23,] 2018 2011
[24,] 2018 2012
[25,] 2018 2013
[26,] 2018 2014
[27,] 2018 2015
[28,] 2018 2016
[29,] 2019 2010
[30,] 2019 2011
[31,] 2019 2012
[32,] 2019 2013
[33,] 2019 2014
[34,] 2019 2015
[35,] 2019 2016
[36,] 2019 2017
[37,] 2020 2010
[38,] 2020 2011
[39,] 2020 2012
[40,] 2020 2013
[41,] 2020 2014
[42,] 2020 2015
[43,] 2020 2016
[44,] 2020 2017
[45,] 2020 2018

另一个基本 R 选项可以使用

combn

do.call(
    rbind,
    Filter(
        length,
        combn(x[order(-x)],
            2,
            \(...) if (diff(...) < -1) data.frame(t(...)),
            simplify = FALSE
        )
    )
)

这给出了

     X1   X2
1  2020 2018
2  2020 2017
3  2020 2016
4  2020 2015
5  2020 2014
6  2020 2013
7  2020 2012
8  2020 2011
9  2020 2010
10 2019 2017
11 2019 2016
12 2019 2015
13 2019 2014
14 2019 2013
15 2019 2012
16 2019 2011
17 2019 2010
18 2018 2016
19 2018 2015
20 2018 2014
21 2018 2013
22 2018 2012
23 2018 2011
24 2018 2010
25 2017 2015
26 2017 2014
27 2017 2013
28 2017 2012
29 2017 2011
30 2017 2010
31 2016 2014
32 2016 2013
33 2016 2012
34 2016 2011
35 2016 2010
36 2015 2013
37 2015 2012
38 2015 2011
39 2015 2010
40 2014 2012
41 2014 2011
42 2014 2010
43 2013 2011
44 2013 2010
45 2012 2010
© www.soinside.com 2019 - 2024. All rights reserved.