计数值的组出现在整个数据集的次数

问题描述 投票:1回答:3

我需要找到一个组中的整个数据集显示了多少次与它一起数。下面是样品的数据。如果出现组别首次让说那么多的IT为1沿着侧同样的,如果它再次出现向下的车道,然后一些IT沿着侧和等2 ...每个组..请参阅德采样数据与预期输出为了更清楚。

样本数据:

  Group
Group1
Group1
Group1
Group1
Group1
Group1
Group2
Group2
Group2
Group2
Group2
Group2
Group2
Group2
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group2
Group2
Group2
Group2
Group2
Group2
Group2
Group2
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group3
Group3
Group3
Group3
Group3
Group3
Group3
Group3
Group3
Group3

预期输出:

   Group    No of times
Group1  1
Group1  1
Group1  1
Group1  1
Group1  1
Group1  1
Group2  1
Group2  1
Group2  1
Group2  1
Group2  1
Group2  1
Group2  1
Group2  1
Group1  2
Group1  2
Group1  2
Group1  2
Group1  2
Group1  2
Group1  2
Group1  2
Group2  2
Group2  2
Group2  2
Group2  2
Group2  2
Group2  2
Group2  2
Group2  2
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group1  3
Group3  1
Group3  1
Group3  1
Group3  1
Group3  1
Group3  1
Group3  1
Group3  1
Group3  1
Group3  1
r
3个回答
3
投票

使用data.table rleid的一种方式,我们创建一个new列获得用于Group列的每一个变化不同的值。然后,我们group_by Group以及在new的值的每一个变化,我们使用cumsum增加计数。

library(data.table)
library(dplyr)

df %>% 
  mutate(new = rleid(Group)) %>%
  group_by(Group) %>%
  mutate(no_of_times = cumsum(c(1,diff(new) != 0))) %>%
  select(-new)


#   Group no_of_times
#1  Group1           1
#2  Group1           1
#3  Group1           1
#4  Group1           1
#5  Group1           1
#6  Group1           1
#7  Group2           1
#8  Group2           1
#9  Group2           1
#10 Group2           1
#11 Group2           1
#12 Group2           1
#13 Group2           1
#14 Group2           1
#15 Group1           2
#.....

1
投票

相比于矿山Ronak的答案是伟大的,但这里是我想出了反正dplyr / tidyr解决方案。我们的想法是:

  1. 数量它们原来的顺序行。
  2. 排序组表。组的簇之间的中断将通过行数(4,5,6,15,16,17 ...)骤升进行标记。
  3. 分配簇号在每一个集群中的第一条记录,然后填写了所有NAS的。
library(dplyr)
library(tidyr)

df_clustered <- 
    df %>% 
    mutate(rownum = row_number()) %>% 
    arrange(Group) %>% 
    mutate(mark = case_when((rownum - lag(rownum)) == 1 ~ NA, TRUE ~ TRUE)) %>% 
    group_by(Group, mark) %>% 
    mutate(cluster_number = ifelse(mark == TRUE, row_number(), NA)) %>% 
    ungroup() %>% 
    fill(cluster_number) %>% 
    arrange(rownum) %>% 
    select(-rownum, -mark)

head(df_clustered, 20)
#> # A tibble: 20 x 3
#>    Group  Value cluster_number
#>    <chr>  <dbl>          <int>
#>  1 Group1   1                1
#>  2 Group1   2                1
#>  3 Group1   1                1
#>  4 Group1   1.3              1
#>  5 Group1   1.2              1
#>  6 Group1   1                1
#>  7 Group2   7                1
#>  8 Group2   6                1
#>  9 Group2   2                1
#> 10 Group2   1                1
#> 11 Group2  25                1
#> 12 Group2  23                1
#> 13 Group2  24                1
#> 14 Group2  25                1
#> 15 Group1  24                2
#> 16 Group1  23                2
#> 17 Group1  26                2
#> 18 Group1  23                2
#> 19 Group1  17                2
#> 20 Group1  11                2

1
投票

这是一个纯粹的data.table解决方案。它基于rle()rep()

library(data.table)

DT <- data.table(stringsAsFactors=FALSE,
                 Group = c("Group1", "Group1", "Group1", "Group1", "Group1", "Group1",
                           "Group2", "Group2", "Group2", "Group2", "Group2", "Group2",
                           "Group2", "Group2", "Group1", "Group1", "Group1", "Group1",
                           "Group1", "Group1", "Group1", "Group1", "Group2", "Group2", 
                           "Group2", "Group2", "Group2", "Group2", "Group2", "Group2", 
                           "Group1", "Group1", "Group1", "Group1", "Group1", "Group1"),
                 Value = c(1, 2, 1, 1.3, 1.2, 1, 7, 6, 2, 1, 25, 23, 24, 25, 24, 23, 26, 23,
                           17, 11, 2, 1, 1, 2, 2.3, 1, 3, 4, 1, 1, 2, 25, 26, 11, 17, 16)
)

lengthEncoding <- rle(DT$Group)
setDT(lengthEncoding)[, group_count := seq_len(.N), by="values"]
DT[, "No of times" := rep(lengthEncoding$group_count, lengthEncoding$lengths)]

print(DT)

顺便说一句,这个解决方案比接受的答案更快:

编辑:添加@ chinsoon12美丽的单行值得冠!

library(microbenchmark)
library(data.table)
library(dplyr)

df <- data.frame(stringsAsFactors=FALSE,
                 Group = c("Group1", "Group1", "Group1", "Group1", "Group1", "Group1",
                           "Group2", "Group2", "Group2", "Group2", "Group2", "Group2",
                           "Group2", "Group2", "Group1", "Group1", "Group1", "Group1",
                           "Group1", "Group1", "Group1", "Group1", "Group2", "Group2", 
                           "Group2", "Group2", "Group2", "Group2", "Group2", "Group2", 
                           "Group1", "Group1", "Group1", "Group1", "Group1", "Group1"),
                 Value = c(1, 2, 1, 1.3, 1.2, 1, 7, 6, 2, 1, 25, 23, 24, 25, 24, 23, 26, 23,
                           17, 11, 2, 1, 1, 2, 2.3, 1, 3, 4, 1, 1, 2, 25, 26, 11, 17, 16)
)

DT <- data.table(df)

results <- microbenchmark(
  RonakShah = {
    df %>% 
      mutate(new = rleid(Group)) %>%
      group_by(Group) %>%
      mutate(no_of_times = cumsum(c(1,diff(new) != 0))) %>%
      select(-new)
  },
  ismirsehregal = {
    lengthEncoding <- rle(DT$Group)
    setDT(lengthEncoding)[, group_count := seq_len(.N), by="values"]
    DT[, "No of times" := rep(lengthEncoding$group_count, lengthEncoding$lengths)]
  },
  chinsoon12 = {DT[, numtimes := 1L + c(0L, cumsum(diff(.I) > 1L)), by=.(Group)]}
)

print(results)
plot(results)

          expr      min       lq     mean   median       uq      max neval cld
     RonakShah 3.980914 4.253103 4.898788 4.500009 5.063746 8.021481   100   c
 ismirsehregal 1.494078 1.653283 1.937947 1.828487 2.023246 5.678442   100  b 
    chinsoon12 1.050436 1.239666 1.469426 1.440154 1.646369 2.572168   100 a  
© www.soinside.com 2019 - 2024. All rights reserved.