我需要找到一个组中的整个数据集显示了多少次与它一起数。下面是样品的数据。如果出现组别首次让说那么多的IT为1沿着侧同样的,如果它再次出现向下的车道,然后一些IT沿着侧和等2 ...每个组..请参阅德采样数据与预期输出为了更清楚。
样本数据:
Group
Group1
Group1
Group1
Group1
Group1
Group1
Group2
Group2
Group2
Group2
Group2
Group2
Group2
Group2
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group2
Group2
Group2
Group2
Group2
Group2
Group2
Group2
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group3
Group3
Group3
Group3
Group3
Group3
Group3
Group3
Group3
Group3
预期输出:
Group No of times
Group1 1
Group1 1
Group1 1
Group1 1
Group1 1
Group1 1
Group2 1
Group2 1
Group2 1
Group2 1
Group2 1
Group2 1
Group2 1
Group2 1
Group1 2
Group1 2
Group1 2
Group1 2
Group1 2
Group1 2
Group1 2
Group1 2
Group2 2
Group2 2
Group2 2
Group2 2
Group2 2
Group2 2
Group2 2
Group2 2
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group3 1
Group3 1
Group3 1
Group3 1
Group3 1
Group3 1
Group3 1
Group3 1
Group3 1
Group3 1
使用data.table
rleid
的一种方式,我们创建一个new
列获得用于Group
列的每一个变化不同的值。然后,我们group_by
Group
以及在new
的值的每一个变化,我们使用cumsum
增加计数。
library(data.table)
library(dplyr)
df %>%
mutate(new = rleid(Group)) %>%
group_by(Group) %>%
mutate(no_of_times = cumsum(c(1,diff(new) != 0))) %>%
select(-new)
# Group no_of_times
#1 Group1 1
#2 Group1 1
#3 Group1 1
#4 Group1 1
#5 Group1 1
#6 Group1 1
#7 Group2 1
#8 Group2 1
#9 Group2 1
#10 Group2 1
#11 Group2 1
#12 Group2 1
#13 Group2 1
#14 Group2 1
#15 Group1 2
#.....
相比于矿山Ronak的答案是伟大的,但这里是我想出了反正dplyr / tidyr解决方案。我们的想法是:
library(dplyr)
library(tidyr)
df_clustered <-
df %>%
mutate(rownum = row_number()) %>%
arrange(Group) %>%
mutate(mark = case_when((rownum - lag(rownum)) == 1 ~ NA, TRUE ~ TRUE)) %>%
group_by(Group, mark) %>%
mutate(cluster_number = ifelse(mark == TRUE, row_number(), NA)) %>%
ungroup() %>%
fill(cluster_number) %>%
arrange(rownum) %>%
select(-rownum, -mark)
head(df_clustered, 20)
#> # A tibble: 20 x 3
#> Group Value cluster_number
#> <chr> <dbl> <int>
#> 1 Group1 1 1
#> 2 Group1 2 1
#> 3 Group1 1 1
#> 4 Group1 1.3 1
#> 5 Group1 1.2 1
#> 6 Group1 1 1
#> 7 Group2 7 1
#> 8 Group2 6 1
#> 9 Group2 2 1
#> 10 Group2 1 1
#> 11 Group2 25 1
#> 12 Group2 23 1
#> 13 Group2 24 1
#> 14 Group2 25 1
#> 15 Group1 24 2
#> 16 Group1 23 2
#> 17 Group1 26 2
#> 18 Group1 23 2
#> 19 Group1 17 2
#> 20 Group1 11 2
这是一个纯粹的data.table解决方案。它基于rle()
和rep()
:
library(data.table)
DT <- data.table(stringsAsFactors=FALSE,
Group = c("Group1", "Group1", "Group1", "Group1", "Group1", "Group1",
"Group2", "Group2", "Group2", "Group2", "Group2", "Group2",
"Group2", "Group2", "Group1", "Group1", "Group1", "Group1",
"Group1", "Group1", "Group1", "Group1", "Group2", "Group2",
"Group2", "Group2", "Group2", "Group2", "Group2", "Group2",
"Group1", "Group1", "Group1", "Group1", "Group1", "Group1"),
Value = c(1, 2, 1, 1.3, 1.2, 1, 7, 6, 2, 1, 25, 23, 24, 25, 24, 23, 26, 23,
17, 11, 2, 1, 1, 2, 2.3, 1, 3, 4, 1, 1, 2, 25, 26, 11, 17, 16)
)
lengthEncoding <- rle(DT$Group)
setDT(lengthEncoding)[, group_count := seq_len(.N), by="values"]
DT[, "No of times" := rep(lengthEncoding$group_count, lengthEncoding$lengths)]
print(DT)
顺便说一句,这个解决方案比接受的答案更快:
编辑:添加@ chinsoon12美丽的单行值得冠!
library(microbenchmark)
library(data.table)
library(dplyr)
df <- data.frame(stringsAsFactors=FALSE,
Group = c("Group1", "Group1", "Group1", "Group1", "Group1", "Group1",
"Group2", "Group2", "Group2", "Group2", "Group2", "Group2",
"Group2", "Group2", "Group1", "Group1", "Group1", "Group1",
"Group1", "Group1", "Group1", "Group1", "Group2", "Group2",
"Group2", "Group2", "Group2", "Group2", "Group2", "Group2",
"Group1", "Group1", "Group1", "Group1", "Group1", "Group1"),
Value = c(1, 2, 1, 1.3, 1.2, 1, 7, 6, 2, 1, 25, 23, 24, 25, 24, 23, 26, 23,
17, 11, 2, 1, 1, 2, 2.3, 1, 3, 4, 1, 1, 2, 25, 26, 11, 17, 16)
)
DT <- data.table(df)
results <- microbenchmark(
RonakShah = {
df %>%
mutate(new = rleid(Group)) %>%
group_by(Group) %>%
mutate(no_of_times = cumsum(c(1,diff(new) != 0))) %>%
select(-new)
},
ismirsehregal = {
lengthEncoding <- rle(DT$Group)
setDT(lengthEncoding)[, group_count := seq_len(.N), by="values"]
DT[, "No of times" := rep(lengthEncoding$group_count, lengthEncoding$lengths)]
},
chinsoon12 = {DT[, numtimes := 1L + c(0L, cumsum(diff(.I) > 1L)), by=.(Group)]}
)
print(results)
plot(results)
expr min lq mean median uq max neval cld
RonakShah 3.980914 4.253103 4.898788 4.500009 5.063746 8.021481 100 c
ismirsehregal 1.494078 1.653283 1.937947 1.828487 2.023246 5.678442 100 b
chinsoon12 1.050436 1.239666 1.469426 1.440154 1.646369 2.572168 100 a