我正在使用两个不同的数据框,这两个数据框包括对特定年份的分类变量及其频率的不同观察,但是对于两个不同大小的不同组(第2组大于第1组)。
现在,我想包括第1组中第2组的观察频率,这些观察结果与分类变量(a)具有相同的“值”,并且属于同一年。到目前为止,我无法找到正确的函数或循环。
到目前为止我尝试过的:
library(tidyverse)
library(dplyr)
year_1 <- c(1914, 1915, 1916, 1917)
a_1 <- c("blue", "green", "green", "blue")
df_1 <- data.frame(a_1, year_1)
df_1 <- group_by(a_1, year_1) %>% count(name = "counts") %>%
ungroup(a_1, year_1) %>%
df_1 <- mutate(freq_1=counts/sum(counts))
year_2 <- c(1912, 1913, 1914, 1915, 1916, 1917, 1918)
a_2 <- c("black", "pink", "blue", "green", "green", "pink", "blue")
df_2 <- data.frame(a_2, year_2)
df_2 <- group_by(a_2, year_2) %>% count(name = "counts") %>%
ungroup(a_2, year_2) %>%
df_2 <- mutate(freq_2=counts/sum(counts))
for(i in df_1) {
if(df_2$a_2==df_1$a_1[i] & df_2$year_2==df_1$year_1[i]) {
df_1 %>%
mutate(df_1, freq_2 = df_2$freq_2) %>%
return(df_1)}
}
我不知道这是不是你想要的:
library(dplyr)
#just recreating your data with minor corrections
year_1 <- c(1914, 1915, 1916, 1917)
a_1 <- c("blue", "green", "green", "blue")
df_1 <- data.frame(a_1, year_1, stringsAsFactors = FALSE)
df_1 <- df_1 %>% group_by(a_1, year_1) %>% tally() %>%
ungroup(a_1, year_1) %>%
mutate(freq_1=n/sum(n))
year_2 <- c(1912, 1913, 1914, 1915, 1916, 1917, 1918)
a_2 <- c("black", "pink", "blue", "green", "green", "pink", "blue")
df_2 <- data.frame(a_2, year_2, stringsAsFactors = FALSE)
df_2 <- df_2 %>% group_by(a_2, year_2) %>% tally() %>%
ungroup(a_2, year_2) %>%
mutate(freq_2=n/sum(n))
df_1 %>% left_join(df_2, by = c("year_1" = "year_2", "a_1" = "a_2")) %>%
mutate(count = ifelse(!is.na(n.y), n.x + n.y, n.x),
freq = count/sum(count)) %>% select(-n.y, -n.x, - freq_1, -freq_2 )
# A tibble: 4 x 4
a_1 year_1 count freq
<chr> <dbl> <int> <dbl>
1 blue 1914 2 0.286
2 blue 1917 1 0.143
3 green 1915 2 0.286
4 green 1916 2 0.286
这会将df_2
和df_1
中的所有类别添加到同一年的两个数据框中,从两个数据框中相加各自的频率,并计算一个类似于freq_1
and和freq_2
列的新百分比。
#Your original Data
df1 <- data.frame(a=c("blue", "green", "green", "blue"), year=c(1914, 1915, 1916, 1917))
df2 <- data.frame(a=c("black", "pink", "blue", "green", "green", "pink", "blue"), year=c(1912, 1913, 1914, 1915, 1916, 1917, 1918))
#Count per a and year
df1 <- with(df1, aggregate(list(count=year), list(a=a, year=year), FUN=length))
df2 <- with(df2, aggregate(list(count=year), list(a=a, year=year), FUN=length))
#Include counts of df2 in df1 using aggregate and merge
merge(df1[c("a","year")], aggregate(count ~ a + year, data=rbind(df1, df2), FUN=sum))
a year count
1 blue 1914 2
2 blue 1917 1
3 green 1915 2
4 green 1916 2
#Include counts of df2 in df1 using ave
df1$count <- with(rbind(df1, df2), ave(count, a, year, FUN=sum))[1:NROW(df1)]
df1
a year count
1 blue 1914 2
2 green 1915 2
3 green 1916 2
4 blue 1917 1