以下代码生成数据:
# Set the seed for reproducibility
set.seed(123)
# Generate random data
n <- 490
PTSD <- sample(c(1, 2, NA), n, replace = TRUE) #class(PTSD) = "numeric"
ANX <- sample(c(1, 2, NA), n, replace = TRUE) #class(ANX) = "numeric"
DEP <- sample(c(1, 2, NA), n, replace = TRUE) #class(DEP) = "numeric"
# Create the data frame
df <- data.frame(PTSD, ANX, DEP) #class(df) = "data.frame"
# Label the values: 1 = Low, 2 = High
expss::val_lab(df$PTSD) = expss::num_lab("1 Low
2 High")
expss::val_lab(df$ANX) = expss::num_lab("1 Low
2 High")
expss::val_lab(df$DEP) = expss::num_lab("1 Low
2 High")
# Create a list of tables for each variable to count 1s, 2s, and NAs
count_results <- list(
PTSD = table(df$PTSD, useNA = "ifany"),
ANX = table(df$ANX, useNA = "ifany"),
DEP = table(df$DEP, useNA = "ifany")
)
这部分代码进行了一些频率计数并总结了数据:
# Combine the count tables into a single table
count_table <- do.call(rbind, count_results)
# Initialize empty vectors to store results
variable_names <- character()
sample_sizes <- numeric()
# Loop through the test results and extract relevant information
for (variable_name in names(count_results)) {
sample_sizes <- c(sample_sizes, sum(count_results[[variable_name]]))
variable_names <- c(variable_names, variable_name)
}
# Create summary data frame
summary_df <- data.frame(
Variable = variable_names,
N = sample_sizes
)
# Combine the count table and chi-squared summary table by columns
final_result <- cbind(count_table, summary_df)
# Remove Variable column in the middle of the table
final_result <- subset(final_result, select = -c(Variable))
这部分代码执行我所说的“组合分析”(它基于此SO线程接受的答案):
library(dplyr)
out <- df %>%
mutate(id = row_number())%>%
tidyr::pivot_longer(PTSD:DEP) %>%
filter(value == 2)%>%
summarise(combination = toString(name),.by=id) %>%
summarise(n = n(), .by = combination)
打印频率计数和摘要会生成以下内容:
> print(final_result)
Low High NA N
PTSD 164 167 159 490
ANX 157 156 177 490
DEP 168 156 166 490
打印频率计数和摘要会生成以下内容:
# A tibble: 7 × 2
combination n
<chr> <int>
1 ANX 72
2 ANX, DEP 28
3 PTSD 82
4 DEP 76
5 PTSD, DEP 29
6 PTSD, ANX 33
7 PTSD, ANX, DEP 23
我真正感兴趣的是“高”频率及其组合(即 PTSD == 2、ANX == 2 和 DEP == 2)。
因此,我预计两个表之间的 PTSD High、ANX High 和 DEP High 是相同的,但事实并非如此!
为了检查第二个表(即 tibble 表)应显示的内容,我将
df
导出到 CSV 文件并将其导入到电子表格中。
我使用了
COUNTIFS
函数(具有以下语法 COUNTIFS(criteria_range1, criteria1, [criteria_range2, criteria2]…)
),得到了下表:
Combination n
--------------------
PTSD 167
ANX 156
DEP 156
PTSD + ANX 56
PTSD + DEP 52
ANX + DEP 51
PTSD + ANX + DEP 23
我的问题:
以下是您必须在 Excel 中使用 countif 完成的操作的再现
library(tidyverse)
library(rlang)
t3 <- c("PTSD","ANX","DEP")
(combs <- map(seq_along(t3),\(n)combn(t3,n,simplify = FALSE)) |> flatten())
(filts <- parse_exprs(map_chr(combs,\(x)paste0(x ,'== 2',collapse=' & '))))
(filtsnames <- parse_exprs(map_chr(combs,\(x)paste0(x ,collapse=' + '))))
names(filts) <- filtsnames
(out2 <- map_int(filts,\(x){
df %>%
mutate(id = row_number())%>%
filter(!!(x))%>%
summarise(
n = n())
} |> pull(n)
))
enframe(out2)