我们正在处理面板数据,Stata 中有一个命令,
xtsum
,它可以为您提供数据集中变量的内部方差和方差之间的信息。
R 是否有类似的命令可以产生干净的输出?
我用了一个小函数来做到这一点。
函数 XTSUM 需要三个输入:
数据——数据集
varname -- xtsum 的变量
单位 -- 内部维度的标识符
library(rlang)
library(dplyr)
XTSUM <- function(data, varname, unit) {
varname <- enquo(varname)
loc.unit <- enquo(unit)
ores <- data %>% summarise(ovr.mean=mean(!! varname, na.rm=TRUE), ovr.sd=sd(!! varname, na.rm=TRUE), ovr.min = min(!! varname, na.rm=TRUE), ovr.max=max(!! varname, na.rm=TRUE), ovr.N=sum(as.numeric((!is.na(!! varname)))))
bmeans <- data %>% group_by(!! loc.unit) %>% summarise(meanx=mean(!! varname, na.rm=T), t.count=sum(as.numeric(!is.na(!! varname))))
bres <- bmeans %>% ungroup() %>% summarise(between.sd = sd(meanx, na.rm=TRUE), between.min = min(meanx, na.rm=TRUE), between.max=max(meanx, na.rm=TRUE), Units=sum(as.numeric(!is.na(t.count))), t.bar=mean(t.count, na.rm=TRUE))
wdat <- data %>% group_by(!! loc.unit) %>% mutate(W.x = scale(!! varname, scale=FALSE))
wres <- wdat %>% ungroup() %>% summarise(within.sd=sd(W.x, na.rm=TRUE), within.min=min(W.x, na.rm=TRUE), within.max=max(W.x, na.rm=TRUE))
return(list(ores=ores,bres=bres,wres=wres))
}
library(haven)
nlswork <- read_stata("http://www.stata-press.com/data/r13/nlswork.dta")
XTSUM(nlswork, varname=hours, unit=idcode)
此代码改编自 Rob 和 Robert Walker 的代码(见上文),并输出一个以 Stata xtsum 输出呈现方式配置的表格。
XTSUM <- function(data, varname, unit) {
# Xtsum
varname <- enquo(varname)
loc.unit <- enquo(unit)
ores <- data %>% summarise(Mean=mean(!! varname, na.rm=TRUE), sd=sd(!! varname, na.rm=TRUE), min = min(!! varname, na.rm=TRUE), max=max(!! varname, na.rm=TRUE), N=sum(as.numeric((!is.na(!! varname)))))
bmeans <- data %>% group_by(!! loc.unit) %>% summarise(meanx=mean(!! varname, na.rm=T), t.count=sum(as.numeric(!is.na(!! varname))))
bres <- bmeans %>% ungroup() %>% summarise(sd = sd(meanx, na.rm=TRUE), min = min(meanx, na.rm=TRUE), max=max(meanx, na.rm=TRUE), n=sum(as.numeric(!is.na(t.count))), `T-bar`=mean(t.count, na.rm=TRUE))
wdat <- data %>% group_by(!! loc.unit) %>% mutate(W.x = scale(!! varname, center=TRUE, scale=FALSE))
wres <- wdat %>% ungroup() %>% summarise(sd=sd(W.x, na.rm=TRUE), min=min(W.x, na.rm=TRUE), max=max(W.x, na.rm=TRUE))
# Loop to adjust the scales within group outputs against the overall mean
for(i in 2:3) {
wres[i] <- sum(ores[1], wres[i])
}
# Table Output
Variable <- matrix(c(varname, "", ""), ncol=1)
Comparison <- matrix(c("Overall", "Between", "Within"), ncol=1)
Mean <- matrix(c(ores[1], "", ""), ncol=1)
Observations <- matrix(c(paste0("N = ", ores[5]), paste0("n = ", bres[4]), paste0("T-bar = ", round(bres[5], 4))), ncol=1)
Tab <- rbind(ores[2:4], bres[1:3], wres[1:3])
Tab <- cbind(Tab, Observations)
Tab <- cbind(Mean, Tab)
Tab <- cbind(Comparison, Tab)
Tab <- cbind(Variable, Tab)
# Output
return(Tab)
}
我刚刚在 cran 中发布了一个包 xtsum,用于生成面板数据描述性统计
# Install from cran
install.packages("xtsum")
# Load the librarry
library(xtsum)
data("nlswork", package = "sampleSelection")
xtsum(nlswork, "hours", id = "idcode", t = "year", na.rm = T, dec = 6)
默认情况下
xtsum
返回一个kableExtra对象,您可以设置return.data.frame = TRUE
返回一个dataframe
xtsum(nlswork, "hours", id = "idcode", t = "year",
na.rm = T, dec = 6, return.data.frame = TRUE)
Variable Dim Mean SD Min Max Observations
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 ___________ _________ NA NA NA NA NA
2 hours overall 36.55956 9.869623 1 168 N = 28467
3 NA between NA 7.846585 1 83.5 n = 4710
4 NA within NA 7.520712 -2.154726 130.05956 T = 6.043949
您还可以单独获取每个统计数据
between_sd(nlswork, "hours", id = "idcode", t = "year", na.rm = T)
7.846585
between_max(nlswork, "hours", id = "idcode", t = "year", na.rm = T)
83.5
between_min(nlswork, "hours", id = "idcode", t = "year", na.rm = T)
1
within_sd(nlswork, "hours", id = "idcode", t = "year", na.rm = T)
7.520712
within_max(nlswork, "hours", id = "idcode", t = "year", na.rm = T)
130.0596
within_min(nlswork, "hours", id = "idcode", t = "year", na.rm = T)
-2.154726
用户指南和参考资料
CRAN 中的 xtsum
xtsum简介
报告错误
citation("xtsum")
To cite package ‘xtsum’ in publications use:
Macosso JC (2023). _xtsum: Summary Statistics for Panel Data_. R package version
0.1.0, <https://CRAN.R-project.org/package=xtsum>.
A BibTeX entry for LaTeX users is
@Manual{,
title = {xtsum: Summary Statistics for Panel Data},
author = {Joao Claudio Macosso},
year = {2023},
note = {R package version 0.1.0},
url = {https://CRAN.R-project.org/package=xtsum},
}