我使用
从 pdf 中提取了表格pdf_file <- "UBI Kenya paper.pdf"
for(i in 59:68) {
table_i <- extract_tables(pdf_file, pages = i)
tname <- paste0("table_E.", i-58, "_full")
assign(tname,table_i)
}
我没有使用as.data.frame,因为它返回了错误消息。相反,我这样做了:
table_E.9_full <- data.table::as.data.table(table_E.9_full)
table_E.9 <- table_E.9_full[-c(10:16),]
这给了我一张如下所示的表格:,其中企业和净收入列合并为每个类别(零售、制造、运输、服务),并用空格分隔。
如何将 V2-V5 的第 2-9 行分成两列,各列标题为“零售贸易 - # 企业”、“零售贸易 - 净收入”、“制造 - # 企业”、“制造 -净收入”等,在正确的列中包含正确的值?
dput(head(table_E.9))
返回:
structure(list(V1 = c("", "", "", "Long Term Arm", "", "Short Term Arm"
), V2 = c("Retail Trade", "# Enterprises Net Revenues", "(1) (2)",
"3.89�\u0088\u0097�\u0088\u0097�\u0088\u0097 1601.42�\u0088\u0097",
"[1.28] [824.74]", "2.34�\u0088\u0097�\u0088\u0097 464.60�\u0088\u0097"
), V3 = c("Manufacturing", "# Enterprises Net Revenues", "(3) (4)",
"0.02 51.90", "[.27] [120.79]", "0.03 17.82"), V4 = c("Transportation",
"# Enterprises Net Revenues", "(5) (6)", "0.53�\u0088\u0097 100.76",
"[.29] [85.37]", "-0.12 -3.85"), V5 = c("Services", "# Enterprises Net Revenues",
"(7) (8)", "0.23 198.64", "[.33] [205.6]", "-0.04 70.95")), row.names = c(NA,
-6L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x000001716fd35930>)
您将在下面找到完整的快速但肮脏的解决方案。
但是,如果您需要更强大的东西,您应该优化您的刮擦程序。 由于问题是如何拆分多列,答案是 - 使用 dplyr 函数
separate_wider_delim
。
library(tidyverse)
df <- structure(list(V1 = c("", "", "", "Long Term Arm", "", "Short Term Arm"
), V2 = c("Retail Trade", "# Enterprises Net Revenues", "(1) (2)",
"3.89*** 601.42*", "[1.28] [824.74]", "2.34** 464.6*"
), V3 = c("Manufacturing", "# Enterprises Net Revenues", "(3) (4)",
"0.02 51.90", "[.27] [120.79]", "0.03 17.82"),
V4 = c("Transportation", "# Enterprises Net Revenues", "(5) (6)", "0.53* 100.76",
"[.29] [85.37]", "-0.12 -3.85"),
V5 = c("Services", "# Enterprises Net Revenues",
"(7) (8)", "0.23 198.64", "[.33] [205.6]", "-0.04 70.95")),
row.names = c(NA, -6L), class = c("data.table", "data.frame"))
df_9 <- df[-3:-1,] |>
separate_wider_delim(cols = V2:V5, names=LETTERS[1:2],
delim=" ", names_sep = "")
dfn <- df[1:3,] |>
mutate(across(V2:V5, ~str_replace_all(.x, "s ", "s_")))|>
mutate(across(V2:V5, ~str_replace_all(.x, "\\) ", "\\)_")))|>
separate_wider_delim(cols = V2:V5, names=LETTERS[1:2], names_sep = "",
delim="_", too_few = "align_start")
df_names <- c("")
for (j in 2:9) {
if(is.na(dfn[1,j])) {
df_names <- c(df_names, paste(dfn[1,j-1], dfn[2,j],dfn[3,j]))
} else {
df_names <- c(df_names, paste(dfn[1,j],dfn[2,j],dfn[3,j]))
}
}
names(df_9) <- df_names
> df_9|>as.data.frame()
Retail Trade # Enterprises (1) Retail Trade Net Revenues (2) Manufacturing # Enterprises (3) Manufacturing Net Revenues (4)
1 Long Term Arm 3.89*** 601.42* 0.02 51.90
2 [1.28] [824.74] [.27] [120.79]
3 Short Term Arm 2.34** 464.6* 0.03 17.82
Transportation # Enterprises (5) Transportation Net Revenues (6) Services # Enterprises (7) Services Net Revenues (8)
1 0.53* 100.76 0.23 198.64
2 [.29] [85.37] [.33] [205.6]
3 -0.12 -3.85 -0.04 70.95
>