将从pdf中提取的表格的多列拆分为多列

问题描述 投票:0回答:1

我使用

从 pdf 中提取了表格
pdf_file <- "UBI Kenya paper.pdf"
for(i in 59:68) {
  table_i <- extract_tables(pdf_file, pages = i) 
  tname <- paste0("table_E.", i-58, "_full")
  assign(tname,table_i)
}

我没有使用as.data.frame,因为它返回了错误消息。相反,我这样做了:

table_E.9_full <- data.table::as.data.table(table_E.9_full)
table_E.9 <- table_E.9_full[-c(10:16),]

这给了我一张如下所示的表格:,其中企业和净收入列合并为每个类别(零售、制造、运输、服务),并用空格分隔。

当我想要它看起来像原来的时候:

如何将 V2-V5 的第 2-9 行分成两列,各列标题为“零售贸易 - # 企业”、“零售贸易 - 净收入”、“制造 - # 企业”、“制造 -净收入”等,在正确的列中包含正确的值?

dput(head(table_E.9))

返回:

structure(list(V1 = c("", "", "", "Long Term Arm", "", "Short Term Arm"
), V2 = c("Retail Trade", "# Enterprises Net Revenues", "(1) (2)", 
"3.89�\u0088\u0097�\u0088\u0097�\u0088\u0097 1601.42�\u0088\u0097", 
"[1.28] [824.74]", "2.34�\u0088\u0097�\u0088\u0097 464.60�\u0088\u0097"
), V3 = c("Manufacturing", "# Enterprises Net Revenues", "(3) (4)", 
"0.02 51.90", "[.27] [120.79]", "0.03 17.82"), V4 = c("Transportation", 
"# Enterprises Net Revenues", "(5) (6)", "0.53�\u0088\u0097 100.76", 
"[.29] [85.37]", "-0.12 -3.85"), V5 = c("Services", "# Enterprises Net Revenues", 
"(7) (8)", "0.23 198.64", "[.33] [205.6]", "-0.04 70.95")), row.names = c(NA, 
-6L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x000001716fd35930>)
r split multiple-columns
1个回答
0
投票

您将在下面找到完整的快速但肮脏的解决方案。

但是,如果您需要更强大的东西,您应该优化您的刮擦程序。 由于问题是如何拆分多列,答案是 - 使用 dplyr 函数

separate_wider_delim

library(tidyverse)

df <- structure(list(V1 = c("", "", "", "Long Term Arm", "", "Short Term Arm"
), V2 = c("Retail Trade", "# Enterprises Net Revenues", "(1) (2)", 
          "3.89*** 601.42*", "[1.28] [824.74]", "2.34** 464.6*"
), V3 = c("Manufacturing", "# Enterprises Net Revenues", "(3) (4)", 
          "0.02 51.90", "[.27] [120.79]", "0.03 17.82"),
V4 = c("Transportation", "# Enterprises Net Revenues", "(5) (6)", "0.53* 100.76", 
"[.29] [85.37]", "-0.12 -3.85"),
V5 = c("Services", "# Enterprises Net Revenues", 
       "(7) (8)", "0.23 198.64", "[.33] [205.6]", "-0.04 70.95")),
row.names = c(NA, -6L), class = c("data.table", "data.frame"))

df_9 <- df[-3:-1,] |>
  separate_wider_delim(cols = V2:V5, names=LETTERS[1:2],
                       delim=" ", names_sep = "")

dfn <- df[1:3,] |>
  mutate(across(V2:V5, ~str_replace_all(.x, "s ", "s_")))|>
  mutate(across(V2:V5, ~str_replace_all(.x, "\\) ", "\\)_")))|>
separate_wider_delim(cols = V2:V5, names=LETTERS[1:2], names_sep = "",
                     delim="_", too_few = "align_start") 

df_names <- c("")
for (j in 2:9) {
  if(is.na(dfn[1,j])) {
    df_names <- c(df_names, paste(dfn[1,j-1], dfn[2,j],dfn[3,j]))
  } else {
    df_names <- c(df_names, paste(dfn[1,j],dfn[2,j],dfn[3,j]))
    }
}
names(df_9) <- df_names


> df_9|>as.data.frame()
                 Retail Trade # Enterprises (1) Retail Trade Net Revenues (2) Manufacturing # Enterprises (3) Manufacturing Net Revenues (4)
1  Long Term Arm                        3.89***                       601.42*                            0.02                          51.90
2                                        [1.28]                      [824.74]                           [.27]                       [120.79]
3 Short Term Arm                         2.34**                        464.6*                            0.03                          17.82
  Transportation # Enterprises (5) Transportation Net Revenues (6) Services # Enterprises (7) Services Net Revenues (8)
1                            0.53*                          100.76                       0.23                    198.64
2                            [.29]                         [85.37]                      [.33]                   [205.6]
3                            -0.12                           -3.85                      -0.04                     70.95
> 
© www.soinside.com 2019 - 2024. All rights reserved.