R 数据操作初学者,我在处理多级嵌套列表时遇到困难。
问题:有没有办法将这个
dat0
3级列表转换为下面的全局dat1
数据框?
fulltext
列连接每个小标题中的 text
变量。nbsum
列添加了每个小标题中的 nb
变量。注意:欢迎使用基于
purrr
和 dplyr
函数(mutate
...)的方法,以便在特定上下文中更好地理解这些工具。也欢迎其他方法!
感谢您的帮助
初始数据:
dat0 <- list(pdf1 =
list(page1 =
list(tibble1 = tibble(x = c(1,2,3,4), y = c(1,1,1,1), text = c("ha","r","r","y"), nb = c(1,2,3,4)),
tibble2 = tibble(x = c(1,2,3,4), y = c(2,2,2,2), text = c("p","ot","t","er"), nb = c(1,2,3,4))),
page2 =
list(tibble1 = tibble(x = c(1,2,3), y = c(3,3,3), text = c("her","m","ione"), nb = c(1,2,3)),
tibble2 = tibble(x = c(1,2,3), y = c(4,4,4), text = c("gra","ng","er"), nb = c(1,2,3)))),
pdf2 =
list(page1 =
list(tibble1 = tibble(x = c(1,2), y = c(5,5), text = c("vol","de"), nb = c(1,2)),
tibble2 = tibble(x = c(1,2), y = c(6,6), text = c("m","ort"), nb = c(1,2))),
page2 =
list(tibble1 = tibble(x = c(1,2,3,4,5), y = c(7,7,7,7,7), text = c("a","l","b","u","s"), nb = c(1,2,3,4,5)),
tibble2 = tibble(x = c(1,2,3,4,5), y = c(8,8,8,8,8), text = c("du","m","ble","do","re"), nb = c(1,2,3,4,5))),
page3 =
list(tibble1 = tibble(x = c(1,2,3,4), y = c(9,9,9,9), text = c("dr","a","g","o"), nb = c(1,2,3,4)),
tibble2 = tibble(x = c(1,2,3,4), y = c(10,10,10,10), text = c("ma","lf","o","y"), nb = c(1,2,3,4)))),
pdf3 =
list(page1 =
list(tibble1 = tibble(x = c(1,2,3,4,5), y = c(11,11,11,11,11), text = c("s","ev","e","ru","s"), nb = c(1,2,3,4,5)),
tibble2 = tibble(x = c(1,2,3,4,5), y = c(12,12,12,12,12), text = c("r","o","g","u","e"), nb = c(1,2,3,4,5))),
page2 =
list(tibble1 = tibble(x = c(1,2,3), y = c(13,13,13), text = c("r","o","n"), nb = c(1,2,3)),
tibble2 = tibble(x = c(1,2,3), y = c(14,14,14), text = c("we","as","ley"), nb = c(1,2,3))),
page3 =
list(tibble1 = tibble(x = c(1,2,3,4,5,6), y = c(15,15,15,15,15,15), text = c("be","l","la","t","ri","x"), nb = c(1,2,3,4,5,6)),
tibble2 = tibble(x = c(1,2,3,4,5,6), y = c(16,16,16,16,16,16), text = c("l","est","r","a","ng","e"), nb = c(1,2,3,4,5,6))),
page4 =
list(tibble1 = tibble(x = c(1,2), y = c(17,17), text = c("sir","ius"), nb = c(1,2)),
tibble2 = tibble(x = c(1,2), y = c(18,18), text = c("bl","ack"), nb = c(1,2)))))
所需的输出(费力构建;下面的dput脚本):
dat1 <-
structure(list(pdf = c("pdf1", "pdf1", "pdf1", "pdf1", "pdf1",
"pdf1", "pdf1", "pdf1", "pdf1", "pdf1", "pdf1", "pdf1", "pdf1",
"pdf1", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2",
"pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2",
"pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf3",
"pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3",
"pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3",
"pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3",
"pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3"), page = c("page1",
"page1", "page1", "page1", "page1", "page1", "page1", "page1",
"page2", "page2", "page2", "page2", "page2", "page2", "page1",
"page1", "page1", "page1", "page2", "page2", "page2", "page2",
"page2", "page2", "page2", "page2", "page2", "page2", "page3",
"page3", "page3", "page3", "page3", "page3", "page3", "page3",
"page1", "page1", "page1", "page1", "page1", "page1", "page1",
"page1", "page1", "page1", "page2", "page2", "page2", "page2",
"page2", "page2", "page3", "page3", "page3", "page3", "page3",
"page3", "page3", "page3", "page3", "page3", "page3", "page3",
"page4", "page4", "page4", "page4"), x = c(1, 2, 3, 4, 1, 2,
3, 4, 1, 2, 3, 1, 2, 3, 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 2, 3, 4,
5, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2,
3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 1, 2),
y = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6,
6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10,
10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 13, 13, 13, 14,
14, 14, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 17,
17, 18, 18), text = c("ha", "r", "r", "y", "p", "ot", "t",
"er", "her", "m", "ione", "gra", "ng", "er", "vol", "de",
"m", "ort", "a", "l", "b", "u", "s", "du", "m", "ble", "do",
"re", "dr", "a", "g", "o", "ma", "lf", "o", "y", "s", "ev",
"e", "ru", "s", "r", "o", "g", "u", "e", "r", "o", "n", "we",
"as", "ley", "be", "l", "la", "t", "ri", "x", "l", "est",
"r", "a", "ng", "e", "sir", "ius", "bl", "ack"), nb = c(1,
2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 1, 2, 3, 1, 2, 1, 2, 1, 2,
3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6,
1, 2, 3, 4, 5, 6, 1, 2, 1, 2), fulltext = c("harry", "harry",
"harry", "harry", "potter", "potter", "potter", "potter",
"hermione", "hermione", "hermione", "granger", "granger",
"granger", "volde", "volde", "mort", "mort", "albus", "albus",
"albus", "albus", "albus", "dumbledore", "dumbledore", "dumbledore",
"dumbledore", "dumbledore", "drago", "drago", "drago", "drago",
"malfoy", "malfoy", "malfoy", "malfoy", "severus", "severus",
"severus", "severus", "severus", "rogue", "rogue", "rogue",
"rogue", "rogue", "ron", "ron", "ron", "weasley", "weasley",
"weasley", "bellatrix", "bellatrix", "bellatrix", "bellatrix",
"bellatrix", "bellatrix", "lestrange", "lestrange", "lestrange",
"lestrange", "lestrange", "lestrange", "sirius", "sirius",
"black", "black"), nbsum = c(10, 10, 10, 10, 10, 10, 10,
10, 6, 6, 6, 6, 6, 6, 3, 3, 3, 3, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 10, 10, 10, 10, 10, 10, 10, 10, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 6, 6, 6, 6, 6, 6, 21, 21, 21,
21, 21, 21, 21, 21, 21, 21, 21, 21, 3, 3, 3, 3)), row.names = c(NA,
-68L), class = "data.frame")
此任务最灵活的功能是 IMO
collapse::unlist2d
:
library(dplyr)
dat2 <-
collapse::unlist2d(dat0, idcols = c("pdf", "page", "tibble")) |>
mutate(fulltext = paste(text, collapse = ""),
nbsum = sum(nb),
.by = c(pdf, page, tibble)) |>
select(-tibble)
identical(dat1, dat2)
#[1] TRUE
我相信这有效:
library(dplyr)
library(tidyr)
dat0 |>
bind_rows(.id = "pdf") |>
pivot_longer(starts_with("page"), names_to = "page") |>
unnest(value) |>
arrange(pdf, page, y, x) |>
mutate(fulltext = paste(text, collapse = ""), nbsum = sum(nb), .by = c(pdf, page, y))
# # A tibble: 68 × 8
# pdf page x y text nb fulltext nbsum
# <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr> <dbl>
# 1 pdf1 page1 1 1 ha 1 harry 10
# 2 pdf1 page1 2 1 r 2 harry 10
# 3 pdf1 page1 3 1 r 3 harry 10
# 4 pdf1 page1 4 1 y 4 harry 10
# 5 pdf1 page1 1 2 p 1 potter 10
# 6 pdf1 page1 2 2 ot 2 potter 10
# 7 pdf1 page1 3 2 t 3 potter 10
# 8 pdf1 page1 4 2 er 4 potter 10
# 9 pdf1 page2 1 3 her 1 hermione 6
# 10 pdf1 page2 2 3 m 2 hermione 6
# # ℹ 58 more rows
# # ℹ Use `print(n = ...)` to see more rows
如果您想随时创建新变量,您可能可以使用
lapply
来完成此操作,因为您的数据嵌套不太深:
lapply(dat0, unlist, recursive = FALSE) |>
lapply(function(x) {
lapply(x, \(x) x %>%
mutate(fulltext = paste(text, collapse = ''),
nbsum = sum(nb)))}) %>%
lapply(dplyr::bind_rows, .id = 'page') %>%
dplyr::bind_rows(.id = 'pdf') %>%
mutate(page = gsub('\\..*', '', page))
#> # A tibble: 68 x 8
#> pdf page x y text nb fulltext nbsum
#> <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr> <dbl>
#> 1 pdf1 page1 1 1 ha 1 harry 10
#> 2 pdf1 page1 2 1 r 2 harry 10
#> 3 pdf1 page1 3 1 r 3 harry 10
#> 4 pdf1 page1 4 1 y 4 harry 10
#> 5 pdf1 page1 1 2 p 1 potter 10
#> 6 pdf1 page1 2 2 ot 2 potter 10
#> 7 pdf1 page1 3 2 t 3 potter 10
#> 8 pdf1 page1 4 2 er 4 potter 10
#> 9 pdf1 page2 1 3 her 1 hermione 6
#> 10 pdf1 page2 2 3 m 2 hermione 6
#> # i 58 more rows
#> # i Use `print(n = ...)` to see more rows
这是一个纯粹的
tidyverse
解决方案:
library(dplyr)
library(purrr)
map_dfr(dat0, ~ map_dfr(.x,
~bind_rows(.x),
.id = 'page'),
.id = 'pdf') %>%
mutate(fulltext = paste(text, collapse=""),
nbsum = sum(nb),
.by = y)
#> # A tibble: 68 × 8
#> pdf page x y text nb fulltext nbsum
#> <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr> <dbl>
#> 1 pdf1 page1 1 1 ha 1 harry 10
#> 2 pdf1 page1 2 1 r 2 harry 10
#> 3 pdf1 page1 3 1 r 3 harry 10
#> 4 pdf1 page1 4 1 y 4 harry 10
#> 5 pdf1 page1 1 2 p 1 potter 10
#> 6 pdf1 page1 2 2 ot 2 potter 10
#> 7 pdf1 page1 3 2 t 3 potter 10
#> 8 pdf1 page1 4 2 er 4 potter 10
#> 9 pdf1 page2 1 3 her 1 hermione 6
#> 10 pdf1 page2 2 3 m 2 hermione 6
#> # ℹ 58 more rows
我们还可以使用
rrapply
包:
library(rrapply)
library(tidyr)
library(dplyr)
rrapply(dat0, how = 'melt') %>%
pivot_wider(names_from = L4, values_from = value) %>%
unnest(cols = everything()) %>%
mutate(fulltext = paste(text, collapse = ""),
nbsum = sum(nb),
.by = c(L1, L2, L3),
L3 = NULL) %>%
rename(setNames(c("L1", "L2"), c("pdf", "page")))
#> # A tibble: 68 × 8
#> pdf page x y text nb fulltext nbsum
#> <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr> <dbl>
#> 1 pdf1 page1 1 1 ha 1 harry 10
#> 2 pdf1 page1 2 1 r 2 harry 10
#> 3 pdf1 page1 3 1 r 3 harry 10
#> 4 pdf1 page1 4 1 y 4 harry 10
#> 5 pdf1 page1 1 2 p 1 potter 10
#> 6 pdf1 page1 2 2 ot 2 potter 10
#> 7 pdf1 page1 3 2 t 3 potter 10
#> 8 pdf1 page1 4 2 er 4 potter 10
#> 9 pdf1 page2 1 3 her 1 hermione 6
#> 10 pdf1 page2 2 3 m 2 hermione 6
#> # ℹ 58 more rows
处理嵌套数据集时,rrapply
会派上用场。例如,如果我们只想获取全文,可以像下面这样完成:
library(rrapply)
library(dplyr)
library(tidyr)
library(purrr)
rrapply(dat0, how = 'bind') %>%
mutate(fulltext = map_chr(text, .f = paste, collapse="")) %>%
unnest(cols = everything()) %>%
slice(n(), .by = y)
#> # A tibble: 18 × 5
#> x y text nb fulltext
#> <dbl> <dbl> <chr> <dbl> <chr>
#> 1 4 1 y 4 harry
#> 2 4 2 er 4 potter
#> 3 3 3 ione 3 hermione
#> 4 3 4 er 3 granger
#> 5 2 5 de 2 volde
#> 6 2 6 ort 2 mort
#> 7 5 7 s 5 albus
#> 8 5 8 re 5 dumbledore
#> 9 4 9 o 4 drago
#> 10 4 10 y 4 malfoy
#> 11 5 11 s 5 severus
#> 12 5 12 e 5 rogue
#> 13 3 13 n 3 ron
#> 14 3 14 ley 3 weasley
#> 15 6 15 x 6 bellatrix
#> 16 6 16 e 6 lestrange
#> 17 2 17 ius 2 sirius
#> 18 2 18 ack 2 black
创建于 2023-12-19,使用 reprex v2.0.2