如何对存储在多级嵌套列表中的数据帧进行行绑定,并为每个级别添加标识符列?

问题描述 投票:0回答:4

R 数据操作初学者,我在处理多级嵌套列表时遇到困难。

问题:有没有办法将这个

dat0
3级列表转换为下面的全局
dat1
数据框?

  • 新的
    fulltext
    列连接每个小标题中的
    text
    变量。
  • 新的
    nbsum
    列添加了每个小标题中的
    nb
    变量。

注意:欢迎使用基于

purrr
dplyr
函数(
mutate
...)的方法,以便在特定上下文中更好地理解这些工具。也欢迎其他方法!

感谢您的帮助

初始数据:

dat0 <- list(pdf1 =
               list(page1 =
                      list(tibble1 = tibble(x = c(1,2,3,4), y = c(1,1,1,1), text = c("ha","r","r","y"), nb = c(1,2,3,4)),
                           tibble2 = tibble(x = c(1,2,3,4), y = c(2,2,2,2), text = c("p","ot","t","er"), nb = c(1,2,3,4))),
                    page2 = 
                      list(tibble1 = tibble(x = c(1,2,3), y = c(3,3,3), text = c("her","m","ione"), nb = c(1,2,3)),
                           tibble2 = tibble(x = c(1,2,3), y = c(4,4,4), text = c("gra","ng","er"), nb = c(1,2,3)))),
             pdf2 =
               list(page1 =
                      list(tibble1 = tibble(x = c(1,2), y = c(5,5), text = c("vol","de"), nb = c(1,2)),
                           tibble2 = tibble(x = c(1,2), y = c(6,6), text = c("m","ort"), nb = c(1,2))),
                    page2 =
                      list(tibble1 = tibble(x = c(1,2,3,4,5), y = c(7,7,7,7,7), text = c("a","l","b","u","s"), nb = c(1,2,3,4,5)),
                           tibble2 = tibble(x = c(1,2,3,4,5), y = c(8,8,8,8,8), text = c("du","m","ble","do","re"), nb = c(1,2,3,4,5))),
                    page3 = 
                      list(tibble1 = tibble(x = c(1,2,3,4), y = c(9,9,9,9), text = c("dr","a","g","o"), nb = c(1,2,3,4)),
                           tibble2 = tibble(x = c(1,2,3,4), y = c(10,10,10,10), text = c("ma","lf","o","y"), nb = c(1,2,3,4)))),
             pdf3 =
               list(page1 =
                      list(tibble1 = tibble(x = c(1,2,3,4,5), y = c(11,11,11,11,11), text = c("s","ev","e","ru","s"), nb = c(1,2,3,4,5)),
                           tibble2 = tibble(x = c(1,2,3,4,5), y = c(12,12,12,12,12), text = c("r","o","g","u","e"), nb = c(1,2,3,4,5))),
                    page2 =
                      list(tibble1 = tibble(x = c(1,2,3), y = c(13,13,13), text = c("r","o","n"), nb = c(1,2,3)),
                           tibble2 = tibble(x = c(1,2,3), y = c(14,14,14), text = c("we","as","ley"), nb = c(1,2,3))),
                    page3 =
                      list(tibble1 = tibble(x = c(1,2,3,4,5,6), y = c(15,15,15,15,15,15), text = c("be","l","la","t","ri","x"), nb = c(1,2,3,4,5,6)),
                           tibble2 = tibble(x = c(1,2,3,4,5,6), y = c(16,16,16,16,16,16), text = c("l","est","r","a","ng","e"), nb = c(1,2,3,4,5,6))),
                    page4 = 
                      list(tibble1 = tibble(x = c(1,2), y = c(17,17), text = c("sir","ius"), nb = c(1,2)),
                           tibble2 = tibble(x = c(1,2), y = c(18,18), text = c("bl","ack"), nb = c(1,2)))))

所需的输出(费力构建;下面的dput脚本):

dat1 <-
structure(list(pdf = c("pdf1", "pdf1", "pdf1", "pdf1", "pdf1", 
"pdf1", "pdf1", "pdf1", "pdf1", "pdf1", "pdf1", "pdf1", "pdf1", 
"pdf1", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", 
"pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", 
"pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf2", "pdf3", 
"pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", 
"pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", 
"pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", 
"pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3", "pdf3"), page = c("page1", 
"page1", "page1", "page1", "page1", "page1", "page1", "page1", 
"page2", "page2", "page2", "page2", "page2", "page2", "page1", 
"page1", "page1", "page1", "page2", "page2", "page2", "page2", 
"page2", "page2", "page2", "page2", "page2", "page2", "page3", 
"page3", "page3", "page3", "page3", "page3", "page3", "page3", 
"page1", "page1", "page1", "page1", "page1", "page1", "page1", 
"page1", "page1", "page1", "page2", "page2", "page2", "page2", 
"page2", "page2", "page3", "page3", "page3", "page3", "page3", 
"page3", "page3", "page3", "page3", "page3", "page3", "page3", 
"page4", "page4", "page4", "page4"), x = c(1, 2, 3, 4, 1, 2, 
3, 4, 1, 2, 3, 1, 2, 3, 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 2, 3, 4, 
5, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 
3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 1, 2), 
    y = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 
    6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 
    10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 13, 13, 13, 14, 
    14, 14, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 
    17, 18, 18), text = c("ha", "r", "r", "y", "p", "ot", "t", 
    "er", "her", "m", "ione", "gra", "ng", "er", "vol", "de", 
    "m", "ort", "a", "l", "b", "u", "s", "du", "m", "ble", "do", 
    "re", "dr", "a", "g", "o", "ma", "lf", "o", "y", "s", "ev", 
    "e", "ru", "s", "r", "o", "g", "u", "e", "r", "o", "n", "we", 
    "as", "ley", "be", "l", "la", "t", "ri", "x", "l", "est", 
    "r", "a", "ng", "e", "sir", "ius", "bl", "ack"), nb = c(1, 
    2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 1, 2, 3, 1, 2, 1, 2, 1, 2, 
    3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 
    4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 
    1, 2, 3, 4, 5, 6, 1, 2, 1, 2), fulltext = c("harry", "harry", 
    "harry", "harry", "potter", "potter", "potter", "potter", 
    "hermione", "hermione", "hermione", "granger", "granger", 
    "granger", "volde", "volde", "mort", "mort", "albus", "albus", 
    "albus", "albus", "albus", "dumbledore", "dumbledore", "dumbledore", 
    "dumbledore", "dumbledore", "drago", "drago", "drago", "drago", 
    "malfoy", "malfoy", "malfoy", "malfoy", "severus", "severus", 
    "severus", "severus", "severus", "rogue", "rogue", "rogue", 
    "rogue", "rogue", "ron", "ron", "ron", "weasley", "weasley", 
    "weasley", "bellatrix", "bellatrix", "bellatrix", "bellatrix", 
    "bellatrix", "bellatrix", "lestrange", "lestrange", "lestrange", 
    "lestrange", "lestrange", "lestrange", "sirius", "sirius", 
    "black", "black"), nbsum = c(10, 10, 10, 10, 10, 10, 10, 
    10, 6, 6, 6, 6, 6, 6, 3, 3, 3, 3, 15, 15, 15, 15, 15, 15, 
    15, 15, 15, 15, 10, 10, 10, 10, 10, 10, 10, 10, 15, 15, 15, 
    15, 15, 15, 15, 15, 15, 15, 6, 6, 6, 6, 6, 6, 21, 21, 21, 
    21, 21, 21, 21, 21, 21, 21, 21, 21, 3, 3, 3, 3)), row.names = c(NA, 
-68L), class = "data.frame")
r dataframe dplyr purrr nested-lists
4个回答
5
投票

此任务最灵活的功能是 IMO

collapse::unlist2d
:

library(dplyr)
dat2 <- 
  collapse::unlist2d(dat0, idcols = c("pdf", "page", "tibble")) |> 
  mutate(fulltext = paste(text, collapse = ""), 
         nbsum = sum(nb),
         .by = c(pdf, page, tibble)) |> 
  select(-tibble)

identical(dat1, dat2)
#[1] TRUE

3
投票

我相信这有效:

library(dplyr)
library(tidyr)
dat0 |>
  bind_rows(.id = "pdf") |>
  pivot_longer(starts_with("page"), names_to = "page") |>
  unnest(value) |>
  arrange(pdf, page, y, x) |>
  mutate(fulltext = paste(text, collapse = ""), nbsum = sum(nb), .by = c(pdf, page, y))
# # A tibble: 68 × 8
#    pdf   page      x     y text     nb fulltext nbsum
#    <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr>    <dbl>
#  1 pdf1  page1     1     1 ha        1 harry       10
#  2 pdf1  page1     2     1 r         2 harry       10
#  3 pdf1  page1     3     1 r         3 harry       10
#  4 pdf1  page1     4     1 y         4 harry       10
#  5 pdf1  page1     1     2 p         1 potter      10
#  6 pdf1  page1     2     2 ot        2 potter      10
#  7 pdf1  page1     3     2 t         3 potter      10
#  8 pdf1  page1     4     2 er        4 potter      10
#  9 pdf1  page2     1     3 her       1 hermione     6
# 10 pdf1  page2     2     3 m         2 hermione     6
# # ℹ 58 more rows
# # ℹ Use `print(n = ...)` to see more rows

2
投票

如果您想随时创建新变量,您可能可以使用

lapply
来完成此操作,因为您的数据嵌套不太深:

lapply(dat0, unlist, recursive = FALSE) |>
  lapply(function(x) {
    lapply(x, \(x) x %>% 
             mutate(fulltext = paste(text, collapse = ''),
                    nbsum = sum(nb)))}) %>% 
  lapply(dplyr::bind_rows, .id = 'page') %>%
  dplyr::bind_rows(.id = 'pdf') %>%
  mutate(page = gsub('\\..*', '', page))
#> # A tibble: 68 x 8
#>    pdf   page      x     y text     nb fulltext nbsum
#>    <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr>    <dbl>
#>  1 pdf1  page1     1     1 ha        1 harry       10
#>  2 pdf1  page1     2     1 r         2 harry       10
#>  3 pdf1  page1     3     1 r         3 harry       10
#>  4 pdf1  page1     4     1 y         4 harry       10
#>  5 pdf1  page1     1     2 p         1 potter      10
#>  6 pdf1  page1     2     2 ot        2 potter      10
#>  7 pdf1  page1     3     2 t         3 potter      10
#>  8 pdf1  page1     4     2 er        4 potter      10
#>  9 pdf1  page2     1     3 her       1 hermione     6
#> 10 pdf1  page2     2     3 m         2 hermione     6
#> # i 58 more rows
#> # i Use `print(n = ...)` to see more rows

1
投票

这是一个纯粹的

tidyverse
解决方案:

library(dplyr)
library(purrr)

map_dfr(dat0, ~ map_dfr(.x, 
                        ~bind_rows(.x), 
                        .id = 'page'),
        .id = 'pdf') %>% 
  mutate(fulltext = paste(text, collapse=""),
         nbsum = sum(nb),
         .by = y)

#> # A tibble: 68 × 8
#>    pdf   page      x     y text     nb fulltext nbsum
#>    <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr>    <dbl>
#>  1 pdf1  page1     1     1 ha        1 harry       10
#>  2 pdf1  page1     2     1 r         2 harry       10
#>  3 pdf1  page1     3     1 r         3 harry       10
#>  4 pdf1  page1     4     1 y         4 harry       10
#>  5 pdf1  page1     1     2 p         1 potter      10
#>  6 pdf1  page1     2     2 ot        2 potter      10
#>  7 pdf1  page1     3     2 t         3 potter      10
#>  8 pdf1  page1     4     2 er        4 potter      10
#>  9 pdf1  page2     1     3 her       1 hermione     6
#> 10 pdf1  page2     2     3 m         2 hermione     6
#> # ℹ 58 more rows

我们还可以使用

rrapply
包:

library(rrapply)
library(tidyr)
library(dplyr)

rrapply(dat0, how = 'melt') %>%
  pivot_wider(names_from = L4, values_from = value) %>% 
  unnest(cols = everything()) %>% 
  mutate(fulltext = paste(text, collapse = ""), 
         nbsum = sum(nb),
         .by = c(L1, L2, L3), 
         L3 = NULL) %>% 
  rename(setNames(c("L1", "L2"), c("pdf", "page")))
 
#> # A tibble: 68 × 8
#>    pdf   page      x     y text     nb fulltext nbsum
#>    <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr>    <dbl>
#>  1 pdf1  page1     1     1 ha        1 harry       10
#>  2 pdf1  page1     2     1 r         2 harry       10
#>  3 pdf1  page1     3     1 r         3 harry       10
#>  4 pdf1  page1     4     1 y         4 harry       10
#>  5 pdf1  page1     1     2 p         1 potter      10
#>  6 pdf1  page1     2     2 ot        2 potter      10
#>  7 pdf1  page1     3     2 t         3 potter      10
#>  8 pdf1  page1     4     2 er        4 potter      10
#>  9 pdf1  page2     1     3 her       1 hermione     6
#> 10 pdf1  page2     2     3 m         2 hermione     6
#> # ℹ 58 more rows
处理嵌套数据集时,

rrapply
会派上用场。例如,如果我们只想获取全文,可以像下面这样完成:

library(rrapply)
library(dplyr)
library(tidyr)
library(purrr)
  
rrapply(dat0, how = 'bind') %>% 
  mutate(fulltext = map_chr(text, .f = paste, collapse="")) %>% 
  unnest(cols = everything()) %>% 
  slice(n(), .by = y)

#> # A tibble: 18 × 5
#>        x     y text     nb fulltext  
#>    <dbl> <dbl> <chr> <dbl> <chr>     
#>  1     4     1 y         4 harry     
#>  2     4     2 er        4 potter    
#>  3     3     3 ione      3 hermione  
#>  4     3     4 er        3 granger   
#>  5     2     5 de        2 volde     
#>  6     2     6 ort       2 mort      
#>  7     5     7 s         5 albus     
#>  8     5     8 re        5 dumbledore
#>  9     4     9 o         4 drago     
#> 10     4    10 y         4 malfoy    
#> 11     5    11 s         5 severus   
#> 12     5    12 e         5 rogue     
#> 13     3    13 n         3 ron       
#> 14     3    14 ley       3 weasley   
#> 15     6    15 x         6 bellatrix 
#> 16     6    16 e         6 lestrange 
#> 17     2    17 ius       2 sirius    
#> 18     2    18 ack       2 black

创建于 2023-12-19,使用 reprex v2.0.2

© www.soinside.com 2019 - 2024. All rights reserved.