下面是从基于网络的数据集返回的对象。它作为包含对象/文本嵌套组合的列表返回。我实际上在做的是使用 reticulate 做一个
source_python()
脚本,转换给了我一个 R 对象,看起来像下面的对象。
# The reprex
obj = list(
list(
dontneed1 = "oh well",
dontneed2 = "doesnt matter",
values = list(
need1 = list(list(value = "1231",
text = "abc1")),
need2 = "1232",
need3 = "1/1/2023",
dont_need_value = list(list(value = "12", text = "abc2")), #note that 'text' observation is needed
need5 = list(list(value = "1234", text = "abc3")),
need6 = list()
)
),
list(
dontneed3 = "oh well",
dontneed4 = "no thank you",
values = list(
need1 = list(list(value = "1235",
text = "abc4")),
need2 = "1236",
need3 = "1/2/2023",
dont_need_value = list(list(value = "12", text = "abc4")), #note that 'text' observation is needed
need5 = list(list(value = "1238", text = "abc5")),
need6 = list()
)
)
)
这是我目前如何清理数据以保留所需的表格格式(此数据然后输出为 power bi 表):
# Extract values
new_list <- map(obj, ~.[["values"]])
# rowbind all data nested under values and then unnest
df <- data.table::rbindlist(new_list, fill = TRUE, use.names = TRUE, idcol = "element_number") %>%
unnest(cols = c(need1, dont_need_value, need5, need6))
# column bind those values and return df
# note the result output here brings in the dont_need_value observattions - the filter below
# is needed to remove those rows
df <- as.data.frame(do.call(cbind, df))
# filter out the junk rows - data I actually did not need
df <- df %>%
filter(nchar(dont_need_value) > 2)
我的问题是 - 当一个对象通过 API 检索从网站返回并以这种奇怪的嵌套列表格式返回时 - 是否有比上面更好的方法将数据折叠到数据框并提取“需要”变量及其观察结果?请注意,“dont_need_value”有一个不需要的观察结果,但它带来了一个需要的文本观察结果。
这是另一种使用
map
和pluck
的方法。
注意:
modify_depth()
用 NA 替换空列表。
library(tidyverse)
set_1 <- c("need1", "dont_need_value", "need5") |> set_names()
set_2 <- c("need2", "need3", "need6") |> set_names()
obj2 <- modify_depth(obj, \(x) ifelse(!length(x),NA,x), .depth=3, .ragged=T)
map(1:2,
\(lvl1) map(set_1, \(lvl2) pluck(obj2, lvl1, "values", lvl2, 1, "text")) |>
bind_cols(map(set_2, \(lvl2) pluck(obj2, lvl1, "values", lvl2)))) |>
bind_rows()