我必须处理包含嵌套文档的JSON文档,并且在某种程度上具有array,而该documents,它们在概念上将映射回“数据框行”在R中读取/解析JSON时。
我正在寻找一种方法来确保
全部data frames
都总是变成tibbles
或至少“叶子数据帧”变为tibbles
,而“父数据”帧”变成lists
对于任意嵌套结构,直接在通过{jsonlite}
进行解析时,或之后在通过{purrr}
进行解析。
我如何遍历列表并使用map
递归应用{purrr}
“方式”?
json <- '[
{
"_id": "1234",
"createdAt": "2020-01-13 09:00:00",
"labels": ["label-a", "label-b"],
"levelOne": {
"levelTwo": {
"levelThree": [
{
"x": "A",
"y": 1,
"z": true
},
{
"x": "B",
"y": 2,
"z": false
}
]
}
},
"schema": "0.0.1"
},
{
"_id": "5678",
"createdAt": "2020-01-13 09:01:00",
"labels": ["label-a", "label-b"],
"levelOne": {
"levelTwo": {
"levelThree": [
{
"x": "A",
"y": 1,
"z": true
},
{
"x": "B",
"y": 2,
"z": false
}
]
}
},
"schema": "0.0.1"
}
]'
tibble
后的结果x <- jsonlite::fromJSON(json) %>%
tibble::as_tibble()
x %>% str()
# Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 5 variables:
# $ _id : chr "1234" "5678"
# $ createdAt: chr "2020-01-13 09:00:00" "2020-01-13 09:01:00"
# $ labels :List of 2
# ..$ : chr "label-a" "label-b"
# ..$ : chr "label-a" "label-b"
# $ levelOne :'data.frame': 2 obs. of 1 variable:
# ..$ levelTwo:'data.frame': 2 obs. of 1 variable:
# .. ..$ levelThree:List of 2
# .. .. ..$ :'data.frame': 2 obs. of 3 variables:
# .. .. .. ..$ x: chr "A" "B"
# .. .. .. ..$ y: int 1 2
# .. .. .. ..$ z: logi TRUE FALSE
# .. .. ..$ :'data.frame': 2 obs. of 3 variables:
# .. .. .. ..$ x: chr "A" "B"
# .. .. .. ..$ y: int 1 2
# .. .. .. ..$ z: logi TRUE FALSE
# $ schema : chr "0.0.1" "0.0.1"
x <- jsonlite::fromJSON(json) %>%
tidy_nested_data_frames() %>%
tibble::as_tibble()
x %>% str()
# Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 5 variables:
# $ _id : chr "1234" "5678"
# $ createdAt: chr "2020-01-13 09:00:00" "2020-01-13 09:01:00"
# $ labels :List of 2
# ..$ : chr "label-a" "label-b"
# ..$ : chr "label-a" "label-b"
# $ levelOne :List of 2
# ..$ levelTwo:List of 1
# .. ..$ levelThree:List of 2
# .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 3 variables:
# .. .. .. ..$ x: chr "A" "B"
# .. .. .. ..$ y: int 1 2
# .. .. .. ..$ z: logi TRUE FALSE
# .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 3 variables:
# .. .. .. ..$ x: chr "A" "B"
# .. .. .. ..$ y: int 1 2
# .. .. .. ..$ z: logi TRUE FALSE
# ..$ levelTwo:List of 1
# .. ..$ levelThree:List of 2
# .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 3 variables:
# .. .. .. ..$ x: chr "A" "B"
# .. .. .. ..$ y: int 1 2
# .. .. .. ..$ z: logi TRUE FALSE
# .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 3 variables:
# .. .. .. ..$ x: chr "A" "B"
# .. .. .. ..$ y: int 1 2
# .. .. .. ..$ z: logi TRUE FALSE
# $ schema : chr "0.0.1" "0.0.1"
我有一些可行的方法,但看起来既复杂又脆弱因为它的设计考虑了一个特定的用例/ JSON结构:
tidy_nested_data_frames <- function(
x
) {
is_data_frame_that_should_be_list <- function(x) {
is.data.frame(x) && purrr::map_lgl(x, is.data.frame)
}
y <- x %>%
purrr::map_if(is_data_frame_that_should_be_list, as.list)
# Check for next data frame columns to handle:
false <- function(.x) FALSE
class_info <- y %>%
purrr::map_if(is.list, ~.x %>% purrr::map(is.data.frame), .else = false)
trans_to_tibble <- function(x) {
x %>% purrr::map(tibble::as_tibble)
}
purrr::map2(class_info, y, function(.x, .y) {
go_deeper <- .x %>% as.logical() %>% all()
if (go_deeper) {
# Continue if data frame columns have been detected:
tidy_nested_data_frames(.y[go_deeper])
} else {
# Handle data frames that have list columns that themselves carry the data
# frames we want to turn into tibbles:
# NOTE:
# This probably does not generalize well yet as the logic seems to much
# tied to my current use case!
if (.y %>% is.data.frame()) {
.y %>%
purrr::map_if(is.list, trans_to_tibble)
} else {
.y
}
}
})
}
fromJSON
一起使用:# Recursively checks the depth of the list element but excludes tibbles
depth <- function(list_entry)
{
if(is.list(list_entry) & !is.tibble(list_entry))
{
max(sapply(list_entry, depth)) + 1
}
else 0
}
# Keeps everything with depth 0 as-is.
# Turns everything of depth 1 into a tibble.
# Recursively calls itself on all members with depth > 1
recursive_tibble <- function(json_list)
{
lapply(json_list, function(y)
{
if(depth(y) == 0) return(y)
if(depth(y) == 1) return(as_tibble(y))
else return(recursive_tibble(y))
})
}
所以您现在可以做:
recursive_tibble(x) #> List of 2 #> $ :List of 5 #> ..$ _id : chr "1234" #> ..$ createdAt: chr "2020-01-13 09:00:00" #> ..$ labels : chr [1:2] "label-a" "label-b" #> ..$ levelOne :List of 1 #> .. ..$ levelTwo:List of 1 #> .. .. ..$ levelThree:List of 2 #> .. .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 1 obs. of 3 variables: #> .. .. .. .. ..$ x: chr "A" #> .. .. .. .. ..$ y: num 1 #> .. .. .. .. ..$ z: logi TRUE #> .. .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 1 obs. of 3 variables: #> .. .. .. .. ..$ x: chr "B" #> .. .. .. .. ..$ y: num 2 #> .. .. .. .. ..$ z: logi FALSE #> ..$ schema : chr "0.0.1" #> $ :List of 5 #> ..$ _id : chr "5678" #> ..$ createdAt: chr "2020-01-13 09:01:00" #> ..$ labels : chr [1:2] "label-a" "label-b" #> ..$ levelOne :List of 1 #> .. ..$ levelTwo:List of 1 #> .. .. ..$ levelThree:List of 2 #> .. .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 1 obs. of 3 variables: #> .. .. .. .. ..$ x: chr "A" #> .. .. .. .. ..$ y: num 1 #> .. .. .. .. ..$ z: logi TRUE #> .. .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 1 obs. of 3 variables: #> .. .. .. .. ..$ x: chr "B" #> .. .. .. .. ..$ y: num 2 #> .. .. .. .. ..$ z: logi FALSE #> ..$ schema : chr "0.0.1"