我有一个列表(从API JSON导入),具有一个需要整理的复杂嵌套结构。我只能通过for循环来做到这一点,这似乎并不是最佳选择:冗长的编写,冗长的计算需要包含许多元素,如果出现新的变化,可能会出现错误。是否有一种方法可以使用诸如map()
,unnnest()
,flatten()
,squash()
或底数R中的等价物的整排函数获得类似的结果?我没有在StackOverflow中找到具有类似问题组合的任何示例(在示例下方指出)。这是可重现的示例:
metadata <- list(
table1 = list(
attribute1 = "tb1_att1",
level_to_disard = list(
attribute2 = "tb1_att2",
columns = list(
column1 = list(
col_name = "tb1_col1_name",
col_type = "tb1_col1_type"
),
column2 = list(
col_name = "tb1_col2_name",
col_type = "tb1_col2_type"
)
),
tags = c("tag1", "tag2", "tag3"),
irrelevant = list(irrelveant1 = "blabla", irrelevant2 = "blibli")
)
),
table2 = list(
attribute1 = "tb2_att1",
level_to_disard = list(
columns = list(
column1 = list(
col_name = "tb2_col1_name",
col_irrelevant = "bloblo"
),
column2 = list(
col_name = "tb2_col2_name",
col_type = "tb2_col2_type"
)
),
tags = c("tag1", "tag3")
)
)
)
str(metadata)
# Output in console:
List of 2
$ table1:List of 2
..$ attribute1 : chr "tb1_att1"
..$ level_to_disard:List of 4
.. ..$ attribute2: chr "tb1_att2"
.. ..$ columns :List of 2
.. .. ..$ column1:List of 2
.. .. .. ..$ col_name: chr "tb1_col1_name"
.. .. .. ..$ col_type: chr "tb1_col1_type"
.. .. ..$ column2:List of 2
.. .. .. ..$ col_name: chr "tb1_col2_name"
.. .. .. ..$ col_type: chr "tb1_col1_type"
.. ..$ tags : chr [1:3] "tag1" "tag2" "tag3"
.. ..$ irrelevant:List of 2
.. .. ..$ irrelveant1: chr "blabla"
.. .. ..$ irrelevant2: chr "blibli"
$ table2:List of 2
..$ attribute1 : chr "tb2_att1"
..$ level_to_disard:List of 2
.. ..$ columns:List of 2
.. .. ..$ column1:List of 2
.. .. .. ..$ col_name : chr "tb2_col1_name"
.. .. .. ..$ col_irrelevant: chr "bloblo"
.. .. ..$ column2:List of 2
.. .. .. ..$ col_name: chr "tb2_col2_name"
.. .. .. ..$ col_type: chr "tb2_col1_type"
.. ..$ tags : chr [1:2] "tag1" "tag3"
请注意,属性位于不同的级别,必须丢弃某些元素(列表中名为“不相关”的元素),表2缺少“ attribute2”,表2的第1列缺少“ type”。这是带有for循环和预期结果的解决方案。
# Define a function to extract column information
extract_cols <- function(x){
fields <- tibble()
if (length(x) == 0) {
return(fields)
} else {
for (i in 1:length(x)) {
fields <- add_row(fields)
# Extract name
fields$name[i] = ""
# Extract type if present of return empty string
if (any(names(x[[i]]) == "type")) {
fields$type[i] = x[[i]][["type"]]
} else {
fields$type[i] = ""
}
return(fields)
}
}
}
# Create an empty tibble for the tidy metadata. It could also be a list.
library(tibble)
meta <- tibble()
# for (i in 1:1) {
for (i in 1:length(metadata)) {
meta <- add_row(meta)
meta$attribute1[[i]] <- metadata[[i]][["attribute1"]]
meta$attribute2[[i]] <- ifelse(length(metadata[[i]][["level_to_disard"]][["attribute2"]]) > 0,
c(metadata[[i]][["level_to_disard"]][["attribute2"]]), "")
metadata[[i]][["level_to_disard"]][["attribute2"]]
meta$cols[[i]] <- extract_cols(metadata[[i]][["columns"]])
meta$tags[[i]] <- metadata[[i]][["level_to_disard"]][["tags"]]
}
str(meta)
# Output in console:
tibble [2 × 4] (S3: tbl_df/tbl/data.frame)
$ attribute1: chr [1:2] "tb1_att1" "tb2_att1"
$ attribute2: chr [1:2] "tb1_att2" ""
$ cols :List of 2
..$ : tibble [0 × 0] (S3: tbl_df/tbl/data.frame)
Named list()
..$ : tibble [0 × 0] (S3: tbl_df/tbl/data.frame)
Named list()
$ tags :List of 2
..$ : chr [1:3] "tag1" "tag2" "tag3"
..$ : chr [1:2] "tag1" "tag3"
是否有更直接的方法来获得此结果?输出可以是列表,小标题或数据帧,只要使用与上述“ meta” enter code here
类似的结构对其进行简化即可。
[这里是使用tidyverse
的解决方案:我添加了一些注释,以使步骤更具可读性。
df<- unlist(metadata) %>%
data.frame() %>% # unlist and changing to data.frame
rownames_to_column() %>% #adding row names to data
magrittr::set_colnames(c("Var","value")) %>% #renaming columns
tidyr::separate('Var', paste("Tag", 1:5, sep="_"), sep="[.]", extra="drop") %>% #spliting column to get all levels
dplyr::mutate(Level1= ifelse(grepl("attribute",Tag_2)|grepl("attribute",Tag_3), "attribute",paste0(Tag_2,"_",Tag_3)),
Level1= gsub("\\d.*", "", Level1),
Level2=paste0(Level1,"_",Tag_5),
Level2= gsub("_NA*", "", Level2)
) %>% #cleaning of data based on patterns to reach clean levels
ungroup()%>%
dplyr::group_by(Tag_1,Level2) %>%
dplyr::mutate(n= row_number()) %>%
dplyr::ungroup() %>%
dplyr::mutate(Level2=paste0(Level2,n),
Level3= ifelse(Level1=="attribute",Level1,Level2),
Level4= ifelse(is.na(Tag_5),Level1,Tag_5),
Level4= gsub("level_to_disard_","",Level4),
Level1= gsub("level_to_disard_","",Level1),
Level5= ifelse(Level1=="attribute",Level2,Level1)
) %>% #cleaning again based on patterns to reach clean levels
dplyr::select(Tag_1,Level1,Level2,Level3,Level4,Level5,value) %>%
dplyr::group_by(Tag_1,Level5) %>% # at this step you can change Level number to get data at any other level
dplyr::mutate(value_1=paste0(value,collapse = ",")) %>%
dplyr::select(Tag_1,Level5,value_1) %>%
dplyr::distinct() %>%
tidyr::pivot_wider(names_from =c(Level5),values_from = value_1) # changing to wide format
output:
df
# A tibble: 2 x 6
# Groups: Tag_1 [2]
Tag_1 attribute1 attribute2 columns tags irrelevant
<chr> <chr> <chr> <chr> <chr> <chr>
1 table1 tb1_att1 tb1_att2 tb1_col1_name,tb1_col1_type,tb1_col2_name,tb1_col2_type tag1,tag2,tag3 blabla,blibli
2 table2 tb2_att1 NA tb2_col1_name,bloblo,tb2_col2_name,tb2_col2_type tag1,tag3 NA
您可以使用select
删除不需要的列。