我将国家行政区的替代名称存储在json字符串中,并且具有清除NA和重复值的功能;但是,这是一个非常慢的功能,尤其是当我在某些数据中有成千上万的行时。我一直在使用整洁的语法,因为我对它最熟悉。它有效,但是有人对如何优化此功能有想法吗?
这里有一些示例数据和功能:
library(tidyverse)
library(magrittr)
library(rio)
library(jsonlite)
library(tictoc)
data <- import(
'https://docs.google.com/spreadsheets/d/1Zd_gUj4ejZoTph5i7l_XTRg3pIS2_6tM2mde0SFykzM/edit?usp=sharing',
setclass = 'tibble'
)
cleanAlternateNames <- function(altNames) {
tic()
altNames %<>% lapply(function(x) {
if (!x %>% is.na()) {
x %>% fromJSON() %>% plyr::ldply(rbind) %>%
pivot_longer(-one_of('.id')) %>%
filter(!is.na(value),!value == 'NA') %>%
distinct(value, .keep_all = T) %>%
select(-name) %>%
pivot_wider(names_from = '.id', values_from = 'value') %>%
toJSON()
}
})
toc()
return(altNames)
}
data %<>%
mutate(AlternateNames = AlternateNames %>% str_replace_all('""','"') %>%
cleanAlternateNames)
我修改了示例数据以提供我要转换的范围更大的范围。
[感谢Ian Campbell和另一个用户使用map的有用示例,我能够在代码中返回到最初组合json列表并使用map函数删除重复项和重复项的地方。这段代码明显更快,并且不需要使用我后来尝试清除代码的功能。如果有人遇到类似问题,我会在此发布。
library(tidyverse)
library(magrittr)
library(rio)
library(jsonlite)
# filters out duplicates from first string
.lfilter <- function(x1,x2){
nm <- names(x1)
x1 %<>% unlist
x1 <- x1[!x1 %in% unlist(x2)]
if(length(x1) > 0){
x1 %<>%
unname() %>%
list() %>% set_names(nm)
}
return(x1)
}
# combines two columns that are in json format while removing duplicates
.combineJSONlists <- function(x){
x %<>% str_split("\\|")
x1 <- x[[1]][1] %>% fromJSON()
x2 <- x[[1]][-1] %>% fromJSON()
x1 <- map(x1, .lfilter, x2 = x2)
com <- x1 %>% c(x2) %>% toJSON()
return(com)
}
# load data
data <- import(
'https://docs.google.com/spreadsheets/d/1Zd_gUj4ejZoTph5i7l_XTRg3pIS2_6tM2mde0SFykzM/edit?usp=sharing',
setclass = 'tibble'
)
# combine new and old AlternateNames
data %<>%
mutate(
ISO = ISO %>% str_replace_all('""','"'),
AlternateNames = AlternateNames %>% str_replace_all('""','"') %>%
paste0("|",ISO),
AlternateNames = AlternateNames %>%
map_chr(.combineJSONlists)) %>%
select(-ISO)