我在使用下面的代码时遇到问题。函数测试用于从网站获取数据,并且对于从 2 到 33000(无论)的所有 i 值都非常有效。但是,当使用循环获取所有页面时,我会遇到解析错误,并且数据框中出现多个相同的行。
library(rvest)
library(chromote)
library(jsonlite)
library(dplyr)
test=function(i){
b <- ChromoteSession$new()
p=b$Page$loadEventFired(wait_ = FALSE)
b$Page$navigate(paste("https://www.ecologie.gouv.fr/sru_api/api/towns/",i,sep=""),wait_ = FALSE)
b$wait_for(p)
html <- b$Runtime$evaluate('document.documentElement.outerHTML')
content <- read_html(html$result$value)
data_json=html_text(content)
df=fromJSON(data_json)
return(df)}
ma_liste <- list()
n=100
for (i in 2:n){
tryCatch({
ma_liste <- c(ma_liste, list(test(i)))
})
}
ma_liste
dataframe <- do.call(rbind, ma_liste)
dataframe <- as.data.frame(dataframe)
我尝试使用 tryCatch 忽略有问题的行,但它不能解决多行问题(并且跳过大量数据)。 你能帮我解决这个问题吗?谢谢。
library(tidyverse)
library(jsonlite)
scraper <- function(index) {
"https://www.ecologie.gouv.fr/sru_api/api/towns/" %>%
str_c(., index) %>%
fromJSON() %>%
modify( ~ if(is.null(.x)) NA else .x)
}
df <- map_dfr(2:20, possibly(scraper, otherwise = NULL))
# A tibble: 19 × 46
sru_id sru_structure sru_region sru_dep sru_insee sru_commune sru_pop_commune sru_tx_lls_obj
<int> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 2 "" "" 01 01043 Beynost 4557 25%
2 3 "" "" 01 01142 Dagneux 4706 25%
3 4 "" "" 01 01160 Ferney-Voltaire 9637 25%
4 5 "" "" 01 01249 Miribel 9742 25%
5 6 "" "" 01 01262 Montluel 7005 25%
6 7 "" "" 01 01281 Ornex 4400 25%
7 8 "" "" 01 01313 Prévessin-Moëns 7991 25%
8 9 "" "" 01 01322 Reyrieux 4670 25%
9 10 "" "" 01 01344 Saint-Denis-lès-Bourg 5667 20
10 11 "" "" 01 01354 Saint-Genis-Pouilly 11892 25%
11 12 "" "" 01 01419 Thoiry 6094 25%
12 13 "" "" 01 01451 Viriat 6350 20
13 14 "" "" 03 03013 Avermes 3907 20%
14 15 "" "" 03 03023 Bellerive-sur-Allier 8501 20%
15 16 "" "" 03 03310 Vichy 24383 20%
16 17 "" "" 03 03321 Yzeure 13230 20%
17 18 "" "" 04 04112 Manosque 21868 25%
18 19 "" "" 04 04143 Oraison 5917 25%
19 20 "" "" 04 04152 Pierrevert 3743 25%
# ℹ 38 more variables: sru_nom_agglo <chr>, sru_nom_epci <chr>, sru_pfh <chr>, sru_nb_res_prin <chr>,
# sru_nb_lls2019 <chr>, sru_tx_lls2019 <chr>, sru_nb_lls2014 <chr>, sru_tx_lls2014 <chr>,
# sru_tx_lls2011 <chr>, sru_tx_lls2008 <chr>, sru_tx_lls2005 <chr>, sru_tx_lls2002 <chr>,
# sru_exo <chr>, sru_prel_brut <chr>, sru_maj_brut <chr>, sru_prel_brut_tot <chr>,
# sru_constat_car <chr>, sru_date_arrete <chr>, sru_tx_maj_prel_brut <chr>, sru_benef_locaux <chr>,
# sru_prel_net <chr>, sru_prel_maj_nette <chr>, sru_condition <chr>, sru_condition_2_motif <int>,
# sru_commune_prelv <lgl>, sru_commune_no_prelv <lgl>, sru_commune_no_info <lgl>, …