我想使用 R 从 this 网站下载 pdf。问题是您首先必须单击网站上的“Maak een pdf”按钮。因为这是一个 javascript onclick 属性。我能够找到该属性,但我不知道如何下载此 pdf 文件。这是元素检查的屏幕截图:
这是我尝试过的代码:
library(tidyverse)
library(rvest)
link = "https://puc.overheid.nl/natuurvergunningen/doc/PUC_746615_17/1/"
button <- link %>%
read_html() %>%
html_nodes(".download-als") %>%
html_nodes("a") %>%
html_attr("href")
button
#> [1] "javascript:WebForm_DoPostBackWithOptions(new WebForm_PostBackOptions(\"ctl00$cphContent$Main$ctl00$DocumentHeader$ctl00\", \"\", true, \"\", \"\", false, true))"
download.file(button, destfile = "Downloads/test.pdf")
#> Warning in download.file(button, destfile = "Downloads/test.pdf"): URL
#> javascript:WebForm_DoPostBackWithOptions(new
#> WebForm_PostBackOptions("ctl00$cphContent$Main$ctl00$DocumentHeader$ctl00", "",
#> true, "", "", false, true)): cannot open destfile 'Downloads/test.pdf', reason
#> 'No such file or directory'
#> Warning in download.file(button, destfile = "Downloads/test.pdf"): download had
#> nonzero exit status
创建于 2024-02-05,使用 reprex v2.0.2
我尝试
download.file
该文件,但当然这是行不通的。看来我们需要使用 RSelenium
通过浏览器在按钮上创建单击操作。我发现了这个问题:How to web-scrape on-click information with R? 但我找不到使用“onclick”属性来执行此操作的方法。所以我想知道是否有人知道如何从 onclick 属性下载 pdf 文件?
为了从文档页面到达最终的下载链接,我们需要玩一些请求/响应乒乓球来模仿 JavaScript 应用程序——首先,我们需要向后端提交请求,然后等待它完成并继续下载。
library(tidyverse)
library(rvest)
library(httr2)
# timestamp helper
timestamp_ <- \() sprintf("%.0f", as.numeric(Sys.time()) * 1000)
# get request parameters --------------------------------------------------
link = "https://puc.overheid.nl/natuurvergunningen/doc/PUC_746615_17/1/"
onclick <-
link %>%
read_html() %>%
html_elements(".download-als a") %>%
html_attr("onclick")
(req_param <- str_extract_all(onclick, "(?<=')[^\\s']+(?=')")[[1]])
#> [1] "PUC_746615_17_1" "natuurvergunningen" "pdf"
# submit request / get ticket ---------------------------------------------
ticket <-
request("https://puc.overheid.nl/PUC/Handlers/ManifestatieService.ashx") %>%
req_url_query(actie = "maakmanifestatie",
kanaal = req_param[2],
identifier = req_param[1],
soort = req_param[3],
`_` = timestamp_()) %>%
req_perform() %>%
resp_body_json(check_type = FALSE)
jsonlite::toJSON(ticket, auto_unbox = TRUE, pretty = TRUE)
#> {
#> "ticket": "70337706-d27d-463e-8b6b-8ca2ba47662d"
#> }
# submit ticket / get url -------------------------------------------------
# it takes few moments for backend to finish our request
Sys.sleep(2)
pdf_url <-
request("https://puc.overheid.nl/PUC/Handlers/ManifestatieService.ashx") %>%
req_url_query(actie = "haalstatus",
ticket = ticket$ticket,
`_` = timestamp_()) %>%
req_perform() %>%
resp_body_json(check_type = FALSE)
jsonlite::toJSON(pdf_url, auto_unbox = TRUE, pretty = TRUE)
#> {
#> "result": {
#> "status": "available",
#> "url": "/puc-opendata/request-result/70337706-d27d-463e-8b6b-8ca2ba47662d/Verlenging%20van%20de%20looptijd%20van%20de%20vergunning%20Wet%20Natuurbescherming%20%28Wnb%29%20voor%20het%20project%20Afsluitdij.pdf",
#> "filename": "Verlenging van de looptijd van de vergunning Wet Natuurbescherming (Wnb) voor het project Afsluitdij.pdf"
#> }
#> }
# download pdf ------------------------------------------------------------
request("https://puc.overheid.nl/PUC/Handlers/ManifestatieService.ashx") %>%
req_url_query(actie = "download",
identifier = req_param[1],
url = pdf_url$result$url,
filename = pdf_url$result$filename) %>%
req_perform(path = pdf_url$result$filename)
#> <httr2_response>
#> GET
#> https://puc.overheid.nl/PUC/Handlers/ManifestatieService.ashx?actie=download&identifier=PUC_746615_17_1&url=%2Fpuc-opendata%2Frequest-result%2F70337706-d27d-463e-8b6b-8ca2ba47662d%2FVerlenging%2520van%2520de%2520looptijd%2520van%2520de%2520vergunning%2520Wet%2520Natuurbescherming%2520%2528Wnb%2529%2520voor%2520het%2520project%2520Afsluitdij.pdf&filename=Verlenging%20van%20de%20looptijd%20van%20de%20vergunning%20Wet%20Natuurbescherming%20%28Wnb%29%20voor%20het%20project%20Afsluitdij.pdf
#> Status: 200 OK
#> Content-Type: application/pdf
#> Body: On disk 'body'
fs::file_info(pdf_url$result$filename)[1:3]
#> # A tibble: 1 × 3
#> path type size
#> <fs::path> <fct> <fs:>
#> 1 …nning Wet Natuurbescherming (Wnb) voor het project Afsluitdij.pdf file 171K
创建于 2024-02-05,使用 reprex v2.0.2
替代方法将基于可以处理 JavaScript 的工具,例如 Chromote 或 RSelenium。也许还有
webdriver
和 PhantomJS。