使用 2 个 .aspx 表单下拉菜单进行网页抓取

问题描述 投票:0回答:1

我正在尝试在 R 中抓取此表单 - https://profiles.doe.mass.edu/statereport/enrollmentbygrade.aspx。该表格有两个下拉菜单:“报告类型”和“年份”。我一直使用此回复作为指导。

我基本上想抓取学校每年的数据,但无法完全排序。

如有任何帮助,我们将不胜感激...

library(tidyverse)
library(rvest)

# Start session, extract years 
url <- read_html("https://profiles.doe.mass.edu/statereport/enrollmentbygrade.aspx")
form <- html_form(url)[[2]]
years <- form$fields[[5]]$options

# Update form with years, submit, and extract type.
form <- html_form_set(form, 'ctl00$ContentPlaceHolder1$ddYear' = years[1])
url<- html_form_submit(session, form, "<unnamed>") #This doesn't seem to work 
form <- html_form(url)[[2]]
type<- form$fields[[4]]$options[2]type 

这里似乎分崩离析了 -

# Update form with type, submit, and extract data frame(s).
form <- html_form_set(form, 'ctl00$ContentPlaceHolder1$ddYear' = type[1])
url <- html_form_submit(url, form, "ctl00$ContentPlaceHolder1$hfExport")

df_list <- html_table(url, T, T, T)
forms screen-scraping rvest
1个回答
0
投票

要使用

rvest
会话,您应该将对
rvest::read_html()
rvest::html_form_submit()
的调用替换为
rvest::session()
rvest::session_submit()

library(tidyverse)
library(rvest)

# helpers
header <- \(s) html_elements(s, "h1#ctl00_ContentPlaceHolder1_hTitle") |> html_text(trim = TRUE)
rprt_table <- \(s) html_element(s, "table#tblAccountability") |> html_table()

# Start session, extract years 
s <- session("https://profiles.doe.mass.edu/statereport/enrollmentbygrade.aspx")
years <- html_form(s)[[2]]$fields[[5]]$options

# limiting request rate to 1 per second
slow_session_submit <- slowly(session_submit, rate = rate_delay(1))

# reports for 4 years
for (year in years[1:4]){
  # populate form from the last session as hidden form values that 
  # carry ASP session state will likely change
  form <- html_form_set(html_form(s)[[2]], 'ctl00$ContentPlaceHolder1$ddYear' = year)
  s <- slow_session_submit(s, form)
  
  # parse response 
  header(s) |> print()
  rprt_table(s) |> print(n = 3)
}
#> [1] "2023-24 Enrollment by Grade Report (District)"
#> # A tibble: 400 × 18
#>   `District Name`      `District Code` PK    K     `1`   `2`   `3`   `4`   `5`  
#>   <chr>                          <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Abby Kelley Foster …         4450000 0     121   119   123   128   123   124  
#> 2 Abington                       10000 78    177   161   168   192   160   170  
#> 3 Academy Of the Paci…         4120000 0     0     0     0     0     0     30   
#> # ℹ 397 more rows
#> # ℹ 9 more variables: `6` <chr>, `7` <chr>, `8` <chr>, `9` <chr>, `10` <chr>,
#> #   `11` <chr>, `12` <chr>, SP <chr>, Total <chr>

#> [1] "2022-23 Enrollment by Grade Report (District)"
#> # A tibble: 399 × 18
#>   `District Name`      `District Code` PK    K     `1`   `2`   `3`   `4`   `5`  
#>   <chr>                          <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Abby Kelley Foster …         4450000 0     120   120   118   121   124   121  
#> 2 Abington                       10000 87    160   173   188   166   169   157  
#> 3 Academy Of the Paci…         4120000 0     0     0     0     0     0     36   
#> # ℹ 396 more rows
#> # ℹ 9 more variables: `6` <chr>, `7` <chr>, `8` <chr>, `9` <chr>, `10` <chr>,
#> #   `11` <chr>, `12` <chr>, SP <chr>, Total <chr>

#> [1] "2021-22 Enrollment by Grade Report (District)"
#> # A tibble: 401 × 18
#>   `District Name`      `District Code` PK    K     `1`   `2`   `3`   `4`   `5`  
#>   <chr>                          <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Abby Kelley Foster …         4450000 0     119   118   119   119   120   124  
#> 2 Abington                       10000 76    174   186   171   166   148   169  
#> 3 Academy Of the Paci…         4120000 0     0     0     0     0     0     49   
#> # ℹ 398 more rows
#> # ℹ 9 more variables: `6` <chr>, `7` <chr>, `8` <chr>, `9` <chr>, `10` <chr>,
#> #   `11` <chr>, `12` <chr>, SP <chr>, Total <chr>

#> [1] "2020-21 Enrollment by Grade Report (District)"
#> # A tibble: 401 × 18
#>   `District Name`      `District Code` PK    K     `1`   `2`   `3`   `4`   `5`  
#>   <chr>                          <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Abby Kelley Foster …         4450000 0     117   116   120   118   117   115  
#> 2 Abington                       10000 51    173   168   161   144   162   140  
#> 3 Academy Of the Paci…         4120000 0     0     0     0     0     0     53   
#> # ℹ 398 more rows
#> # ℹ 9 more variables: `6` <chr>, `7` <chr>, `8` <chr>, `9` <chr>, `10` <chr>,
#> #   `11` <chr>, `12` <chr>, SP <chr>, Total <chr>

创建于 2024-05-02,使用 reprex v2.1.0

© www.soinside.com 2019 - 2024. All rights reserved.