我正在尝试使用 R 抓取下表
https://www.nba.com/stats/teams/opponent-shooting
我写的代码如下
library(rvest)
url <- "https://www.nba.com/stats/teams/opponent-shooting"
page <- read_html(url)
table_data <- page %>%
html_table(fill=TRUE)
然而,这似乎返回了一个看起来像是日历月的表格。
知道如何成功地抓取预期的表格吗?
这是一个动态 JavaScript 驱动页面,表内容 (JSON) 通过 API 调用获取。您可以通过浏览器的网络选项卡进行追踪。要自己发出该请求(并成功),您需要稍微更改请求标头;这是
httr2
的一种选择:
library(httr2)
req_url <- "https://stats.nba.com/stats/leaguedashteamshotlocations?Conference=&DateFrom=&DateTo=&DistanceRange=5ft%20Range&Division=&GameScope=&GameSegment=&ISTRound=&LastNGames=0&Location=&MeasureType=Opponent&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=2023-24&SeasonSegment=&SeasonType=Regular%20Season&ShotClockRange=&StarterBench=&TeamID=0&VsConference=&VsDivision="
json <-
request(req_url) |>
req_user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") |>
req_headers(
Accept = "*/*",
Origin = "https://www.nba.com",
Referer = "https://www.nba.com/",
) |>
req_perform() |>
resp_body_json()
hdr <- json$resultSets$headers
# build column names from 2-level header structure
clean_names <-
c(
rep("", hdr[[1]]$columnsToSkip),
rep(unlist(hdr[[1]]$columnNames), each = 3)
) |>
paste(unlist(hdr[[2]]$columnNames)) |>
janitor::make_clean_names()
clean_names
#> [1] "team_id" "team_name"
#> [3] "less_than_5_ft_opp_fgm" "less_than_5_ft_opp_fga"
#> [5] "less_than_5_ft_opp_fg_pct" "x5_9_ft_opp_fgm"
#> [7] "x5_9_ft_opp_fga" "x5_9_ft_opp_fg_pct"
#> [9] "x10_14_ft_opp_fgm" "x10_14_ft_opp_fga"
#> [11] "x10_14_ft_opp_fg_pct" "x15_19_ft_opp_fgm"
#> [13] "x15_19_ft_opp_fga" "x15_19_ft_opp_fg_pct"
#> [15] "x20_24_ft_opp_fgm" "x20_24_ft_opp_fga"
#> [17] "x20_24_ft_opp_fg_pct" "x25_29_ft_opp_fgm"
#> [19] "x25_29_ft_opp_fga" "x25_29_ft_opp_fg_pct"
#> [21] "x30_34_ft_opp_fgm" "x30_34_ft_opp_fga"
#> [23] "x30_34_ft_opp_fg_pct" "x35_39_ft_opp_fgm"
#> [25] "x35_39_ft_opp_fga" "x35_39_ft_opp_fg_pct"
#> [27] "x40_ft_opp_fgm" "x40_ft_opp_fga"
#> [29] "x40_ft_opp_fg_pct"
# apply names to each item in rowSet list, this allows us to use dplyr::bind_rows()
# on a list of
json$resultSets$rowSet |>
lapply(setNames, clean_names) |>
dplyr::bind_rows()
结果:
#> # A tibble: 30 × 29
#> team_id team_name less_than_5_ft_opp_fgm less_than_5_ft_opp_fga
#> <int> <chr> <dbl> <dbl>
#> 1 1610612737 Atlanta Hawks 22 33.6
#> 2 1610612738 Boston Celtics 17.4 28.5
#> 3 1610612751 Brooklyn Nets 17.9 28.8
#> 4 1610612766 Charlotte Hornets 20.5 31.8
#> 5 1610612741 Chicago Bulls 18 28.5
#> 6 1610612739 Cleveland Cavaliers 17.2 28.8
#> 7 1610612742 Dallas Mavericks 20 29.5
#> 8 1610612743 Denver Nuggets 18.8 30.3
#> 9 1610612765 Detroit Pistons 20.5 32
#> 10 1610612744 Golden State Warrio… 17 25.8
#> # ℹ 20 more rows
#> # ℹ 25 more variables: less_than_5_ft_opp_fg_pct <dbl>, x5_9_ft_opp_fgm <dbl>,
#> # x5_9_ft_opp_fga <dbl>, x5_9_ft_opp_fg_pct <dbl>, x10_14_ft_opp_fgm <dbl>,
#> # x10_14_ft_opp_fga <dbl>, x10_14_ft_opp_fg_pct <dbl>,
#> # x15_19_ft_opp_fgm <dbl>, x15_19_ft_opp_fga <dbl>,
#> # x15_19_ft_opp_fg_pct <dbl>, x20_24_ft_opp_fgm <dbl>,
#> # x20_24_ft_opp_fga <dbl>, x20_24_ft_opp_fg_pct <dbl>, …
创建于 2024-01-24,使用 reprex v2.0.2