网络抓取 NBA.com

问题描述 投票:0回答:1

我正在尝试使用 R 抓取下表

https://www.nba.com/stats/teams/opponent-shooting

我写的代码如下

library(rvest)

url <- "https://www.nba.com/stats/teams/opponent-shooting"

page <- read_html(url)

table_data <- page %>% 
  html_table(fill=TRUE)

然而,这似乎返回了一个看起来像是日历月的表格。

知道如何成功地抓取预期的表格吗?

r web-scraping rvest
1个回答
0
投票

这是一个动态 JavaScript 驱动页面,表内容 (JSON) 通过 API 调用获取。您可以通过浏览器的网络选项卡进行追踪。要自己发出该请求(并成功),您需要稍微更改请求标头;这是

httr2
的一种选择:

library(httr2)

req_url <- "https://stats.nba.com/stats/leaguedashteamshotlocations?Conference=&DateFrom=&DateTo=&DistanceRange=5ft%20Range&Division=&GameScope=&GameSegment=&ISTRound=&LastNGames=0&Location=&MeasureType=Opponent&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=2023-24&SeasonSegment=&SeasonType=Regular%20Season&ShotClockRange=&StarterBench=&TeamID=0&VsConference=&VsDivision="

json <- 
  request(req_url) |>
  req_user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") |>
  req_headers(
    Accept = "*/*",
    Origin = "https://www.nba.com",
    Referer = "https://www.nba.com/",
  ) |> 
  req_perform() |>
  resp_body_json() 

hdr <- json$resultSets$headers

# build column names from 2-level header structure
clean_names <- 
  c(
    rep("", hdr[[1]]$columnsToSkip), 
    rep(unlist(hdr[[1]]$columnNames), each = 3)
    ) |>
  paste(unlist(hdr[[2]]$columnNames)) |>
  janitor::make_clean_names()
clean_names
#>  [1] "team_id"                   "team_name"                
#>  [3] "less_than_5_ft_opp_fgm"    "less_than_5_ft_opp_fga"   
#>  [5] "less_than_5_ft_opp_fg_pct" "x5_9_ft_opp_fgm"          
#>  [7] "x5_9_ft_opp_fga"           "x5_9_ft_opp_fg_pct"       
#>  [9] "x10_14_ft_opp_fgm"         "x10_14_ft_opp_fga"        
#> [11] "x10_14_ft_opp_fg_pct"      "x15_19_ft_opp_fgm"        
#> [13] "x15_19_ft_opp_fga"         "x15_19_ft_opp_fg_pct"     
#> [15] "x20_24_ft_opp_fgm"         "x20_24_ft_opp_fga"        
#> [17] "x20_24_ft_opp_fg_pct"      "x25_29_ft_opp_fgm"        
#> [19] "x25_29_ft_opp_fga"         "x25_29_ft_opp_fg_pct"     
#> [21] "x30_34_ft_opp_fgm"         "x30_34_ft_opp_fga"        
#> [23] "x30_34_ft_opp_fg_pct"      "x35_39_ft_opp_fgm"        
#> [25] "x35_39_ft_opp_fga"         "x35_39_ft_opp_fg_pct"     
#> [27] "x40_ft_opp_fgm"            "x40_ft_opp_fga"           
#> [29] "x40_ft_opp_fg_pct"


# apply names to each item in rowSet list, this allows us to use dplyr::bind_rows()
# on a list of 
json$resultSets$rowSet |>
  lapply(setNames, clean_names) |>
  dplyr::bind_rows()

结果:

#> # A tibble: 30 × 29
#>       team_id team_name            less_than_5_ft_opp_fgm less_than_5_ft_opp_fga
#>         <int> <chr>                                 <dbl>                  <dbl>
#>  1 1610612737 Atlanta Hawks                          22                     33.6
#>  2 1610612738 Boston Celtics                         17.4                   28.5
#>  3 1610612751 Brooklyn Nets                          17.9                   28.8
#>  4 1610612766 Charlotte Hornets                      20.5                   31.8
#>  5 1610612741 Chicago Bulls                          18                     28.5
#>  6 1610612739 Cleveland Cavaliers                    17.2                   28.8
#>  7 1610612742 Dallas Mavericks                       20                     29.5
#>  8 1610612743 Denver Nuggets                         18.8                   30.3
#>  9 1610612765 Detroit Pistons                        20.5                   32  
#> 10 1610612744 Golden State Warrio…                   17                     25.8
#> # ℹ 20 more rows
#> # ℹ 25 more variables: less_than_5_ft_opp_fg_pct <dbl>, x5_9_ft_opp_fgm <dbl>,
#> #   x5_9_ft_opp_fga <dbl>, x5_9_ft_opp_fg_pct <dbl>, x10_14_ft_opp_fgm <dbl>,
#> #   x10_14_ft_opp_fga <dbl>, x10_14_ft_opp_fg_pct <dbl>,
#> #   x15_19_ft_opp_fgm <dbl>, x15_19_ft_opp_fga <dbl>,
#> #   x15_19_ft_opp_fg_pct <dbl>, x20_24_ft_opp_fgm <dbl>,
#> #   x20_24_ft_opp_fga <dbl>, x20_24_ft_opp_fg_pct <dbl>, …

创建于 2024-01-24,使用 reprex v2.0.2

© www.soinside.com 2019 - 2024. All rights reserved.