如何循环遍历列表并将结果存储在数据框中?

问题描述 投票:0回答:1

这是我问题的后续这里

我想学习如何循环遍历列表,然后将结果存储(或绑定)在一起。

目标是首先创建游戏 id 列表,将它们附加到通用 url,然后循环这些以获取 boxscore 表,最后将结果存储(或将它们添加到一起)在数据框中。

我像这样定义了 game_ids:

game_ids <- list(401559239,401559240)

和网址:

url_ = "https://www.espn.com/nhl/boxscore/_/gameId"

然后我将此循环代码添加到建议解决方案的开头:

for (game_id in game_ids) {
  url2 = paste(url_, game_id, sep = '/')

完整代码:

library(rvest)
library(dplyr)
library(purrr)
library(tidyr)

game_ids <- list(401559239,401559240)
url_ = "https://www.espn.com/nhl/boxscore/_/gameId"
    
for (game_id in game_ids) {
  url2 = paste(url_, game_id, sep = '/')
  
  boxscore <- read_html(url2) %>% 
    # extract team sections (2)
    html_elements("div.Boxscore div.Wrapper") %>% 
    # extract team names, use as list element names
    set_names(html_elements(., ".BoxscoreItem__TeamName") %>% html_text()) %>% 
    # extact table elements, 4 per team
    map(\(team_section) html_elements(team_section, "table")) %>% 
    map(\(team_tables) list(
      # bind tables 1 & 2 (skaters/defensemen and data section)
      tbl_1 = html_table(team_tables[1:2]) %>% 
        bind_cols(.name_repair = "minimal") %>% 
        # column names from the first row
        set_names(.[1,]) %>% 
        rename(player = Skaters) %>% 
        # position to spearate column
        mutate(position = if_else(G == "G", player, NA), .before = 1) %>% 
        fill(position, .direction = "down") %>% 
        # remove rows with header info
        filter(G != "G"),
      # bind tables 3 & 4 (goalies and data section)
      tbl_2 = html_table(team_tables[3:4]) %>% 
        bind_cols(.name_repair = "minimal") %>% 
        set_names(.[1,]) %>% 
        filter(SA != "SA")
    ) 
    )
  output = boxscore %>% 
    map("tbl_1") %>% 
    list_rbind(names_to = "team")   
}

我将其添加到循环的末尾(在

list_rbind
之后):

print(res)

我可以看到两场比赛的结果:

[1] "https://www.espn.com/nhl/boxscore/_/gameId/401559239"
# A tibble: 36 × 22
   team  position player G     A     `+/-` S     SM    BS    PN    PIM   HT    TK    GV    SHFT  TOI   PPTOI SHTOI
   <chr> <chr>    <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
 1 Mont… Skaters  J. An… 0     0     -1    3     1     1     1     2     2     0     1     25    17:31 1:46  0:00 
 2 Mont… Skaters  C. Ca… 1     0     0     1     1     1     0     0     2     0     1     23    19:11 3:59  0:00 
 3 Mont… Skaters  K. Da… 0     2     2     3     0     0     0     0     3     1     0     23    21:22 3:59  0:15 
 4 Mont… Skaters  J. Ev… 1     0     1     1     0     0     1     2     3     2     0     19    11:22 0:12  2:27 
 5 Mont… Skaters  B. Ga… 0     0     0     0     0     1     0     0     1     0     0     17    12:34 1:50  0:00 
 6 Mont… Skaters  R. Ha… 0     1     0     0     1     2     0     0     3     0     1     18    13:39 0:11  2:56 
 7 Mont… Skaters  S. Mo… 0     0     -1    3     2     1     0     0     0     1     0     22    18:51 4:00  1:10 
 8 Mont… Skaters  A. Ne… 2     0     1     2     0     0     1     2     2     0     2     22    16:46 1:49  0:00 
 9 Mont… Skaters  T. Pe… 0     0     0     0     1     0     0     0     4     0     0     18    12:10 0:00  1:10 
10 Mont… Skaters  J. Sl… 0     1     2     1     3     0     0     0     3     2     1     20    15:25 1:28  0:00 
# ℹ 26 more rows
# ℹ 4 more variables: ESTOI <chr>, FW <chr>, FL <chr>, `FO%` <chr>
# ℹ Use `print(n = ...)` to see more rows
[1] "https://www.espn.com/nhl/boxscore/_/gameId/401559240"
# A tibble: 35 × 22
   team  position player G     A     `+/-` S     SM    BS    PN    PIM   HT    TK    GV    SHFT  TOI   PPTOI SHTOI
   <chr> <chr>    <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
 1 Otta… Skaters  D. Ba… 0     0     -1    5     3     0     0     0     1     0     0     24    18:21 4:44  0:18 
 2 Otta… Skaters  R. Ch… 0     0     1     3     0     0     0     0     4     0     0     24    13:28 0:07  2:51 
 3 Otta… Skaters  C. Gi… 0     0     -1    1     1     1     1     2     1     0     1     29    19:48 4:19  1:45 
 4 Otta… Skaters  R. Gr… 0     0     -1    2     0     0     0     0     0     0     0     26    16:04 3:49  3:15 
 5 Otta… Skaters  M. Jo… 1     1     1     1     2     2     0     0     0     1     0     29    17:52 0:21  6:00 
 6 Otta… Skaters  M. Ka… 0     0     -1    0     0     1     1     2     0     0     0     15    7:33  0:00  0:33 
 7 Otta… Skaters  P. Ke… 1     1     0     2     1     0     0     0     0     0     0     25    12:40 0:14  4:42 
 8 Otta… Skaters  D. Ku… 0     0     0     3     0     0     0     0     0     0     1     21    14:32 2:58  0:00 
 9 Otta… Skaters  T. St… 1     0     -1    3     0     0     2     4     1     0     2     28    21:44 5:25  2:12 
10 Otta… Skaters  V. Ta… 0     0     0     0     1     0     0     0     1     0     0     23    13:12 3:00  0:00 
# ℹ 25 more rows
# ℹ 4 more variables: ESTOI <chr>, FW <chr>, FL <chr>, `FO%` <chr>
# ℹ Use `print(n = ...)` to see more rows

但是我不知道如何将它们在数据框中绑定在一起。

我尝试了

res = rbind(output)
res = do.call(rbind,output)

但是两者都只返回最后一场比赛的结果:

> head(res,10)
# A tibble: 10 × 22
   team  position player G     A     `+/-` S     SM    BS    PN    PIM   HT    TK    GV    SHFT  TOI   PPTOI SHTOI
   <chr> <chr>    <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
 1 Otta… Skaters  D. Ba… 0     0     -1    5     3     0     0     0     1     0     0     24    18:21 4:44  0:18 
 2 Otta… Skaters  R. Ch… 0     0     1     3     0     0     0     0     4     0     0     24    13:28 0:07  2:51 
 3 Otta… Skaters  C. Gi… 0     0     -1    1     1     1     1     2     1     0     1     29    19:48 4:19  1:45 
 4 Otta… Skaters  R. Gr… 0     0     -1    2     0     0     0     0     0     0     0     26    16:04 3:49  3:15 
 5 Otta… Skaters  M. Jo… 1     1     1     1     2     2     0     0     0     1     0     29    17:52 0:21  6:00 
 6 Otta… Skaters  M. Ka… 0     0     -1    0     0     1     1     2     0     0     0     15    7:33  0:00  0:33 
 7 Otta… Skaters  P. Ke… 1     1     0     2     1     0     0     0     0     0     0     25    12:40 0:14  4:42 
 8 Otta… Skaters  D. Ku… 0     0     0     3     0     0     0     0     0     0     1     21    14:32 2:58  0:00 
 9 Otta… Skaters  T. St… 1     0     -1    3     0     0     2     4     1     0     2     28    21:44 5:25  2:12 
10 Otta… Skaters  V. Ta… 0     0     0     0     1     0     0     0     1     0     0     23    13:12 3:00  0:00 
# ℹ 4 more variables: ESTOI <chr>, FW <chr>, FL <chr>, `FO%` <chr>

如何存储每个游戏所需的输出,并添加到数据框?

r purrr rvest
1个回答
0
投票

使用所有代码很难发现问题。 :) 然而,解决方案很简单:

在每次迭代中,您都会覆盖输出变量。因此,最终您只能看到上一次迭代的游戏。要解决此问题,您可以在循环之前将输出创建为空列表,并在每次迭代时填充它。然后,您可以绑定行。类似的东西应该有效:

output <- list()
for (game_id in game_ids) {
        print(game_id)
        url2 = paste(url_, game_id, sep = '/')
        
        boxscore <- read_html(url2) %>%
                # extract team sections (2)
                html_elements("div.Boxscore div.Wrapper") %>%
                # extract team names, use as list element names
                set_names(html_elements(., ".BoxscoreItem__TeamName") %>% html_text()) %>%
                # extact table elements, 4 per team
                map(\(team_section) html_elements(team_section, "table")) %>%
                map(
                        \(team_tables) list(
                                # bind tables 1 & 2 (skaters/defensemen and data section)
                                tbl_1 = html_table(team_tables[1:2]) %>%
                                        bind_cols(.name_repair = "minimal") %>%
                                        # column names from the first row
                                        set_names(.[1, ]) %>%
                                        dplyr::rename(player = Skaters) %>%
                                        # position to spearate column
                                        mutate(
                                                position = if_else(G == "G", player, NA),
                                                .before = 1
                                        ) %>%
                                        fill(position, .direction = "down") %>%
                                        # remove rows with header info
                                        filter(G != "G"),
                                # bind tables 3 & 4 (goalies and data section)
                                tbl_2 = html_table(team_tables[3:4]) %>%
                                        bind_cols(.name_repair = "minimal") %>%
                                        set_names(.[1, ]) %>%
                                        filter(SA != "SA")
                        )
                )
        output[[as.character(game_id)]] <-  boxscore %>%
                map("tbl_1") %>%
                list_rbind(names_to = "team")
}
output <- output |> bind_rows(.id = "game")
© www.soinside.com 2019 - 2024. All rights reserved.