大学课程纪律作业的网络抓取

问题描述 投票:0回答:1

我想用 R 抓取大学课程目录。我的代码已经相当不错了,但是学科和子学科的课程分配尚未按照我想要的方式工作。

这是我的代码:

# loading needed libraries -----------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, rvest, xml2)
html_code <- read_html(https://www.uni-bremen.de/studium/starten-studieren/veranstaltungsverzeichnis?tx_hbulvp_pi1%5Bmodule%5D=93fdb6be384979f7300d263ba0c094be&tx_hbulvp_pi1%5Bsem%5D=39)
# Eine rekursive Funktion, um Informationen unter jedem h-Tags zu sammeln
extract_module_info <- function(node, module_path = c()) {
  # Basisfall: Wenn der Knoten leer ist, beenden Sie die Rekursion
  if (length(node) == 0) return(tibble())
  current_tag <- node %>% html_name()
  current_text <- node %>% html_text(trim = TRUE)
  # Aktualisieren des Pfades mit dem aktuellen Modul/Submodul
  new_path <- c(module_path, current_text)
  # Suchen nach dem nächsten div, das die Details enthält
  details_node <- node %>% html_node(xpath = "./following-sibling::div[1]")
  # Sammeln von Detailinformationen, wenn vorhanden
  if (!is.null(details_node) && length(html_nodes(details_node, 'tr')) > 0) {
    details <- html_nodes(details_node, 'tr') %>%
      map_df(~{
        tibble(
          ModulePath = list(new_path),
          CourseDesc = html_nodes(.x, '.expander') %>%
            html_text(trim = TRUE) %>%
            ifelse(length(.) == 0, NA_character_, .),
          CourseElse = html_nodes(.x, 'td:nth-child(2)') %>%
            html_text(trim = TRUE) %>%
            ifelse(length(.) == 0, NA_character_, .),
          CourseVAK = html_nodes(.x, 'td:nth-child(1)') %>%
            html_text(trim = TRUE) %>%
            ifelse(length(.) == 0, NA_character_, .),
          CourseTitle = html_nodes(.x, 'strong') %>%
            html_text(trim = TRUE) %>%
            ifelse(length(.) == 0, NA_character_, .),
          CourseTeacher = html_nodes(.x, 'td ~ td + td') %>%
            html_text(trim = TRUE) %>%
            ifelse(length(.) == 0, NA_character_, .)
        )
      })
  } else {
    details <- tibble(ModulePath = list(new_path))
  }
  # Rekursiver Abstieg zum nächsten h-Tag, falls vorhanden
  next_node <- node %>% html_node(xpath = "./following-sibling::*[self::h2 or self::h3 or self::h4][1]")
  child_details <- extract_module_info(next_node, new_path)
  # Kombinieren der aktuellen Details mit den rekursiv gesammelten Details
  bind_rows(details, child_details)
}
# Anwendung der Funktion auf das gesamte Dokument, startend mit dem ersten h2-Tag
results <- html_nodes(html_code, 'h2') %>% map_df(~extract_module_info(.x))
# Ausgabe der Ergebnisse
print(results)

问题出现在变量 ModulePath 上。让我们以 VAK ID SZHB 0806 为例来看看课程设置:

> results |> slice(544) |> select(CourseVAK )
# A tibble: 1 × 1
  CourseVAK
  <chr>
1 SZHB 0806

还有 ModulePath 的值:


 > results |> slice(544) |> select(ModulePath) |> pull()
[[1]]
 [1] "Language Center of the Universities in the State of Bremen"
 [2] "Arabic"
 [3] "Chinese"
 [4] "German"
 [5] "German sign language"
 [6] "English"
 [7] "French"
 [8] "Hebrew (modern)"
 [9] "Italian"
[10] "Japanese"
[11] "Catalan"
[12] "Korean"
[13] "Croatian"
[14] "Kurdish"
[15] "Latin"
[16] "Dutch"
[17] "Polish"

不幸的是,所有 h4 标题都已达到相应的课程报价。

我想要的是这样的结果:

[[1]]
 [1] "Language Center of the Universities in the State of Bremen"
 [2] "Polish"

当然,您现在可以删除除第一个和最后一个之外的所有元素。但考虑到我不知道其他页面的结构,这是一种可靠的方法吗?也许标题将来会更加嵌套。

r web-scraping rvest
1个回答
1
投票

很酷的代码和不错的项目。我知道当 html 代码没有正确嵌套时的痛苦。总是让事情变得更困难。

我重写了代码的逻辑,因为对我来说,利用不同样式标题的隐式排序似乎更容易。如果你利用保持这种方式的优势,你可以使用 tidyverse 中漂亮的

fill
函数复制以前节点的值。

请仔细检查,所有内容仍然按预期提取值。

编辑:我添加了一项修改,该修改还删除了偶尔的“p Strong”标签作为标题。当然,它变得更加复杂,只是没有时间考虑更漂亮的东西,但我认为它有效。

# loading needed libraries -----------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, rvest, xml2)
html_code_1 <- read_html("https://www.uni-bremen.de/studium/starten-studieren/veranstaltungsverzeichnis?tx_hbulvp_pi1%5Bmodule%5D=93fdb6be384979f7300d263ba0c094be&tx_hbulvp_pi1%5Bsem%5D=39")
html_code_2 <- read_html("https://www.uni-bremen.de/studium/starten-studieren/veranstaltungsverzeichnis?tx_hbulvp_pi1%5Bmodule%5D=54c6fd5c0b74c8c6b7f81ab2939a7196&tx_hbulvp_pi1%5Bsem%5D=40")

result <- html_elements(html_code_2, '.tx-hbulvp-pi1-module') |>
  ## map over the different modules
  map_dfr(function(main_module) {
    ## extract the children of these, which are studium generale, etc.
    html_children(main_module) |>
      map_dfr(function(headers) {
        if (as.character(headers) |> str_detect("h2")) {
          return(tibble(h2 = html_text(headers)))
        } else if (as.character(headers) |> str_detect("h3")) {
          return(tibble(h3 = html_text(headers)))
        } else if (as.character(headers) |> str_detect("h4")) {
          return(tibble(h4 = html_text(headers)))
        } else if (as.character(headers) |> str_detect("<p")) {
          return(tibble(p_strong = html_text(headers)))
        } else if (as.character(headers) |> str_detect("div")) {
          content <- map_dfr(headers |> html_elements("tr"),
                             function(tr) {
                               tibble(
                                 CourseDesc = html_nodes(tr, '.expander') %>%
                                   html_text(trim = TRUE) %>%
                                   ifelse(length(.) == 0, NA_character_, .),
                                 CourseElse = html_nodes(tr, 'td:nth-child(2)') %>%
                                   html_text(trim = TRUE) %>%
                                   ifelse(length(.) == 0, NA_character_, .),
                                 CourseVAK = html_nodes(tr, 'td:nth-child(1)') %>%
                                   html_text(trim = TRUE) %>%
                                   ifelse(length(.) == 0, NA_character_, .),
                                 CourseTitle = html_nodes(tr, 'strong') %>%
                                   html_text(trim = TRUE) %>%
                                   ifelse(length(.) == 0, NA_character_, .),
                                 CourseTeacher = html_nodes(tr, 'td ~ td + td') %>%
                                   html_text(trim = TRUE) %>%
                                   ifelse(length(.) == 0, NA_character_, .)
                               )
                               
                             })
          return(content)
        }
      })
  }) |>
  fill(h2, .direction = "down") |>
  group_by(h2) |>
  fill(h3, .direction = "down") |>
  group_by(h3,h2) |>
  fill(h4, .direction = "down")|>
  group_by(h2,h3,h4)

if("p_strong" %in% names(result)){
  result <- result |> 
    fill(p_strong, .direction = "down") |>
    filter(!is.na(CourseDesc)) |>
    select(h2, h3, h4,p_strong, everything())
} else {
  result <- result |> 
    filter(!is.na(CourseDesc)) |>
    select(h2, h3, h4, everything())
}

© www.soinside.com 2019 - 2024. All rights reserved.