使用 rvest 从 Skytrax 网站抓取网页

问题描述 投票:0回答:1

任何人都可以从 skytrax 网站上查看下面的代码,而且我不确定我是如何根据评级值将星级评级转换为数字表示的。任何提示将不胜感激。提前致谢

library(rvest)
library(dplyr)

link = "https://www.airlinequality.com/airline-reviews/emirates/page/1/sortby=post_date%3ADesc&pagesize=100"

page = read_html(link)

pax_name= page %>% html_nodes(".grippy-host , .userStatusWrapper") %>% html_text()

date = page %>% html_nodes("time , .grippy-host") %>% html_text()

pax_rating = page %>% html_nodes(".grippy-host , .rating-10") %>% html_text()

traveller_type = page %>% html_nodes(".type_of_traveller+ .review-value") %>% html_text()

trip_verified = page %>% html_nodes("em , .grippy-host") %>% html_text()

crew_service = page %>% html_nodes(".grippy-host , .cabin_staff_service+ .stars") %>% html_text()

pax_recommended = page %>% html_nodes("t.recommended+ .review-value") %>% html_text()

food_drinks = page %>% html_nodes(".food_and_beverages+ .stars") %>% html_text()

ground_service = page %>% html_nodes(".ground_service+ .stars") %>% html_text()

travel_class= page %>% html_nodes(".cabin_flown+ .review-value") %>% html_text()

emirates_reviews = data.frame(pax_name, date, pax_rating, traveller_type, trip_verified,
      crew_service, pax_recommended, food_drinks, ground_service, travel_class, stringsAsFactors = FALSE)
r web-scraping rvest
1个回答
0
投票

在此页面上没有用于评级的 html 元素。相反,每个单独的开始都有 html 元素,星的类是“星”或“星填充”。

所以看起来找到评级的唯一方法是计算评级表中每一行的填充星数。下面是如何为第一个检查表执行此操作的示例。您仍然需要想办法将此信息附加到评论的其余部分,但这里有一种方法可以只获取每条评论的评分:

# define target url
url <- "https://www.airlinequality.com/airline-reviews/emirates/?sortby=post_date%3ADesc&pagesize=100"

page_html <- url %>% read_html() 


# Find all the reviews
reviews <- page_html  %>% html_nodes(".review-ratings")

# find each invidual star's class
stars <- reviews[1] %>% html_nodes(".review-rating-stars.stars") %>% html_children() %>% html_attr("class")

# create a dataframe where each row is a review category and each column is a star's class
stars_df <- matrix(stars, nrow = length(stars)/5) %>% as.data.frame()

# count the number of filled in stars and make it a new column
stars_df$rating <- apply(stars_df, 1, function(x) length(which(x=="star fill")))

# pull the review ratings table for this review
review_table <- reviews[1] %>% html_table() %>% .[[1]]
         
# count the number of filled in stars and make it a new column
review_table$rating <- apply(stars_df, 1, function(x) length(which(x=="star fill")))


这是结果

> review_table
# A tibble: 5 × 3
  X1                        X2 rating
  <chr>                  <int>  <int>
1 Food & Beverages       12345      5
2 Inflight Entertainment 12345      5
3 Seat Comfort           12345      5
4 Staff Service          12345      1
5 Value for Money        12345      0
© www.soinside.com 2019 - 2024. All rights reserved.