Webscrapping 循环到每个产品中

问题描述 投票:0回答:0

我试图从在线商店下载数据,我从这里得到了帮助 :) 但我想更深入地研究。相反,我想进入每个类别并单击每个类别中的图像并检索该页面中的所有信息。主页是 https://moweek.com.uy/,现在我去 https://moweek.com.uy/vestimenta/camperas-y-tapados/1 并从那里提取所有信息(名称、价格、图像)。我想要的是不要停在那里并单击每个名称并从页面中检索所有信息,例如:https://moweek.com.uy/p/tapado-ponti/6174/17471其名称为“Tapado Ponti”(

<div id="titleRow"> == $0 <h1 id = "productTitle" class "productInfoTitle" Tapado Monti</h1> </div>
),“by”在品牌“Zarvich”(
<h2 class="productInfoTitle brandName"> " by " <a target= "_blank" href="/b/zarvich"> <em>Zarvich</em> </a> </h2>
)之前,还有包含一些特征(
<div id="cocardasContainer"> </div>
),两个价格(
<div id="productPricesContainer" style="float: left;width: 100%;"> == $0 </div>
),尺寸(
<div class="specGroup"> <div class="specGroupTitle"> Talle </div>
)的盒子)、颜色 (
<div class="specGroup"> <div class="specGroupTitle"> Color </div>
) 和描述 (
<div id="productInfoDescription" class="moreInfoDiv openDiv"> <div class="productGrouptitle">Descripción</div>
)。我不知道如何获取这些信息。

pacman::p_load(tidyverse, rvest, httr)

url <- "https://moweek.com.uy/"
html_content <- GET(url)
webpage <- read_html(content(html_content, as = "text"))

category_nodes <- html_nodes(webpage, ".expandedCategory")

category_urls <- lapply(category_nodes, function(node) html_nodes(node, "a") %>% 
                        html_attr("href")) %>% 
  unlist() %>% 
  str_subset("/vestimenta/|/calzado/|/accesorios/")

product_data <- tibble()
for (url in category_urls) {
  cat_url <- paste0("https://moweek.com.uy", url)
  cat_content <- RETRY("GET", cat_url)
  cat_page <- read_html(content(cat_content, as = "text"))
  
  image_tags <- html_nodes(cat_page, ".productViewContainer")
  
  for (tag in image_tags) {
    # Extract product information from the image tag
    name <- html_text(html_node(tag, ".productViewName"))
    price <- html_text(html_node(tag, ".productViewPrice"))
    product_url <- html_attr(html_node(tag, "a"), "href")
    img_url <- html_attr(html_node(tag, ".productViewTop"), "data-hover-image")
    
    # Visit the product page and extract additional information
    product_content <- RETRY("GET", paste0("https://moweek.com.uy", product_url))
    product_page <- read_html(content(product_content, as = "text"))
    
    brand <- html_text(html_node(product_page, ".productInfoTitle .brandName a em"))
    characteristics <- html_text(html_node(product_page, "#cocardasContainer"))
    prices <- html_text(html_nodes(product_page, "#productPricesContainer .productPrice"))
    sizes <- html_text(html_nodes(product_page, ".specGroupTitle:contains('Talle') + .specGroupContent a"))
    color <- html_text(html_nodes(product_page, ".specGroupTitle:contains('Color') + .specGroupContent"))
    description <- html_text(html_node(product_page, "#productInfoDescription .moreInfoText"))
    
    # Combine all the information into a single row and add it to the product_data data frame
    product_info <- tibble(
      Category = cat_name,
      Subcategory = subcat_name,
      Name = name,
      Price = price,
      Brand = brand,
      Characteristics = characteristics,
      Prices = prices,
      Sizes = sizes,
      Color = color,
      Description = description,
      Image_URL = img_url,
      Product_URL = product_url
    )
    product_data <- product_data %>%
      add_row(product_info)
  }
}

# Clean data
product_data <- product_data %>%
  mutate(Name = str_trim(Name),
         Price = str_trim(Price),
         Brand = str_trim(Brand),
         Characteristics = str_trim(Characteristics),
         Sizes = str_trim(Sizes),
         Color = str_trim(Color),
         Description = str_trim(Description)) %>% 
  filter(Image_URL != "/files/empty.png")
r web-scraping rvest httr
© www.soinside.com 2019 - 2024. All rights reserved.