将图像从网站抓取到子文件夹中

问题描述 投票:0回答:1

我正在尝试从该网站抓取图像:hhttps://moweek.com.uy/。

有不同的副标题:“VESTIMENTA”、“CALZADO”、“ACCESORIOS”、“BEAUTY”、“MARCAS”、“AGENDA”和“BLOG”。我想在“VESTIMENTA”中“单击”,然后单击每个子标题(“Activewear”、“Blazers y chaquetas”),如下图所示,但我在使用过去有效的代码时遇到了问题:

我的主要目标是将每个子标题(类别)中的图像下载到具有类别名称(activewear、“blazer_y_chaquetas”等)的子文件夹中名为“images”的文件夹中。在代码中,我想从每张图片下载信息,但现在我将其分开在另一个脚本中。

这是我到目前为止所拥有的:

library(tidyverse)
library(rvest)
library(httr)

url <- "https://moweek.com.uy/"
html_content <- GET(url)

webpage <- read_html(content(html_content, as = "text"))

category_nodes <- html_nodes(webpage, "a.headerOption.headerLink[data-category-id='1']")
category_urls <- html_attr(category_nodes, "href")

dir.create("images", showWarnings = FALSE)

# Loop through subcategories and download images
for (category_url in category_urls) {
  # Remove any extra slashes in the URL construction
  subcategory_url <- paste0(url, gsub("/+", "/", category_url))
  
  subcategory_page <- tryCatch({
    read_html(subcategory_url)
  }, error = function(e) {
    message(paste("Error accessing URL:", subcategory_url))
    return(NULL)
  })
  
  if (is.null(subcategory_page)) {
    next  # Skip to the next subcategory if an error occurred
  }
  
  subcategory_name <- subcategory_page %>% html_text()
  
  # Clean the subcategory name for folder creation
  subfolder_name <- gsub(" ", "_", tolower(subcategory_name))
  subfolder_path <- file.path("images", subfolder_name)
  dir.create(subfolder_path, showWarnings = FALSE)
  
  # Extract image URLs and download them into the subfolder
  image_urls <- subcategory_page %>% 
    html_nodes(".your-image-selector") %>%
    html_attr("src")
  
  for (image_url in image_urls) {
    image_name <- basename(image_url)
    download.file(image_url, file.path(subfolder_path, image_name))
  }
}
r web-scraping rvest httr
1个回答
0
投票

这非常有效!

pacman::p_load(tidyverse, rvest, httr)

url <- "https://moweek.com.uy"
webpage <- httr::GET(url)

webpage <- read_html(content(webpage, as = "text"))

category_nodes <- html_nodes(webpage, ".expandedCategory")

category_urls <- lapply(category_nodes, function(node) html_nodes(node, "a") %>%
                          html_attr("href")) %>%
  unlist() %>%
  str_subset("vestimenta") %>%
  str_replace_all("1", "30")

category_urls <- category_urls[!(category_urls %in% c("/vestimenta/30",          "/vestimenta/activewear/30",
                                                      "/vestimenta/bodywear/30", "/vestimenta/leggings-y-bikers/30",
                                                      "/vestimenta/lenceria/30", "/vestimenta/hombre/30",
                                                      "/vestimenta/ninos/30",    "/vestimenta/pijamas-y-camisones/30",
                                                      "https://moweek.com.uy/vestimenta/vestidos/30",
                                                      "https://moweek.com.uy/vestimenta/trajes-de-bano/30"))]

dir.create("images", showWarnings = FALSE)

# Function to limit folder name length
limit_folder_name_length <- function(category_url, max_length = 50) {
  # Extract the subcategory name from the category_url
  subcategory_name <- gsub(".*/vestimenta/(.*?)/30", "\\1", category_url)
  
  if (nchar(subcategory_name) > max_length) {
    subcategory_name <- substr(subcategory_name, 1, max_length)
  }
  return(subcategory_name)
}

# Loop through subcategories and download images
for (category_url in category_urls) {
  
  subcategory_name <- gsub(".*/vestimenta/(.*?)/30", "\\1", category_url)
  subfolder_name <- limit_folder_name_length(gsub(" ", "_", tolower(subcategory_name)))
  subfolder_path <- file.path("images", subfolder_name)
  dir.create(subfolder_path, showWarnings = FALSE, recursive = TRUE)
  
  subcategory_url <- paste0(url, gsub("/+", "/", category_url))
  
  # Sleep for a few seconds to avoid overloading the server
  Sys.sleep(3)
  
  cat_content <- RETRY("GET", subcategory_url)
  cat_page <- read_html(content(cat_content, as = "text"))
  
  image_urls <- cat_page %>%
    html_nodes(".productViewTopImage") %>%
    html_attr("src")
  
  image_urls <- ifelse(substr(image_urls, 1, 8) != "https://", paste0(url, image_urls), image_urls)
  
  for (image_url in image_urls) {
    if (!is.na(image_url) && image_url != "") {
     
       image_name <- basename(image_url)
      image_path <- file.path(subfolder_path, image_name)
      
      # Download and save the image to the subfolder
      download.file(image_url, image_path, mode = "wb")
      
    }
  }
}
© www.soinside.com 2019 - 2024. All rights reserved.