我正在尝试从该网站抓取图像:hhttps://moweek.com.uy/。
有不同的副标题:“VESTIMENTA”、“CALZADO”、“ACCESORIOS”、“BEAUTY”、“MARCAS”、“AGENDA”和“BLOG”。我想在“VESTIMENTA”中“单击”,然后单击每个子标题(“Activewear”、“Blazers y chaquetas”),如下图所示,但我在使用过去有效的代码时遇到了问题:
我的主要目标是将每个子标题(类别)中的图像下载到具有类别名称(activewear、“blazer_y_chaquetas”等)的子文件夹中名为“images”的文件夹中。在代码中,我想从每张图片下载信息,但现在我将其分开在另一个脚本中。
这是我到目前为止所拥有的:
library(tidyverse)
library(rvest)
library(httr)
url <- "https://moweek.com.uy/"
html_content <- GET(url)
webpage <- read_html(content(html_content, as = "text"))
category_nodes <- html_nodes(webpage, "a.headerOption.headerLink[data-category-id='1']")
category_urls <- html_attr(category_nodes, "href")
dir.create("images", showWarnings = FALSE)
# Loop through subcategories and download images
for (category_url in category_urls) {
# Remove any extra slashes in the URL construction
subcategory_url <- paste0(url, gsub("/+", "/", category_url))
subcategory_page <- tryCatch({
read_html(subcategory_url)
}, error = function(e) {
message(paste("Error accessing URL:", subcategory_url))
return(NULL)
})
if (is.null(subcategory_page)) {
next # Skip to the next subcategory if an error occurred
}
subcategory_name <- subcategory_page %>% html_text()
# Clean the subcategory name for folder creation
subfolder_name <- gsub(" ", "_", tolower(subcategory_name))
subfolder_path <- file.path("images", subfolder_name)
dir.create(subfolder_path, showWarnings = FALSE)
# Extract image URLs and download them into the subfolder
image_urls <- subcategory_page %>%
html_nodes(".your-image-selector") %>%
html_attr("src")
for (image_url in image_urls) {
image_name <- basename(image_url)
download.file(image_url, file.path(subfolder_path, image_name))
}
}
这非常有效!
pacman::p_load(tidyverse, rvest, httr)
url <- "https://moweek.com.uy"
webpage <- httr::GET(url)
webpage <- read_html(content(webpage, as = "text"))
category_nodes <- html_nodes(webpage, ".expandedCategory")
category_urls <- lapply(category_nodes, function(node) html_nodes(node, "a") %>%
html_attr("href")) %>%
unlist() %>%
str_subset("vestimenta") %>%
str_replace_all("1", "30")
category_urls <- category_urls[!(category_urls %in% c("/vestimenta/30", "/vestimenta/activewear/30",
"/vestimenta/bodywear/30", "/vestimenta/leggings-y-bikers/30",
"/vestimenta/lenceria/30", "/vestimenta/hombre/30",
"/vestimenta/ninos/30", "/vestimenta/pijamas-y-camisones/30",
"https://moweek.com.uy/vestimenta/vestidos/30",
"https://moweek.com.uy/vestimenta/trajes-de-bano/30"))]
dir.create("images", showWarnings = FALSE)
# Function to limit folder name length
limit_folder_name_length <- function(category_url, max_length = 50) {
# Extract the subcategory name from the category_url
subcategory_name <- gsub(".*/vestimenta/(.*?)/30", "\\1", category_url)
if (nchar(subcategory_name) > max_length) {
subcategory_name <- substr(subcategory_name, 1, max_length)
}
return(subcategory_name)
}
# Loop through subcategories and download images
for (category_url in category_urls) {
subcategory_name <- gsub(".*/vestimenta/(.*?)/30", "\\1", category_url)
subfolder_name <- limit_folder_name_length(gsub(" ", "_", tolower(subcategory_name)))
subfolder_path <- file.path("images", subfolder_name)
dir.create(subfolder_path, showWarnings = FALSE, recursive = TRUE)
subcategory_url <- paste0(url, gsub("/+", "/", category_url))
# Sleep for a few seconds to avoid overloading the server
Sys.sleep(3)
cat_content <- RETRY("GET", subcategory_url)
cat_page <- read_html(content(cat_content, as = "text"))
image_urls <- cat_page %>%
html_nodes(".productViewTopImage") %>%
html_attr("src")
image_urls <- ifelse(substr(image_urls, 1, 8) != "https://", paste0(url, image_urls), image_urls)
for (image_url in image_urls) {
if (!is.na(image_url) && image_url != "") {
image_name <- basename(image_url)
image_path <- file.path(subfolder_path, image_name)
# Download and save the image to the subfolder
download.file(image_url, image_path, mode = "wb")
}
}
}