Python Beatifulsoup 只抓取了亚马逊评论的第一页,有人知道如何在没有代理的情况下理想地提取所有评论吗?

问题描述 投票:0回答:1

我正在 Python 上使用 Beautifulsoup 尝试抓取某个产品的所有亚马逊评论,但它只提取第一页(9 条评论)。这段代码过去似乎对其他用户有效,也许我需要一个代理来避免在更改页面的过程中被阻止?预先感谢!

我在 colab 上使用 Pyhon 3 进行此测试

import requests

import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import logging

headers = {
    "authority": "www.amazon.com",
    "pragma": "no-cache",
    "cache-control": "no-cache",
    "dnt": "1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "sec-fetch-site": "none",
    "sec-fetch-mode": "navigate",
    "sec-fetch-dest": "document",
    "accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
}

URLS = [
 "https://www.amazon.com/Heat-Storm-HS-1500-PHX-WIFI-Infrared-Heater/product-reviews/B07JXRWJ8D/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
 "https://www.amazon.com/Heat-Storm-HS-1500-PHX-WIFI-Infrared-Heater/product-reviews/B07JXRWJ8D/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=2",
 "https://www.amazon.com/Heat-Storm-HS-1500-PHX-WIFI-Infrared-Heater/product-reviews/B07JXRWJ8D/ref=cm_cr_getr_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3"
]


def get_page_html(page_url: str) -> str:
    resp = requests.get(page_url, headers=headers)
    return resp.text


def get_reviews_from_html(page_html: str) -> BeautifulSoup:
    soup = BeautifulSoup(page_html, "lxml")
    reviews = soup.find_all("div", {"class": "a-section celwidget"})
    return reviews


def get_review_date(soup_object: BeautifulSoup):
    date_string = soup_object.find("span", {"class": "review-date"}).get_text()
    return date_string


def get_review_text(soup_object: BeautifulSoup) -> str:
    review_text = soup_object.find(
        "span", {"class": "a-size-base review-text review-text-content"}
    ).get_text()
    return review_text.strip()


def get_review_header(soup_object: BeautifulSoup) -> str:
    review_header = soup_object.find(
        "a",
        {
            "class": "a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold"
        },
    ).get_text()
    return review_header.strip()


def get_number_stars(soup_object: BeautifulSoup) -> str:
    stars = soup_object.find("span", {"class": "a-icon-alt"}).get_text()
    return stars.strip()


def get_product_name(soup_object: BeautifulSoup) -> str:
    product = soup_object.find(
        "a", {"class": "a-size-mini a-link-normal a-color-secondary"}
    ).get_text()
    return product.strip()


def orchestrate_data_gathering(single_review: BeautifulSoup) -> dict:
    return {
        "review_text": get_review_text(single_review),
        "review_date": get_review_date(single_review),
        "review_title": get_review_header(single_review),
        "review_stars": get_number_stars(single_review),
        "review_flavor": get_product_name(single_review),
    }


if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    all_results = []

    for u in URLS:
        logging.info(u)
        html = get_page_html(u)
        reviews = get_reviews_from_html(html)
        for rev in reviews:
            data = orchestrate_data_gathering(rev)
            all_results.append(data)

    out = pd.DataFrame.from_records(all_results)
    logging.info(f"{out.shape[0]} Is the shape of the dataframe")
    save_name = f"{datetime.now().strftime('%Y-%m-%d-%m')}.xlsx"
    logging.info(f"saving to {save_name}")
    out.to_excel(save_name)
python web-scraping beautifulsoup review
1个回答
0
投票

您可以尝试使用他们的 Review pagination Ajax API 来加载更多页面:

import re
from ast import literal_eval

import requests
from bs4 import BeautifulSoup

url = "https://www.amazon.com/hz/reviews-render/ajax/reviews/get/ref=cm_cr_arp_d_paging_btm_next_2"

headers = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0"
}

payload = {
    "sortBy": "",
    "reviewerType": "all_reviews",
    "formatType": "",
    "mediaType": "",
    "filterByStar": "",
    "filterByAge": "",
    "pageNumber": "1",
    "filterByLanguage": "",
    "filterByKeyword": "",
    "shouldAppend": "undefined",
    "deviceType": "desktop",
    "canShowIntHeader": "undefined",
    "reftag": "cm_cr_arp_d_paging_btm_next_2",
    "pageSize": "10",
    "asin": "B07JXRWJ8D",  # <--- change product asin here
    "scope": "reviewsAjax0",
}


for page in range(1, 4):  # <--- change number of pages here
    payload["pageNumber"] = page

    t = requests.post(url, data=payload, headers=headers).text

    soup = BeautifulSoup(
        "\n".join(map(literal_eval, re.findall(r'"<div id=.*?</div>"', t))),
        "html.parser",
    )

    for r in soup.select('[data-hook="review"]'):
        print(r.select_one(".a-profile-name").text.strip())
        print(r.select_one('[data-hook="review-body"]').text.strip())
        print()

打印:


...

Kindle Customer
We have 4 of these throughout my house now and with the app they’re so easy to use.The they haven’t raised my electric bill at all yet and they’re so quiet. The wall installation is perfect for smaller rooms and children. Never having to worry about them pushing them over, and they have a child lock on the screen. So convenient

Josh
Heats up my work trailer perfectly without having to worry about a electric heater constantly being on and being able to have it set to a exact temp.
© www.soinside.com 2019 - 2024. All rights reserved.