我正在 Python 上使用 Beautifulsoup 尝试抓取某个产品的所有亚马逊评论,但它只提取第一页(9 条评论)。这段代码过去似乎对其他用户有效,也许我需要一个代理来避免在更改页面的过程中被阻止?预先感谢!
我在 colab 上使用 Pyhon 3 进行此测试
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import logging
headers = {
"authority": "www.amazon.com",
"pragma": "no-cache",
"cache-control": "no-cache",
"dnt": "1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"sec-fetch-site": "none",
"sec-fetch-mode": "navigate",
"sec-fetch-dest": "document",
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
}
URLS = [
"https://www.amazon.com/Heat-Storm-HS-1500-PHX-WIFI-Infrared-Heater/product-reviews/B07JXRWJ8D/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
"https://www.amazon.com/Heat-Storm-HS-1500-PHX-WIFI-Infrared-Heater/product-reviews/B07JXRWJ8D/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=2",
"https://www.amazon.com/Heat-Storm-HS-1500-PHX-WIFI-Infrared-Heater/product-reviews/B07JXRWJ8D/ref=cm_cr_getr_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3"
]
def get_page_html(page_url: str) -> str:
resp = requests.get(page_url, headers=headers)
return resp.text
def get_reviews_from_html(page_html: str) -> BeautifulSoup:
soup = BeautifulSoup(page_html, "lxml")
reviews = soup.find_all("div", {"class": "a-section celwidget"})
return reviews
def get_review_date(soup_object: BeautifulSoup):
date_string = soup_object.find("span", {"class": "review-date"}).get_text()
return date_string
def get_review_text(soup_object: BeautifulSoup) -> str:
review_text = soup_object.find(
"span", {"class": "a-size-base review-text review-text-content"}
).get_text()
return review_text.strip()
def get_review_header(soup_object: BeautifulSoup) -> str:
review_header = soup_object.find(
"a",
{
"class": "a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold"
},
).get_text()
return review_header.strip()
def get_number_stars(soup_object: BeautifulSoup) -> str:
stars = soup_object.find("span", {"class": "a-icon-alt"}).get_text()
return stars.strip()
def get_product_name(soup_object: BeautifulSoup) -> str:
product = soup_object.find(
"a", {"class": "a-size-mini a-link-normal a-color-secondary"}
).get_text()
return product.strip()
def orchestrate_data_gathering(single_review: BeautifulSoup) -> dict:
return {
"review_text": get_review_text(single_review),
"review_date": get_review_date(single_review),
"review_title": get_review_header(single_review),
"review_stars": get_number_stars(single_review),
"review_flavor": get_product_name(single_review),
}
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
all_results = []
for u in URLS:
logging.info(u)
html = get_page_html(u)
reviews = get_reviews_from_html(html)
for rev in reviews:
data = orchestrate_data_gathering(rev)
all_results.append(data)
out = pd.DataFrame.from_records(all_results)
logging.info(f"{out.shape[0]} Is the shape of the dataframe")
save_name = f"{datetime.now().strftime('%Y-%m-%d-%m')}.xlsx"
logging.info(f"saving to {save_name}")
out.to_excel(save_name)
您可以尝试使用他们的 Review pagination Ajax API 来加载更多页面:
import re
from ast import literal_eval
import requests
from bs4 import BeautifulSoup
url = "https://www.amazon.com/hz/reviews-render/ajax/reviews/get/ref=cm_cr_arp_d_paging_btm_next_2"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0"
}
payload = {
"sortBy": "",
"reviewerType": "all_reviews",
"formatType": "",
"mediaType": "",
"filterByStar": "",
"filterByAge": "",
"pageNumber": "1",
"filterByLanguage": "",
"filterByKeyword": "",
"shouldAppend": "undefined",
"deviceType": "desktop",
"canShowIntHeader": "undefined",
"reftag": "cm_cr_arp_d_paging_btm_next_2",
"pageSize": "10",
"asin": "B07JXRWJ8D", # <--- change product asin here
"scope": "reviewsAjax0",
}
for page in range(1, 4): # <--- change number of pages here
payload["pageNumber"] = page
t = requests.post(url, data=payload, headers=headers).text
soup = BeautifulSoup(
"\n".join(map(literal_eval, re.findall(r'"<div id=.*?</div>"', t))),
"html.parser",
)
for r in soup.select('[data-hook="review"]'):
print(r.select_one(".a-profile-name").text.strip())
print(r.select_one('[data-hook="review-body"]').text.strip())
print()
打印:
...
Kindle Customer
We have 4 of these throughout my house now and with the app they’re so easy to use.The they haven’t raised my electric bill at all yet and they’re so quiet. The wall installation is perfect for smaller rooms and children. Never having to worry about them pushing them over, and they have a child lock on the screen. So convenient
Josh
Heats up my work trailer perfectly without having to worry about a electric heater constantly being on and being able to have it set to a exact temp.