我正在尝试创建一个脚本来返回所有tonies的持续时间https://tonies.com/en-gb/tonies/。我也想退还他们每个人的费用,但很挣扎。我还通过selenium查看了脚本,但被cookieaccept困住了,它是一个影子dom。我想我可能会让这个变得过于复杂。我是编程和 Python 新手。任何建议表示赞赏。当前形式的脚本似乎只抓取了前 21 项。
import re
import requests
from bs4 import BeautifulSoup
def get_tonie_info(tonie_url):
response = requests.get(tonie_url)
soup = BeautifulSoup(response.text, 'html.parser')
script_tags = soup.find_all('script')
tonie_info = {'url': tonie_url, 'durations': []}
for script_tag in script_tags:
script_content = script_tag.string
if script_content and 'runTime' in script_content:
matches = re.findall(r'"runTime":\s*(\d+)', script_content)
if matches:
tonie_info['durations'] = list(map(int, matches))
return tonie_info
def scrape_tonies():
all_tonie_info = []
base_url = "https://tonies.com/en-gb/tonies/?page="
page_number = 9 # Only scrape data from page 9
current_url = base_url + str(page_number)
response = requests.get(current_url)
soup = BeautifulSoup(response.text, 'html.parser')
tonie_links = soup.find_all('a', class_='View__StretchedLink-sc-5t9da0-0 ivnTIu')
for tonie_link in tonie_links:
tonie_url = "https://tonies.com" + tonie_link['href']
tonie_info = get_tonie_info(tonie_url)
if tonie_info['durations']:
tonie_info['name'] = tonie_link.text.strip()
tonie_info['duration'] = tonie_info['durations'][-1]
all_tonie_info.append(tonie_info)
else:
print(f"Could not retrieve information for {tonie_url}")
return all_tonie_info
if __name__ == "__main__":
tonies_info = scrape_tonies()
for index, tonie_info in enumerate(tonies_info, start=1):
print(f"Toni {index} Name: {tonie_info['name']}")
print(f" URL: {tonie_info['url']}")
print(f" Duration: {tonie_info['duration']}")
您可以尝试以 JSON 格式收集 tonies 数据,然后后期处理:
import json
data = (json.loads(soup.select_one("#__NEXT_DATA__").text)
["props"]["pageProps"]["page"]["productList"]["normalizedProducts"])
use_keys = ["name", "price", "runTime"] # << ask for more if needed
tonies = [
{
k: d.get(k) if k!="price" else d.get(k).get("amount")
for k in use_keys
} for d in data
]
输出:
# len(tonies) # 196
print(json.dumps(tonies, indent=4))
[
{
"name": "Chase",
"price": 14.99,
"runTime": 54
},
{
"name": "Elmer and Friends Story Collection",
"price": 14.99,
"runTime": 62
},
{
"name": "Frozen",
"price": 14.99,
"runTime": 24
},
...
]