尝试创建 pyton 脚本来返回 tonie 长度等

问题描述 投票:0回答:1

我正在尝试创建一个脚本来返回所有tonies的持续时间https://tonies.com/en-gb/tonies/。我也想退还他们每个人的费用,但很挣扎。我还通过selenium查看了脚本,但被cookieaccept困住了,它是一个影子dom。我想我可能会让这个变得过于复杂。我是编程和 Python 新手。任何建议表示赞赏。当前形式的脚本似乎只抓取了前 21 项。

    import re
    import requests
    from bs4 import BeautifulSoup

    def get_tonie_info(tonie_url):
response = requests.get(tonie_url)
soup = BeautifulSoup(response.text, 'html.parser')

script_tags = soup.find_all('script')

tonie_info = {'url': tonie_url, 'durations': []}

for script_tag in script_tags:
    script_content = script_tag.string

    if script_content and 'runTime' in script_content:
        matches = re.findall(r'"runTime":\s*(\d+)', script_content)

        if matches:
            tonie_info['durations'] = list(map(int, matches))

return tonie_info

   def scrape_tonies():
all_tonie_info = []

base_url = "https://tonies.com/en-gb/tonies/?page="

page_number = 9  # Only scrape data from page 9
current_url = base_url + str(page_number)
response = requests.get(current_url)
soup = BeautifulSoup(response.text, 'html.parser')

tonie_links = soup.find_all('a', class_='View__StretchedLink-sc-5t9da0-0 ivnTIu')

for tonie_link in tonie_links:
    tonie_url = "https://tonies.com" + tonie_link['href']
    tonie_info = get_tonie_info(tonie_url)

    if tonie_info['durations']:
        tonie_info['name'] = tonie_link.text.strip()
        tonie_info['duration'] = tonie_info['durations'][-1]
        all_tonie_info.append(tonie_info)
    else:
        print(f"Could not retrieve information for {tonie_url}")

return all_tonie_info

   if __name__ == "__main__":
tonies_info = scrape_tonies()

for index, tonie_info in enumerate(tonies_info, start=1):
    print(f"Toni {index} Name: {tonie_info['name']}")
    print(f"   URL: {tonie_info['url']}")
    print(f"   Duration: {tonie_info['duration']}")
python selenium-webdriver web-scraping beautifulsoup python-requests
1个回答
0
投票

您可以尝试以 JSON 格式收集 tonies 数据,然后后期处理

import json

data = (json.loads(soup.select_one("#__NEXT_DATA__").text)
    ["props"]["pageProps"]["page"]["productList"]["normalizedProducts"])

use_keys = ["name", "price", "runTime"] # << ask for more if needed

tonies = [
    {
        k: d.get(k) if k!="price" else d.get(k).get("amount")
        for k in use_keys
    } for d in data
     
]

输出:

# len(tonies) # 196

print(json.dumps(tonies, indent=4))

[
    {
        "name": "Chase",
        "price": 14.99,
        "runTime": 54
    },
    {
        "name": "Elmer and Friends Story Collection",
        "price": 14.99,
        "runTime": 62
    },
    {
        "name": "Frozen",
        "price": 14.99,
        "runTime": 24
    },
    ...
]
© www.soinside.com 2019 - 2024. All rights reserved.