我无法从动态网站提取数据

问题描述 投票:0回答:1

这是任务的详细信息。 编写一个 Python 脚本,按照以下方式从以下网站 https://www.psychologytoday.com/us/therapys 抓取数据

第 1 步 – 脚本将从美国部分(阿拉斯加到怀俄明州)一次转到一个州,但它也可以在下面的步骤 2 中将所有州合并到每个性别的一个抓取步骤中

第 2 步 – 一旦进入像这样的州 https://www.psychologytoday.com/us/therapys/alabama,它将按性别(女性、男性、非二元性别)进行过滤。每个性别都会有自己的 CSV,因此总共输出 3 个 CSV。

第 3 步 - 进入某个州后,将通过单击“查看”按钮将信息一一发送给每位治疗师(包括所有后续页面上的治疗师)

第 4 步 – 一旦出现在个别治疗师的页面(例如此页面)上 https://www.psychologytoday.com/us/therapys/dannette-bivins-birmingham-al/443641,它会将以下信息抓取到相应的页面中CSV 按以下顺序

  • 状态
  • 城市
  • 拉链
  • 街道地址1
  • 街道地址2
  • 公司名称
  • 人名
  • 标题
  • 电话
  • 保险
  • 专业和专长
  • 治疗类型
  • 年龄

我已经编写了这个脚本,但它无限期地运行而不返回任何内容。当我尝试单独运行 scrape_therapy_data(soup) 时,对于 State 中的单个类别,它返回一个空列表。因此,主要的挑战是找到正确的 HTML 标签。 我将不胜感激任何修复脚本的帮助。

import csv
import requests
from bs4 import BeautifulSoup
from ratelimit import limits, sleep_and_retry

# Rate limits: 2 requests per second
RATE_LIMIT = 2
RATE_PERIOD = 1

# User-Agent header
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"

# Headers with the User-Agent
headers = {"User-Agent": USER_AGENT}

# Rate limit decorator
@sleep_and_retry
@limits(calls=RATE_LIMIT, period=RATE_PERIOD)
def fetch_url(url):
    return requests.get(url, headers=headers)

def scrape_gender_data(state_url, gender):
    # Navigate to the state URL for the specified gender category
    gender_url = f"{state_url}?category={gender}"
    response = fetch_url(gender_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Get the total number of pages for the gender category
        num_pages = get_num_pages(gender_url)

        therapists_data = []

        # Iterate over each page and scrape therapist data
        for page_num in range(1, num_pages + 1):
            page_url = f"{gender_url}&page={page_num}"
            response = fetch_url(page_url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                therapists_data.extend(scrape_page_data(soup))
            else:
                print(f"Failed to fetch page: {page_url}")

        return therapists_data
    else:
        print(f"Failed to fetch gender data: {gender_url}")
        return []

def get_num_pages(url):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for bad status codes
        soup = BeautifulSoup(response.content, 'html.parser')
        pagination_div = soup.find('div', class_='pagination')
        if pagination_div:
            pages = pagination_div.find_all('a')
            highest_page_number = 0
            new_highest_page_number = True

            while new_highest_page_number:
                new_highest_page_number = False
                for page in pages:
                    try:
                        page_number = int(page.text)
                        if page_number > highest_page_number:
                            highest_page_number = page_number
                            page_url = page['href']
                            response = requests.get(page_url, headers=headers)
                            response.raise_for_status()
                            soup = BeautifulSoup(response.content, 'html.parser')
                            pagination_div = soup.find('div', class_='pagination')
                            if pagination_div:
                                pages = pagination_div.find_all('a')
                                new_highest_page_number = True
                                break
                    except (ValueError, KeyError) as e:
                        continue

            num_pages = highest_page_number
        else:
            num_pages = 1
    except Exception as e:
        print(f"An error occurred: {e}")
        num_pages = 1

    return num_pages

def scrape_page_data(soup):
    therapists_data = []
    results_container = soup.find('div', class_='results')
    if results_container:
        therapists = results_container.find_all('div', class_='results-row')
        if therapists:
            for therapist in therapists:
                view_button = therapist.find('a', class_='profile-title')
                if view_button:
                    view_url = view_button['href']
                    response = fetch_url(view_url)
                    if response.status_code == 200:
                        therapist_soup = BeautifulSoup(response.content, 'html.parser')
                        therapist_data = scrape_therapist_data(therapist_soup)
                        therapists_data.append(therapist_data)
                    else:
                        print(f"Failed to fetch therapist page: {view_url}")
                else:
                    print("View button not found.")
        else:
            print("No therapist elements found.")
    else:
        print("Results container not found.")
    return therapists_data

def scrape_therapist_data(soup):
    therapist_data = {}
    meta_section = soup.find('meta', attrs={'name': 'article:section'})
    if meta_section:
        therapist_data['state'] = meta_section['content']
    locality_span = soup.find('span', attrs={'itemprop': 'addressLocality'})
    if locality_span:
        therapist_data['city'] = locality_span.text.strip()
    postal_span = soup.find('span', attrs={'itemprop': 'postalCode'})
    if postal_span:
        therapist_data['zip_code'] = postal_span.text.strip()
    street_span = soup.find('span', attrs={'itemprop': 'streetAddress'})
    if street_span:
        therapist_data['street_address_1'] = street_span.text.strip()
    business_h1 = soup.find('h1', attrs={'itemprop': 'name'})
    if business_h1:
        therapist_data['business_name'] = business_h1.text.strip()
    name_span = soup.find('span', attrs={'itemprop': 'name'})
    if name_span:
        therapist_data['person_name'] = name_span.text.strip()
    title_span = soup.find('span', attrs={'itemprop': 'jobTitle'})
    if title_span:
        therapist_data['title'] = title_span.text.strip()
    telephone_span = soup.find('span', attrs={'itemprop': 'telephone'})
    if telephone_span:
        therapist_data['telephone'] = telephone_span.text.strip()
    insurance_div = soup.find('div', class_='professional-list-item--body')
    if insurance_div:
        therapist_data['insurance'] = insurance_div.find('p').text.strip()
    specialties_h2 = soup.find('h2', string='Specialties')
    if specialties_h2:
        specialties_ul = specialties_h2.find_next('ul')
        if specialties_ul:
            therapist_data['specialties_and_expertise'] = specialties_ul.text.strip()
    modalities_h2 = soup.find('h2', string='Therapy Modalities')
    if modalities_h2:
        modalities_ul = modalities_h2.find_next('ul')
        if modalities_ul:
            therapist_data['types_of_therapy'] = modalities_ul.text.strip()
    age_h2 = soup.find('h2', string='Age')
    if age_h2:
        age_ul = age_h2.find_next('ul')
        if age_ul:
            therapist_data['age'] = age_ul.text.strip()

    return therapist_data

def save_to_csv(data, state, gender):
    if not data:
        filename = f'{gender}_therapists_{state}.csv'
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["state", "city", "zip_code", "street_address_1", "business_name", "person_name", "title", "telephone", "insurance", "specialties_and_expertise", "types_of_therapy", "age"])
    else:
        filename = f'{gender}_therapists_{state}.csv'
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=data[0].keys())
            writer.writeheader()
            for therapist in data:
                writer.writerow(therapist)

def main():
    base_url = "https://www.psychologytoday.com/us/therapists/"
    states = ["alabama", "alaska", "arizona", "arkansas", "california", "colorado", "connecticut", "delaware", "florida", "georgia", "hawaii", "idaho", "illinois", "indiana", "iowa", "kansas", "kentucky", "louisiana", "maine", "maryland", "massachusetts", "michigan", "minnesota", "mississippi", "missouri", "montana", "nebraska", "nevada", "new-hampshire", "new-jersey", "new-mexico", "new-york", "north-carolina", "north-dakota", "ohio", "oklahoma", "oregon", "pennsylvania", "rhode-island", "south-carolina", "south-dakota", "tennessee", "texas", "utah", "vermont", "virginia", "washington", "west-virginia", "wisconsin", "wyoming"]
    genders = ['male', 'female', 'non-binary']

    state_urls = [f'{base_url}{state_name}' for state_name in states]

    for state_url in state_urls:
        for gender in genders:
            therapists_data = scrape_gender_data(state_url, gender)
            save_to_csv(therapists_data, state_url.split("/")[-1], gender)

if __name__ == "__main__":
    main()
html selenium-webdriver web-scraping beautifulsoup python-requests
1个回答
0
投票

这个问题有很多需要解析的地方,但最终目标似乎是为什么这个脚本从指定网站为治疗师返回各种数据点的空数组。

我认为这确实是一个与页面内容相关的调试问题。我发现线上的find函数没有返回任何数据(没有产生错误)

meta_section = soup.find('meta', attrs={'name': 'article:section'})

scrape_therapy_data 函数的

。这表明没有与提供给查找函数的过滤器匹配的数据点。

例如,如果我将代码更改为

,我可以在同一行找到状态

soup.find('div', class_='breadcrumb-xs-hide').contents

如果打印到控制台,将提供输出 ['Alabama']。我认为其余问题可能与此类似。我真的不想解决此代码中的所有错误,但希望这将有助于指导您自行完成故障排除。

© www.soinside.com 2019 - 2024. All rights reserved.