Python 抓取代码正在导出空 csv

问题描述 投票:0回答:1

我正在使用下面的Python代码来抓取这个网站(https://www.eeoc.gov/newsroom/search)。我希望它能抓取标题所链接的标题、描述、日期和网址。

代码正在创建一个空的 csv。知道我做错了什么吗?

import csv
import requests
from bs4 import BeautifulSoup

def scrape_eec_news():
    url = "https://www.eeoc.gov/newsroom/search"
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        results = []
        entries = soup.find_all("div", class_="views-row")
        for entry in entries:
            title_elem = entry.find("h3", class_="field-content")
            description_elem = entry.find("div", class_="field-content")
            date_elem = entry.find("span", class_="date-display-single")
            url_elem = entry.find("a")

            if title_elem and description_elem and date_elem and url_elem:
                title = title_elem.text.strip()
                description = description_elem.text.strip()
                date = date_elem.text.strip()
                url = url_elem["href"]
                results.append({"title": title, "description": description, "date": date, "url": url})
        return results
    else:
        print("Failed to fetch the webpage")
        return []

def export_to_csv(data, filename):
    with open(filename, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["title", "description", "date", "url"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for entry in data:
            writer.writerow(entry)

if __name__ == "__main__":
    news_entries = scrape_eec_news()
    export_to_csv(news_entries, "eec_news.csv")
    print("Data exported to eec_news.csv")

运行代码后清空 csv 文件

python web-scraping beautifulsoup
1个回答
0
投票

这里是稍微修改过的代码,将结果保存到 CSV 文件:

import csv

import requests
from bs4 import BeautifulSoup


def scrape_eec_news():
    url = "https://www.eeoc.gov/newsroom/search"
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, "html.parser")
    results = []
    entries = soup.find_all("div", class_="views-row")
    for entry in entries:
        title_elem = entry.h2
        description_elem = entry.p
        date_elem = entry.find("div", class_="field--type-datetime")
        url_elem = entry.a

        title = title_elem.text.strip()
        description = description_elem.text.strip()
        date = date_elem.text.strip()
        url = url_elem["href"]
        results.append(
            {
                "title": title,
                "description": description,
                "date": date,
                "url": url,
            }
        )
    return results


def export_to_csv(data, filename):
    with open(filename, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["title", "description", "date", "url"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for entry in data:
            writer.writerow(entry)


if __name__ == "__main__":
    news_entries = scrape_eec_news()
    export_to_csv(news_entries, "eec_news.csv")
    print("Data exported to eec_news.csv")

保存

eec_news.csv
(LibreOffice 的屏幕截图):

© www.soinside.com 2019 - 2024. All rights reserved.