尝试访问 Dataframe 中列的内容时出现“键错误”

问题描述 投票:0回答:0

我在尝试访问 Dataframe 中列的内容时遇到“Key Error”。在这里,在下面的代码中 -

def scrape_topics_repos():
    print('Scraping list of topics')
    topics_df = scrape_topics()

    os.makedirs('data', exist_ok=True)
    for index, row in topics_df.iterrows():
        print('Scraping top repositories for "{}"'.format(row['Topic Name']))
        scrape_topic(row['Topic Link'], 'data/{}.csv'.format(row['Topic Name']))

这是我的全部代码,请尝试运行它,并帮助我解决错误。

import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
# Use the requests library to download the webpage on the local server
'''github_url = "https://github.com/topics"
r = requests.get(github_url)
response = r.text
if r.status_code != 200:
    raise Exception("Failed to load page {}".format(github_url))
doc1 = BeautifulSoup(response, 'lxml')'''


def get_topic_titles(doc):
    topic_title_class = "f3 lh-condensed mb-0 mt-1 Link--primary"
    topic_title_tags = doc.find_all('p', {'class': topic_title_class})
    topic_titles = []
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles

# Extracting the description of the topic


def get_topic_description(doc):
    topic_description_class = "f5 color-fg-muted mb-0 mt-1"
    topic_description_tags = doc.find_all('p', {'class': topic_description_class})
    topic_descriptions = []
    for des in topic_description_tags:
        topic_descriptions.append(des.text.strip())
    return topic_descriptions


# Extracting the link to the topic
def get_topic_urls(doc):
    topic_link_class = "no-underline flex-1 d-flex flex-column"
    topic_link_tags = doc.find_all('a', {'class': topic_link_class})
    # topic_url = "https://github.com" + topic_link_tags[0]["href"]
    base_url = "https://github.com"
    topic_urls = []
    for url in topic_link_tags:
        topic_urls.append(base_url + url['href'])
    return topic_urls
# topic_urls = get_topic_urls(doc1)      # to iterate using index of the list


def scrape_topics():
    github_url = "https://github.com/topics"
    r = requests.get(github_url)
    response = r.text
    if r.status_code != 200:
        raise Exception("Failed to load page {}".format(github_url))
    doc1 = BeautifulSoup(response, 'lxml')
    table_dict = {
        'Topic Title': get_topic_titles(doc1),
        'Topic Link': get_topic_urls(doc1),
        'Topic Description': get_topic_description(doc1)
    }
    return pd.DataFrame(table_dict)
# topic_df = scrape_topics()
# print(topic_df)                  #to check if topics are scraped properly


def get_topic_page(topic_url):
    r = requests.get(topic_url)
    response = r.text
    if r.status_code != 200:
        raise Exception("Failed to load page {}".format(topic_url))
    topic_doc = BeautifulSoup(response, 'lxml')
    return topic_doc
# topic_doc = get_topic_page(topic_urls[0])


def get_repo_info(h3_tag, star_tag):
    a_tags = h3_tag.find_all('a')

    username = a_tags[0].text.strip()

    repo_name = a_tags[1].text.strip()
    base_url = "https://github.com"
    repo_url = base_url + a_tags[1]['href']

    star_count = star_tag.text.strip()
    return username, repo_name, repo_url, star_count


# FURTHER INFORMATION ABOUT TOPICS

def get_topic_repos(topic_doc):
    h3_selection_tag = 'f3 color-fg-muted text-normal lh-condensed'
    h3_tags = topic_doc.find_all('h3', {'class': h3_selection_tag})

    star_span_id = 'repo-stars-counter-star'
    star_tags = topic_doc.find_all('span', {'id': star_span_id})

    topic_repos_dict = {
        'username': [],
        'repo_name': [],
        'stars': [],
        'repo_url': []
    }

    for i in range(len(h3_tags)):
        repo_info = get_repo_info(h3_tags[i], star_tags[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[2])
        topic_repos_dict['repo_url'].append(repo_info[3])

    return pd.DataFrame(topic_repos_dict)
# repo_df = get_topic_repos(get_topic_page(topic_urls[2]))   # to check if repo extraction is working fine
# print(repo_df)


def scrape_topic(topic_url, path):
    if os.path.exists(path):
        print("The file {} already exists. Skipping...".format(path))
        return
    topic_df = get_topic_repos(get_topic_page(topic_url))
    topic_df.to_csv(path, index=False)


def scrape_topics_repos():
    print('Scraping list of topics')
    topics_df = scrape_topics()

    os.makedirs('data', exist_ok=True)
    for index, row in topics_df.iterrows():
        print('Scraping top repositories for "{}"'.format(row['Topic Name']))
        scrape_topic(row['Topic Link'], 'data/{}.csv'.format(row['Topic Name']))


scrape_topics_repos()

我打印了数据框以确保确实创建了 df,确实如此,但这没有帮助。

python dataframe web-scraping keyerror
© www.soinside.com 2019 - 2024. All rights reserved.