IndexError:列表索引超出范围(在Reddit数据搜寻器上)

问题描述 投票:0回答:2

预期以下内容应该没有问题。

Reddit数据解决方案:

    import requests
    import re
    import praw
    from datetime import date
    import csv
    import pandas as pd
    import time
    import sys

    class Crawler(object):
        '''
            basic_url is the reddit site.
            headers is for requests.get method
            REX is to find submission ids.
        '''
        def __init__(self, subreddit="apple"):
            '''
                Initialize a Crawler object.
                    subreddit is the topic you want to parse. default is r"apple"
                basic_url is the reddit site.
                headers is for requests.get method
                REX is to find submission ids.
                submission_ids save all the ids of submission you will parse.
                reddit is an object created using praw API. Please check it before you use.
            '''
            self.basic_url = "https://www.reddit.com"
            self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
            self.REX = re.compile(r"<div class=\" thing id-t3_[\w]+")
            self.subreddit = subreddit
            self.submission_ids = []
            self.reddit = praw.Reddit(client_id="your_id", client_secret="your_secret", user_agent="subreddit_comments_crawler")

        def get_submission_ids(self, pages=2):
            '''
                Collect all ids of submissions..
                One page has 25 submissions.
                page url: https://www.reddit.com/r/subreddit/?count25&after=t3_id
                    id(after) is the last submission from last page.
            '''
    #         This is page url.
            url = self.basic_url + "/r/" + self.subreddit

            if pages <= 0:
                return []

            text = requests.get(url, headers=self.headers).text
            ids = self.REX.findall(text)
            ids = list(map(lambda x: x[-6:], ids))
            if pages == 1:
                self.submission_ids = ids
                return ids

            count = 0
            after = ids[-1]
            for i in range(1, pages):
                count += 25
                temp_url = self.basic_url + "/r/" + self.subreddit + "?count=" + str(count) + "&after=t3_" + ids[-1]
                text = requests.get(temp_url, headers=self.headers).text
                temp_list = self.REX.findall(text)
                temp_list = list(map(lambda x: x[-6:], temp_list))
                ids += temp_list
                if count % 100 == 0:
                    time.sleep(60)
            self.submission_ids = ids
            return ids

        def get_comments(self, submission):
            '''
                Submission is an object created using praw API.
            '''
    #         Remove all "more comments".
            submission.comments.replace_more(limit=None)
            comments = []
            for each in submission.comments.list():
                try:
                    comments.append((each.id, each.link_id[3:], each.author.name, date.fromtimestamp(each.created_utc).isoformat(), each.score, each.body) )
                except AttributeError as e: # Some comments are deleted, we cannot access them.
    #                 print(each.link_id, e)
                    continue
            return comments

        def save_comments_submissions(self, pages):
            '''
                1. Save all the ids of submissions.
                2. For each submission, save information of this submission. (submission_id, #comments, score, subreddit, date, title, body_text)
                3. Save comments in this submission. (comment_id, submission_id, author, date, score, body_text)
                4. Separately, save them to two csv file.
                Note: You can link them with submission_id.
                Warning: According to the rule of Reddit API, the get action should not be too frequent. Safely, use the defalut time span in this crawler.
            '''

            print("Start to collect all submission ids...")
            self.get_submission_ids(pages)
            print("Start to collect comments...This may cost a long time depending on # of pages.")
            submission_url = self.basic_url + "/r/" + self.subreddit + "/comments/"
            comments = []
            submissions = []
            count = 0
            for idx in self.submission_ids:
                temp_url = submission_url + idx
                submission = self.reddit.submission(url=temp_url)
                submissions.append((submission.name[3:], submission.num_comments, submission.score, submission.subreddit_name_prefixed, date.fromtimestamp(submission.created_utc).isoformat(), submission.title, submission.selftext))
                temp_comments = self.get_comments(submission)
                comments += temp_comments
                count += 1
                print(str(count) + " submissions have got...")
                if count % 50 == 0:
                    time.sleep(60)
            comments_fieldnames = ["comment_id", "submission_id", "author_name", "post_time", "comment_score", "text"]
            df_comments = pd.DataFrame(comments, columns=comments_fieldnames)
            df_comments.to_csv("comments.csv")
            submissions_fieldnames = ["submission_id", "num_of_comments", "submission_score", "submission_subreddit", "post_date", "submission_title", "text"]
            df_submission = pd.DataFrame(submissions, columns=submissions_fieldnames)
            df_submission.to_csv("submissions.csv")
            return df_comments


    if __name__ == "__main__":
        args = sys.argv[1:]
        if len(args) != 2:
            print("Wrong number of args...")
            exit()

        subreddit, pages = args
        c = Crawler(subreddit)
        c.save_comments_submissions(int(pages))

但是我得到了:

[(base)UserAir:scrape_reddit user $ python reddit_crawler.py apple 2

开始收集所有提交ID ...

回溯(最近通话):

文件“ reddit_crawler.py”,第127行,在

c.save_comments_submissions(int(pages))

文件“ reddit_crawler.py”,第94行,在save_comments_submissions中

self.get_submission_ids(pages)

文件“ reddit_crawler.py”,第54行,在get_submission_ids中

after = ids[-1]

IndexError:列表索引超出范围

python praw
2个回答
1
投票

[当my_list[-1]抛出IndexError时,表示my_list为空:

>>> ids = []
>>> ids[-1]
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
IndexError: list index out of range
>>> ids = ['1']
>>> ids[-1]
'1'

0
投票

Erik's answer诊断此错误的具体原因,但更广泛地说,我认为这是由于您未充分利用PRAW引起的。您的脚本会导入requests并执行很多手动请求,而PRAW已经具备了这些方法。 PRAW的主要目的是防止您不得不编写处理诸如对列表进行分页之类的请求,因此,我建议您利用这一点。

例如,您的get_submission_ids函数(用于刮擦Reddit的Web版本并处理分页)可以替换为[]]

def get_submission_ids(self, pages=2):
    return [
        submission.id
        for submission in self.reddit.subreddit(self.subreddit).hot(
            limit=25 * pages
        )
    ]

因为.hot() function会手动完成您尝试做的所有事情。

我将在此进行更进一步,使该函数仅返回一个.hot()对象的列表,因为其余的代码最终将完成与PRAW Submission对象的交互,从而可以更好地完成工作。这就是代码(我将函数重命名以反映其更新的目的):

Submission

((我已经将此函数更新为仅返回其结果,因为您的版本均返回值并且

将其设置为Submission除非 def get_submissions(self, pages=2): return list(self.reddit.subreddit(self.subreddit).hot(limit=25 * pages)) self.submission_ids。不一致,因此我只返回了值。)

您的pages功能看起来不错。

0功能,与get_comments一样,完成了PRAW可以处理的许多手动工作。您可以构造一个具有帖子完整URL的save_comments_submissions,然后使用该URL来创建PRAW get_submission_ids对象,但是我们可以直接使用temp_url返回的URL替换该对象。您还打了一些我删除的Submission,因为PRAW会自动为您休眠适当的数量。最后,我删除了此函数的返回值,因为该函数的目的是将数据保存到磁盘,而不是将其返回到其他任何地方,并且脚本的其余部分不使用该返回值。这是该功能的更新版本:

get_submissions

这里是完整使用PRAW的整个脚本的更新版本:

time.sleep()

我意识到我的回答进入了def save_comments_submissions(self, pages): """ 1. Save all the ids of submissions. 2. For each submission, save information of this submission. (submission_id, #comments, score, subreddit, date, title, body_text) 3. Save comments in this submission. (comment_id, submission_id, author, date, score, body_text) 4. Separately, save them to two csv file. Note: You can link them with submission_id. Warning: According to the rule of Reddit API, the get action should not be too frequent. Safely, use the defalut time span in this crawler. """ print("Start to collect all submission ids...") submissions = self.get_submissions(pages) print( "Start to collect comments...This may cost a long time depending on # of pages." ) comments = [] pandas_submissions = [] for count, submission in enumerate(submissions): pandas_submissions.append( ( submission.name[3:], submission.num_comments, submission.score, submission.subreddit_name_prefixed, date.fromtimestamp(submission.created_utc).isoformat(), submission.title, submission.selftext, ) ) temp_comments = self.get_comments(submission) comments += temp_comments print(str(count) + " submissions have got...") comments_fieldnames = [ "comment_id", "submission_id", "author_name", "post_time", "comment_score", "text", ] df_comments = pd.DataFrame(comments, columns=comments_fieldnames) df_comments.to_csv("comments.csv") submissions_fieldnames = [ "submission_id", "num_of_comments", "submission_score", "submission_subreddit", "post_date", "submission_title", "text", ] df_submission = pd.DataFrame(pandas_submissions, columns=submissions_fieldnames) df_submission.to_csv("submissions.csv") 领域,但是我希望这个答案有助于理解PRAW可以做的一些事情。您可以通过使用预先存在的库代码来避免出现“列表索引超出范围”错误,因此,我认为这是对您的问题的解决方案。

© www.soinside.com 2019 - 2024. All rights reserved.