从python中的特定reddit线程获取所有注释

问题描述 投票:11回答:3

官方的方式,

r = praw.Reddit('Comment Scraper 1.0 by u/_Daimon_ see '
                 'https://praw.readthedocs.org/en/latest/'
                 'pages/comment_parsing.html')
submission = r.get_submission(submission_id='11v36o')
submission.replace_more_comments(limit=None, threshold=0)

非常慢。有没有办法加快速度?有些人已将每个reddit注释提取到数据库中,因此必须有一些方法可以更快地完成此操作。

python performance praw reddit
3个回答
12
投票

编辑:新的praw api(6.0.0)有list(),使工作更轻松:

这也处理由于qazxsw poi通过使用AttributeError可能发生的qazxsw poi

more_comments

编辑:新的praw api(5.0.1)是神奇的,让这更容易。以下是如何做到这一点:

replace_more(limit=None)

用法示例:

submissionList = []
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
    submissionList.append(comment)

def getSubComments(comment, allComments, verbose=True): allComments.append(comment) if not hasattr(comment, "replies"): replies = comment.comments() if verbose: print("fetching (" + str(len(allComments)) + " comments fetched total)") else: replies = comment.replies for child in replies: getSubComments(child, allComments, verbose=verbose) def getAll(r, submissionId, verbose=True): submission = r.submission(submissionId) comments = submission.comments commentsList = [] for comment in comments: getSubComments(comment, commentsList, verbose=verbose) return commentsList 在哪里

res = getAll(r, "6rjwo1")
#res = getAll(r, "6rjwo1", verbose=False) # This won't print out progress if you want it to be silent. Default is verbose=True

以前的东西(现在已经过时):

好吧,我编写的代码能够可靠地从一个帖子中提取每条评论,并且500条评论需要大约10秒钟,4000条评论大约需要一分钟。我把它命名为redApi.py这里是:

r

要使用此脚本,请在python提示符下键入以下内容:

username = 'myusernamehere'
userAgent = "MyAppName/0.1 by " + username
clientId = 'myClientId'
clientSecret = "myClientSecret"
password = "passwordformyusernamehere"
r = praw.Reddit(user_agent=userAgent, client_id=clientId, client_secret=clientSecret)

4
投票

看起来praw已更新?在4.5.1中它看起来更像:

import time
import requests
import requests.auth
import praw

username = 'myusernamehere'
userAgent = "MyAppName/0.1 by " + username
clientId = 'myClientId'
clientSecret = "myClientSecret"
password = "passwordformyusernamehere"

def getPraw():
  return praw.Reddit(user_agent=userAgent, client_id=clientId, client_secret=clientSecret)

global accessToken
accessToken = None

def getAccessToken():
  client_auth = requests.auth.HTTPBasicAuth(clientId, clientSecret)
  post_data = {"grant_type": "password", "username": username, "password": password}
  headers = {"User-Agent": userAgent}
  response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth, data=post_data, headers=headers)
  return response.json()

def makeRequest(apiUrl, useGet=True):
  global accessToken
  if accessToken is None:
    accessToken = getAccessToken()
  headers = {"Authorization": "bearer "  + accessToken['access_token'], "User-Agent": userAgent}
  if useGet:
    response = requests.get(apiUrl, headers=headers)
  else:
    response = requests.post(apiUrl, headers=headers)
  time.sleep(1.1)
  responseJson = response.json()
  if 'error' in responseJson:
    if responseJson['error'] == 401:
      print "Refreshing access token"
      time.sleep(1.1)
      accessToken = getAccessToken()
      headers = {"Authorization": "bearer "  + accessToken['access_token'], "User-Agent": userAgent}
      time.sleep(1.1)
      response = requests.get(apiUrl, headers=headers)
      responseJson = response.json()
  return responseJson


global prawReddit
prawReddit = praw.Reddit(user_agent=userAgent, client_id=clientId, client_secret=clientSecret)

# Gets any number of posts
def getPosts(subredditName, numPosts=1000):
  global prawReddit
  subreddit = prawReddit.get_subreddit(subredditName)

  postGetter = praw.helpers.submissions_between(prawReddit, subreddit)

  postArray = []
  numGotten = 0
  while numGotten < numPosts:
    postArray.append(postGetter.next())
    numGotten += 1

  return postArray






# Get all comments from a post
# Submission is a praw submission, obtained via:
# r = redApi.getPraw()
# submission = r.get_submission(submission_id='2zysz7') # (or some other submission id, found via https://www.reddit.com/r/test/comments/2zysz7/ayy/ - the thing after /comments/)
# comments = redApi.getComments(submission)
def getComments(submission):
  requestUrl = 'https://oauth.reddit.com/' + submission.subreddit.url + 'comments/article?&limit=1000&showmore=true&article=' + submission.id
  allData = makeRequest(requestUrl)
  articleData = allData[0]
  comments = allData[1]
  curComments = comments['data']['children']

  resultComments = getCommentsHelper(curComments, submission.name, submission)

  return resultComments




# Print out the tree of comments
def printTree(comments):
  return printTreeHelper(comments, "")


def printTreeHelper(comments, curIndentation):
  resultString = ""
  for comment in comments:
    resultString += curIndentation + comment['data']['body'].replace("\n", "\n" + curIndentation) + "\n"
    if not comment['data']['replies'] == "":
      resultString += printTreeHelper(comment['data']['replies']['data']['children'], curIndentation + "  ")
  return resultString

# Get all comments as a single array  
def flattenTree(comments):
  allComments = []
  for comment in comments:
    allComments.append(comment)
    if not comment['data']['replies'] == "":
      allComments += flattenTree(comment['data']['replies']['data']['children'])
  return allComments





# Utility functions for getComments
def expandCommentList(commentList, submission):

  curComments = commentList
  allComments = {}
  while True:
    thingsToExpand = []
    nextComments = []
    allParents = {}
    for comment in curComments:
      if comment['kind'] == "more":
        thingsToExpand += comment['data']['children']
      else:
        if comment['data']['body'][:len("If they are shipping")] == "If they are shipping":
          print comment
        allComments[comment['data']['name']] = comment

    if len(thingsToExpand) == 0:
      curComments = []
      break

    curComments = []
    if not len(thingsToExpand) == 0:
      print "total things to expand: " + str(len(thingsToExpand))
      for i in range(0, len(thingsToExpand)/100+1):
        curCommentIds = thingsToExpand[i*100:min((i+1)*100, len(thingsToExpand))]
        requestUrl = 'https://oauth.reddit.com/api/morechildren.json?api_type=json&link_id=' + submission.name + '&limit=1000&showmore=true&children=' + ",".join(curCommentIds)
        curData = makeRequest(requestUrl)
        if 'json' in curData and 'data' in curData['json']:
          curComments += curData['json']['data']['things']
        print (i+1)*100


  for comment in curComments:
    allComments[comment['data']['name']] = comment

  return allComments.values()


def lookForMore(comment):
  if comment['kind'] == "more":
    return True
  if not comment['data']['replies'] == "":
    for reply in comment['data']['replies']['data']['children']:
      if lookForMore(reply):
        return True
  return False

def getCommentsHelper(curComments, rootId, submission):

  allComments = expandCommentList(curComments, submission)

  commentMap = {}
  for comment in allComments:
    commentMap[comment['data']['name']] = comment


  allRootComments = []
  for comment in allComments:
    if comment['data']['parent_id'] == rootId:
      allRootComments.append(comment)
    elif comment['data']['parent_id'] in commentMap:
      parentComment = commentMap[comment['data']['parent_id']]
      if parentComment['data']['replies'] == "":
        parentComment['data']['replies'] = {'data': {'children': []}}
      alreadyChild = False
      for childComment in parentComment['data']['replies']['data']['children']:
        if childComment['data']['name'] == comment['data']['name']:
          alreadyChild = True
          break
      if not alreadyChild:
        parentComment['data']['replies']['data']['children'].append(comment)
    else:
      print "pls halp"

  completedComments = []
  needMoreComments = []

  for comment in allRootComments:
    if not comment['data']['replies'] == "" or comment['kind'] == 'more':
      hasMore = lookForMore(comment)

      if hasMore:
        needMoreComments.append(comment)
      else:
        replyComments = getCommentsHelper(comment['data']['replies']['data']['children'], comment['data']['name'], submission)
        comment['data']['replies']['data']['children'] = replyComments
        completedComments.append(comment)
    else:
      completedComments.append(comment)
  for comment in needMoreComments:
    requestUrl = 'https://oauth.reddit.com/' + submission.subreddit.url + 'comments/article?&limit=1000&showmore=true&article=' + submission.id + "&comment=" + comment['data']['id']
    allData = makeRequest(requestUrl)
    articleData = allData[0]
    comment = allData[1]['data']['children'][0]
    if comment['data']['replies'] == "":
      completedComments.append(comment)
    else:
      comments = comment['data']['replies']['data']['children']
      actualComments = getCommentsHelper(comments, comment['data']['name'], submission)
      comment['data']['replies']['data']['children'] = actualComments
      completedComments.append(comment)

  return completedComments

编辑:似乎我能回来的最多是1000条评论?


2
投票

我添加了一堆打印和调试,但是现在@danielle你的脚本什么都不做。刚回来提示。

© www.soinside.com 2019 - 2024. All rights reserved.