将 reddit API pullpush.io 从评论更改为 R 中的提交

问题描述 投票:0回答:1

我正在尝试使用 pullpush.io 从 reddit API 提取提交内容,但我不知道如何修复代码,因此它返回此内容而不是评论。

我如何更改代码才能返回这个?我想使用关键字从特定的 Reddit 子版块获取提交内容。

我如何调整代码以搜索多个关键字,例如“GPT”和“code”?


library(jsonlite)
library(tidyverse)
library(httr)

# command = paste("https://api.pullpush.io/reddit/search/comment/?q=artificial%20intelligence&subreddit=machinelearning&sort=desc&size=100&before=",as.integer(before),sep ='')


body <- c()
author_created_utc <- c()
author_fullname <- c()
permalink <- c()

after <- as.POSIXct("2022-11-30", tz = "UTC")#86400 
before <- as.POSIXct("2022-12-01", tz = "UTC")
before == after

repeat {

command = paste("https://api.pullpush.io/reddit/search/comment/?q=ChatGPT&sort=desc&size=100&before=",as.integer(before),sep ='')
command = paste(command,'&after=',sep ='')
command = paste(command,as.integer(after),sep ='')

  
  
response <- GET(
  command
)

posts <- content(response)[1]$data


for (post in posts){
  body <- c(body,post$body)
  if (is.null(post$author_created_utc)==TRUE){
    author_created_utc <- c(author_created_utc,0)
  }
  else{
    author_created_utc <- c(author_created_utc,post$author_created_utc)
  }
  if (is.null(post$author_fullname)==TRUE){
    author_fullname <- c(author_fullname,'Unknown')
  }
  else{
    author_fullname <- c(author_fullname,post$author_fullname)
  }
  if (is.null(post$author_fullname)==TRUE){
    permalink <- c(permalink,'Unknown')
  }
  else{
    permalink <- c(permalink,post$permalink)
  }
}

after <- after + 86400 
before <- before + 86400 
print(before)
if (before==as.POSIXct("2023-01-01", tz = "UTC")){
  output <- data.frame(body,author_created_utc,author_fullname,permalink)
  stop('End date')
}

}
r reddit httr
1个回答
0
投票
# load libraries
pacman::p_load(tidyverse, httr, jsonlite)

# function for getting data for an individual query
reddit_query <- function(query, subreddit, after, before, size, sort, type){
  command = "https://api.pullpush.io/reddit/search/{type}/?q={URLencode(query)}&sort={sort}&size={size}&after={as.integer(after)}&before={as.integer(before)}&subreddit={subreddit}"
  response <- GET(
    glue::glue(command)
  )
  content(response, "text", encoding = "utf-8") %>%
    fromJSON() %>%
    .$data -> data
    # if data is an empty list, just return the empty list. else, get the important columns
    if (length(data) == 0) {
        return(data)
    } else {
       return(select(data, author_created_utc,author_fullname,permalink))
    }
}

# function for aggregating query results into a dataframe
reddit_scrape <- function(query, subreddit, after, before, size, sort, type, breakpoint, period = "day"){

    # get the time period
    difference <- case_when(
        period == "day" ~ days(1),
        period == "week" ~ weeks(1),
        period == "month" ~ months(1),
        period == "year" ~ years(1),
        TRUE ~ days(1)
    )
    
    # initialise the output dataframe
    out <- tibble(
        author_created_utc = integer(),
        author_fullname = character(),
        permalink = character()
    )

    while (before <= breakpoint){
        print(glue::glue("Scraping {after} to {before} (ending at {breakpoint})"))

      data <- reddit_query(query, subreddit, after, before, size, sort, type)

      if (length(data) == 0) {
        break
      }
      out <- bind_rows(out, data)
      after <- after + difference
      before <- before + difference
    }
    return(mutate(out, author_created_utc = as_datetime(author_created_utc)))
}

reddit_scrape("gpt code", "programming", as_datetime("2022-01-01"), as_datetime("2022-01-01") + years(1), 100, "desc", "submission", Sys.time(), "year")
© www.soinside.com 2019 - 2024. All rights reserved.