我正在尝试使用 pullpush.io 从 reddit API 提取提交内容,但我不知道如何修复代码,因此它返回此内容而不是评论。
我如何更改代码才能返回这个?我想使用关键字从特定的 Reddit 子版块获取提交内容。
我如何调整代码以搜索多个关键字,例如“GPT”和“code”?
library(jsonlite)
library(tidyverse)
library(httr)
# command = paste("https://api.pullpush.io/reddit/search/comment/?q=artificial%20intelligence&subreddit=machinelearning&sort=desc&size=100&before=",as.integer(before),sep ='')
body <- c()
author_created_utc <- c()
author_fullname <- c()
permalink <- c()
after <- as.POSIXct("2022-11-30", tz = "UTC")#86400
before <- as.POSIXct("2022-12-01", tz = "UTC")
before == after
repeat {
command = paste("https://api.pullpush.io/reddit/search/comment/?q=ChatGPT&sort=desc&size=100&before=",as.integer(before),sep ='')
command = paste(command,'&after=',sep ='')
command = paste(command,as.integer(after),sep ='')
response <- GET(
command
)
posts <- content(response)[1]$data
for (post in posts){
body <- c(body,post$body)
if (is.null(post$author_created_utc)==TRUE){
author_created_utc <- c(author_created_utc,0)
}
else{
author_created_utc <- c(author_created_utc,post$author_created_utc)
}
if (is.null(post$author_fullname)==TRUE){
author_fullname <- c(author_fullname,'Unknown')
}
else{
author_fullname <- c(author_fullname,post$author_fullname)
}
if (is.null(post$author_fullname)==TRUE){
permalink <- c(permalink,'Unknown')
}
else{
permalink <- c(permalink,post$permalink)
}
}
after <- after + 86400
before <- before + 86400
print(before)
if (before==as.POSIXct("2023-01-01", tz = "UTC")){
output <- data.frame(body,author_created_utc,author_fullname,permalink)
stop('End date')
}
}
# load libraries
pacman::p_load(tidyverse, httr, jsonlite)
# function for getting data for an individual query
reddit_query <- function(query, subreddit, after, before, size, sort, type){
command = "https://api.pullpush.io/reddit/search/{type}/?q={URLencode(query)}&sort={sort}&size={size}&after={as.integer(after)}&before={as.integer(before)}&subreddit={subreddit}"
response <- GET(
glue::glue(command)
)
content(response, "text", encoding = "utf-8") %>%
fromJSON() %>%
.$data -> data
# if data is an empty list, just return the empty list. else, get the important columns
if (length(data) == 0) {
return(data)
} else {
return(select(data, author_created_utc,author_fullname,permalink))
}
}
# function for aggregating query results into a dataframe
reddit_scrape <- function(query, subreddit, after, before, size, sort, type, breakpoint, period = "day"){
# get the time period
difference <- case_when(
period == "day" ~ days(1),
period == "week" ~ weeks(1),
period == "month" ~ months(1),
period == "year" ~ years(1),
TRUE ~ days(1)
)
# initialise the output dataframe
out <- tibble(
author_created_utc = integer(),
author_fullname = character(),
permalink = character()
)
while (before <= breakpoint){
print(glue::glue("Scraping {after} to {before} (ending at {breakpoint})"))
data <- reddit_query(query, subreddit, after, before, size, sort, type)
if (length(data) == 0) {
break
}
out <- bind_rows(out, data)
after <- after + difference
before <- before + difference
}
return(mutate(out, author_created_utc = as_datetime(author_created_utc)))
}
reddit_scrape("gpt code", "programming", as_datetime("2022-01-01"), as_datetime("2022-01-01") + years(1), 100, "desc", "submission", Sys.time(), "year")