使用rselenium和rvest通过while循环从多个页面的多个表格中提取数据。

问题描述 投票:0回答:1

所以我在写一个R代码,它将登录一个受密码保护的网站,进入该网站的一个特定页面,然后从一个特定的表格中收集数据。这个表是由某一天的销售数据组成的。现在,在这些日子里,很多时候,有不止一个 "页面"(必须点击下一个按钮)。所以,对于每一个特定的日子,我都要在多个页面上抓取表格,然后从一个起始日期开始抓取多个日子的数据。

比如说 我拉出显示01012020的销售数据的页面。假设它的这个特定表格有三页的数据。这段代码应该抓取当天所有三页价值的表数据,然后切换输入,进入01022020的页面,并做同样的事情,直到今天。

现在,我已经完成了大部分的工作,但是我遇到了这个恼人的

Error in UseMethod("html_table") : no applicable method for 'html_table' applied to an object of class "xml_missing"

错误。它存在于下面的循环函数中。

###loop to collect all data for each day when there are multiple pages
#set it so we can input custom date ranges
remDr$findElement(using = 'xpath', '//*[@id="date-dropdown-container"]/button')$clickElement()
remDr$findElement(using = 'xpath', '//*[@id="date-dropdown-container"]/ul/li[9]/a')$clickElement()

#set up final dataframe
items_table_final.df <- data.frame()


date <- start.date.date


#loop start for cycling through days
while (date <= end.date.date){


#create text version of the date to enter into the webpage
date.char <- format(as.Date(date, format = "%d-%m-%Y"), "%m-%d-%Y")



#fill in the date range
remDr$findElement("name", "reportDateStart")$sendKeysToElement(sendKeys = list(control = "\uE009", "a", delete = "\uE017"))
remDr$findElement("name", "reportDateStart")$sendKeysToElement(list(date.char))

remDr$findElement("name", "reportDateEnd")$sendKeysToElement(sendKeys = list(control = "\uE009", "a", delete = "\uE017"))
remDr$findElement("name", "reportDateEnd")$sendKeysToElement(list(date.char))

#setup and or clear temporary data frame
items_table.df <- data.frame("Menu Item" = character(),
                            "Menu Group" = character(),
                            "Menu" = character(),
                            "Item Quantity" = integer(),
                            "Net Amount" = integer(),
                            stringsAsFactors = FALSE)

#go to the data for the selected date range
remDr$findElement("id", "update-btn")$clickElement()


pages <- 1


 #loop start for cycling through pages within a specified day
 while (pages <= 100){
   #fills a second temp data frame with data from the displayed page
   items_html <- read_html(remDr$getPageSource()[[1]])
   items_table_new <- items_html %>%
       rvest::html_node("table#top-items") %>%
       rvest::html_table(fill = TRUE)


     #test if the page loop needs to stop
     if(nrow(items_table_new) == nrow(match_df(items_table.df, items_table_new))){

       break

     } else {
       #add the new data to the earlier temp data frame IF it isnt a match to something already there
       items_table.df <- rbind(items_table.df, items_table_new)

       #hit the next page arrow button
       remDr$findElement("link text", "Next →")$clickElement()
     }
   pages <- pages + 1
}

#add the new data to the final data frame 
items_table_final.df <- rbind(items_table_final.df, items_table.df)

date <- date + 1
}

当我做一个 traceback() 我得到了下面的输出。

9: rvest::html_table(., fill = TRUE)
8: function_list[[k]](value)
7: withVisible(function_list[[k]](value))
6: freduce(value, `_function_list`)
5: `_fseq`(`_lhs`)
4: eval(quote(`_fseq`(`_lhs`)), env, env)
3: eval(quote(`_fseq`(`_lhs`)), env, env)
2: withVisible(eval(quote(`_fseq`(`_lhs`)), env, env))
1: items_html %>% rvest::html_node("table#top-items") %>% rvest::html_table(fill = TRUE)

所以我认为从表中抓取数据的那段代码有问题. 但是当我手动运行它时。没问题. 事实上,如果我手动运行整个循环,我不会遇到任何错误。我甚至可以运行嵌套循环,即循环浏览页面的循环,也就是包含所谓有问题的代码的循环,就很好。只是外层循环有问题。

我已经用网站上的空表和填充表的数据进行了测试。我已经确认这些表的名称是一致的。我已经确认数据被正确地从网页上抓取并保存到我指定的数据框架中。

任何想法或建议都将被认真地感谢! 下面是完整的代码(网页和密码数据删除)。

library(RSelenium)
library(rvest)
library(tidyverse)
library(plyr)


####adjustable variables####
#enter in the date you wish to grab data starting from in the format MONTH-DAY-YEAR where it is all numbers, and there are at least two digits for month and day, and four digits for year 
start.date <- "01-01-2020"

start.date.date <- as.Date(start.date, format = "%m-%d-%Y")


#change this to follow the format as specified for start.date if you want to specify a different end date than the current date this program is running
end.date <- Sys.Date()

end.date.date <- as.Date(end.date, format = "%m-%d-%Y")



####data retrieval code####
#create a server based on the chrome broswer. If you are running version 84, then put "Latest"
rD <- rsDriver(chromever = "83.0.4103.39", verbose = F)
remDr <- rD$client

#navigate to login page
remDr$navigate("**LOGIN WEB PAGE LINK**")

#fill in login info and submit
remDr$findElement("id", "email")$sendKeysToElement(list("**LOGIN DETAIL:USERNAME**"))
remDr$findElement("id", "password")$sendKeysToElement(list("**LOGIN DETAIL: PASSWORD**"))
remDr$findElement("id", "log-in")$clickElement()

#go to the data page
remDr$navigate("**WEB PAGE THAT HAS THE DATA TABLE DISPLAYED**")



###loop to collect all data for each day when there are multiple pages
#set it so we can input custom date ranges
remDr$findElement(using = 'xpath', '//*[@id="date-dropdown-container"]/button')$clickElement()
remDr$findElement(using = 'xpath', '//*[@id="date-dropdown-container"]/ul/li[9]/a')$clickElement()

#set up final dataframe
items_table_final.df <- data.frame()


date <- start.date.date


#loop start for cycling through days
while (date <= end.date.date){


  #create text version of the date to enter into the webpage
  date.char <- format(as.Date(date, format = "%d-%m-%Y"), "%m-%d-%Y")



  #fill in the date range
  remDr$findElement("name", "reportDateStart")$sendKeysToElement(sendKeys = list(control = "\uE009", "a", delete = "\uE017"))
  remDr$findElement("name", "reportDateStart")$sendKeysToElement(list(date.char))

  remDr$findElement("name", "reportDateEnd")$sendKeysToElement(sendKeys = list(control = "\uE009", "a", delete = "\uE017"))
  remDr$findElement("name", "reportDateEnd")$sendKeysToElement(list(date.char))

  #setup and or clear temporary data frame
  items_table.df <- data.frame("Menu Item" = character(),
                               "Menu Group" = character(),
                               "Menu" = character(),
                               "Item Quantity" = integer(),
                               "Net Amount" = integer(),
                               stringsAsFactors = FALSE)

  #go to the data for the selected date range
  remDr$findElement("id", "update-btn")$clickElement()


  pages <- 1


    #loop start for cycling through pages within a specified day
    while (pages <= 100){
      #fills a second temp data frame with data from the displayed page
      items_html <- read_html(remDr$getPageSource()[[1]])
      items_table_new <- items_html %>%
          rvest::html_node("table#top-items") %>%
          rvest::html_table(fill = TRUE)


        #test if the page loop needs to stop
        if(nrow(items_table_new) == nrow(match_df(items_table.df, items_table_new))){

          break

        } else {
          #add the new data to the earlier temp data frame IF it isnt a match to something already there
          items_table.df <- rbind(items_table.df, items_table_new)

          #hit the next page arrow button
          remDr$findElement("link text", "Next →")$clickElement()
        }
      pages <- pages + 1
  }

  #add the new data to the final data frame 
  items_table_final.df <- rbind(items_table_final.df, items_table.df)

  date <- date + 1
}

r web-scraping rvest rselenium
1个回答
0
投票

我刚刚解决了! 原来有两个问题。首先,代码在页面加载完成之前,执行了从表中收集数据的行。所以,从技术上讲,表的ID并不存在,无法收集数据。为了解决这个问题,我只是在代码中添加了一个 Sys.sleep(5) 命令,让系统等待5秒。接下来的问题是,如果有一个页面的表是空的,或者一个表只有一页,就没有 "next "这个元素来翻页。所以我在上面的 while 语句中加了一个 try 句柄,直接跳过,让它在计数器上向下运行,因为这只需要 2 秒钟的时间。我把修正后的循环贴出来,供有类似问题的人参考!

#loop start for cycling through days
while (date <= end.date.date){


  #create text version of the date to enter into the webpage
  date.char <- format(as.Date(date, format = "%d-%m-%Y"), "%m-%d-%Y")



  #fill in the date range
  remDr$findElement("name", "reportDateStart")$sendKeysToElement(sendKeys = list(control = "\uE009", "a", delete = "\uE017"))
  remDr$findElement("name", "reportDateStart")$sendKeysToElement(list(date.char))

  remDr$findElement("name", "reportDateEnd")$sendKeysToElement(sendKeys = list(control = "\uE009", "a", delete = "\uE017"))
  remDr$findElement("name", "reportDateEnd")$sendKeysToElement(list(date.char))

  #setup and or clear temporary data frame
  items_table.df <- data.frame("Menu Item" = character(),
                               "Menu Group" = character(),
                               "Menu" = character(),
                               "Item Quantity" = integer(),
                               "Net Amount" = integer(),
                               stringsAsFactors = FALSE)

  #go to the data for the selected date range
  remDr$findElement("id", "update-btn")$clickElement()


  pages <- 1


  #add a system pause to avoid an error where the page is not yet loaded 
  Sys.sleep(5)


    #loop start for cycling through pages within a specified day
    while (pages <= 20){
      #fills a second temp data frame with data from the displayed page
      items_html <- read_html(remDr$getPageSource()[[1]])
      items_table_new <- items_html %>%
          rvest::html_node("table#top-items") %>%
          rvest::html_table(fill = TRUE)

        #add the date of the data to the dataframe
        items_table_new$date <- date.char

        #test if the page loop needs to stop
        if(nrow(items_table_new) == nrow(match_df(items_table.df, items_table_new))){

          break

        } else {
          #add the new data to the earlier temp data frame IF it isnt a match to something already there
          items_table.df <- rbind(items_table.df, items_table_new)

          #hit the next page arrow button. Ignore the error of there not being one of these if theres only one page, and proceed 
          try(remDr$findElement("link text", "Next →")$clickElement(), silent = TRUE)
        }

      pages <- pages + 1

    }


  #add the new data to the final data frame 
  items_table_final.df <- rbind(items_table_final.df, items_table.df)

  date <- date + 1
}
© www.soinside.com 2019 - 2024. All rights reserved.