我正在使用axios和cheerio抓取网页:该网页具有许多链接,而向下滚动时会增加负载(例如facebook)。我想抓取每个链接while向下滚动直到到达结尾。这是我的代码的示例:
cheerio = require('cheerio')
axios = require('axios')
function getLink(id) {
return axios(options).then(function(response) {
// Do stuff...
})
}
function scrollDown() {
axios(scrollOptions).then(function(response) {
$ = cheerio.load(response['data'])
isScrollFinished = ($('.page_more').length == 0)
promises = []
newLinks = $('.link') // Get the new links that were loaded while scrolling
newLinks.each(function() {
promises.push(getLink($(this).attr('id')))
})
axios.all(promises).then(responseArr => {
if(isScrollFinished) {
// Exit script
}
})
if(!isScrollFinished) {
scrollDown()
}
})
}
scrollDown()
此代码的问题是,有时在我退出之前,它不会抓取所有链接。这是因为最后一个axios.all仅等待直到刮擦last滚动页面的所有链接。我该如何解决?
我创建了promises数组作为一个静态变量,并且在滚动结束时只在其上调用axios.all:
cheerio = require('cheerio')
axios = require('axios')
function getLink(id) {
return axios(options).then(function(response) {
// Do stuff...
})
}
function scrollDown() {
if (typeof scrollDown.promises === 'undefined') {
scrollDown.promises = [] // Define static variable if undefined
}
axios(scrollOptions).then(function(response) {
$ = cheerio.load(response['data'])
isScrollFinished = ($('.page_more').length == 0)
newLinks = $('.link') // Get the new links that were loaded while scrolling
newLinks.each(function() {
scrollDown.promises.push(getLink($(this).attr('id')))
})
if(isScrollFinished) {
axios.all(scrollDown.promises).then(responseArr => {
// Exit script
})
}
else {
scrollDown()
}
})
}
scrollDown()
更好的解决方案将很乐意被接受。