我有一个抓取亚马逊网站的项目。但是,我无法获得分页的最后页码。
import telegram, aiohttp
async def main(self,links ,semaphore=8):
try:
s = HTMLSession()
async with aiohttp.ClientSession() as asession:
sem = asyncio.Semaphore(semaphore)
urls = []
for cat_link in links:
API_link = cat_link
resp = s.get(API_link)
pages = resp.html.xpath('//li[@class="a-disabled" and contains(text(),"")][last()]/text()', first=True)
print(pages)
if not pages : pages = 1
urls_1 = [cat_link + f'&page={p}' for p in range(1,(int(pages)+1))]
for url_1 in urls_1: urls.append(url_1)
#print(urls)
tasks = [asyncio.ensure_future(self.fetch_eith_sem(sem, url, asession)) for url in urls]
await asyncio.gather(*tasks)
await asession.close()
except aiohttp.ClientConnectionError as e:
print('Error handelded')
这里的挑剔是行不通的 pages = resp.html.xpath('//li[@class="a-disabled" and contains(text(),"")][last()]/text()', first=True)
示例链接: