我正在尝试使用 scrapy spider 从 gtabase.com 抓取 GTA V 车辆信息。这就是我想要实现的目标 -
"https://www.gtabase.com/grand-theft-auto-v/vehicles/#sort=attr.ct3.frontend_value&sortdir=desc&page=1"
作为我的start_urls
name
和manufacturer
start_urls
发生)并重复 #2 和 #3。我知道还有其他方法可以循环浏览页面,但是我想通过单击“下一步”按钮(如果可用)来学习如何使用此方法。
在下面的代码中,我还试图通过仅从前 4 个链接和前 3 个页面(参见上面的#4)中提取信息(参见上面的#3)来测试蜘蛛。但是我只得到第一页上前 4 个链接的信息。我的代码似乎有问题,蜘蛛无法转到下一页,我无法弄清楚问题所在。请注意,在代码中,我使用
itertools
仅获取前 4 个链接,并使用 counter
循环浏览前 2 页。谁能帮我调试 -
蜘蛛代码-
# Packages
import scrapy
from scrapy_selenium import SeleniumRequest
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait as WDW
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains as AC
from webdriver_manager.chrome import ChromeDriverManager
import itertools
import time
# Spider Method
class VehiclesSpider(scrapy.Spider):
name = "vehicles"
allowed_domains = ["gtabase.com"]
start_urls = ["https://www.gtabase.com/grand-theft-auto-v/vehicles/#sort=attr.ct3.frontend_value&sortdir=desc&page=1"]
# Initialize Counter
# counter = 0
# Driver Init
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
chrome_options = Options()
chrome_options.add_argument("--headless")
self.driver = webdriver.Chrome(ChromeDriverManager().install(), options = chrome_options)
# Parse Method (Get Links)
def parse(self, response):
self.driver.get(response.url)
# Scroll Till The Middle of the Page
viewport_height = self.driver.execute_script("return Math.max(document.documentElement.clientHeight, window.innerHeight || 0);")
self.driver.execute_script(f"window.scrollTo(0, {viewport_height/2});")
# Store Response
resp = Selector(text=self.driver.page_source)
# Extract Links
for i, vehicle_card in itertools.islice(enumerate(resp.xpath("//div[contains(@class, 'product') and contains(@class, 'item') and contains(@class, 'ln-element')]")), 4): # Testing With 4 Links
link = response.urljoin(vehicle_card.xpath(".//a[contains(@class, 'product') and contains(@class, 'item-link')]/@href").get())
yield scrapy.Request(link, callback = self.parse_details)
# Extract Vehicle Details
def parse_details(self, response):
# Open URL
self.driver.get(response.url)
# Extract Data
for vehicle_info in response.xpath("//div[@class='article-content']"):
name = response.xpath(".//h2[5]/text()").get(default = "NA").strip()
manufacturer = response.xpath(".//dl/dd//span//a[contains(@title, 'Vehicle Class')]/text()").get(default = "NA").strip()
yield {
"name": name
, "manufacturer": manufacturer
}
# Increment Counter / Check if Counter is <= 3
self.counter += 1
if self.counter < 4:
# Get Next Page URL & Callback parse_details
next_page = response.xpath("(//a[@title='Next'])[2]/@href").get()
if next_page:
absolute_url = f"https://www.gtabase.com/grand-theft-auto-v/vehicles/{next_page}"
yield SeleniumRequest(
url = absolute_url,
wait_time = 5,
callback = self.parse_details
)
else:
self.log("REACHED PAGE LIMIT, SPIDER STOPPED")
# Close Spider
def spider_closed(self, reason):
self.driver.quit()
我的
settings.py
文件中也有以下设置-
from webdriver_manager.chrome import ChromeDriverManager
SELENIUM_DRIVER_NAME = "chrome"
SELENIUM_DRIVER_EXECUTABLE_PATH = ChromeDriverManager().install()
SELENIUM_DRIVER_ARGUMENTS=['-headless'] # '--headless' if using chrome instead of firefox