减少代码完成抓取所需的时间

问题描述 投票:0回答:1

我想减少代码完成抓取页面所需的时间,我正在使用 selenium。 我在这个抓取项目中使用了 Scrapy,但 JavaScript 隐藏了 Scrapy 中的电子邮件元素。

Scrapy 很完美,我不知道是否有办法减少 selenium 的时间,或者有其他方法,或者在这种情况下使用其他工具或包?

如果有任何信息或文档可以了解更多信息,我将不胜感激。

这是代码:

# import the nesscery packages 
from typing import Iterable
import csv
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.webdriver import Chrome
import time
from selenium.webdriver.common.by import By
import logging
from selenium.webdriver.support.ui import WebDriverWait

logging.basicConfig(level=logging.INFO) 

path = r"C:\Users\HP\Desktop\scraping\chromedriver.exe"
options = Options()
options.headless=True
driver =Chrome(executable_path=path, options=options)
start_time = time.ctime()
end_time = time.ctime()


data = []
page_no = 1
base_url = f"https://www.mdpi.com/search?sort=pubdate&page_no={page_no}&page_count=50&year_from=1996&year_to=2024&q=biomaterials&view=default"

print (start_time)

for page_no in range(218,219):
    # visit the link (the main web page)
    logging.info(f"Scraping page number : {page_no} ")

    url = base_url.format(page_no)
    driver.get(url)
    time.sleep(1)

    # extracting the articles links from the main page 
    article_links = driver.find_elements_by_xpath(".//a[@class='title-link']")
    article_hrefs = [lnk.get_attribute("href") for lnk in article_links]

    # loop through all the article links to extract the specified information 
    for href in article_hrefs:
        # Visit the article page
        logging.info(f"Scraping article: {href}")
        driver.get(href)
        time.sleep(1)
        
        # extracting  title, author name and his email  from article page  
        title_element = driver.find_element(By.XPATH,".//h1[contains(@class,'title')]")
        title = title_element.text

        author_element= author_element = driver.find_element(By.XPATH, ".//a[@class='profile-card-drop']")
        author_name = author_element.text

        email_elements = driver.find_elements(By.XPATH, ".//a[contains(@class,'email')]") 
        # Extract the first email element
        if email_elements:
            
            email_element = email_elements[0]
            email = email_element.get_attribute("href")
            data.append({"Title": title ,"Link": href,"Author": author_name, "Email": email })

# to know how much time it took to scrape all pages 
print (end_time)

driver.quit()

# saving the data in the csv file 
with open('emails.csv', 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=["Title","Author","Link", "Email"])
    writer.writeheader()
    writer.writerows(data)
python performance selenium-webdriver web-scraping scrapy
1个回答
0
投票

电子邮件地址受 CloudFlare 的电子邮件保护脚本保护(实际上是双重编码)。我在网上找到了解码脚本,但它是针对单个编码字符串的,所以我必须修改它。

以下是如何使用 Scrapy(无硒)抓取网站:

import scrapy
import logging


def decode_email_protection(encoded_string):
    encoded_data = encoded_string.split('#')[-1]

    r = int(encoded_data[:2], 16)
    email = ''.join([chr(int(encoded_data[i:i + 2], 16) ^ r) for i in range(2, len(encoded_data), 2)])

    encoded_data = email.split('#')[-1]

    r = int(encoded_data[4:6], 16)
    encoded_data = encoded_data[:4] + encoded_data[6:]
    email = ''.join([chr(int(encoded_data[i:i + 2], 16) ^ r) for i in range(0, len(encoded_data), 2)])
    return email


class ExampleSpider(scrapy.Spider):
    name = "example_spider"
    allowed_domains = ["mdpi.com"]
    base_url = "https://www.mdpi.com/search?sort=pubdate&page_no={}&page_count=50&year_from=1996&year_to=2024&q=biomaterials&view=default"

    def start_requests(self):
        for page_no in range(218, 219):
            yield scrapy.Request(url=self.base_url.format(page_no), cb_kwargs={"page_no": page_no})

    def parse(self, response, page_no):
        self.log(f"Scraping page number : {page_no}", logging.INFO)
        article_hrefs = response.xpath("//a[@class='title-link']/@href").getall()
        for href in article_hrefs:
            yield response.follow(url=href, callback=self.parse_page)

    def parse_page(self, response):
        self.log(f"Scraping article: {response.url}", logging.INFO)

        title = response.xpath("//h1[contains(@class,'title')]/text()").get(default="").strip()
        authors = response.xpath("//a[@class='profile-card-drop']//text()").getall()
        authors = [i.strip() for i in authors]
        email_href = response.xpath("//a[contains(@class,'email')]/@href").get(default="")
        email = decode_email_protection(email_href)

        yield {
            "Title": title,
            "Link": response.url,
            "Authors": authors,
            "Email": email
        }
© www.soinside.com 2019 - 2024. All rights reserved.