消除重复的 url(Web 抓取 RACE 条件)

问题描述 投票:0回答:1

我正在尝试用 2 年(2015 年和 2022 年)来破坏网站。我正在尝试从以下更新中检索所有 pdf。每年我们都有许多 pdf 网址,我试图以 {year: pdf_link} 格式插入数据。我能够获取 URL,但年份错误。我的假设是因为线程

例如::在下面的 2015 年屏幕截图中,我们有 12-40968.pdf ,但是为 2022 年生成了相同的 12-40968.pdf,但不存在。

import os
from apify_client import ApifyClient
import concurrent.futures
from selenium.common.exceptions import TimeoutException
import requests
import subprocess
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import boto3
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import stat
import json
import ast
import threading
import time
from concurrent.futures import ThreadPoolExecutor


def main():

    s3 = boto3.client('s3' )
    today_date=datetime.today().strftime('%Y-%m-%d')                  
    generate_xmlfiles(s3,today_date)
    print("Uploaded successfully")

def generate_xmlfiles(s3,today_date):
    s3.put_object(Bucket='data', Key='Bany/'+today_date+"/xmlfiles.txt", Body=(str(scrape_data())))
    print("Exported files to s3")


def scrape_data():  
        years = [2022,2015]
        data = []
        states = ["alnb"]

        options = webdriver.ChromeOptions()
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("headless")
        driver = webdriver.Chrome(#executable_path='/usr/src/app/chromedriver',
         options=options)
        # st = os.stat('/usr/src/app/chromedriver')
        # os.chmod('/usr/src/app/chromedriver', st.st_mode | stat.S_IEXEC)
    
        with requests.Session() as session:
            for state in states:
                with ThreadPoolExecutor() as executor:
                    futures = []
                    for year in years:
                        url = 'https://www.govinfo.gov/app/collection/uscourts/bankruptcy/'+state+'/'+str(year)+'/%7B%22pageSize%22%3A%22100%22%2C%22offset%22%3A%220%22%7D'
                        response = session.get(url)
                      #  print("response is"+str(response))
                        if response.status_code == 200:               
                            #  print(executor.submit(scrape_year, driver, url, state, year,data))     
                              futures.append(executor.submit(scrape_year, driver, url, state, year,data))  
                    for future in futures:
                      #  print("future is "+str(future.result()))
                        data += future.result()
                        print("hi")
                       # print(data)
    
                print("Loaded " +state)
        driver.quit()
        
        return data
    
def scrape_year(driver, url, state, year,data):
    print("scraping data for state "+state.capitalize() +" for "+str(year).capitalize())
    driver.get(url)
    driver.implicitly_wait(10)
    try_count = 0
    while try_count < 3:
        try:
            elements = WebDriverWait(driver, 120).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "panel-body")))
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            bankruptcy_element = soup.findAll('div', {   "class":"panel-collapse collapse in","class":"panel-title","class":"panel-body","class":"panel panel-default", "class":"panel-collapse collapse in" ,"class":"btn-group-horizontal" })          
            return [{year:"https://www.govinfo.gov/metadata/granule"+xmlfile['href'].replace(".pdf","/mods.xml").replace("/pdf","").replace("/pkg/","/").replace("/content","")} for i in bankruptcy_element for xmlfile in i.findAll('a', href=True) if ("pdf" in (xmlfile['href']))]
        except TimeoutException:
                print(f"TimeoutException encountered. Retrying {try_count + 1} of 3...")
                try_count += 1
     
main()
python-3.x multithreading selenium-webdriver web-scraping race-condition
1个回答
0
投票

我已经更新了你的部分代码,它现在对我有用了。

  • scrape_data()

    已添加

    futures.append(scrape_year(driver, url, state, year, data)) 
    

    代替

    futures.append(executor.submit(scrape_year, driver, url, state, year,data))
    

    也就是说,对于串行执行,我忽略了

    ThreadPoolExecutor
    ,而不是并行。

  • scrape_year()

    已添加

    res = soup.find(attrs={"data-href": "/uscourts/bankruptcy/alnb/"+str(year)});
    bankruptcy_element = soup.findAll('div', {"id": "collapseOne"+res['id']})
    
    return [{year:"https://www.govinfo.gov/metadata/granule"+xmlfile['href'].replace(".pdf","/mods.xml").replace("/pdf","").replace("/pkg/","/").replace("/content","")} for i in bankruptcy_element for j in i.findAll('div', {"class":"btn-group-horizontal"}) for xmlfile in j.findAll('a', href=True) if ("pdf" in (xmlfile['href']))]
    
© www.soinside.com 2019 - 2024. All rights reserved.