使用Python多线程将报废的信息附加到列表中,导致0个列表

问题描述 投票:0回答:1

我正在使用具有并发 future 线程的 Selenium 无头浏览器,以便从网站抓取信息并将其输出到 Excel 工作表。然而,虽然 Beautiful Soup 能够很好地找到元素并能够打印元素,但将它们附加到列表中不起作用,最终会得到空白列表,除了 1 个附加良好的列表之外。 这是代码(抱歉,如果它很混乱,我还在学习):

def init_driver():
    PROXY = 'http://p.webshare.io:9999'
    chrome_options = ChromeOptions()
    chrome_options.add_argument('log-level=3')
    chrome_options.add_argument('--proxy-server=%s' % PROXY)
    chrome_options.add_argument("--headless")  # Run headless
    driver_service = ChromeService(executable_path='chromedriver.exe')
    driver = webdriver.Chrome(service=driver_service, options=chrome_options)
    return driver


def Get_info(url2):
    driver = init_driver()
    driver.get(url2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    title = soup.find('h1', class_='DrugHeader__title-content___2ZaPo').text
    images = soup.find_all('img', alt = title)
    store = soup.find_all(class_='saltInfo DrugHeader__meta-value___vqYM0')
    for things in store:
        if 'store' in things.text:
            stor = thing.text
        else:
            pass
    with lock:
        pop = []
        for image in images:
            blah = image['src']
            final = blah.replace('l_watermark_346,w_480,h_480/a_ignore,w_480,h_480,c_fit,q_auto,f_auto/', '')
            print(final)
            pop.append(final)
        thing = ', '.join(pop)
        image_list.append(thing)
        names.append(title)
        storage.append(stor)
        manufacturer.append(soup.find_all(class_='DrugHeader__meta-value___vqYM0')[0].text)
        salt.append(soup.find_all(class_='DrugHeader__meta-value___vqYM0')[1].text)
        if soup.find(class_='DrugPriceBox__best-price___32JXw').text != None:
            price.append(soup.find(class_='DrugPriceBox__best-price___32JXw').text)
        elif soup.find(class_='PriceBoxPlanOption__offer-price___3v9x8 PriceBoxPlanOption__offer-price-cp___2QPU_').text != None:
            price.append(soup.find(class_='PriceBoxPlanOption__offer-price___3v9x8 PriceBoxPlanOption__offer-price-cp___2QPU_').text)
        elif soup.find(class_='DrugPriceBox__price___dj2lv').text != None:
            price.append(soup.find(class_='DrugPriceBox__price___dj2lv').text)
        pack = soup.find(class_='DrugPriceBox__quantity___2LGBX').text
        if 'vial' in pack:
            pack_type.append('vial')
        elif 'strip' in pack:
            pack_type.append('strip')
        elif 'bottle' in pack:
            pack_type.append('bottle')
        elif 'tube' in pack:
            pack_type.append('tube')
        elif 'packet' in pack:
            pack_type.append('packet')
        elif 'box' in pack:
            pack_type.append('box')
        elif 'vial' in pack:
            pack_type.append('vial')
        elif 'cartridge' in pack:
            pack_type.append('cartridge')
        elif 'ampoule' in pack:
            pack_type.append('ampoule')
        elif 'syringe' in pack:
            pack_type.append('syringe')
        else:
            pack_type.append('N/A')
        packaging.append(soup.find(class_='DrugPriceBox__quantity___2LGBX').text)
    driver.quit()


with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor2:
    print('Start!')
    print(len(url_list))
    for url2 in url_list:
        executor2.submit(Get_info, url2)

file2.close()


print('a' + str(len(names)))
print('b' + str(len(price)))
print('d' + str(len(packaging)))
print('salt = ' + str(len(salt)))
print('e'+str(len(storage)))
print('f'+str(len(manufacturer)))



listing_dict = {
    'Drug Name': names,
    'Price': price,
    'Packagaing type': pack_type,
    'Packaging': packaging,
    'Composition': salt,
    'Storage': storage,
    'Manufacterer': manufacturer,
    'Images': image_list
 }

print("______________________Job Finished!______________________")
df = pd.DataFrame(listing_dict)
df.to_excel(f"Test.xlsx")
python selenium-webdriver python-multithreading concurrent.futures
1个回答
0
投票

正如您所注意到的,线程不共享您要附加到的列表。

从线程收集结果的常用方法是保存

concurrent.futures.Future
创建的
submit()
对象并为每个对象调用
result()
方法。

这将是模板:

with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor2:
    print('Start!')
    print(len(url_list))
    futures = []
    for url2 in url_list:
        futures.append(executor2.submit(Get_Info, url2))
    concurrent.futures.wait(futures)
    results = [f.result() for f in futures]

但是,您需要更改

Get_Info()
代码以返回部分列表(我建议使用字典,因为这是您的最终目标)并编写代码以在最后合并结果。

© www.soinside.com 2019 - 2024. All rights reserved.