我正在使用具有并发 future 线程的 Selenium 无头浏览器,以便从网站抓取信息并将其输出到 Excel 工作表。然而,虽然 Beautiful Soup 能够很好地找到元素并能够打印元素,但将它们附加到列表中不起作用,最终会得到空白列表,除了 1 个附加良好的列表之外。 这是代码(抱歉,如果它很混乱,我还在学习):
def init_driver():
PROXY = 'http://p.webshare.io:9999'
chrome_options = ChromeOptions()
chrome_options.add_argument('log-level=3')
chrome_options.add_argument('--proxy-server=%s' % PROXY)
chrome_options.add_argument("--headless") # Run headless
driver_service = ChromeService(executable_path='chromedriver.exe')
driver = webdriver.Chrome(service=driver_service, options=chrome_options)
return driver
def Get_info(url2):
driver = init_driver()
driver.get(url2)
soup = BeautifulSoup(driver.page_source, 'html.parser')
title = soup.find('h1', class_='DrugHeader__title-content___2ZaPo').text
images = soup.find_all('img', alt = title)
store = soup.find_all(class_='saltInfo DrugHeader__meta-value___vqYM0')
for things in store:
if 'store' in things.text:
stor = thing.text
else:
pass
with lock:
pop = []
for image in images:
blah = image['src']
final = blah.replace('l_watermark_346,w_480,h_480/a_ignore,w_480,h_480,c_fit,q_auto,f_auto/', '')
print(final)
pop.append(final)
thing = ', '.join(pop)
image_list.append(thing)
names.append(title)
storage.append(stor)
manufacturer.append(soup.find_all(class_='DrugHeader__meta-value___vqYM0')[0].text)
salt.append(soup.find_all(class_='DrugHeader__meta-value___vqYM0')[1].text)
if soup.find(class_='DrugPriceBox__best-price___32JXw').text != None:
price.append(soup.find(class_='DrugPriceBox__best-price___32JXw').text)
elif soup.find(class_='PriceBoxPlanOption__offer-price___3v9x8 PriceBoxPlanOption__offer-price-cp___2QPU_').text != None:
price.append(soup.find(class_='PriceBoxPlanOption__offer-price___3v9x8 PriceBoxPlanOption__offer-price-cp___2QPU_').text)
elif soup.find(class_='DrugPriceBox__price___dj2lv').text != None:
price.append(soup.find(class_='DrugPriceBox__price___dj2lv').text)
pack = soup.find(class_='DrugPriceBox__quantity___2LGBX').text
if 'vial' in pack:
pack_type.append('vial')
elif 'strip' in pack:
pack_type.append('strip')
elif 'bottle' in pack:
pack_type.append('bottle')
elif 'tube' in pack:
pack_type.append('tube')
elif 'packet' in pack:
pack_type.append('packet')
elif 'box' in pack:
pack_type.append('box')
elif 'vial' in pack:
pack_type.append('vial')
elif 'cartridge' in pack:
pack_type.append('cartridge')
elif 'ampoule' in pack:
pack_type.append('ampoule')
elif 'syringe' in pack:
pack_type.append('syringe')
else:
pack_type.append('N/A')
packaging.append(soup.find(class_='DrugPriceBox__quantity___2LGBX').text)
driver.quit()
with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor2:
print('Start!')
print(len(url_list))
for url2 in url_list:
executor2.submit(Get_info, url2)
file2.close()
print('a' + str(len(names)))
print('b' + str(len(price)))
print('d' + str(len(packaging)))
print('salt = ' + str(len(salt)))
print('e'+str(len(storage)))
print('f'+str(len(manufacturer)))
listing_dict = {
'Drug Name': names,
'Price': price,
'Packagaing type': pack_type,
'Packaging': packaging,
'Composition': salt,
'Storage': storage,
'Manufacterer': manufacturer,
'Images': image_list
}
print("______________________Job Finished!______________________")
df = pd.DataFrame(listing_dict)
df.to_excel(f"Test.xlsx")
正如您所注意到的,线程不共享您要附加到的列表。
从线程收集结果的常用方法是保存
concurrent.futures.Future
创建的 submit()
对象并为每个对象调用 result()
方法。
这将是模板:
with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor2:
print('Start!')
print(len(url_list))
futures = []
for url2 in url_list:
futures.append(executor2.submit(Get_Info, url2))
concurrent.futures.wait(futures)
results = [f.result() for f in futures]
但是,您需要更改
Get_Info()
代码以返回部分列表(我建议使用字典,因为这是您的最终目标)并编写代码以在最后合并结果。