如何使用ThreadPoolExecutor填充字典和列表? 函数返回空字典和列表。 由于 Windows 问题,我无法使用多处理来执行此任务。 看起来 Manager() 的工作方式与多处理中的工作方式不同。 这是我的代码:
def parser(link, data, exceptions, driver = driver):
try:
data_ = {}
driver.get(link)
brand = driver.find_element('xpath', '//*[@id="middle"]/section[1]/div/div/div[2]/h1').text
name = driver.find_element('xpath', '//*[@id="section-st-block9"]/div[2]/div/div[1]').find_elements('tag name', 'div')
field = name[0].text
name_ = name[1].text
data_[field] = name_
contacts = driver.find_element('xpath', '//*[@id="section-st-block5"]/div[2]/div[1]').find_elements('class name', 'content-contact-item')
for contact in contacts:
fields = contact.find_elements('tag name', 'div')
field = fields[0].text
name__ = fields[1].text
data_[field] = name__
data[brand] = data_
except:
exceptions.append(link)
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Manager
manager = Manager()
data = manager.dict()
exceptions = manager.list()
#collecting links from page...
for i in range(132, 142):
core = 'https://fabricators.ru/zavody?region=19569&page={}'
url = core.format(i)
driver.get(url)
companies = driver.find_elements('class name', 'content-list-item')
links = []
for c in companies:
x = c.find_element('tag name', 'a').get_attribute('href')
links.append(x)
#...and provide them as argument of function
with ThreadPoolExecutor(5) as executor:
for _ in executor.map(parser, links, data, exceptions): #also tried with itertools.repeat
pass
不需要使用重量级的
selenium
,只需使用requests
/beautifulsoup
获取数据即可。以下是如何使用两个 multiprocessing.Pool
快速完成此操作的示例(一个用于获取链接,第二个用于从链接获取数据):
from itertools import chain
from multiprocessing import Pool
import requests
from bs4 import BeautifulSoup
base_url = "https://fabricators.ru/zavody?region=19569&page={}"
def get_links(page_num):
u = base_url.format(page_num)
soup = BeautifulSoup(
requests.get(u, cookies={"beget": "begetok"}).content, "html.parser"
)
out = []
for a in soup.select("a.title-site--h3"):
out.append("https://fabricators.ru" + a["href"])
return out
def get_data_from_link(link):
soup = BeautifulSoup(
requests.get(link, cookies={"beget": "begetok"}).content, "html.parser"
)
title = soup.h1.text
address = soup.h1.find_next("p").text
# ... get other data here
return title, address
if __name__ == "__main__":
with Pool(processes=2) as pool1, Pool(processes=4) as pool2:
links = pool1.imap_unordered(get_links, range(132, 134))
for title, address in pool2.imap_unordered(
get_data_from_link, chain.from_iterable(links)
):
# print the data, or store it to the dictionary etc...
print(title)
print(address)
print()
打印:
...
Завод Энергокабель
Московская область, Ногинский р-н, Электроугли, ул. Полевая, 10
Завод Микропровод
Московская область, Подольск, ул. Бронницкая, 3
КабельЭлектроСвязь
Московская область, Ленинский р-н, Видное, территория Северная Промзона, ул. Проектируемый проезд №5208
Завод КвантКабель
Московская область, Дмитров, ул. Пушкинская, 1, стр.15
...