大家好,我正在尝试使用线程处理硒。我的代码是:-
import threading as th
import time
import base64
import mysql.connector as mysql
import requests
from bs4 import BeautifulSoup
from seleniumwire import webdriver
from selenium.webdriver.chrome.options import Options
from functions import *
options = Options()
prefs = {'profile.default_content_setting_values': {'images': 2,'popups': 2, 'geolocation': 2,
'notifications': 2, 'auto_select_certificate': 2, 'fullscreen': 2,
'mouselock': 2, 'mixed_script': 2, 'media_stream': 2,
'media_stream_mic': 2, 'media_stream_camera': 2, 'protocol_handlers': 2,
'ppapi_broker': 2, 'automatic_downloads': 2, 'midi_sysex': 2,
'push_messaging': 2, 'ssl_cert_decisions': 2, 'metro_switch_to_desktop': 2,
'protected_media_identifier': 2, 'app_banner': 2, 'site_engagement': 2,
'durable_storage': 2}}
print('Crawling process started')
options.add_experimental_option('prefs', prefs)
driver = webdriver.Chrome(executable_path='chromedriver.exe', options=options)
driver.set_page_load_timeout(50000)
urls='https://google.com https://youtube.com'
def getinf(url_):
driver.get(url_)
soup=BeautifulSoup(driver.page_source, 'html5lib')
print(soup.select('title'))
for url in urls.split():
t=th.Thread(target=getinf, args=(url,))
t.start()
[当脚本运行时,选项卡没有按预期的那样(从线程中)立即打开,而是逐个完成该过程,仅显示最后一个url(https://youtube.com)的标题。当我尝试Multiprocessing时,程序崩溃了很多次。我正在制作网络爬虫,并且某些网站(例如twitter)需要JavaScript才能显示内容,因此我也不能使用请求或urllib。有什么解决方案呢?其他任何图书馆建议都将受到欢迎。
尝试将chromedriver的创建放入线程代码中。否则,您只有一个驱动程序,并且您正在更改一个驱动程序的URL。而是尝试为每个线程创建单独的chromedriver。
注意:我没有尝试过代码,只是建议。
import threading as th
import time
import base64
import mysql.connector as mysql
import requests
from bs4 import BeautifulSoup
from seleniumwire import webdriver
from selenium.webdriver.chrome.options import Options
from functions import *
options = Options()
prefs = {'profile.default_content_setting_values': {'images': 2,'popups': 2, 'geolocation': 2,
'notifications': 2, 'auto_select_certificate': 2, 'fullscreen': 2,
'mouselock': 2, 'mixed_script': 2, 'media_stream': 2,
'media_stream_mic': 2, 'media_stream_camera': 2, 'protocol_handlers': 2,
'ppapi_broker': 2, 'automatic_downloads': 2, 'midi_sysex': 2,
'push_messaging': 2, 'ssl_cert_decisions': 2, 'metro_switch_to_desktop': 2,
'protected_media_identifier': 2, 'app_banner': 2, 'site_engagement': 2,
'durable_storage': 2}}
print('Crawling process started')
options.add_experimental_option('prefs', prefs)
urls='https://google.com https://youtube.com'
def getinf(url_):
driver = webdriver.Chrome(executable_path='chromedriver.exe', options=options)
driver.set_page_load_timeout(50000)
driver.get(url_)
soup=BeautifulSoup(driver.page_source, 'html5lib')
print(soup.select('title'))
for url in urls.split():
t=th.Thread(target=getinf, args=(url,))
t.start()