我正在为我的项目使用网页抓取,它在 Windows 系统上运行得很好。在 ubuntu 上部署它在第一次运行我的脚本时工作得很好,但此后的任何时候我都会收到错误
selenium.common.exceptions.WebDriverException: Message: unknown error: unable to discover open pages
通常在这种情况发生之前,脚本在返回错误之前大约 1 分 30 秒内没有输出。任何帮助将不胜感激!
我的代码:
import selenium
from bs4 import BeautifulSoup, NavigableString
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
for i in range(5):
URLS = []
URLS.append('https://website.com/')
for i in range(1):
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--headless")
options.add_argument("start-maximized")
options.add_argument("window-size=1900,1080")
options.add_argument("disable-gpu")
options.add_argument("--disable-software-rasterizer")
options.add_argument("--disable-dev-shm-usage")
options.add_argument('--remote-debugging-port=9222')
options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36')
driver = webdriver.Chrome(executable_path="./drivers/chromedriver", options=options)
driver.set_page_load_timeout(2)
for url in URLS:
try:
webdriver.get(url)
innerHTML = webdriver.page_source
soup = BeautifulSoup(innerHTML, "html.parser")
for i in soup.select("#ELEMENT"):
global ELEMENT
ELEMENT = i.text
print(ELEMENT)
webdriver.close()
webdriver.quit()
except:
pass
如果您想抓取受限或阻止的网站,您必须使用随机用户代理。您可以检查下面的代码片段。希望它对你有用。
import random
import selenium
from bs4 import BeautifulSoup, NavigableString
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
for i in range(5):
URLS = []
URLS.append('https://google.com/')
for i in range(1):
user_agent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.5; rv:90.0) Gecko/20100101 Firefox/90.0',
'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_5_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
]
user_agent = random.choice(user_agent_list)
browser_options = webdriver.ChromeOptions()
browser_options.add_argument("--no-sandbox")
browser_options.add_argument("--headless")
browser_options.add_argument("start-maximized")
browser_options.add_argument("window-size=1900,1080")
browser_options.add_argument("disable-gpu")
browser_options.add_argument("--disable-software-rasterizer")
browser_options.add_argument("--disable-dev-shm-usage")
browser_options.add_argument(f'user-agent={user_agent}')
web_driver = webdriver.Chrome(options=browser_options, service_args=["--verbose", "--log-path=test.log"])
for url in URLS:
try:
web_driver.get(url)
innerHTML = web_driver.page_source
soup = BeautifulSoup(innerHTML, "html.parser")
for i in soup.select("body"):
global ELEMENT
ELEMENT = i.text
print(ELEMENT)
web_driver.close()
web_driver.quit()
except:
pass
请让我知道它不起作用。