在 Python 中使用 Selenium 下载 PDF

问题描述 投票:0回答:1

我正在尝试通过单击下载按钮从浏览器的 PDF 查看器下载以下 PDF。

image

我使用 ID、CSS_SELECTOR 和 XPATH 定位器尝试了以下代码,但它似乎不起作用。 可能是什么问题?有人可以帮我解决这个问题吗?

import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait


download_url = "https://www.npci.org.in/PDF/nach/circular/2015-16/Circular-No.135.pdf"

options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)

driver.get(download_url)
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#icon')))
driver.find_element(By.ID, '#icon').click()

time.sleep(5)
driver.quit()
python python-3.x selenium-webdriver
1个回答
0
投票

启动下载时,您需要保持网络驱动程序运行,直到下载完成。但是,Selenium 中没有内置方法来等待下载完成。

以下脚本会将 PDF 文件下载到提供的文件夹路径。这是通过打开 Chrome 的下载管理器并监控上次下载的状态来实现的。

状态完成后,驱动程序会再等待几秒钟以确保下载的文件已重命名。

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

def download_pdf_to_custom_path(download_url, download_folder):
    chrome_options = Options()
    chrome_options.add_experimental_option("prefs",  {
        "download.default_directory": download_folder,
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "plugins.always_open_pdf_externally": True
        }
    )

    driver = webdriver.Chrome(options = chrome_options)
    driver.get(download_url)
    # Give some time for the download to begin
    time.sleep(5)
    # Open the Downloads Manager window
    driver.get("chrome://downloads")
    # Execute JavaScript code to get list of downloads
    download_items_script = "return document.querySelector('downloads-manager').shadowRoot.getElementById('downloadsList').items;"
    items = driver.execute_script(download_items_script)
    # Most recent download is the first item in the list
    # Each downloaded item has an attribute, `state`, that we can poll.
    # Loop the JavaScript code block while the download state is in progress.
    while items[0]["state"] == 0:
        time.sleep(1)
        items = driver.execute_script(download_items_script)

    # The `state` will change to 2 when the download is complete
    if items[0]["state"] == 2:
        # Give the browser some time to rename the downloaded file
        time.sleep(2)
    else:
        # The download did not complete successfully.
        print(f"Something went wrong. {items[0]['state']}")

    driver.quit()


if __name__ == "__main__":
    from pathlib import Path
    download_url = "https://www.npci.org.in/PDF/nach/circular/2015-16/Circular-No.135.pdf"
    folder = str(Path(__file__).parent)
    download_pdf_to_custom_path(download_url, folder)
© www.soinside.com 2019 - 2024. All rights reserved.