在 Python 中截取网址的更好方法

问题描述 投票:0回答:1

问题描述

目前正在做一个项目,需要我浏览一个网址并截取网页的屏幕截图。

查看各种资源后,我发现了 3 种方法。我将提到我当前使用的所有 3 种方法。

方法 - 1:PhantomJS

from selenium import webdriver
import time
import sys

print 'Without Headless'
_start = time.time()
br = webdriver.PhantomJS()
br.get('http://' + sys.argv[1])
br.save_screenshot('screenshot-phantom.png')
br.quit
_end = time.time()
print 'Total time for non-headless {}'.format(_end - _start)

方法2:无头浏览器

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

print 'Headless'
_start = time.time()
options = Options()
options.add_argument("--headless") # Runs Chrome in headless mode.
options.add_argument('--no-sandbox') # # Bypass OS security model
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path='/usr/bin/chromedriver')
driver.get('http://' + sys.argv[1])
driver.save_screenshot('screenshot-headless.png')
driver.quit()
_end = time.time()
print 'Total time for headless {}'.format(_end - _start)

方法 - 3:PyQT

import argparse
import sys
import logging
import sys
import time
import os

import urlparse
from selenium import webdriver
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *

class Screenshot(QWebView):
    def __init__(self):
        self.app = QApplication(sys.argv)
        QWebView.__init__(self)
        self._loaded = False
        self.loadFinished.connect(self._loadFinished)

    def capture(self, url, output_file):
        _logger.info('Received url {}'.format(url))
        _start = time.time()
        try:
            #Check for http/https
            if url[0:3] == 'http' or url[0:4] == 'https':
                self.url = url
            else:
                url = 'http://' + url
            self.load(QUrl(url))
            self.wait_load(url)
            # set to webpage size
            frame = self.page().mainFrame()
            self.page().setViewportSize(frame.contentsSize())
            # render image
            image = QImage(self.page().viewportSize(), QImage.Format_ARGB32)
            painter = QPainter(image)
            frame.render(painter)
            painter.end()
            _logger.info('Saving screenshot {} for {}'.format(output_file,url))
            image.save(os.path.join(os.path.dirname(os.path.realpath(__file__)),'data',output_file))
        except Exception as e:
            _logger.error('Error in capturing screenshot {} - {}'.format(url,e))
        _end = time.time()
        _logger.info('Time took for processing url {} - {}'.format(url,_end - _start))

    def wait_load(self,url,delay=1,retry_count=60):
        # process app events until page loaded
        while not self._loaded and retry_count:
            _logger.info('wait_load for url {} retry_count {}'.format(url,retry_count))
            self.app.processEvents()
            time.sleep(delay)
            retry_count -=1
        _logger.info('wait_load for url {} expired'.format(url))
        self._loaded = False

    def _loadFinished(self, result):
        self._loaded = True

面临的问题:

这 3 种方法在使用时,都会由于一个或其他错误而卡住。这里提出了一个这样的问题Stackoverflow 上的错误问题。 因此,在这 3 种用 Python 截取网页的方法中,这是一种高效且适用于大规模部署的方法。

python python-3.x google-chrome phantomjs headless
1个回答
1
投票

取自https://gist.github.com/fabtho/13e4a2e7cfbfde671b8fa81bbe9359fb并用Python 3重写

此方法在技术上可行,但看起来不太好,因为许多网站都会有 cookie 接受弹出窗口,这些弹出窗口会出现在每个屏幕截图中,因此根据您使用的网站,您可能希望在开始之前先使用 selenium 删除这些弹出窗口截图过程。

from selenium import webdriver
from PIL import Image
from io import BytesIO

verbose = 1

browser = webdriver.Chrome(executable_path='C:/yourpath/chromedriver.exe')
browser.get('http://stackoverflow.com/questions/37906704/taking-a-whole-page-screenshot-with-selenium-marionette-in-python')

# from here http://stackoverflow.com/questions/1145850/how-to-get-height-of-entire-document-with-javascript
js = 'return Math.max( document.body.scrollHeight, document.body.offsetHeight,  document.documentElement.clientHeight,  document.documentElement.scrollHeight,  document.documentElement.offsetHeight);'

scrollheight = browser.execute_script(js)

if verbose > 0: 
    print(scrollheight)

slices = []
offset = 0
while offset < scrollheight:
    if verbose > 0: 
        print(offset)

    browser.execute_script("window.scrollTo(0, %s);" % offset)
    img = Image.open(BytesIO(browser.get_screenshot_as_png()))
    offset += img.size[1]
    slices.append(img)

    if verbose > 0:
        browser.get_screenshot_as_file('%s/screen_%s.png' % ('/tmp', offset))
        print(scrollheight)


screenshot = Image.new('RGB', (slices[0].size[0], offset))
offset = 0
for img in slices:
    screenshot.paste(img, (0, offset))
    offset += img.size[1]

screenshot.save('screenshot.png')
browser.quit()

这是一种可能也有帮助的替代方法。

from selenium import webdriver
import time

#Webdriver variables to open browser in headless mode
options = webdriver.ChromeOptions()
options.headless = True

#Sample URL. Google here used for ease of example
url="https://www.google.com"

driver = webdriver.Chrome(options=options)
driver.get(url)
#Give a certain amount of time for the page to load
time.sleep(7)
#Bypass cookie. The cookie button is here identified with an ID. Modify as necessary for your needs
try:
    driver.find_element_by_css_selector('#L2AGLb').click()
    print("Bypassed stage one of cookie consent") #Add additional stages if necessary for various cookie consent forms
except:
    print("Couldn't find cookie consent") 
time.sleep(10)
#Take picture
S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
driver.set_window_size(S('Width'),S('Height')) # May need manual adjustment
driver.find_element_by_tag_name('body').screenshot("output_image.png")
#Quit driver
driver.quit()
© www.soinside.com 2019 - 2024. All rights reserved.