方法:1链接说明了如何使用Python下载网页内容并将其保存到本地硬盘驱动器。 [Download webpage using python]] 1
方法:2
您可以使用“ PyWebCopy”,它是用Python归档的Web抓取和归档工具,可以存档任何在线网站及其资产,CSS,js和图像,以供离线阅读,存储或出于任何原因。 More details here...方法:3
以下实现使您能够获取HTML子网站。为了获得您需要的其他文件,可以对其进行更完善的开发。我为您设置了depth变量,以设置要解析的最大sub_websites。 (此程序可以下载网页的所有文件,包括HTML,CSS,JS和图像文件)import urllib2
from BeautifulSoup import *
from urlparse import urljoin
def crawl(pages, depth=None):
indexed_url = [] # a list for the main and sub-HTML websites in the main website
for i in range(depth):
for page in pages:
if page not in indexed_url:
indexed_url.append(page)
try:
c = urllib2.urlopen(page)
except:
print "Could not open %s" % page
continue
soup = BeautifulSoup(c.read())
links = soup('a') #finding all the sub_links
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http':
indexed_url.append(url)
pages = indexed_url
return indexed_url
pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=2)
print urls
Python3版本,2019年。这可以为某人节省一些时间:#!/usr/bin/env python import urllib.request as urllib2 from bs4 import * from urllib.parse import urljoin def crawl(pages, depth=None): indexed_url = [] # a list for the main and sub-HTML websites in the main website for i in range(depth): for page in pages: if page not in indexed_url: indexed_url.append(page) try: c = urllib2.urlopen(page) except: print( "Could not open %s" % page) continue soup = BeautifulSoup(c.read()) links = soup('a') #finding all the sub_links for link in links: if 'href' in dict(link.attrs): url = urljoin(page, link['href']) if url.find("'") != -1: continue url = url.split('#')[0] if url[0:4] == 'http': indexed_url.append(url) pages = indexed_url return indexed_url pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"] urls = crawl(pagelist, depth=1) print( urls )