main.py
from flask import Flask, render_template, request, redirect
from scrap import Scrap, Scrap_web3, Scrap_remoteok
app = Flask("Job Scrapper")
db = {}
@app.route("/")
def index():
return render_template("index.html")
@app.route("/search")
def search():
keyword = request.args.get("keyword")
if keyword is None:
return redirect("/")
if keyword in db:
jobs = db[keyword]
else:
all_jobs = []
berlin_script = Scrap(keyword)
berlin_script.find_job()
all_jobs.extend(berlin_script.results)
web3_script = Scrap_web3(keyword)
web3_script.find_job()
all_jobs.extend(web3_script.results)
remoteok_script = Scrap_remoteok(keyword)
remoteok_script.find_job()
all_jobs.extend(remoteok_script.results)
db[keyword] = all_jobs
jobs = all_jobs
return render_template("search.html", keyword=keyword, jobs=jobs)
if __name__ == "__main__":
app.run(debug=True)
scrap.py
import requests
from bs4 import BeautifulSoup
class Scrap:
def __init__(self, keyword):
self.keyword = keyword
self.url_list = [
f"https://berlinstartupjobs.com/skill-areas/{self.keyword}",
f"https://web3.career/{self.keyword}-jobs",
f"https://weworkremotely.com/remote-jobs/search?utf8=%E2%9C%93&term={self.keyword}",
]
self.results = []
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
def find_job(self):
response = requests.get(self.url_list[0], headers=self.headers)
soup = BeautifulSoup(response.content, "html.parser")
jobs = soup.find("ul", class_="jobs-list-items").find_all("li")
for job in jobs:
title = job.find("h4", class_="bjs-jlid__h").text
company = job.find("a", class_="bjs-jlid__b").text
link = job.find("h4", class_="bjs-jlid__h").find("a")["href"]
job_db = {
"Title": title,
"Company": company,
"URL": link,
}
self.results.append(job_db)
class Scrap_web3(Scrap):
def __init__(self, keyword):
super().__init__(keyword)
def find_job(self):
response = requests.get(self.url_list[1], headers=self.headers)
soup = BeautifulSoup(response.content, "html.parser")
jobs = soup.find("tbody", class_="tbody").find_all("tr")
for index, job in enumerate(jobs):
if index == 4:
continue
title = job.find(
"h2", class_="fs-6 fs-md-5 fw-bold my-primary").text
company = job.find(
"td", class_="job-location-mobile").find("h3").text
link = f'https: // web3.career{job.find("td", class_="job-location-mobile").find("a")["href"]}'
job_db = {
"Title": title,
"Company": company,
"URL": link,
}
self.results.append(job_db)
class Scrap_remoteok(Scrap):
def __init__(self, keyword):
super().__init__(keyword)
def find_job(self):
response = requests.get(self.url_list[2], headers=self.headers)
soup = BeautifulSoup(response.content, "html.parser")
jobs = soup.find_all("li", class_="feature")
for job in jobs:
title = job.find("span", class_="title").text
company = job.find("span", class_="company").text
link = f"https://weworkremotely.com{job.find(lambda tag: tag.name == 'a' and 'remote-jobs' in tag.get('href', ''))['href']}"
job_db = {
"Title": title,
"Company": company,
"URL": link,
}
self.results.append(job_db)
`当我尝试使用 Flask 启动时,我不知道为什么这段代码会出现这个问题。 我已经检查过 bs4 可以访问这些页面,并且当我在 scrap.py 上打印每一页时 一切都很好。
当我单独运行这段代码时,完全没有问题,但是当我使用 Flask 将其作为 Web 应用程序运行时,同样的错误不断发生。我简直无法理解为什么。
如果bs4或者代码本身有问题,那么运行.py文件应该也会导致错误,但看起来根本没有问题。 `
我尝试启动每个 scrap.py 的代码(没问题) 检查 HTML(没问题)
AttributeError: 'NoneType' object has no attribute 'find_all'
错误意味着在调用obj.find_all(...)
时,obj是None。代码中的这些 obj 值是 soup.find(...)
调用的结果,如果文档中不存在此类 DOM 元素(HTML 标签),该调用将返回 None。
修复方法:
而不是从互联网下载
response.content
,而是将 HTML 文档作为硬编码常量字符串传递。那么无论您在哪里运行程序,行为都将是确定性的。
修改您的 Python 代码,以便
soup.find(...)
能够找到匹配的 DOM 元素。
向您的 Python 代码中添加类似
if soup.find(...) is None:
的条件,并使其在未找到指定的 DOM 元素时执行一些对您有用的操作。