from flask import Flask, render_template
from bs4 import BeautifulSoup
import requests
import pandas as pd
app = Flask(__name__)
@app.route("/")
def job_scraper():
url01 = "https://www.linkedin.com/jobs/search?keywords=Python%20%28Programming%20Language%29&location=Las%20Vegas%2C%20Nevada%2C%20United%20States&geoId=100293800¤tJobId=3477751834&position=7&pageNum=1"
url_request_01 = requests.get(url01)
soup = BeautifulSoup(url_request_01.text, 'html.parser')
job_title_pull = soup.find_all(class_="base-search-card__title")
job_subtitle_pull = soup.find_all(class_="base-search-card__subtitle")
job_location_pull = soup.find_all(class_="job-search-card__location")
job_date_pull = soup.find_all(class_="job-search-card__listdate")
job_links = [job_link_pull['href'] for job_link_pull in soup.find_all('a', href=True, class_="base-card__full-link absolute top-0 right-0 bottom-0 left-0 p-0 z-[2]")]
job_title_data = [title.text.strip() for title in job_title_pull]
job_subtitle_data = [subtitle.text.strip() for subtitle in job_subtitle_pull]
job_location_data = [location.text.strip() for location in job_location_pull]
job_date_data = [date.text.strip() for date in job_date_pull]
data = {'title': job_title_data,
'subtitle': job_subtitle_data,
'location': job_location_data,
'date': job_date_data,
'link': job_links}
df = pd.DataFrame(data)
return render_template('index.html', table_html=df.to_html(classes='table table-striped', index=False))
if __name__ == '__main__':
app.run(host="0.0.0.0", port=5001)
这是我的html代码
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Your Web App</title>
</head>
<body>
<h1>Your Data</h1>
{{ table_html | safe }}
</body>
</html>
这是错误
applemacbookproa2289@APPLEs-MacBook-Pro-2 web-scraping-01 % python3 scraping.py
* Serving Flask app 'scraping'
* Debug mode: off
WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
* Running on all addresses (0.0.0.0)
* Running on http://127.0.0.1:5001
* Running on http://192.168.4.53:5001
Press CTRL+C to quit
[2024-01-12 10:03:36,006] ERROR in app: Exception on / [GET]
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 1455, in wsgi_app
response = self.full_dispatch_request()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 869, in full_dispatch_request
rv = self.handle_user_exception(e)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 867, in full_dispatch_request
rv = self.dispatch_request()
^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 852, in dispatch_request
return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/applemacbookproa2289/Desktop/web-scraping-01/scraping.py", line 33, in job_scraper
df = pd.DataFrame(data)
^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/frame.py", line 733, in __init__
mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 503, in dict_to_mgr
return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 114, in arrays_to_mgr
index = _extract_index(arrays)
^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 677, in _extract_index
raise ValueError("All arrays must be of the same length")
ValueError: All arrays must be of the same length
192.168.4.53 - - [12/Jan/2024 10:03:36] "GET / HTTP/1.1" 500 -
[2024-01-12 10:04:22,869] ERROR in app: Exception on / [GET]
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 1455, in wsgi_app
response = self.full_dispatch_request()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 869, in full_dispatch_request
rv = self.handle_user_exception(e)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 867, in full_dispatch_request
rv = self.dispatch_request()
^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 852, in dispatch_request
return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/applemacbookproa2289/Desktop/web-scraping-01/scraping.py", line 33, in job_scraper
df = pd.DataFrame(data)
^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/frame.py", line 733, in __init__
mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 503, in dict_to_mgr
return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 114, in arrays_to_mgr
index = _extract_index(arrays)
^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 677, in _extract_index
raise ValueError("All arrays must be of the same length")
ValueError: All arrays must be of the same length
192.168.4.53 - - [12/Jan/2024 10:04:22] "GET / HTTP/1.1" 500 -
我已经尝试了一堆随机解决方案,但我不知道出了什么问题,你能指出我正确的方向吗,我只是测试一下,一旦我能够将废弃的数据获取到 html,我的下一步是接受用户输入的职位名称和位置来提取与此类似的数据。
jobtitle = input("Enter job title: ")
location = input("Enter job location: ")
url01 = "https://www.linkedin.com/jobs/search?keywords="+jobtitle+"&location="+location+"&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0"
问题似乎并不是每个工作都有
class="job-search-card__listdate"
的元素,所以你需要检查一下:
import pandas as pd
import requests
from bs4 import BeautifulSoup
url01 = "https://www.linkedin.com/jobs/search?keywords=Python%20%28Programming%20Language%29&location=Las%20Vegas%2C%20Nevada%2C%20United%20States&geoId=100293800¤tJobId=3477751834&position=7&pageNum=1"
url_request_01 = requests.get(url01)
soup = BeautifulSoup(url_request_01.text, "html.parser")
job_title_pull = soup.find_all(class_="base-search-card__title")
job_subtitle_pull = soup.find_all(class_="base-search-card__subtitle")
job_location_pull = soup.find_all(class_="job-search-card__location")
job_date_pull = soup.find_all(class_="job-search-card__listdate")
job_links = [
job_link_pull["href"]
for job_link_pull in soup.find_all(
"a",
href=True,
class_="base-card__full-link absolute top-0 right-0 bottom-0 left-0 p-0 z-[2]",
)
]
job_title_data = [title.text.strip() for title in job_title_pull]
job_subtitle_data = [subtitle.text.strip() for subtitle in job_subtitle_pull]
job_location_data = [location.text.strip() for location in job_location_pull]
job_date_data = [
d.text.strip() if (d := job.select_one(".job-search-card__listdate")) else None
for job in soup.select(".job-search-card")
]
data = {
"title": job_title_data,
"subtitle": job_subtitle_data,
"location": job_location_data,
"date": job_date_data,
"link": job_links,
}
df = pd.DataFrame(data)
print(df)
打印:
title subtitle location date link
0 Junior/Entry Level Software Developer(Remote) SynergisticIT Las Vegas, NV 1 month ago https://www.linkedin.com/jobs/view/junior-entry-level-software-developer-remote-at-synergisticit-3767588842?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=gBXODO%2FGuULT55Q6dJzufw%3D%3D&position=1&pageNum=0&trk=public_jobs_jserp-result_search-card
1 Junior Software Development Engineer Team Remotely Inc: Talent Solution Reimagined! Las Vegas, NV None https://www.linkedin.com/jobs/view/junior-software-development-engineer-at-team-remotely-inc-talent-solution-reimagined%21-3805052007?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=FXvU%2FHLIKDYVFT6Pjv8miA%3D%3D&position=2&pageNum=0&trk=public_jobs_jserp-result_search-card
2 Backend Developer(Entry Level) SynergisticIT Las Vegas, NV 3 weeks ago https://www.linkedin.com/jobs/view/backend-developer-entry-level-at-synergisticit-3784401022?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=BzXsFKHN1CCduvAj2TWNNA%3D%3D&position=3&pageNum=0&trk=public_jobs_jserp-result_search-card
3 Junior Software Engineer SynergisticIT Las Vegas, NV 1 month ago https://www.linkedin.com/jobs/view/junior-software-engineer-at-synergisticit-3767595083?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=v6IbHxai%2Bjh7sUI6kl0PLA%3D%3D&position=4&pageNum=0&trk=public_jobs_jserp-result_search-card
4 Data Scientist(Rermote) - Junior/Entry Level SynergisticIT Las Vegas, NV 1 week ago https://www.linkedin.com/jobs/view/data-scientist-rermote-junior-entry-level-at-synergisticit-3792903797?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=s%2BTFVT3a5IxYwJ4UGrhHdg%3D%3D&position=5&pageNum=0&trk=public_jobs_jserp-result_search-card
...
我能够解决这个问题,这是我的解决方案
from flask import Flask, render_template
from bs4 import BeautifulSoup
import requests
import pandas as pd
app = Flask(__name__)
@app.route("/")
def job_scraper():
url01 = "https://www.linkedin.com/jobs/search?keywords=Python%20%28Programming%20Language%29&location=Las%20Vegas%2C%20Nevada%2C%20United%20States&geoId=100293800¤tJobId=3477751834&position=7&pageNum=1"
url_request_01 = requests.get(url01)
soup = BeautifulSoup(url_request_01.text, 'html.parser')
job_title_pull = soup.find_all(class_="base-search-card__title")
job_subtitle_pull = soup.find_all(class_="base-search-card__subtitle")
job_location_pull = soup.find_all(class_="job-search-card__location")
job_date_pull = soup.find_all(class_="job-search-card__listdate")
job_links = [job_link_pull['href'] for job_link_pull in soup.find_all('a', href=True, class_="base-card__full-link absolute top-0 right-0 bottom-0 left-0 p-0 z-[2]")]
job_title_data = [title.text.strip() for title in job_title_pull]
job_subtitle_data = [subtitle.text.strip() for subtitle in job_subtitle_pull]
job_location_data = [location.text.strip() for location in job_location_pull]
job_date_data = [
d.text.strip() if (d := job.select_one(".job-search-card__listdate")) else None
for job in soup.select(".job-search-card")]
data = {'title': job_title_data,
'subtitle': job_subtitle_data,
'location': job_location_data,
'link': job_date_data,
'date': job_date_data}
df = pd.DataFrame(data)
df_html = df.to_html(table_id="table")
return render_template('index.html', df=df, df_html = df_html)
app.run(host="0.0.0.0", port=5001)
这是 HTML 代码,用于循环遍历在 python 中创建的数据框
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Your Web App</title>
</head>
<body>
<table>
<tr>
<th>Title</th> <!-- Close the th tag properly -->
</tr>
{% for cell in df.title %}
<tr>
<td> {{ cell }} </td>
</tr>
{% endfor %}
</table>
<h1>Your Data1</h1>
</body>
</html>