我的网络爬虫烧瓶应用程序不断收到内部服务器错误

问题描述 投票:0回答:2
from flask import Flask, render_template
from bs4 import BeautifulSoup
import requests
import pandas as pd

app = Flask(__name__)

@app.route("/")
def job_scraper():
    url01 = "https://www.linkedin.com/jobs/search?keywords=Python%20%28Programming%20Language%29&location=Las%20Vegas%2C%20Nevada%2C%20United%20States&geoId=100293800&currentJobId=3477751834&position=7&pageNum=1"
    
    url_request_01 = requests.get(url01)
    soup = BeautifulSoup(url_request_01.text, 'html.parser')
    
    job_title_pull = soup.find_all(class_="base-search-card__title")
    job_subtitle_pull = soup.find_all(class_="base-search-card__subtitle")
    job_location_pull = soup.find_all(class_="job-search-card__location")
    job_date_pull = soup.find_all(class_="job-search-card__listdate")
    job_links = [job_link_pull['href'] for job_link_pull in soup.find_all('a', href=True, class_="base-card__full-link absolute top-0 right-0 bottom-0 left-0 p-0 z-[2]")]
    
    job_title_data = [title.text.strip() for title in job_title_pull]
    job_subtitle_data = [subtitle.text.strip() for subtitle in job_subtitle_pull]
    job_location_data = [location.text.strip() for location in job_location_pull]
    job_date_data = [date.text.strip() for date in job_date_pull]

    data = {'title': job_title_data,
            'subtitle': job_subtitle_data,
            'location': job_location_data,
            'date': job_date_data,
            'link': job_links}

    
    df = pd.DataFrame(data)
    return render_template('index.html', table_html=df.to_html(classes='table table-striped', index=False))

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=5001)

这是我的html代码

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Your Web App</title>
</head>
<body>
    <h1>Your Data</h1>
    {{ table_html | safe }}
</body>
</html>


这是错误

applemacbookproa2289@APPLEs-MacBook-Pro-2 web-scraping-01 % python3 scraping.py
 * Serving Flask app 'scraping'
 * Debug mode: off
WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5001
 * Running on http://192.168.4.53:5001
Press CTRL+C to quit
[2024-01-12 10:03:36,006] ERROR in app: Exception on / [GET]
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 1455, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 869, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 867, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 852, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/applemacbookproa2289/Desktop/web-scraping-01/scraping.py", line 33, in job_scraper
    df = pd.DataFrame(data)
         ^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/frame.py", line 733, in __init__
    mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 503, in dict_to_mgr
    return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 114, in arrays_to_mgr
    index = _extract_index(arrays)
            ^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 677, in _extract_index
    raise ValueError("All arrays must be of the same length")
ValueError: All arrays must be of the same length
192.168.4.53 - - [12/Jan/2024 10:03:36] "GET / HTTP/1.1" 500 -
[2024-01-12 10:04:22,869] ERROR in app: Exception on / [GET]
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 1455, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 869, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 867, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 852, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/applemacbookproa2289/Desktop/web-scraping-01/scraping.py", line 33, in job_scraper
    df = pd.DataFrame(data)
         ^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/frame.py", line 733, in __init__
    mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 503, in dict_to_mgr
    return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 114, in arrays_to_mgr
    index = _extract_index(arrays)
            ^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 677, in _extract_index
    raise ValueError("All arrays must be of the same length")
ValueError: All arrays must be of the same length
192.168.4.53 - - [12/Jan/2024 10:04:22] "GET / HTTP/1.1" 500 -

我已经尝试了一堆随机解决方案,但我不知道出了什么问题,你能指出我正确的方向吗,我只是测试一下,一旦我能够将废弃的数据获取到 html,我的下一步是接受用户输入的职位名称和位置来提取与此类似的数据。

jobtitle = input("Enter job title: ")
location = input("Enter job location: ")

url01 = "https://www.linkedin.com/jobs/search?keywords="+jobtitle+"&location="+location+"&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0"
python html flask web-scraping
2个回答
0
投票

问题似乎并不是每个工作都有

class="job-search-card__listdate"
的元素,所以你需要检查一下:

import pandas as pd
import requests
from bs4 import BeautifulSoup


url01 = "https://www.linkedin.com/jobs/search?keywords=Python%20%28Programming%20Language%29&location=Las%20Vegas%2C%20Nevada%2C%20United%20States&geoId=100293800&currentJobId=3477751834&position=7&pageNum=1"

url_request_01 = requests.get(url01)
soup = BeautifulSoup(url_request_01.text, "html.parser")

job_title_pull = soup.find_all(class_="base-search-card__title")
job_subtitle_pull = soup.find_all(class_="base-search-card__subtitle")
job_location_pull = soup.find_all(class_="job-search-card__location")
job_date_pull = soup.find_all(class_="job-search-card__listdate")
job_links = [
    job_link_pull["href"]
    for job_link_pull in soup.find_all(
        "a",
        href=True,
        class_="base-card__full-link absolute top-0 right-0 bottom-0 left-0 p-0 z-[2]",
    )
]

job_title_data = [title.text.strip() for title in job_title_pull]
job_subtitle_data = [subtitle.text.strip() for subtitle in job_subtitle_pull]
job_location_data = [location.text.strip() for location in job_location_pull]

job_date_data = [
    d.text.strip() if (d := job.select_one(".job-search-card__listdate")) else None
    for job in soup.select(".job-search-card")
]

data = {
    "title": job_title_data,
    "subtitle": job_subtitle_data,
    "location": job_location_data,
    "date": job_date_data,
    "link": job_links,
}

df = pd.DataFrame(data)
print(df)

打印:

                                                                     title                                        subtitle       location          date                                                                                                                                                                                                                                                                              link
0                            Junior/Entry Level Software Developer(Remote)                                   SynergisticIT  Las Vegas, NV   1 month ago                            https://www.linkedin.com/jobs/view/junior-entry-level-software-developer-remote-at-synergisticit-3767588842?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=gBXODO%2FGuULT55Q6dJzufw%3D%3D&position=1&pageNum=0&trk=public_jobs_jserp-result_search-card
1                                     Junior Software Development Engineer  Team Remotely Inc: Talent Solution Reimagined!  Las Vegas, NV          None  https://www.linkedin.com/jobs/view/junior-software-development-engineer-at-team-remotely-inc-talent-solution-reimagined%21-3805052007?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=FXvU%2FHLIKDYVFT6Pjv8miA%3D%3D&position=2&pageNum=0&trk=public_jobs_jserp-result_search-card
2                                           Backend Developer(Entry Level)                                   SynergisticIT  Las Vegas, NV   3 weeks ago                                             https://www.linkedin.com/jobs/view/backend-developer-entry-level-at-synergisticit-3784401022?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=BzXsFKHN1CCduvAj2TWNNA%3D%3D&position=3&pageNum=0&trk=public_jobs_jserp-result_search-card
3                                                 Junior Software Engineer                                   SynergisticIT  Las Vegas, NV   1 month ago                                                https://www.linkedin.com/jobs/view/junior-software-engineer-at-synergisticit-3767595083?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=v6IbHxai%2Bjh7sUI6kl0PLA%3D%3D&position=4&pageNum=0&trk=public_jobs_jserp-result_search-card
4                             Data Scientist(Rermote) - Junior/Entry Level                                   SynergisticIT  Las Vegas, NV    1 week ago                               https://www.linkedin.com/jobs/view/data-scientist-rermote-junior-entry-level-at-synergisticit-3792903797?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=s%2BTFVT3a5IxYwJ4UGrhHdg%3D%3D&position=5&pageNum=0&trk=public_jobs_jserp-result_search-card

...

0
投票

我能够解决这个问题,这是我的解决方案

from flask import Flask, render_template
    from bs4 import BeautifulSoup
    import requests
    import pandas as pd
    
    app = Flask(__name__)
    @app.route("/")
    def job_scraper():
        url01 = "https://www.linkedin.com/jobs/search?keywords=Python%20%28Programming%20Language%29&location=Las%20Vegas%2C%20Nevada%2C%20United%20States&geoId=100293800&currentJobId=3477751834&position=7&pageNum=1"
        
        url_request_01 = requests.get(url01)
        soup = BeautifulSoup(url_request_01.text, 'html.parser')
        
        job_title_pull = soup.find_all(class_="base-search-card__title")
        job_subtitle_pull = soup.find_all(class_="base-search-card__subtitle")
        job_location_pull = soup.find_all(class_="job-search-card__location")
        job_date_pull = soup.find_all(class_="job-search-card__listdate")
        job_links = [job_link_pull['href'] for job_link_pull in soup.find_all('a', href=True, class_="base-card__full-link absolute top-0 right-0 bottom-0 left-0 p-0 z-[2]")]
        
        job_title_data = [title.text.strip() for title in job_title_pull]
        job_subtitle_data = [subtitle.text.strip() for subtitle in job_subtitle_pull]
        job_location_data = [location.text.strip() for location in job_location_pull]
        job_date_data = [
        d.text.strip() if (d := job.select_one(".job-search-card__listdate")) else None
        for job in soup.select(".job-search-card")]
        data = {'title': job_title_data,
            'subtitle': job_subtitle_data,
            'location': job_location_data,
            'link': job_date_data,
            'date': job_date_data}
        df = pd.DataFrame(data)
        df_html = df.to_html(table_id="table")
        
        
        return render_template('index.html', df=df, df_html = df_html)
        
    app.run(host="0.0.0.0", port=5001)

这是 HTML 代码,用于循环遍历在 python 中创建的数据框

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Your Web App</title>
</head>
<body>
    <table>
        <tr>
            <th>Title</th> <!-- Close the th tag properly -->
        </tr>
        {% for cell in df.title %}
        <tr>
            <td> {{ cell }} </td>
        </tr>
        {% endfor %}
    </table>
    <h1>Your Data1</h1>
</body>
</html>
© www.soinside.com 2019 - 2024. All rights reserved.