from bs4 import BeautifulSoup
from zenrows import ZenRowsClient
import uuid
base_url = 'https://wd1.myworkdaysite.com'
client = ZenRowsClient("")
url = "https://wd1.myworkdaysite.com/en-US/recruiting/abinbev/USA"
response = client.get(url, params={"js_render": "true", "wait": 3000})
soup = BeautifulSoup(response.content, 'html.parser')
jobs_listings = soup.find_all('li', class_='css-1q2dra3')
for jobs_listing in jobs_listings:
try:
job_title = jobs_listing.find('h3').find('a').text
city_name = jobs_listing.find('dd', class_='css-129m7dg').text.strip()
job_link = jobs_listing.find('h3').find('a')['href']
date_posted_section = jobs_listing.find(
'div', {'data-automation-id': 'postedOn'})
date_posted = date_posted_section.find(
'dd', class_='css-129m7dg').text.strip()
if date_posted != 'Posted Yesterday':
job_page_url = base_url + job_link
job_page_response = client.get(
job_page_url, params={"js_render": "true", "wait": 3000})
job_page_soup = BeautifulSoup(job_page_response.content, 'html.parser')
description_section = job_page_soup.find(
'div', {'data-automation-id': 'jobPostingDescription', 'class': 'css-oplht1'})
job = {
"id": str(uuid.uuid4()),
"title": job_title,
"job_location": city_name,
"employment_type": employment_type,
"description": str(description_section),
}
这是我抓取此网站的代码https://wd1.myworkdaysite.com/en-US/recruiting/abinbev/USA
我可以抓取第一页,但无法抓取其他页面。
正如我们所看到的,网站中有分页,但是当我导航到第 2、3 页等时,
web_url
保持不变,那么我如何抓取其他页面数据?
在这种情况下我不想使用硒
我正在使用 zenrows,漂亮的肥皂,但无法从第 2 页开始
浏览器显示的 URL 是相同的,但分页数据是通过 POST 到此 URL 来获取的:
https://wd1.myworkdaysite.com/wday/cxs/abinbev/USA/jobs
。
好消息是,这使您的工作变得更加轻松。您可以发送 JSON 数据和接收 JSON 数据,无需 HTML 或 BeautifulSoup。唯一困难的部分是确定最后一页被阅读的时间,因为网站会循环返回发送重复的帖子。因此,您必须跟踪看到的作业,并在它们重复时停止:
import requests
LIMIT = 20
all_job_postings = []
paths_seen = []
url = 'https://wd1.myworkdaysite.com/wday/cxs/abinbev/USA/jobs'
offset = 0
while True:
response = requests.post(url, json={"appliedFacets":{},"limit":LIMIT,"offset":offset,"searchText":""})
job_postings = response.json()['jobPostings']
# The only sign that we've processed all pages is that the jobs start repeating.
new_paths = [posting['externalPath'] for posting in job_postings]
if any(new_path in paths_seen for new_path in new_paths): break
paths_seen.extend(new_paths)
all_job_postings.extend(job_postings)
offset += len(job_postings)
print(len(all_job_postings))
# 155
每个职位发布都是这样的:
{'title': 'Director Retail Sales - On Premise', 'externalPath': '/job/St-Louis-Missouri/Director-Retail-Sales---On-Premise_30056661', 'locationsText': '3 Locations', 'postedOn': 'Posted Yesterday', 'bulletFields': ['30056661']}