目前,我正在使用 LinkedIn scraper 来收集有关职位发布的数据。它工作得很好,除了它没有收集工作技能,而且我不知道如何修复它。我相信这个错误是因为技能是在可切换菜单中找到的,但我不知道如何访问它。
打印详细信息时,除了技能之外,所有内容都已存在,这是当前代码。 `
def __init__(self):
self.error_count = 0
self.job_details_link = "https://www.linkedin.com/voyager/api/jobs/jobPostings/{}?decorationId=com.linkedin.voyager.deco.jobs.web.shared.WebFullJobPosting-65"
emails, passwords = get_logins('details')
self.emails = emails
self.sessions = [create_session(email, password) for email, password in zip(emails, passwords)]
self.session_index = 0
self.variable_paths = pd.read_csv('json_paths/data_variables.csv')
self.headers = [{
'Authority': 'www.linkedin.com',
'Method': 'GET',
'Path': '/voyager/api/search/hits?decorationId=com.linkedin.voyager.deco.jserp.WebJobSearchHitWithSalary-25&count=25&filters=List(sortBy-%3EDD,resultType-%3EJOBS)&origin=JOB_SEARCH_PAGE_JOB_FILTER&q=jserpFilters&queryContext=List(primaryHitType-%3EJOBS,spellCorrectionEnabled-%3Etrue)&start=0&topNRequestedFlavors=List(HIDDEN_GEM,IN_NETWORK,SCHOOL_RECRUIT,COMPANY_RECRUIT,SALARY,JOB_SEEKER_QUALIFIED,PRE_SCREENING_QUESTIONS,SKILL_ASSESSMENTS,ACTIVELY_HIRING_COMPANY,TOP_APPLICANT)',
'Scheme': 'https',
'Accept': 'application/vnd.linkedin.normalized+json+2.1',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Cookie': "; ".join([f"{key}={value}" for key, value in session.cookies.items()]),
'Csrf-Token': session.cookies.get('JSESSIONID').strip('"'),
# 'TE': 'Trailers',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
# 'X-Li-Track': '{"clientVersion":"1.12.7990","mpVersion":"1.12.7990","osName":"web","timezoneOffset":-7,"timezone":"America/Los_Angeles","deviceFormFactor":"DESKTOP","mpName":"voyager-web","displayDensity":1,"displayWidth":1920,"displayHeight":1080}'
'X-Li-Track': '{"clientVersion":"1.13.5589","mpVersion":"1.13.5589","osName":"web","timezoneOffset":-7,"timezone":"Spain/Madrid","deviceFormFactor":"DESKTOP","mpName":"voyager-web","displayDensity":1,"displayWidth":360,"displayHeight":800}'
} for session in self.sessions]
# self.proxies = [{'http': f'http://{proxy}', 'https': f'http://{proxy}'} for proxy in []]
def get_job_details(self, job_ids):
job_details = {}
for job_id in job_ids:
error = False
try:
details = self.sessions[self.session_index].get(self.job_details_link.format(job_id), headers=self.headers[self.session_index])#, proxies=self.proxies[self.session_index], timeout=5)
except requests.exceptions.Timeout:
print('Timeout for job {}'.format(job_id))
error = True
if details.status_code != 200:
job_details[job_id] = -1
print('Status code {} for job {} with account {}\nText: {}'.format(details.status_code, job_id, self.emails[self.session_index], details.text))
error = True
if error:
self.error_count += 1
if self.error_count > 10:
raise Exception('Too many errors')
else:
self.error_count = 0
job_details[job_id] = details.json()
print('Job {} done'.format(job_id))
self.session_index = (self.session_index + 1) % len(self.sessions)
time.sleep(.3)
return job_details
` 这是我的第一个问题,如果写得不正确,很抱歉。谢谢你。
你找到问题的解决办法了吗?