Proxycurl api 没有正确返回数据

问题描述 投票:0回答:1

首先,对于这条长消息,我很抱歉,但我有一个问题阻碍了我推进我的项目:首先让我快速解释一下工作流程,用户输入搜索查询 -> 使用此查询在 linkedin 中进行搜索 - > 抓取用户的 url(在页面的 nb 函数中)-> 在 proxycurl 中搜索这些用户(https://nubela.co/proxycurl/docs#people-api-person-lookup-endpoint)-> 抓取他们的具有功能的信息 -> 将它们存储在我的数据库中 -> 获取有关被抓取用户体验的信息 -> 再次在 proxycurl API 中进行搜索,但这次是针对公司 -> 获取有关公司的信息并将它们存储在数据库中 ->搜索有关该公司员工的信息 (https://nubela.co/proxycurl/docs#company-api-employee-search-api-endpoint) -> 抓取 CTO 的 url -> 在联系人 API 中搜索以抓取关于 CTO 的信息-个人邮箱-looku p-endpoint) -> 将所有内容存储在数据库中。 好的,所以我设法获取 url,在 api 中搜索用户,但我从来没有设法用我的代码获取“额外”信息,而我可以在 Postman 中获取相同的配置文件,同样适用于 personnal_email、personnal_contact_number、github_profile_id。 然后我设法获取了有关公司的数据,但仍然是同样的问题,即使我将它们包含在我的代码中,也无法检索“额外”信息或“资金数据”或“收购”。

我真的不知道我的代码有什么问题(我假设出了什么问题,因为邮递员一切正常),我可以在这里得到一些帮助,感谢您的宝贵时间! (下面的完整代码)

from telnetlib import EC import requests from datetime import datetime import json import re from cgitb import text import selenium from selenium.webdriver.support.ui import WebDriverWait from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.common.action_chains import ActionChains from bs4 import BeautifulSoup, NavigableString, Tag from time import sleep from time import time from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.options import Options import csv import firebase_admin from firebase_admin import credentials from firebase_admin import db import openpyxl import requests cred = credentials.Certificate(r"C:\Users\radia\Downloads\st-londres-2-firebase-adminsdk-7eowq-786e799875.json") firebase_admin.initialize_app(cred, { 'databaseURL': 'https://st-londres-2-default-rtdb.firebaseio.com/' }) print('- Importation des packages') # Task 1: webdriver configuration driver = webdriver.Chrome(ChromeDriverManager().install()) # Task 1.1: Open Chrome and Access Linkedin sleep(2) url = 'https://www.linkedin.com/login' driver.get(url) print('Initialisation du chrome driver') sleep(2) # Task 1.2: Import username and password credential = open(r"C:\Users\radia\OneDrive\Bureau\credentials.txt") line = credential.readlines() username = line[0] password = line[1] print('Importation des id') sleep(2) # Task 1.2: Key in login credentials email_field = driver.find_element(By.ID, 'username') email_field.send_keys(username) print('Email ok') sleep(3) password_field = driver.find_element(By.NAME, 'session_password') password_field.send_keys(password) print('Mdp ok') sleep(2) # Task 1.2: Click the Login button signin_field = driver.find_element(By.XPATH, '//*[@id="organic-div"]/form/div[3]/button') signin_field.click() sleep(3) print('- Task A: Connexion à Linkedin') search_field = driver.find_element(By.XPATH, '//*[@id="global-nav-typeahead"]/input') search_query = input('Type of profile to scrape ') search_field.send_keys(search_query) search_field.send_keys(Keys.RETURN) print('TASK B OK') sleep(10) try: driver.find_element(By.XPATH, "//*[@id='search-reusables__filters-bar']/ul/li[2]/button").click() except selenium.common.exceptions.NoSuchElementException: print("Element not found") def GetURL(): #function to grab linkedin urls page_source = BeautifulSoup(driver.page_source, features='lxml') a_elements = page_source.find_all('a', {'class': "app-aware-link"}) all_urls = [] for element in a_elements: url = element.get('href') all_urls.append(url) return all_urls ##Pagination sleep(2) input_page = int(input('Nombre de pages à scraper: ')) URLs_all_page = [] for page in range(input_page): URLs_one_page = GetURL() sleep(2) driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') #scrolling to the end of the page sleep(3) next_button = driver.find_element(By.XPATH, '//button[contains(@class, "artdeco-pagination__button--next") and .//li-icon]') driver.execute_script("arguments[0].click();", next_button) sleep(2) if URLs_one_page is not None: URLs_all_page = URLs_all_page + URLs_one_page print(URLs_all_page) else: print('variable stores a None value') sleep(2) print(URLs_all_page) sleep(1) def get_profile_info(url): # function to make api calls for users api_endpoint = 'https://nubela.co/proxycurl/api/v2/linkedin' api_key = 'SDrD73S2fXlvCMdFDExEaw' headers = {'Authorization': 'Bearer ' + api_key} params = { 'url': url, 'fallback_to_cache': 'on-error', 'use_cache': 'if-present', 'skills': 'include', 'inferred_salary': 'include', 'personal_email': 'include', 'personal_contact_number': 'include', 'twitter_profile_id': 'include', 'facebook_profile_id': 'include', 'github_profile_id': 'include', 'extra': 'include', } try: response = requests.get(api_endpoint, headers=headers, params=params) if response.status_code != 404: data_profile = response.json() return data_profile else: return None except requests.exceptions.RequestException as e: print (e) return None def get_company_info(url): #function to make api calls for companies api_key = 'SDrD73S2fXlvCMdFDExEaw' headers = {'Authorization': 'Bearer ' + api_key} api_endpoint = 'https://nubela.co/proxycurl/api/linkedin/company' params = { 'resolve_numeric_id': 'true', 'categories': 'include', 'funding_data': 'include', 'extra': 'include', 'exit_data': 'include', 'acquisitions': 'include', 'url': 'include', 'use_cache': 'if-present', } try: response = requests.get(api_endpoint, params={'url':url}, headers=headers) if response.status_code == 404: print("Company not found for URL:", url) return None else: data_company = response.json() print(data_company) if 'extra' in data_company: print("Extra information found:", data_company['extra']) else: print("No extra information found in JSON response.") return data_company except requests.exceptions.RequestException as e: print (e) return None def get_company_employee_url(company_linkedin_profile_url): api_endpoint = 'https://nubela.co/proxycurl/api/linkedin/company/employee/search/' api_key = 'SDrD73S2fXlvCMdFDExEaw' header_dic = {'Authorization': 'Bearer ' + api_key} params = { 'page_size': '10', 'linkedin_company_profile_url': company_linkedin_profile_url, 'keyword_regex': '[Cc][Tt][Oo]', 'enrich_profiles': 'enrich', 'resolve_numeric_id': 'false', } response = requests.get(api_endpoint, params=params, headers=header_dic) print(response.status_code) print(response.text) if response.status_code == 404: print("No employees found for URL:", url) return None else: data_employees = response.json() if 'employees' in data_employees: print("Employees found:", data_employees['employee_search_results']) else: print("No employees found in JSON response.") #return and store profile_url in data_employees: for employee in data_employees['employee_search_results']: profile_url = employee['profile_url'] print(profile_url) def get_company_employee_info(profile_url): api_endpoint = 'https://nubela.co/proxycurl/api/contact-api/personal-contact' api_key = 'SDrD73S2fXlvCMdFDExEaw' header_dic = {'Authorization': 'Bearer ' + api_key} params = { 'linkedin_profile_url': 'https://linkedin.com/in/test-phone-number', } response = requests.get(api_endpoint, params=params, headers=header_dic) # Initialize visited URLs + data_list visited_urls = [] for url in URLs_all_page: if url in visited_urls: print("Profile already exists in the database for URL:", url) continue data = get_profile_info(url) if data and "error" in data: print(data["error"]) if not data or "experiences" not in data: continue data["search_query"] = search_query # Add the search_query to the data db.reference('profiles').push(data) # Store data in the candidates table visited_urls.append(url) print("Profile data and search query successfully added to the candidates table for URL:", url) for item in data['experiences']: company_name = str(item['company']) company_name_push = re.sub(r'[^a-zA-Z0-9]', '', company_name) # Error handling when pushing code to db, replacement of illegal values company_linkedin_profile_url = item['company_linkedin_profile_url'] company_description = item['description'] company_data = get_company_info(company_linkedin_profile_url) if company_name_push: filtered_company = db.reference('companies/'+ company_name_push).get() else: continue if filtered_company is None: db.reference('companies').push({ 'company_name': company_name_push, 'company_linkedin_profile_url': company_linkedin_profile_url, 'company_description': company_description, 'company_data': company_data }) print("Company data successfully added for URL:", company_linkedin_profile_url) else: print("Company already exists in the database for URL:", company_linkedin_profile_url) experiences = { 'candidate_name': data['full_name'], 'title': item['title'], 'company': item['company'], 'location': item['location'], 'start_date': item['starts_at'], 'end_date': item['ends_at'], 'description': item['description'], } db.reference('experiences').push(experiences) company_employee_url = get_company_employee_url(company_linkedin_profile_url) company_employee_data = get_company_employee_info(company_employee_url) if company_employee_data: db.reference('company_employees/' + company_name_push).push(company_employee_data) print("Company employee data successfully added for company:", company_name) else: print("No data found for company employees for company:", company_name)

要点是 Proxycurl API 会尽最大努力返回额外的信息。如果没有结果,将不予退还。
python firebase api rest screen-scraping
1个回答
0
投票

© www.soinside.com 2019 - 2024. All rights reserved.