首先,对于这条长消息,我很抱歉,但我有一个问题阻碍了我推进我的项目:首先让我快速解释一下工作流程,用户输入搜索查询 -> 使用此查询在 linkedin 中进行搜索 - > 抓取用户的 url(在页面的 nb 函数中)-> 在 proxycurl 中搜索这些用户(https://nubela.co/proxycurl/docs#people-api-person-lookup-endpoint)-> 抓取他们的具有功能的信息 -> 将它们存储在我的数据库中 -> 获取有关被抓取用户体验的信息 -> 再次在 proxycurl API 中进行搜索,但这次是针对公司 -> 获取有关公司的信息并将它们存储在数据库中 ->搜索有关该公司员工的信息 (https://nubela.co/proxycurl/docs#company-api-employee-search-api-endpoint) -> 抓取 CTO 的 url -> 在联系人 API 中搜索以抓取关于 CTO 的信息-个人邮箱-looku p-endpoint) -> 将所有内容存储在数据库中。 好的,所以我设法获取 url,在 api 中搜索用户,但我从来没有设法用我的代码获取“额外”信息,而我可以在 Postman 中获取相同的配置文件,同样适用于 personnal_email、personnal_contact_number、github_profile_id。 然后我设法获取了有关公司的数据,但仍然是同样的问题,即使我将它们包含在我的代码中,也无法检索“额外”信息或“资金数据”或“收购”。
我真的不知道我的代码有什么问题(我假设出了什么问题,因为邮递员一切正常),我可以在这里得到一些帮助,感谢您的宝贵时间! (下面的完整代码)
from telnetlib import EC
import requests
from datetime import datetime
import json
import re
from cgitb import text
import selenium
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup, NavigableString, Tag
from time import sleep
from time import time
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import csv
import firebase_admin
from firebase_admin import credentials
from firebase_admin import db
import openpyxl
import requests
cred = credentials.Certificate(r"C:\Users\radia\Downloads\st-londres-2-firebase-adminsdk-7eowq-786e799875.json")
firebase_admin.initialize_app(cred, {
'databaseURL': 'https://st-londres-2-default-rtdb.firebaseio.com/'
})
print('- Importation des packages')
# Task 1: webdriver configuration
driver = webdriver.Chrome(ChromeDriverManager().install())
# Task 1.1: Open Chrome and Access Linkedin
sleep(2)
url = 'https://www.linkedin.com/login'
driver.get(url)
print('Initialisation du chrome driver')
sleep(2)
# Task 1.2: Import username and password
credential = open(r"C:\Users\radia\OneDrive\Bureau\credentials.txt")
line = credential.readlines()
username = line[0]
password = line[1]
print('Importation des id')
sleep(2)
# Task 1.2: Key in login credentials
email_field = driver.find_element(By.ID, 'username')
email_field.send_keys(username)
print('Email ok')
sleep(3)
password_field = driver.find_element(By.NAME, 'session_password')
password_field.send_keys(password)
print('Mdp ok')
sleep(2)
# Task 1.2: Click the Login button
signin_field = driver.find_element(By.XPATH, '//*[@id="organic-div"]/form/div[3]/button')
signin_field.click()
sleep(3)
print('- Task A: Connexion à Linkedin')
search_field = driver.find_element(By.XPATH, '//*[@id="global-nav-typeahead"]/input')
search_query = input('Type of profile to scrape ')
search_field.send_keys(search_query)
search_field.send_keys(Keys.RETURN)
print('TASK B OK')
sleep(10)
try:
driver.find_element(By.XPATH, "//*[@id='search-reusables__filters-bar']/ul/li[2]/button").click()
except selenium.common.exceptions.NoSuchElementException:
print("Element not found")
def GetURL(): #function to grab linkedin urls
page_source = BeautifulSoup(driver.page_source, features='lxml')
a_elements = page_source.find_all('a', {'class': "app-aware-link"})
all_urls = []
for element in a_elements:
url = element.get('href')
all_urls.append(url)
return all_urls
##Pagination
sleep(2)
input_page = int(input('Nombre de pages à scraper: '))
URLs_all_page = []
for page in range(input_page):
URLs_one_page = GetURL()
sleep(2)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') #scrolling to the end of the page
sleep(3)
next_button = driver.find_element(By.XPATH, '//button[contains(@class, "artdeco-pagination__button--next") and .//li-icon]')
driver.execute_script("arguments[0].click();", next_button)
sleep(2)
if URLs_one_page is not None:
URLs_all_page = URLs_all_page + URLs_one_page
print(URLs_all_page)
else:
print('variable stores a None value')
sleep(2)
print(URLs_all_page)
sleep(1)
def get_profile_info(url): # function to make api calls for users
api_endpoint = 'https://nubela.co/proxycurl/api/v2/linkedin'
api_key = 'SDrD73S2fXlvCMdFDExEaw'
headers = {'Authorization': 'Bearer ' + api_key}
params = {
'url': url,
'fallback_to_cache': 'on-error',
'use_cache': 'if-present',
'skills': 'include',
'inferred_salary': 'include',
'personal_email': 'include',
'personal_contact_number': 'include',
'twitter_profile_id': 'include',
'facebook_profile_id': 'include',
'github_profile_id': 'include',
'extra': 'include',
}
try:
response = requests.get(api_endpoint, headers=headers, params=params)
if response.status_code != 404:
data_profile = response.json()
return data_profile
else:
return None
except requests.exceptions.RequestException as e:
print (e)
return None
def get_company_info(url): #function to make api calls for companies
api_key = 'SDrD73S2fXlvCMdFDExEaw'
headers = {'Authorization': 'Bearer ' + api_key}
api_endpoint = 'https://nubela.co/proxycurl/api/linkedin/company'
params = {
'resolve_numeric_id': 'true',
'categories': 'include',
'funding_data': 'include',
'extra': 'include',
'exit_data': 'include',
'acquisitions': 'include',
'url': 'include',
'use_cache': 'if-present',
}
try:
response = requests.get(api_endpoint, params={'url':url}, headers=headers)
if response.status_code == 404:
print("Company not found for URL:", url)
return None
else:
data_company = response.json()
print(data_company)
if 'extra' in data_company:
print("Extra information found:", data_company['extra'])
else:
print("No extra information found in JSON response.")
return data_company
except requests.exceptions.RequestException as e:
print (e)
return None
def get_company_employee_url(company_linkedin_profile_url):
api_endpoint = 'https://nubela.co/proxycurl/api/linkedin/company/employee/search/'
api_key = 'SDrD73S2fXlvCMdFDExEaw'
header_dic = {'Authorization': 'Bearer ' + api_key}
params = {
'page_size': '10',
'linkedin_company_profile_url': company_linkedin_profile_url,
'keyword_regex': '[Cc][Tt][Oo]',
'enrich_profiles': 'enrich',
'resolve_numeric_id': 'false',
}
response = requests.get(api_endpoint,
params=params,
headers=header_dic)
print(response.status_code)
print(response.text)
if response.status_code == 404:
print("No employees found for URL:", url)
return None
else:
data_employees = response.json()
if 'employees' in data_employees:
print("Employees found:", data_employees['employee_search_results'])
else:
print("No employees found in JSON response.")
#return and store profile_url in data_employees:
for employee in data_employees['employee_search_results']:
profile_url = employee['profile_url']
print(profile_url)
def get_company_employee_info(profile_url):
api_endpoint = 'https://nubela.co/proxycurl/api/contact-api/personal-contact'
api_key = 'SDrD73S2fXlvCMdFDExEaw'
header_dic = {'Authorization': 'Bearer ' + api_key}
params = {
'linkedin_profile_url': 'https://linkedin.com/in/test-phone-number',
}
response = requests.get(api_endpoint,
params=params,
headers=header_dic)
# Initialize visited URLs + data_list
visited_urls = []
for url in URLs_all_page:
if url in visited_urls:
print("Profile already exists in the database for URL:", url)
continue
data = get_profile_info(url)
if data and "error" in data:
print(data["error"])
if not data or "experiences" not in data:
continue
data["search_query"] = search_query # Add the search_query to the data
db.reference('profiles').push(data) # Store data in the candidates table
visited_urls.append(url)
print("Profile data and search query successfully added to the candidates table for URL:", url)
for item in data['experiences']:
company_name = str(item['company'])
company_name_push = re.sub(r'[^a-zA-Z0-9]', '', company_name) # Error handling when pushing code to db, replacement of illegal values
company_linkedin_profile_url = item['company_linkedin_profile_url']
company_description = item['description']
company_data = get_company_info(company_linkedin_profile_url)
if company_name_push:
filtered_company = db.reference('companies/'+ company_name_push).get()
else:
continue
if filtered_company is None:
db.reference('companies').push({
'company_name': company_name_push,
'company_linkedin_profile_url': company_linkedin_profile_url,
'company_description': company_description,
'company_data': company_data
})
print("Company data successfully added for URL:", company_linkedin_profile_url)
else:
print("Company already exists in the database for URL:", company_linkedin_profile_url)
experiences = {
'candidate_name': data['full_name'],
'title': item['title'],
'company': item['company'],
'location': item['location'],
'start_date': item['starts_at'],
'end_date': item['ends_at'],
'description': item['description'],
}
db.reference('experiences').push(experiences)
company_employee_url = get_company_employee_url(company_linkedin_profile_url)
company_employee_data = get_company_employee_info(company_employee_url)
if company_employee_data:
db.reference('company_employees/' + company_name_push).push(company_employee_data)
print("Company employee data successfully added for company:", company_name)
else:
print("No data found for company employees for company:", company_name)
要点是 Proxycurl API 会尽最大努力返回额外的信息。如果没有结果,将不予退还。