Proxycurl api 没有正确返回数据

首先，对于这条长消息，我很抱歉，但我有一个问题阻碍了我推进我的项目：首先让我快速解释一下工作流程，用户输入搜索查询 -> 使用此查询在 linkedin 中进行搜索 - > 抓取用户的 url（在页面的 nb 函数中）-> 在 proxycurl 中搜索这些用户（https://nubela.co/proxycurl/docs#people-api-person-lookup-endpoint）-> 抓取他们的具有功能的信息 -> 将它们存储在我的数据库中 -> 获取有关被抓取用户体验的信息 -> 再次在 proxycurl API 中进行搜索，但这次是针对公司 -> 获取有关公司的信息并将它们存储在数据库中 ->搜索有关该公司员工的信息 (https://nubela.co/proxycurl/docs#company-api-employee-search-api-endpoint) -> 抓取 CTO 的 url -> 在联系人 API 中搜索以抓取关于 CTO 的信息-个人邮箱-looku p-endpoint) -> 将所有内容存储在数据库中。好的，所以我设法获取 url，在 api 中搜索用户，但我从来没有设法用我的代码获取“额外”信息，而我可以在 Postman 中获取相同的配置文件，同样适用于 personnal_email、personnal_contact_number、github_profile_id。然后我设法获取了有关公司的数据，但仍然是同样的问题，即使我将它们包含在我的代码中，也无法检索“额外”信息或“资金数据”或“收购”。

我真的不知道我的代码有什么问题（我假设出了什么问题，因为邮递员一切正常），我可以在这里得到一些帮助，感谢您的宝贵时间！（下面的完整代码）

from telnetlib import EC
import requests
from datetime import datetime
import json
import re 
from cgitb import text
import selenium
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup, NavigableString, Tag 
from time import sleep
from time import time 
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import csv
import firebase_admin
from firebase_admin import credentials
from firebase_admin import db
import openpyxl
import requests


cred = credentials.Certificate(r"C:\Users\radia\Downloads\st-londres-2-firebase-adminsdk-7eowq-786e799875.json")
firebase_admin.initialize_app(cred, {
    'databaseURL': 'https://st-londres-2-default-rtdb.firebaseio.com/'
})


print('- Importation des packages')
# Task 1: webdriver configuration 
driver = webdriver.Chrome(ChromeDriverManager().install())
# Task 1.1: Open Chrome and Access Linkedin 
sleep(2)
url = 'https://www.linkedin.com/login'
driver.get(url)
print('Initialisation du chrome driver')
sleep(2)

# Task 1.2: Import username and password
credential = open(r"C:\Users\radia\OneDrive\Bureau\credentials.txt")
line = credential.readlines()
username = line[0]
password = line[1]
print('Importation des id')
sleep(2)

# Task 1.2: Key in login credentials
email_field = driver.find_element(By.ID, 'username')
email_field.send_keys(username)
print('Email ok')
sleep(3)

password_field = driver.find_element(By.NAME, 'session_password')
password_field.send_keys(password)
print('Mdp ok')
sleep(2)

# Task 1.2: Click the Login button
signin_field = driver.find_element(By.XPATH, '//*[@id="organic-div"]/form/div[3]/button')
signin_field.click()
sleep(3)

print('- Task A: Connexion à Linkedin')

search_field = driver.find_element(By.XPATH, '//*[@id="global-nav-typeahead"]/input')

search_query = input('Type of profile to scrape ')

search_field.send_keys(search_query)

search_field.send_keys(Keys.RETURN) 

print('TASK B OK')
sleep(10)
try: 
    driver.find_element(By.XPATH, "//*[@id='search-reusables__filters-bar']/ul/li[2]/button").click()

except selenium.common.exceptions.NoSuchElementException:
    print("Element not found")


def GetURL(): #function to grab linkedin urls 
    page_source = BeautifulSoup(driver.page_source, features='lxml')
    a_elements = page_source.find_all('a', {'class': "app-aware-link"})
    all_urls = []
    for element in a_elements:
        url = element.get('href')
        all_urls.append(url)
    return all_urls

##Pagination 
sleep(2)
input_page = int(input('Nombre de pages à scraper: '))
URLs_all_page = []
for page in range(input_page):
    URLs_one_page = GetURL()
    sleep(2)
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') #scrolling to the end of the page
    sleep(3)
    next_button = driver.find_element(By.XPATH, '//button[contains(@class, "artdeco-pagination__button--next") and .//li-icon]')
    driver.execute_script("arguments[0].click();", next_button)
    sleep(2) 
    if URLs_one_page is not None:
        URLs_all_page = URLs_all_page + URLs_one_page
        print(URLs_all_page)
    else:
        print('variable stores a None value')
        sleep(2)
        print(URLs_all_page)
sleep(1)        
    
def get_profile_info(url): # function to make api calls for users 
    api_endpoint = 'https://nubela.co/proxycurl/api/v2/linkedin'
    api_key = 'SDrD73S2fXlvCMdFDExEaw'
    headers = {'Authorization': 'Bearer ' + api_key}
    params = {
        'url': url,
        'fallback_to_cache': 'on-error',
        'use_cache': 'if-present',
        'skills': 'include',
        'inferred_salary': 'include',
        'personal_email': 'include',
        'personal_contact_number': 'include',
        'twitter_profile_id': 'include',
        'facebook_profile_id': 'include',
        'github_profile_id': 'include', 
        'extra': 'include',
    }
    try:
        response = requests.get(api_endpoint, headers=headers, params=params)
        if response.status_code != 404:
            data_profile = response.json()
            return data_profile
        else:
            return None
    except requests.exceptions.RequestException as e:
        print (e)
        return None

def get_company_info(url): #function to make api calls for companies 
    api_key = 'SDrD73S2fXlvCMdFDExEaw'
    headers = {'Authorization': 'Bearer ' + api_key}
    api_endpoint = 'https://nubela.co/proxycurl/api/linkedin/company'
    params = {
    'resolve_numeric_id': 'true',
    'categories': 'include',
    'funding_data': 'include',
    'extra': 'include',
    'exit_data': 'include',
    'acquisitions': 'include',
    'url': 'include',
    'use_cache': 'if-present',
}
    try:
        response = requests.get(api_endpoint, params={'url':url}, headers=headers)
        if response.status_code == 404:
            print("Company not found for URL:", url)
            return None
        else:
            data_company = response.json()
            print(data_company)
            if 'extra' in data_company:
                print("Extra information found:", data_company['extra'])
            else:
                print("No extra information found in JSON response.")
            return data_company

    except requests.exceptions.RequestException as e:
        print (e)
        return None 

def get_company_employee_url(company_linkedin_profile_url):
    api_endpoint = 'https://nubela.co/proxycurl/api/linkedin/company/employee/search/'
    api_key = 'SDrD73S2fXlvCMdFDExEaw'
    header_dic = {'Authorization': 'Bearer ' + api_key}
    params = {
        'page_size': '10',
        'linkedin_company_profile_url': company_linkedin_profile_url,
        'keyword_regex': '[Cc][Tt][Oo]',
        'enrich_profiles': 'enrich',
        'resolve_numeric_id': 'false',
    }
    response = requests.get(api_endpoint,
                            params=params,
                            headers=header_dic)
    print(response.status_code)
    print(response.text)
    if response.status_code == 404:
        print("No employees found for URL:", url)
        return None
    else:
        data_employees = response.json()
        if 'employees' in data_employees:
            print("Employees found:", data_employees['employee_search_results'])
        else:
            print("No employees found in JSON response.")
        #return and store profile_url in data_employees:
        for employee in data_employees['employee_search_results']:
           profile_url = employee['profile_url']
           print(profile_url)
    
def get_company_employee_info(profile_url):
    api_endpoint = 'https://nubela.co/proxycurl/api/contact-api/personal-contact'
    api_key = 'SDrD73S2fXlvCMdFDExEaw'
    header_dic = {'Authorization': 'Bearer ' + api_key}
    params = {
        'linkedin_profile_url': 'https://linkedin.com/in/test-phone-number',
    }
    response = requests.get(api_endpoint,
                            params=params,
                            headers=header_dic)
# Initialize visited URLs + data_list 

visited_urls = []

for url in URLs_all_page:
    if url in visited_urls:
        print("Profile already exists in the database for URL:", url)
        continue
    data = get_profile_info(url)
    if data and "error" in data:
        print(data["error"])
    if not data or "experiences" not in data:
        continue
    data["search_query"] = search_query  # Add the search_query to the data
    db.reference('profiles').push(data)  # Store data in the candidates table
    
    visited_urls.append(url)
    print("Profile data and search query successfully added to the candidates table for URL:", url)

    for item in data['experiences']:
        company_name = str(item['company'])
        company_name_push = re.sub(r'[^a-zA-Z0-9]', '', company_name) # Error handling when pushing code to db, replacement of illegal values 
        company_linkedin_profile_url = item['company_linkedin_profile_url']
        company_description = item['description']
        company_data = get_company_info(company_linkedin_profile_url)
        if company_name_push:
            filtered_company = db.reference('companies/'+ company_name_push).get()
        else:
            continue

        if filtered_company is None:
            db.reference('companies').push({
                'company_name': company_name_push,
                'company_linkedin_profile_url': company_linkedin_profile_url, 
                'company_description': company_description,
                'company_data': company_data
            })
            print("Company data successfully added for URL:", company_linkedin_profile_url)
        else:
            print("Company already exists in the database for URL:", company_linkedin_profile_url)

        experiences = {
            'candidate_name': data['full_name'], 
            'title': item['title'], 
            'company': item['company'], 
            'location': item['location'],
            'start_date': item['starts_at'],
            'end_date': item['ends_at'],
            'description': item['description'],
        }
        db.reference('experiences').push(experiences) 
        
        company_employee_url = get_company_employee_url(company_linkedin_profile_url)
        company_employee_data = get_company_employee_info(company_employee_url)

        if company_employee_data:
            db.reference('company_employees/' + company_name_push).push(company_employee_data)
            print("Company employee data successfully added for company:", company_name)
        else:
            print("No data found for company employees for company:", company_name)

要点是 Proxycurl API 会尽最大努力返回额外的信息。如果没有结果，将不予退还。

问题描述投票：0回答：1

1个回答

最新问题

Proxycurl api 没有正确返回数据

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1