使用 REST API 格式从 URL 列表中抓取数据

问题描述 投票:0回答:1

对于某些上下文,我有一个已转换为 Uniprot 的 REST URL API 查询格式的基因名称列表。我已经编写了一个 python 代码,可以请求并提取查询的第一个结果,但现在我试图通过尝试一次请求多个 URL 来提高程序的速度。这可能吗? (我已经尝试过 futures.concurrent 模块,但它似乎并没有提高太多速度,因为根据我的理解,它不是“真正的多线程”。

代码在这里:

#Packages
import pandas as pd
import requests as req
import io
from IPython.display import display
import warnings
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
from tqdm import tqdm
import time
import datetime

#Warnign silencer
warnings.filterwarnings("ignore", message="Workbook contains no default style")

#Grabbing the raw file

file_path = 'FindAllMarkers.xlsx'

raw_df = pd.read_excel(file_path)

# Display the top 10 rows of raw_df
print(raw_df.head(10))

#Split into a dictionary containing each cluster
print(f"Number of clusters: {raw_df['cluster'].nunique()} (including 0)")

clusterdfs_dict = {}
# Iterate over unique cluster numbers
for cluster_num in raw_df['cluster'].unique():
    # Create a DataFrame for each cluster number
    clusterdfs_dict[cluster_num] = raw_df[raw_df['cluster'] == cluster_num]

#Creating a dataframe with all the unqiue genes and other relavent info
unique_genes = raw_df['gene'].unique()
gene_info = pd.DataFrame()
total_genes = len(unique_genes)
processed_genes = 0
#Function for retrieving the search query result of a gene
## 9606 is human
skipped_genes = []

#Generate links
def get_UniPQuery_link(gene_name, tax_id='9606', file_format='xlsx', max_retries=3):
    url = f"https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Cft_intramem%2Ccc_subcellular_location%2Cft_topo_dom%2Cft_transmem%2Ccc_function%2Clit_doi_id%2Clit_pubmed_id&format={file_format}&size=500&query=%28%28gene%3A{gene_name}%29+AND+%28taxonomy_id%3A{tax_id}%29%29"
    return url


#Code to generate Uniprot rest links
UP_links = []
for x in unique_genes:
    UP_links.append(get_UniPQuery_link(x))


start_time = time.time()
start_date_time = datetime.datetime.now()

skipped_genes = []
final_dataframe = pd.DataFrame()

def process_link(link):
    response = req.get(link)
    if response.status_code == 200:
        content = response.content
        excel_data = io.BytesIO(content)
        raw_file_df = pd.read_excel(excel_data)
        
        if raw_file_df.empty:
            skipped_genes.append(link)
            print(f"Empty Excel file for link: {link}. Skipping.")
        else:
            first_row = raw_file_df.iloc[[0]]
            return first_row
    else:
        skipped_genes.append(link)
        print(f"Failed to retrieve Excel file for link: {link}. Skipping.")

def process_links_subset(links_subset, progress):
    results = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_link, link): link for link in links_subset}
        for future in concurrent.futures.as_completed(futures):
            first_row = future.result()
            if first_row is not None:
                results.append(first_row)
            progress.update(1)  # Update progress bar
    
    return results

def process_all_links(links):
    results = []
    subset_size = (len(links) + 7) // 8  # Calculate size of each subset
    with tqdm(total=len(links)) as progress:
        for i in range(0, len(links), subset_size):
            subset = links[i:i+subset_size]
            subset_results = process_links_subset(subset, progress)
            results.extend(subset_results)
    
    return results

# Process all links in 8 subsets concurrently
processed_data = process_all_links(UP_links)

for data in processed_data:
    final_dataframe = final_dataframe.append(data)

print("Skipped genes:", skipped_genes)
gene_info = final_dataframe.copy()

gene_info.to_excel('./gene_info.xlsx', index=False, engine='openpyxl')
# Get the end system time
end_date_time = datetime.datetime.now()
end_time = time.time()
elapsed_time = end_time - start_time
# Convert total seconds to hours, minutes, and seconds
hours = int(elapsed_time // 3600)
minutes = int((elapsed_time % 3600) // 60)
seconds = int(elapsed_time % 60)
# Display the start and end system date and time
print(f"Start Date and Time: {start_date_time}")
print(f"End Date and Time: {end_date_time}")
print(f"Execution time: {hours} hours, {minutes} minutes, {seconds} seconds")
print(f"These are the skipped genes: {skipped_genes}")
python python-3.x rest web-scraping
1个回答
0
投票

以下是如何使用

multiprocessing.Pool
同时请求多个 URL、将结果转换为数据帧并作为最后一步将多个数据帧连接在一起的示例:

import warnings
from io import BytesIO
from multiprocessing import Pool

import pandas as pd
import requests

warnings.simplefilter(
    "ignore"
)  # ignore "Workbook contains no default style, apply openpyxl's default"


def get_UniPQuery_link(gene_name, tax_id="9606", file_format="xlsx", max_retries=3):
    url = f"https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Cft_intramem%2Ccc_subcellular_location%2Cft_topo_dom%2Cft_transmem%2Ccc_function%2Clit_doi_id%2Clit_pubmed_id&format={file_format}&size=500&query=%28%28gene%3A{gene_name}%29+AND+%28taxonomy_id%3A{tax_id}%29%29"
    return url


def get_result(name):
    url = get_UniPQuery_link(name)

    data = requests.get(url).content
    df = pd.read_excel(BytesIO(data))

    return name, df


if __name__ == "__main__":
    # sample gene names:
    gene_names = ["aaa", "bbb", "ccc", "ddd", "eee", "fff"] * 2

    all_dfs = []
    with Pool(processes=4) as pool:
        for result in pool.imap_unordered(get_result, gene_names, chunksize=4):
            name, df = result
            print(name)
            print(df)
            print()
            all_dfs.append(df)

    final_df = pd.concat(all_dfs)
    print(final_df)
© www.soinside.com 2019 - 2024. All rights reserved.