使用 python beautiful soup 和 selenium 发布从人类微生物项目 (HMP) 抓取动态表数据

Question

我正在使用 python（漂亮的汤和硒）从 HMP 网站的“文件 UUID”列中抓取动态表数据。出于某种原因，我能够从 HMP 网站动态表中提取除我需要的列之外的所有数据。由于某种原因它没有出现。下面是我正在运行的 python 代码。让我知道问题可能是什么，或者是否有更好的方法来获取这些数据。

from bs4 import BeautifulSoup
from selenium import webdriver
import time

import numpy as np
import pandas as pd

# establishing connection to hmp main website and parsing hmp data table information
url = 'https://portal.hmpdacc.org/query/f?query=file.matrix_type%20in%20%5B%22wgs_community%22,%2216s_community%22%5D%20and%20sample.body_site%20in%20%5B%22feces%22%5D&filters=%7B%22op%22:%22and%22,%22content%22:%5B%7B%22op%22:%22in%22,%22content%22:%7B%22field%22:%22file.matrix_type%22,%22value%22:%5B%22wgs_community%22,%2216s_community%22%5D%7D%7D,%7B%22op%22:%22in%22,%22content%22:%7B%22field%22:%22sample.body_site%22,%22value%22:%5B%22feces%22%5D%7D%7D%5D%7D#:~:text=Samples%20(3%2C452)-,Files%20(5%2C181),-files'
browser = webdriver.Chrome()
browser.get(url)
time.sleep(3)
html = browser.page_source
hmp_parsed_page = BeautifulSoup(html, "lxml")
hmp_files_table = hmp_parsed_page.find('table', id='files-table')

# gathering hmp meta datatable column headers
hmp_metadata_fields = []
for th in hmp_files_table.find_all('th'):
    col_header = th.text
    hmp_metadata_fields.append(col_header)

# creating dataframe of hmp information scraped
hmp_metadata_df = pd.DataFrame(columns = hmp_metadata_fields)

# appending hmp row data to dataframe
for tr in hmp_files_table.find_all('tr')[1:]:
    row_data = tr.find_all('td')
    row = [data_point.text for data_point in row_data]
    hmp_metadata_df.loc[len(hmp_metadata_df.index)] = row

# dropping unneeded columns
hmp_metadata_df = hmp_metadata_df.drop(hmp_metadata_df.columns[[0,1]], axis = 1)

# adding hmp indicator to front of dataframe
hmp_metadata_df['Data Source'] = 'HMP'
print(hmp_metadata_df)

# closing hmp website connection
browser.close()
browser.quit()

我已经尝试了所有不同的屏幕方式，但没有成功地从 HMP 中抓取此表数据。我希望输出是网站上显示的所有列和行的表格。由于某种原因，它没有显示。当我使用检查查找表中的每个元素时，它显示“文件 UUID”在“文件表”下。

<th title="File UUID" ng-repeat="h in tsc.headings" ng-class="{
              'sortable': h['sortable'],
              'sort-asc': tsc.tableParams.sorting()[h['sortable']]=='asc',
              'sort-desc': tsc.tableParams.sorting()[h['sortable']]=='desc'
            }" ng-click="tsc.sortByCol(h, $event)" ng-if="h.show" class="header ng-scope sortable" role="button" tabindex="0" style=""><div class="ng-table-header " ng-class="{'sort-indicator': tsc.tableParams.defaultSettings.sortingIndicator == 'div'}"><span data-cell="tsc.getHeaderCell(h)" data-data="data" data-paging="paging" ng-class="{'sort-indicator': tsc.tableParams.defaultSettings.sortingIndicator == 'span'}" class="ng-isolate-scope sort-indicator">File UUID</span></div></th>

Answer 1

你可以直接使用他们的Ajax API来获取数据（UUID是我相信

id

专栏）：

import requests
import pandas as pd
from bs4 import BeautifulSoup

url = "https://portal.hmpdacc.org/api/files"

params = {
    "fields": "file_format,file_type,file_annotation_pipeline,file_matrix_type",
    "filters": '{"op":"and","content":[{"op":"in","content":{"field":"file.matrix_type","value":["wgs_community","16s_community"]}},{"op":"in","content":{"field":"sample.body_site","value":["feces"]}}]}',
    "from": 0,
    "save": "",
    "size": "20",
    "sort": "file_id:asc",
}

all_dfs = []
for params['from'] in range(0, 40, 20): # <--- increase the range for next pages
    data = requests.get(url, params=params).json()
    all_dfs.append(pd.DataFrame([h['file'] for h in data['data']['hits']]))

df = pd.concat(all_dfs).reset_index(drop=True)
print(df.tail())

印花：

                 format_doc        study  ver organism_type                         format    data_modality         node_type    size        subtype                                                                                                                    fasp  data_type    matrix_type abundance_type                                                                                                                            https                                id                               md5                                                                                                                        file_name access                                                                                                comment
35  http://biom-format.org/  prediabetes  NaN     bacterial  Biological Observation Matrix  marker sequence  abundance_matrix  196000  16s_community    fasp://aspera.ihmpdcc.org/t2d/genome/microbiome/16s/analysis/hmqcp/HMP2_J45372_1_ST_T0_B0_0120_ZY39SN0-02_APB4D.biom  abundance  16s_community      community    https://downloads.hmpdacc.org/ihmp/t2d/genome/microbiome/16s/analysis/hmqcp/HMP2_J45372_1_ST_T0_B0_0120_ZY39SN0-02_APB4D.biom  76612bd9a41885add4f6b0b7683a65da  70600351056001048c1d42d7268cc6b7    https://downloads.hmpdacc.org/ihmp/t2d/genome/microbiome/16s/analysis/hmqcp/HMP2_J45372_1_ST_T0_B0_0120_ZY39SN0-02_APB4D.biom   open    Qiime output upload from DCC for HMP2_J45372_1_ST_T0_B0_0120_ZY39SN0-02_APB4D.clean.dehost.fastq.gz
36  http://biom-format.org/  prediabetes  NaN     bacterial  Biological Observation Matrix  marker sequence  abundance_matrix  196000  16s_community  fasp://aspera.ihmpdcc.org/t2d/genome/microbiome/16s/analysis/hmqcp/HMP2_J45281_1_ST_T0_B0_0120_ZRB0F6P-6021_APATM.biom  abundance  16s_community      community  https://downloads.hmpdacc.org/ihmp/t2d/genome/microbiome/16s/analysis/hmqcp/HMP2_J45281_1_ST_T0_B0_0120_ZRB0F6P-6021_APATM.biom  76612bd9a41885add4f6b0b76836df9b  39643700bd4bcf040064c12f1d2b644c  https://downloads.hmpdacc.org/ihmp/t2d/genome/microbiome/16s/analysis/hmqcp/HMP2_J45281_1_ST_T0_B0_0120_ZRB0F6P-6021_APATM.biom   open  Qiime output upload from DCC for HMP2_J45281_1_ST_T0_B0_0120_ZRB0F6P-6021_APATM.clean.dehost.fastq.gz
37  http://biom-format.org/  prediabetes  NaN     bacterial  Biological Observation Matrix  marker sequence  abundance_matrix   81000  16s_community    fasp://aspera.ihmpdcc.org/t2d/genome/microbiome/16s/analysis/hmqcp/HMP2_J04182_1_ST_T0_B0_0122_ZN0JE53-04_AAH7B.biom  abundance  16s_community      community    https://downloads.hmpdacc.org/ihmp/t2d/genome/microbiome/16s/analysis/hmqcp/HMP2_J04182_1_ST_T0_B0_0122_ZN0JE53-04_AAH7B.biom  6cca313bce90a4392c3d5cf23fdb7ca8  7a33c9809cb98fac4e89aa2d3c151597    https://downloads.hmpdacc.org/ihmp/t2d/genome/microbiome/16s/analysis/hmqcp/HMP2_J04182_1_ST_T0_B0_0122_ZN0JE53-04_AAH7B.biom   open    Qiime output upload from DCC for HMP2_J04182_1_ST_T0_B0_0122_ZN0JE53-04_AAH7B.clean.dehost.fastq.gz
38  http://biom-format.org/  prediabetes  NaN     bacterial  Biological Observation Matrix  marker sequence  abundance_matrix  204000  16s_community                                       fasp://aspera.ihmpdcc.org/t2d/genome/microbiome/16s/analysis/hmqcp/otu_table.biom  abundance  16s_community      community                                       https://downloads.hmpdacc.org/ihmp/t2d/genome/microbiome/16s/analysis/hmqcp/otu_table.biom  76612bd9a41885add4f6b0b7681567ac  7a33c9809cb98fac4e89aa2d3c151597                                       https://downloads.hmpdacc.org/ihmp/t2d/genome/microbiome/16s/analysis/hmqcp/otu_table.biom   open    Qiime output upload from DCC for HMP2_J04182_1_ST_T0_B0_0122_ZN0JE53-04_AAH7B.clean.dehost.fastq.gz
39  http://biom-format.org/  prediabetes  NaN     bacterial  Biological Observation Matrix  marker sequence  abundance_matrix  120000  16s_community    fasp://aspera.ihmpdcc.org/t2d/genome/microbiome/16s/analysis/hmqcp/HMP2_J00840_1_ST_T0_B0_0120_ZLZNCLZ-01_AA31J.biom  abundance  16s_community      community    https://downloads.hmpdacc.org/ihmp/t2d/genome/microbiome/16s/analysis/hmqcp/HMP2_J00840_1_ST_T0_B0_0120_ZLZNCLZ-01_AA31J.biom  6cca313bce90a4392c3d5cf23fdafbcc  9757b64815cbfee3ba188e80b69a023e    https://downloads.hmpdacc.org/ihmp/t2d/genome/microbiome/16s/analysis/hmqcp/HMP2_J00840_1_ST_T0_B0_0120_ZLZNCLZ-01_AA31J.biom   open    Qiime output upload from DCC for HMP2_J00840_1_ST_T0_B0_0120_ZLZNCLZ-01_AA31J.clean.dehost.fastq.gz

使用 python beautiful soup 和 selenium 发布从人类微生物项目 (HMP) 抓取动态表数据

问题描述投票：0回答：1

1个回答

最新问题

使用 python beautiful soup 和 selenium 发布从人类微生物项目 (HMP) 抓取动态表数据

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1