分页网页抓取

问题描述 投票:0回答:1

我正在尝试从网站上抓取测试数据。我陷入了从所有页面提取数据的困境,我检查了源文件中的分页代码,但我的代码仍然只返回第一页数据。谁能帮助我我的代码中缺少什么。

我正在使用的代码:

import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def navigate_to_next_page():
    try:
        next_button = WebDriverWait(driver, 60).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'li.pagination-next a'))
        )
        next_button.click()
        return True
    except:
        return False

def extract_test_data():
    # Find all test divs
    test_divs = WebDriverWait(driver, 60).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.product"))
    )
    # Iterate over each test div to extract test name, URL, and price
    for test_div in test_divs:
        test_link = test_div.find_element(By.CSS_SELECTOR, "a.text-theme-colored")
        test_name = test_link.text.strip()
        test_url = test_link.get_attribute("href")  # Extract href attribute for URL
        test_price = test_div.find_element(By.CSS_SELECTOR, "span.amount").text.strip()
        # Append test data to the list
        all_test_data.append([test_url, test_name, test_price])

base_url = "https://www.tenetdiagnostics.in/book/tests?type=p"

chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)

driver.get(base_url)


all_test_data = []

while True:
    extract_test_data()
    if not navigate_to_next_page():
        break

csv_file = "tenet_test_data.csv"
with open(csv_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Test URL", "Test Name", "Test Price"])  # Write header
    writer.writerows(all_test_data)

print("Test data saved to", csv_file)

driver.quit()

这段代码给了我想要的结果,但仅限于第一页。我想从所有页面中提取数据。

提前致谢!

python web-scraping beautifulsoup ngx-pagination
1个回答
0
投票

您可以使用他们的分页 Ajax API 来加载数据,例如:

import pandas as pd
import requests

api_url = "https://infinity.tenetdiagnostics.in/GetServices"

payload = {
    "test_name": "",
    "test_type": "",
    "condition_id": "",
    "speciality_id": "",
    "category_type": "p",
    "sort_by": "",
    "sort_order": "",
    "organ_id": "",
    "city_name": "Hyderabad",
    "from": "10",
    "to": "18",
    "is_popular": "",
}

all_data = []
for i in range(3):  # <-- increase number of pages here
    print("Page", i)

    payload["from"] = i * 10
    payload["to"] = (i + 1) * 10

    data = requests.post(api_url, data=payload).json()
    all_data.extend(data["data"])

df = pd.DataFrame(all_data)
print(df)

打印:

    ROW  tid                                                   test_name test_code                                            test_other_names                                service_url  test_price  test_discpr  test_finalpr type parameter_count specimen  is_home_collection website_order test_cdn is_popular department_name   location  gender
0     1    1                            1, 25-Dihydroxy Vitamin D, Serum   TNT0001                            1, 25-Dihydroxy Vitamin D, Serum                   vitamin-d-1-25-dihydroxy      2190.0            0        2190.0    0               2     None                   1             1  REGULAR          N       Pathology  Hyderabad       4
1     2    2                   17 - Hydroxycorticosteroids, 24 Hrs Urine   TNT0002                   17 - Hydroxycorticosteroids, 24 Hrs Urine                  17-hydroxycorticosteroids      4100.0            0        4100.0    0               0     None                   2             1  REGULAR          N       Pathology  Hyderabad       4
2     3    3                    17 - Hydroxyprogesterone (17-OHP), Serum   TNT0003                    17 - Hydroxyprogesterone (17-OHP), Serum                     17-hydroxyprogesterone      1610.0            0        1610.0    0               1     None                   1             1  REGULAR          N       Pathology  Hyderabad       4
3     4    4                               17 Ketosteroids, 24 Hrs Urine   TNT0004                               17 Ketosteroids, 24 Hrs Urine                            17-ketosteroids      3570.0            0        3570.0    0               2     None                   2             1  REGULAR          N       Pathology  Hyderabad       4
4     5    5                               25 - Hydroxy Vitamin D, Serum   TNT0005                               25 - Hydroxy Vitamin D, Serum                       25-hydroxy-vitamin-d      1990.0            0        1990.0    0               2     None                   1             1  REGULAR          N       Pathology  Hyderabad       4
5     6    6       5 - HIAA (5-Hydroxy Indole Acetic Acid), 24 Hrs Urine   TNT0006       5 - HIAA (5-Hydroxy Indole Acetic Acid), 24 Hrs Urine                                     5-hiaa      4200.0            0        4200.0    0               4     None                   2             1  REGULAR          N       Pathology  Hyderabad       4

...
© www.soinside.com 2019 - 2024. All rights reserved.