我正在尝试从网站上抓取测试数据。我陷入了从所有页面提取数据的困境,我检查了源文件中的分页代码,但我的代码仍然只返回第一页数据。谁能帮助我我的代码中缺少什么。
我正在使用的代码:
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def navigate_to_next_page():
try:
next_button = WebDriverWait(driver, 60).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, 'li.pagination-next a'))
)
next_button.click()
return True
except:
return False
def extract_test_data():
# Find all test divs
test_divs = WebDriverWait(driver, 60).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.product"))
)
# Iterate over each test div to extract test name, URL, and price
for test_div in test_divs:
test_link = test_div.find_element(By.CSS_SELECTOR, "a.text-theme-colored")
test_name = test_link.text.strip()
test_url = test_link.get_attribute("href") # Extract href attribute for URL
test_price = test_div.find_element(By.CSS_SELECTOR, "span.amount").text.strip()
# Append test data to the list
all_test_data.append([test_url, test_name, test_price])
base_url = "https://www.tenetdiagnostics.in/book/tests?type=p"
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
driver.get(base_url)
all_test_data = []
while True:
extract_test_data()
if not navigate_to_next_page():
break
csv_file = "tenet_test_data.csv"
with open(csv_file, "w", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(["Test URL", "Test Name", "Test Price"]) # Write header
writer.writerows(all_test_data)
print("Test data saved to", csv_file)
driver.quit()
这段代码给了我想要的结果,但仅限于第一页。我想从所有页面中提取数据。
提前致谢!
您可以使用他们的分页 Ajax API 来加载数据,例如:
import pandas as pd
import requests
api_url = "https://infinity.tenetdiagnostics.in/GetServices"
payload = {
"test_name": "",
"test_type": "",
"condition_id": "",
"speciality_id": "",
"category_type": "p",
"sort_by": "",
"sort_order": "",
"organ_id": "",
"city_name": "Hyderabad",
"from": "10",
"to": "18",
"is_popular": "",
}
all_data = []
for i in range(3): # <-- increase number of pages here
print("Page", i)
payload["from"] = i * 10
payload["to"] = (i + 1) * 10
data = requests.post(api_url, data=payload).json()
all_data.extend(data["data"])
df = pd.DataFrame(all_data)
print(df)
打印:
ROW tid test_name test_code test_other_names service_url test_price test_discpr test_finalpr type parameter_count specimen is_home_collection website_order test_cdn is_popular department_name location gender
0 1 1 1, 25-Dihydroxy Vitamin D, Serum TNT0001 1, 25-Dihydroxy Vitamin D, Serum vitamin-d-1-25-dihydroxy 2190.0 0 2190.0 0 2 None 1 1 REGULAR N Pathology Hyderabad 4
1 2 2 17 - Hydroxycorticosteroids, 24 Hrs Urine TNT0002 17 - Hydroxycorticosteroids, 24 Hrs Urine 17-hydroxycorticosteroids 4100.0 0 4100.0 0 0 None 2 1 REGULAR N Pathology Hyderabad 4
2 3 3 17 - Hydroxyprogesterone (17-OHP), Serum TNT0003 17 - Hydroxyprogesterone (17-OHP), Serum 17-hydroxyprogesterone 1610.0 0 1610.0 0 1 None 1 1 REGULAR N Pathology Hyderabad 4
3 4 4 17 Ketosteroids, 24 Hrs Urine TNT0004 17 Ketosteroids, 24 Hrs Urine 17-ketosteroids 3570.0 0 3570.0 0 2 None 2 1 REGULAR N Pathology Hyderabad 4
4 5 5 25 - Hydroxy Vitamin D, Serum TNT0005 25 - Hydroxy Vitamin D, Serum 25-hydroxy-vitamin-d 1990.0 0 1990.0 0 2 None 1 1 REGULAR N Pathology Hyderabad 4
5 6 6 5 - HIAA (5-Hydroxy Indole Acetic Acid), 24 Hrs Urine TNT0006 5 - HIAA (5-Hydroxy Indole Acetic Acid), 24 Hrs Urine 5-hiaa 4200.0 0 4200.0 0 4 None 2 1 REGULAR N Pathology Hyderabad 4
...