这是我的代码,你可以看到我正在使用 playwright 和 selectolax 来抓取网站。每当我执行脚本时,脚本都会从网站上的表中提取数据,直到 aria 行索引 29,然后执行成功停止,没有显示任何错误,但我希望脚本执行到 aria 行索引 2509
from playwright.sync_api import sync_playwright
from selectolax.parser import HTMLParser
import time
import pandas as pd
def extract_full_body_html(url):
TIMEOUT = 30000 # Reduced timeout to prevent long waits
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
# Maximize the window
page.set_viewport_size({'width': 1920, 'height': 1080})
page.goto(url, wait_until='networkidle')
# Wait for the initial dynamic content to load
page.wait_for_selector('div[role="gridcell"]', timeout=TIMEOUT) # Adjusted selector
# Scroll down and periodically check for new content
def load_more_content():
last_row_index = 0
while True:
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(10) # Wait for the page to load more content
# Check for new elements based on the aria-rowindex attribute
new_last_row_index = int(page.evaluate('''() => {
const rows = document.querySelectorAll('div[role="gridcell"][aria-rowindex]');
return rows[rows.length - 1].getAttribute("aria-rowindex");
}'''))
if new_last_row_index <= last_row_index:
break # No new data loaded, stop the process
last_row_index = new_last_row_index
# Small delay to ensure all data is loaded for the new rows
time.sleep(2)
load_more_content()
return page.inner_html('body')
def extraction(html):
tree = HTMLParser(html)
data = []
# Adjust the range if you expect more or fewer rows
for i in range(1, 2510): # Extract data up to aria row index 2509
row_selector = f'div[role="gridcell"][aria-rowindex="{i}"]'
company_div = tree.css_first(f'{row_selector}[aria-colindex="1"]')
if company_div is None:
break # Exit if no more rows are found
# Extracting data for each column in the row
row_data = {
'Company': company_div.text(deep=True, separator=' '),
'Emails': tree.css_first(f'{row_selector}[aria-colindex="2"]').text(deep=True, separator=' '),
'Addresses': tree.css_first(f'{row_selector}[aria-colindex="3"]').text(deep=True, separator=' '),
'Urls': tree.css_first(f'{row_selector}[aria-colindex="4"]').text(deep=True, separator=' '),
'Description': tree.css_first(f'{row_selector}[aria-colindex="5"]').text(deep=True, separator=' '),
'Stage': tree.css_first(f'{row_selector}[aria-colindex="6"]').text(deep=True, separator=' '),
'Number of Portfolio Organizations': tree.css_first(f'{row_selector}[aria-colindex="7"]').text(deep=True, separator=' '),
'Number of Investments': tree.css_first(f'{row_selector}[aria-colindex="8"]').text(deep=True, separator=' '),
'Accelerator Duration (in weeks)': tree.css_first(f'{row_selector}[aria-colindex="9"]').text(deep=True, separator=' '),
'Number of Exits': tree.css_first(f'{row_selector}[aria-colindex="10"]').text(deep=True, separator=' '),
'Linkedin': tree.css_first(f'{row_selector}[aria-colindex="11"]').text(deep=True, separator=' '),
'Founders': tree.css_first(f'{row_selector}[aria-colindex="12"]').text(deep=True, separator=' '),
'Twitter': tree.css_first(f'{row_selector}[aria-colindex="13"]').text(deep=True, separator=' ')
}
data.append(row_data)
return data
if __name__ == '__main__':
url = 'https://app.folk.app/shared/All-accelerators-rw0kuUNqtzl6j6dDQquoZTYF6MFKIQHo'
html = extract_full_body_html(url)
data = extraction(html)
df = pd.DataFrame(data)
df.to_excel('output.xlsx', index=False)
在我的脚本中,我认为页面的 html 内容不完全可供抓取,或者当脚本进一步执行时,页面的 html 未加载或可见以进行抓取。
我认为这或多或少是你想要的:
import time
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
context = browser.new_context()
page = context.new_page()
page.goto('https://app.folk.app/shared/All-accelerators-rw0kuUNqtzl6j6dDQquoZTYF6MFKIQHo')
#We make click into the table (Otherwise we can not make scroll)
page.locator("//div[@data-testid='contact-table']").click()
# We make scroll till the end of the page
for i in range(5): # make the range as long as needed
page.mouse.wheel(0, 150000)
time.sleep(1)
# We get the aria-rowindex of the last row of the table
print(page.locator("//div[@role='row'][last()]").get_attribute('aria-rowindex'))
num_rows = page.locator("//div[@role='row'][last()]").get_attribute('aria-rowindex')
# We make scrill again till the top of the page again
for i in range(5): # make the range as long as needed
page.mouse.wheel(0, -150000)
time.sleep(1)
# We iterate to take all the data using the num of rows we previously took
for i in range(1, int(num_rows)+1):
page.locator(f"//div[@class='c-klyBnI c-klyBnI-inIPuL-css']/div[@aria-rowindex='{i}']").scroll_into_view_if_needed()
company = page.locator(f"//div[@class='c-klyBnI c-klyBnI-inIPuL-css']/div[@aria-rowindex='{i}']//span[2]").inner_text()
email = page.locator(f"//div[@role='row' and @aria-rowindex='{i}']//div[@aria-colindex='2']/span").inner_text()
print(f"{i} - {company} - {email}")
time.sleep(10)
我在代码中留下了一些注释来解释代码的作用。
基本上,正如你所说,页面是由Javascript加载的,所以我认为关键是获取最后一行,然后逐行滚动,直到我们获得所有数据。
我刚刚提取了几行,但我认为您应该很容易获取其余的行。
祝你好运!