我在抓取网站时遇到问题,我的脚本将数据提取到 aria rowindex 29,而我需要提取到 aria rowindex 2509

问题描述 投票:0回答:1

这是我的代码,你可以看到我正在使用 playwright 和 selectolax 来抓取网站。每当我执行脚本时,脚本都会从网站上的表中提取数据,直到 aria 行索引 29,然后执行成功停止,没有显示任何错误,但我希望脚本执行到 aria 行索引 2509

from playwright.sync_api import sync_playwright
from selectolax.parser import HTMLParser
import time
import pandas as pd


def extract_full_body_html(url):
    TIMEOUT = 30000  # Reduced timeout to prevent long waits

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()

        # Maximize the window
        page.set_viewport_size({'width': 1920, 'height': 1080})

        page.goto(url, wait_until='networkidle')

        # Wait for the initial dynamic content to load
        page.wait_for_selector('div[role="gridcell"]', timeout=TIMEOUT)  # Adjusted selector

        # Scroll down and periodically check for new content
        def load_more_content():
            last_row_index = 0
            while True:
                page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                time.sleep(10)  # Wait for the page to load more content

                # Check for new elements based on the aria-rowindex attribute
                new_last_row_index = int(page.evaluate('''() => {
                    const rows = document.querySelectorAll('div[role="gridcell"][aria-rowindex]');
                    return rows[rows.length - 1].getAttribute("aria-rowindex");
                }'''))

                if new_last_row_index <= last_row_index:
                    break  # No new data loaded, stop the process
                last_row_index = new_last_row_index

                # Small delay to ensure all data is loaded for the new rows
                time.sleep(2)

        load_more_content()

        return page.inner_html('body')

def extraction(html):
    tree = HTMLParser(html)
    data = []

    # Adjust the range if you expect more or fewer rows
    for i in range(1, 2510):  # Extract data up to aria row index 2509
        row_selector = f'div[role="gridcell"][aria-rowindex="{i}"]'
        company_div = tree.css_first(f'{row_selector}[aria-colindex="1"]')
        if company_div is None:
            break  # Exit if no more rows are found

        # Extracting data for each column in the row
        row_data = {
            'Company': company_div.text(deep=True, separator=' '),
            'Emails': tree.css_first(f'{row_selector}[aria-colindex="2"]').text(deep=True, separator=' '),
            'Addresses': tree.css_first(f'{row_selector}[aria-colindex="3"]').text(deep=True, separator=' '),
            'Urls': tree.css_first(f'{row_selector}[aria-colindex="4"]').text(deep=True, separator=' '),
            'Description': tree.css_first(f'{row_selector}[aria-colindex="5"]').text(deep=True, separator=' '),
            'Stage': tree.css_first(f'{row_selector}[aria-colindex="6"]').text(deep=True, separator=' '),
            'Number of Portfolio Organizations': tree.css_first(f'{row_selector}[aria-colindex="7"]').text(deep=True, separator=' '),
            'Number of Investments': tree.css_first(f'{row_selector}[aria-colindex="8"]').text(deep=True, separator=' '),
            'Accelerator Duration (in weeks)': tree.css_first(f'{row_selector}[aria-colindex="9"]').text(deep=True, separator=' '),
            'Number of Exits': tree.css_first(f'{row_selector}[aria-colindex="10"]').text(deep=True, separator=' '),
            'Linkedin': tree.css_first(f'{row_selector}[aria-colindex="11"]').text(deep=True, separator=' '),
            'Founders': tree.css_first(f'{row_selector}[aria-colindex="12"]').text(deep=True, separator=' '),
            'Twitter': tree.css_first(f'{row_selector}[aria-colindex="13"]').text(deep=True, separator=' ')

        }
        data.append(row_data)

    return data

if __name__ == '__main__':
    url = 'https://app.folk.app/shared/All-accelerators-rw0kuUNqtzl6j6dDQquoZTYF6MFKIQHo'
    html = extract_full_body_html(url)
    data = extraction(html)
    df = pd.DataFrame(data)
    df.to_excel('output.xlsx', index=False)

在我的脚本中,我认为页面的 html 内容不完全可供抓取,或者当脚本进一步执行时,页面的 html 未加载或可见以进行抓取。

python html web-scraping playwright playwright-python
1个回答
0
投票

我认为这或多或少是你想要的:

import time
from playwright.sync_api import sync_playwright


with sync_playwright() as p:
    browser = p.chromium.launch(headless=False)
    context = browser.new_context()
    page = context.new_page()
    page.goto('https://app.folk.app/shared/All-accelerators-rw0kuUNqtzl6j6dDQquoZTYF6MFKIQHo')

    #We make click into the table (Otherwise we can not make scroll)
    page.locator("//div[@data-testid='contact-table']").click()

    # We make scroll till the end of the page
    for i in range(5):  # make the range as long as needed
        page.mouse.wheel(0, 150000)
        time.sleep(1)

    # We get the aria-rowindex of the last row of the table
    print(page.locator("//div[@role='row'][last()]").get_attribute('aria-rowindex'))
    num_rows = page.locator("//div[@role='row'][last()]").get_attribute('aria-rowindex')

    # We make scrill again till the top of the page again
    for i in range(5):  # make the range as long as needed
        page.mouse.wheel(0, -150000)
        time.sleep(1)

    # We iterate to take all the data using the num of rows we previously took
    for i in range(1, int(num_rows)+1):
        page.locator(f"//div[@class='c-klyBnI c-klyBnI-inIPuL-css']/div[@aria-rowindex='{i}']").scroll_into_view_if_needed()
        company = page.locator(f"//div[@class='c-klyBnI c-klyBnI-inIPuL-css']/div[@aria-rowindex='{i}']//span[2]").inner_text()
        email = page.locator(f"//div[@role='row' and  @aria-rowindex='{i}']//div[@aria-colindex='2']/span").inner_text()
        print(f"{i} - {company} - {email}")
    time.sleep(10)

我在代码中留下了一些注释来解释代码的作用。

基本上,正如你所说,页面是由Javascript加载的,所以我认为关键是获取最后一行,然后逐行滚动,直到我们获得所有数据。

我刚刚提取了几行,但我认为您应该很容易获取其余的行。

祝你好运!

© www.soinside.com 2019 - 2024. All rights reserved.