如何从网站抓取过滤内容

问题描述 投票:0回答:1

我想过滤 Innovfest x Elevating Founders(初创/规模化)类别,并从这里抓取所有参展商名称和展位号:https://asiatechxsg.com/exhibitors/

当我尝试添加过滤器时,出现错误

ElementClickInterceptedException          Traceback (most recent call last)
Cell In[6], line 36
     34 # Locate and click the dropdown to reveal the checkboxes
     35 dropdown = wait.until(EC.element_to_be_clickable((By.XPATH, "//div[@data-hook='filter-dropdown']")))
---> 36 dropdown.click()
     37 time.sleep(2)  # Allow the dropdown to open
     39 # Get all the filter checkboxes

File c:\Users\twbsguser009\AppData\Local\Programs\Python\Python312\Lib\site-packages\selenium\webdriver\remote\webelement.py:94, in WebElement.click(self)
     92 def click(self) -> None:
     93     """Clicks the element."""
---> 94     self._execute(Command.CLICK_ELEMENT)

File c:\Users\twbsguser009\AppData\Local\Programs\Python\Python312\Lib\site-packages\selenium\webdriver\remote\webelement.py:395, in WebElement._execute(self, command, params)
    393     params = {}
    394 params["id"] = self._id
--> 395 return self._parent.execute(command, params)

File c:\Users\twbsguser009\AppData\Local\Programs\Python\Python312\Lib\site-packages\selenium\webdriver\remote\webdriver.py:347, in WebDriver.execute(self, driver_command, params)
    345 response = self.command_executor.execute(driver_command, params)
    346 if response:
--> 347     self.error_handler.check_response(response)
    348     response["value"] = self._unwrap_value(response.get("value", None))
    349     return response
...
    (No symbol) [0x00007FF7A3483592]
    (No symbol) [0x00007FF7A3472F9F]
    BaseThreadInitThunk [0x00007FFF26B1257D+29]
    RtlUserThreadStart [0x00007FFF27FEAA48+40]

代码

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

# Initialize WebDriver
driver = webdriver.Chrome()
driver.maximize_window()
driver.get("https://asiatechxsg.com/exhibitors/")

# Wait for the iframe to load and switch to it
wait = WebDriverWait(driver, 10)
wait.until(EC.frame_to_be_available_and_switch_to_it((By.CLASS_NAME, "IframeModule_iframe__JCvXg")))

# Scroll to the bottom of the page to ensure all elements are loaded
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)  # Allow some time for the page to load new content

# Scroll until the last specific element is found
match = False
while not match:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)  # Allow some time for the page to load new content
    last_element = driver.find_elements(By.XPATH, "//img[@alt='Singapore Centre for Social Enterprise, raiSE Ltd']")
    if len(last_element) >= 1:
        match = True

# Locate and click the dropdown to reveal the checkboxes
dropdown = wait.until(EC.element_to_be_clickable((By.XPATH, "//div[@data-hook='filter-dropdown']")))
dropdown.click()
time.sleep(2)  # Allow the dropdown to open

# Get all the filter checkboxes
checkboxes = driver.find_elements(By.XPATH, "//input[@type='checkbox' and contains(@data-hook, 'exhibitor-filter')]")

exhibitor_names = []

# Loop through each checkbox to apply the filter and gather names
for checkbox in checkboxes:
    # Check the checkbox
    checkbox.click()
    time.sleep(2)  # Allow some time for the filter to apply

    # Get the filtered company names
    filtered_names = wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//span[@class='sc-a13c392f-0 sc-85df61db-4 feepZZ eErKol']")))
    logger.info(f"No. of sponsors and exhibitors after applying filter= {len(filtered_names)}")
    
    # Append the names to the list
    for name in filtered_names:
        exhibitor_names.append(name.text)

    # Uncheck the checkbox to reset the filter
    checkbox.click()
    time.sleep(2)  # Allow some time for the filter to reset

# Close the WebDriver
driver.quit()

# Print or log all collected exhibitor names
logger.info(f"Total exhibitors collected: {len(exhibitor_names)}")
for name in exhibitor_names:
    print(name)

这就是我目前拥有的。我如何才能使其只刮掉过滤后的参展商?先谢谢你了

python selenium-webdriver web-scraping
1个回答
0
投票

我建议模仿Rest API来获取参展商信息:

import requests

api_url = "https://attend.informatechevents.virtual.informatech.com/api/graphql"

payload = [
    {
        "extensions": {
            "persistedQuery": {
                "sha256Hash": "a717703fa8924575e04c9968ef2f441781e9cb8e2d5ca62d9ca9742bd04eac93",
                "version": 1,
            }
        },
        "operationName": "EventExhibitorListViewConnectionQuery",
        "variables": {
            "eventId": "RXZlbnRfMTc5MDkyMQ==",  # Event_1790921
            "selectedFilters": [
                {
                    "mustEventFiltersIn": [
                        {
                            "filterId": "RmllbGREZWZpbml0aW9uXzMwNTU5NA==",  # FieldDefinition_305594
                            "values": [
                                "RmllbGRWYWx1ZV8yMzI1MDQ5Mg==",  # FieldValue_23250492 "InnovFest x Elevating Founders Asia Silver Sponsors"
                                "RmllbGRWYWx1ZV8yMzYxMjEzNA==",  # FieldValue_23612134 "InnovFest x Elevating Founders Knowledge Partner"
                                "RmllbGRWYWx1ZV8yMzI1MDQ5Ng==",  # FieldValue_23250496 "InnovFest x Elevating Founders Institutes of Higher Learning"
                                "RmllbGRWYWx1ZV8yMzI1MDQ5NQ==",  # FieldValue_23250495 "InnovFest x Elevating Founders Asia Bronze Sponsors"
                            ],
                        }
                    ]
                }
            ],
            "viewId": "RXZlbnRWaWV3Xzc2MDczMA==",  # EventView_760730
            "withEvent": True,
        },
    }
]

data = requests.post(api_url, json=payload).json()

# print(data)

for e in data[0]["data"]["view"]["exhibitors"]["nodes"]:
    print(e["name"])
    print(e["type"])
    print("-" * 80)

打印:

Republic Polytechnic
InnovFest x Elevating Founders Institutes of Higher Learning
--------------------------------------------------------------------------------
Singapore University of Social Sciences
InnovFest x Elevating Founders Institutes of Higher Learning
--------------------------------------------------------------------------------
Temasek Polytechnic
InnovFest x Elevating Founders Institutes of Higher Learning
--------------------------------------------------------------------------------
Femtech
InnovFest x Elevating Founders Knowledge Partner
--------------------------------------------------------------------------------
IIPCC
InnovFest x Elevating Founders Knowledge Partner
--------------------------------------------------------------------------------
IPOS
InnovFest x Elevating Founders Knowledge Partner
--------------------------------------------------------------------------------
Singapore Centre for Social Enterprise, raiSE Ltd
InnovFest x Elevating Founders Knowledge Partner
--------------------------------------------------------------------------------
Hamilton, Brook, Smith & Reynolds, P.C.
InnovFest x Elevating Founders Asia Silver Sponsors
--------------------------------------------------------------------------------
Krislab, SIA Digital Innovation Lab
InnovFest x Elevating Founders Asia Silver Sponsors
--------------------------------------------------------------------------------
Marks & Clerk Singapore LLP
InnovFest x Elevating Founders Asia Silver Sponsors
--------------------------------------------------------------------------------
Singapore Science Park Limited
InnovFest x Elevating Founders Asia Silver Sponsors
--------------------------------------------------------------------------------
CHINA SINDA IP
InnovFest x Elevating Founders Asia Bronze Sponsors
--------------------------------------------------------------------------------
© www.soinside.com 2019 - 2024. All rights reserved.