不可靠的硒脚本

问题描述 投票:0回答:1

我正在编写一个硒脚本来搜索https://ssllc.com/。该代码似乎不可靠并且仅有时有效。

from selenium.webdriver import Chrome

from selenium.webdriver.common.keys 
import Keys

from selenium.webdriver.common.by import By

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.chrome.options import Options as ChromeOptions  # Import ChromeOptions here



# Define the base URL of the website

base_url = 'https://www.ssllc.com'

search_bar='#page-top > div > div > div > div:nth-child(1) > input[type=text]' #CSS_SELECTOR

items = '#gatsby-focus-wrapper > div.root > div > div > div > div.medium-8.columns.main-content > div.ais-Hits > ul '



# Define the list of search queries

search_queries = [ 
 'Unused+Sartorius+1000+Liter+BIOSTAT+CultiBag+STR+Single+Use+Bioreactor',


'3+x+V5/XCell+Repigen+Next+Gen+ATF+controllers',

'InSite+Integrity+Tester'

]



# Configure ChromeOptions

options = ChromeOptions()

options.add_argument('--headless')  # Optional: Run Chrome in headless mode for faster execution



try:

for search_query in search_queries:

    # Initialize Chrome WebDriver for each search

    with Chrome(options=options) as driver:

        # Load the website

        driver.get(base_url)

       

        # Find the search input field and enter the query

        search_input = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, search_bar)))

        search_input.clear()

        search_input.send_keys(search_query)

        search_input.send_keys(Keys.RETURN)

       

        # Wait for the search results to load

        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, items )))

       

        # Extract and print search results

        search_results = driver.find_elements(By.CSS_SELECTOR, items )

        if search_results:

            print(f"Search results for '{search_query}':")

            for result in search_results:

                # Get the link to the search result

                #result_link = result.find_element(By.TAG_NAME, 'a').get_attribute('href')

                #print(f"{result.text.strip()} - {result_link}")

                print(result.text.strip())

                print()

        else:

            print(f"No search results found for '{search_query}'")

except Exception as e:

print("An error occurred:", e)

我尝试添加一个我知道网站上有的新项目,但它没有出现。例如,生物反应器没有出现,但完整性测试仪却出现了。它们都可以在网站上找到。该代码也不能一致地工作。有时根本不显示任何结果。我认为这是由于网站超载造成的。我的目标是搜索任何关键字。并列出所有相关结果。

python selenium-webdriver search automation
1个回答
0
投票

您的结果可能“不可靠”,因为该网站正在限制您。

所有这些警告都表明由于 API 过载而未返回结果。

我注意到,实际上您可以通过在 URL 中使用

query
参数直接转到搜索结果。这似乎比使用页面上的搜索框更强大。

我还引入了一些延迟,因为你想避免受到限制。

from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options as ChromeOptions

import traceback
import time
import logging
import json

logging.basicConfig(
  level=logging.DEBUG,
  format='%(asctime)s [%(levelname)7s] %(message)s',
)
logging.getLogger("selenium").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)

BASE_URL = "https://www.ssllc.com"

SEARCH_QUERIES = [
    "Unused+Sartorius+1000+Liter+BIOSTAT+CultiBag+STR+Single+Use+Bioreactor",
    "3+x+V5/XCell+Repigen+Next+Gen+ATF+controllers",
    "InSite+Integrity+Tester"
]

RESULTS = []

options = ChromeOptions()
# options.add_argument('--headless')

# Don't instantiate browser inside loop. Once is enough!
with Chrome(options=options) as driver:
    # Just open the base URL once too.
    driver.get(BASE_URL)
    try:
        for search_query in SEARCH_QUERIES:
            logging.info(f"🟦 Search term: {search_query}")

            query_url = BASE_URL+"/search/?query="+search_query
            driver.get(query_url)

            logging.debug("- Wait for results to load.")
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((
                By.CSS_SELECTOR,
                ".ais-Hits > ul.ais-Hits-list"
            )))

            logging.debug("- Extract results.")
            search_results = driver.find_elements(
                By.CSS_SELECTOR,
                ".ais-Hits > ul.ais-Hits-list > li"
            )

            if search_results:
                logging.info(f"✅ Search results ({len(search_results)} items).")
                results = [result.text.strip() for result in search_results]
            else:
                results = []
                logging.warning(f"🚨 No search results found.")

            RESULTS.append({
                "search": search_query,
                "results": results
            })

            time.sleep(60)

    except Exception as e:
        logging.error("An error occurred:"+str(e))
        logging.error(traceback.format_exc())

with open("search-results.json", "wt") as fid:
    json.dump(RESULTS, fid)

这是运行日志:

2024-02-24 08:06:47,910 [   INFO] 🟦 Search term: Unused+Sartorius+1000+Liter+BIOSTAT+CultiBag+STR+Single+Use+Bioreactor
2024-02-24 08:06:50,701 [  DEBUG] - Wait for results to load.
2024-02-24 08:06:50,721 [  DEBUG] - Extract results.
2024-02-24 08:06:50,740 [   INFO] ✅ Search results (4 items).
2024-02-24 08:08:10,910 [   INFO] 🟦 Search term: 3+x+V5/XCell+Repigen+Next+Gen+ATF+controllers
2024-02-24 08:08:12,979 [  DEBUG] - Wait for results to load.
2024-02-24 08:08:12,994 [  DEBUG] - Extract results.
2024-02-24 08:08:13,014 [WARNING] 🚨 No search results found.'
2024-02-24 08:09:33,014 [   INFO] 🟦 Search term: InSite+Integrity+Tester
2024-02-24 08:09:35,355 [  DEBUG] - Wait for results to load.
2024-02-24 08:09:35,370 [  DEBUG] - Extract results.
2024-02-24 08:09:35,383 [   INFO] ✅ Search results (3 items).

我将结果写入 JSON 文件。

[
  {
    "search": 
"Unused+Sartorius+1000+Liter+BIOSTAT+CultiBag+STR+Single+Use+Bioreactor",
    "results": [
      "Unused Sartorius 1000 Liter BIOSTAT CultiBag STR Single Use 
Bioreactor\nManufacturer: Sartorius\nProduct Model #: 1000L Single-Use 
Bioreactor\nProduct Code: 337327\nUnused Sartorius 1000 Liter BIOSTAT CultiBag 
STR Single Use Bioreactor for sale. SSLLC offers a wide selection of used 
Bioreactors / Fermenters for your Used Lab Equipment needs.\nVIEW DETAILS",
      "Unused Sartorius 1000 Liter BIOSTAT CultiBag STR Single Use 
Bioreactor\nManufacturer: Sartorius\nProduct Model #: 1000L Single-Use 
Bioreactor\nProduct Code: 337326\nUnused Sartorius 1000 Liter BIOSTAT CultiBag 
STR Single Use Bioreactor for sale. SSLLC offers a wide selection of used 
Bioreactors / Fermenters for your Used Lab Equipment needs.\nVIEW DETAILS",
      "Unused Sartorius 1000 Liter BIOSTAT CultiBag STR Single Use 
Bioreactor\nManufacturer: Sartorius\nProduct Model #: 1000L Single-Use 
Bioreactor\nProduct Code: 337325\nUnused Sartorius 1000 Liter BIOSTAT CultiBag 
STR Single Use Bioreactor for sale. SSLLC offers a wide selection of used 
Bioreactors / Fermenters for your Used Lab Equipment needs.\nVIEW DETAILS",
      "Unused Sartorius 1000 Liter BIOSTAT CultiBag STR Single Use 
Bioreactor\nManufacturer: Sartorius\nProduct Model #: 1000L Single-Use 
Bioreactor\nProduct Code: 337329\nUnused Sartorius 1000 Liter BIOSTAT CultiBag 
STR Single Use Bioreactor for sale. SSLLC offers a wide selection of used 
Bioreactors / Fermenters for your Used Lab Equipment needs.\nVIEW DETAILS"
    ]
  },
  {
    "search": "3+x+V5/XCell+Repigen+Next+Gen+ATF+controllers",
    "results": []
  },
  {
    "search": "InSite+Integrity+Tester",
    "results": [
      "Unused Thermo Scientific inSITE Integrity Tester\nManufacturer: Thermo 
Fisher Scientific\nProduct Model #: 30-IN-1052 RG\nProduct Code: 332788\nUsed 
Unused Thermo Scientific 30-IN-1001 Filter Integrity Tester for sale. SSLLC 
offers a wide selection of used Analyzers for your Used Lab Equipment 
needs.\nVIEW DETAILS",
      "Unused Thermo Scientific inSITE Integrity Tester\nManufacturer: Thermo 
Fisher Scientific\nProduct Model #: 30-IN-1052 RG\nProduct Code: 332787\nUsed 
Unused Thermo Scientific 30-IN-1001 Filter Integrity Tester for sale. SSLLC 
offers a wide selection of used Analyzers for your Used Lab Equipment 
needs.\nVIEW DETAILS",
      "Unused Thermo Scientific inSITE Integrity Tester\nManufacturer: Thermo 
Fisher Scientific\nProduct Model #: 30-IN-1052 RG\nProduct Code: 332786\nUnused 
Thermo Scientific 30-IN-1001 Filter Integrity Tester for sale. SSLLC offers a 
wide selection of used Analyzers for your Used Lab Equipment needs.\nVIEW 
DETAILS"
    ]
  }
]
© www.soinside.com 2019 - 2024. All rights reserved.