创建了一个不想抓取超过 1 个 URL 的爬虫

问题描述 投票:0回答:0

我创建了一个爬网,用于爬网我的客户网站以获取 SEO 数据。 正在收集数据,但它不会抓取超过 1 个 url。 而且我似乎无法在我的代码中找到错误。

def normalize_url(base_url, url):
    # Join the base_url with the url to handle relative URLs
    absolute_url = urljoin(base_url, url)
    parsed_url = urlparse(absolute_url)
    
    # Remove query strings and fragments from the URL
    normalized_url = parsed_url.scheme + "://" + parsed_url.hostname + parsed_url.path.rstrip("/")
    return normalized_url


def crawl_website(start_url, crawl_request_id, max_pages=100):
    visited_urls = {}
    queue = [normalize_url(start_url, start_url)]
    start_url_normalized = normalize_url(start_url, start_url)


    while queue and len(visited_urls) < max_pages:
        current_url = queue.pop(0)
        if current_url not in visited_urls:
            print(f"Crawling {current_url}")
        seo_data = extract_seo_data(current_url, crawl_request_id)
        print(f"Internal links for {current_url}: {seo_data['internal_links']}")

        if seo_data:
            visited_urls[current_url] = True
            print("Normalized internal links:")
            for link in seo_data["internal_links"]:
                normalized_link = normalize_url(current_url, link)  # Pass the current_url as base_url
                print(f"{link} -> {normalized_link}")
                if normalized_link not in visited_urls and normalized_link not in queue and normalized_link != start_url_normalized:
                    queue.append(normalized_link)

                print(f"Queue: {queue}")

                # Insert the main SEO data and get the page_id
                insert_page_data(connection, seo_data, crawl_request_id)
                page_id = connection.cursor().lastrowid

                # Insert the additional data using the new functions and set the page_id

                # Insert external links
                insert_external_links(connection, page_id, seo_data["external_links"])

                # Insert pagination data
                insert_pagination(connection, page_id, seo_data["pagination"])

                # Insert structured data
                if seo_data["has_structured_data"]:
                    insert_structured_data(connection, page_id, seo_data["structured_data"])

                # Insert images
                # insert_images(connection, page_id, seo_data["images"])

                # Insert internal links
                insert_internal_links(connection, page_id, seo_data["internal_links"])

                # Set the page_id in the visited_urls dictionary
                visited_urls[current_url] = page_id

    return list(visited_urls.keys())



if __name__ == "__main__":
    # Connect to the database
    connection = connect_to_database("localhost", "root", "", "laravel")

    start_url = "https://example.com" #tests are not performed on this URL ofc. 
    max_pages = 10

    # Create a crawl request
    crawl_id = create_crawl_request(connection, start_url)
    print(f"Created crawl request with ID {crawl_id}")

    # Update the crawl request status to 'running'
    update_crawl_request(connection, crawl_id, None, 'running')

    try:
        # Crawl the website
        visited_urls = crawl_website(start_url, crawl_id, max_pages)

        # Print the visited URLs
        print(f"Visited URLs ({len(visited_urls)}):")
        for url in visited_urls:
            print(url)

        # Update the crawl request status to 'completed' and set the end_url
        end_url = visited_urls[-1] if visited_urls else None
        update_crawl_request(connection, crawl_id, end_url, 'completed')
    except Exception as e:
        print(f"Error during crawling: {e}")
        update_crawl_request(connection, crawl_id, None, 'failed')

    # Close the database connection
    connection.close()

我试过打印出前后的值,但就是找不到逻辑错误。

这些是印刷值:

Connecting to the database...
Connected to the database.
Created crawl request with ID 13
Crawling https://example.com
Internal links found in https://example.com: set()
Internal links for https://example.com: 
Normalized internal links:
Visited URLs (1):
https://example.com

似乎没有设置规范化的 URL。 所以脚本只抓取 1 个 URL 而不是收集的 URL。

python python-3.x list dictionary web-crawler
© www.soinside.com 2019 - 2024. All rights reserved.