我创建了一个爬网,用于爬网我的客户网站以获取 SEO 数据。 正在收集数据,但它不会抓取超过 1 个 url。 而且我似乎无法在我的代码中找到错误。
def normalize_url(base_url, url):
# Join the base_url with the url to handle relative URLs
absolute_url = urljoin(base_url, url)
parsed_url = urlparse(absolute_url)
# Remove query strings and fragments from the URL
normalized_url = parsed_url.scheme + "://" + parsed_url.hostname + parsed_url.path.rstrip("/")
return normalized_url
def crawl_website(start_url, crawl_request_id, max_pages=100):
visited_urls = {}
queue = [normalize_url(start_url, start_url)]
start_url_normalized = normalize_url(start_url, start_url)
while queue and len(visited_urls) < max_pages:
current_url = queue.pop(0)
if current_url not in visited_urls:
print(f"Crawling {current_url}")
seo_data = extract_seo_data(current_url, crawl_request_id)
print(f"Internal links for {current_url}: {seo_data['internal_links']}")
if seo_data:
visited_urls[current_url] = True
print("Normalized internal links:")
for link in seo_data["internal_links"]:
normalized_link = normalize_url(current_url, link) # Pass the current_url as base_url
print(f"{link} -> {normalized_link}")
if normalized_link not in visited_urls and normalized_link not in queue and normalized_link != start_url_normalized:
queue.append(normalized_link)
print(f"Queue: {queue}")
# Insert the main SEO data and get the page_id
insert_page_data(connection, seo_data, crawl_request_id)
page_id = connection.cursor().lastrowid
# Insert the additional data using the new functions and set the page_id
# Insert external links
insert_external_links(connection, page_id, seo_data["external_links"])
# Insert pagination data
insert_pagination(connection, page_id, seo_data["pagination"])
# Insert structured data
if seo_data["has_structured_data"]:
insert_structured_data(connection, page_id, seo_data["structured_data"])
# Insert images
# insert_images(connection, page_id, seo_data["images"])
# Insert internal links
insert_internal_links(connection, page_id, seo_data["internal_links"])
# Set the page_id in the visited_urls dictionary
visited_urls[current_url] = page_id
return list(visited_urls.keys())
if __name__ == "__main__":
# Connect to the database
connection = connect_to_database("localhost", "root", "", "laravel")
start_url = "https://example.com" #tests are not performed on this URL ofc.
max_pages = 10
# Create a crawl request
crawl_id = create_crawl_request(connection, start_url)
print(f"Created crawl request with ID {crawl_id}")
# Update the crawl request status to 'running'
update_crawl_request(connection, crawl_id, None, 'running')
try:
# Crawl the website
visited_urls = crawl_website(start_url, crawl_id, max_pages)
# Print the visited URLs
print(f"Visited URLs ({len(visited_urls)}):")
for url in visited_urls:
print(url)
# Update the crawl request status to 'completed' and set the end_url
end_url = visited_urls[-1] if visited_urls else None
update_crawl_request(connection, crawl_id, end_url, 'completed')
except Exception as e:
print(f"Error during crawling: {e}")
update_crawl_request(connection, crawl_id, None, 'failed')
# Close the database connection
connection.close()
我试过打印出前后的值,但就是找不到逻辑错误。
这些是印刷值:
Connecting to the database...
Connected to the database.
Created crawl request with ID 13
Crawling https://example.com
Internal links found in https://example.com: set()
Internal links for https://example.com:
Normalized internal links:
Visited URLs (1):
https://example.com
似乎没有设置规范化的 URL。 所以脚本只抓取 1 个 URL 而不是收集的 URL。