我正在构建一个网络爬虫,可以爬取beatport 前100 名。 我遇到的问题是某些项目已定位,但其他项目出现错误。
def scrape_beatport():
user_agent = UserAgent().random
chrome_options = Options()
chrome_options.add_argument(f"user-agent={user_agent}")
driver = webdriver.Chrome(options=chrome_options)
try:
driver.get('https://www.beatport.com/top-100')
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div#__next')))
test1 = driver.find_elements(By.CSS_SELECTOR, 'div#__next')
test2 = driver.fin_elements(By.CSS_SELECTOR, 'div.TracksList-style__Wrapper-sc-3fb03d50-8 bWXYsy row')
finally:
driver.quit()
if __name__ == '__main__':
scrape_beatport()
这是我的代码。 test1 被发现并且是 body 标签中的一个 div。 test2 更深入地了解 html 结构。它位于一堆其他 div 中,当我尝试获取它时,他们给了我一个错误:
Traceback (most recent call last):
File "/Users/just/Documents/python/yt_test.py", line 84, in <module>
scrape_beatport()
^^^^^^^^^^^^^^^^^
File "/Users/just/Documents/python/yt_test.py", line 71, in scrape_beatport
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.__next div.MainLayout-style__MainWrapper-sc-9f30c253-0 div div.MainLayout-style__Main-sc-9f30c253-1 main.MainLayout-style__MainContent-sc-9f30c253-2 div.TracksList-style__Wrapper-sc-3fb03d50-8')))
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/selenium/webdriver/support/wait.py", line 105, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
Stacktrace:
0 chromedriver 0x0000000105272940 chromedriver + 4368704
1 chromedriver 0x000000010526add4 chromedriver + 4337108
2 chromedriver 0x0000000104e8ec04 chromedriver + 289796
3 chromedriver 0x0000000104ed0e00 chromedriver + 560640
4 chromedriver 0x0000000104f095ec chromedriver + 792044
5 chromedriver 0x0000000104ec5ab4 chromedriver + 514740
6 chromedriver 0x0000000104ec650c chromedriver + 517388
7 chromedriver 0x0000000105236e5c chromedriver + 4124252
8 chromedriver 0x000000010523bc4c chromedriver + 4144204
9 chromedriver 0x000000010521c824 chromedriver + 4016164
10 chromedriver 0x000000010523c57c chromedriver + 4146556
11 chromedriver 0x000000010520e2d8 chromedriver + 3957464
12 chromedriver 0x000000010525bec4 chromedriver + 4275908
13 chromedriver 0x000000010525c040 chromedriver + 4276288
14 chromedriver 0x000000010526aa34 chromedriver + 4336180
15 libsystem_pthread.dylib 0x000000018a0bd034 _pthread_start + 136
16 libsystem_pthread.dylib 0x000000018a0b7e3c thread_start + 8
请自己查看beatport top 100看看html,因为很多。
非常感谢!
您的代码在等待元素出现时超时,但定位器不正确,这就是它从未出现的原因。
我已经更新了下面的代码并且它正在工作。它会提取所有曲目的标题。如果您想获取更多信息,请确保以
row
开始搜索,例如row.find_element()
不是driver.find_element()
。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def scrape_beatport():
# user_agent = UserAgent().random
# chrome_options = Options()
# chrome_options.add_argument(f"user-agent={user_agent}")
# driver = webdriver.Chrome(options=chrome_options)
driver = webdriver.Chrome()
try:
driver.get('https://www.beatport.com/top-100')
wait = WebDriverWait(driver, 10)
rows = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div[data-testid='tracks-table-row']")))
print(len(rows))
for row in rows:
title = row.find_element(By.CSS_SELECTOR, "div[data-testid='marquee-parent']").text
print(title)
finally:
driver.quit()
if __name__ == '__main__':
scrape_beatport()
输出
100
Waiting For Tonight Extended Mix
Go Back feat. Julia Church Original Mix
Last Night Anyma x Layton Giordani Extended Remix
It's Not Right But It's Okay Extended
...
您无法将多个以空格分隔的类传递给 CSS_SELECTOR,但可以使用 XPATH:
这应该有效:
test2 = driver.find_elements(By.XPATH, '//div[@class="TracksList-style__Wrapper-sc-3fb03d50-8 bWXYsy row"]')