我正在尝试使用 find_elements 遍历每个帖子,第一次点击总是有效,但第二次点击总是失败,但我需要页面来遍历每个帖子。
我试过添加 Javascript 点击,但没有用,只是不知道如何继续。 它真正应该做的是浏览每篇文章,我也试过添加一个 WebDriverWait 脚本,或者只是在第二次点击前添加 10 秒,但仍然给我同样的错误。
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options
import time
import requests
from selenium.common.exceptions import NoSuchElementException
# specify the URL you want to scrape
url = 'https://www.depop.com/_realvintage/'
# results = requests.get(url)
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome("D:\\Selenium_python2\\chromedriver.exe", chrome_options=chrome_options)
# driver.maximize_window()
#game plan for this
# scroll down till encounters first sold listing ////
# then stop scrolling ////
# count all unsold listings, maybe count all elements until you get to
# the total title number, then parse them all,
scroll_down = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
total_titles = 0
ovr_listings = 0
driver.get(url)
time.sleep(1)
driver.find_element('xpath', '//*[@id="__next"]/div/div[2]/div[2]/button[2]').click()
time.sleep(1.5)
current_url = driver.current_url
response = requests.get(current_url)
soup = BeautifulSoup(response.text, 'html.parser')
#need a new code so that when the first sold liting appears on the page, the scroll_down_script() ends
def scroll_down_script():
all_post = driver.find_elements('xpath', '//*[contains(@class, "styles__PrimaryProductImage-sc-__dbpyge-1 jUQFmU")]')
load_more_button = driver.find_element('xpath', '//*[contains(@class, "sc-gFGZVQ ewexUW")]')
scroll_down = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
while True:
time.sleep(1)
# Scroll down to the bottom of the screen
scroll_down
time.sleep(1)
driver.execute_script("arguments[0].scrollIntoView(true)",load_more_button)
time.sleep(1)
load_more_button.click()
try:
time.sleep(1)
load_more_button.click()
except:
continue
time.sleep(3)
# Wait for the page to load
try:
# sold_listing = driver.find_element('xpath', '//*[contains(@class, "styles__SoldOverlay-sc-__sc-13q41bc-9 jMuEhs")]')
# sold_listing
# sold = driver.find_element('css selector', '[data-testid="product__sold"]')
# sold = driver.find_element('xpath', '//*[contains(@class, "sc-papXJ kLPPHi")]')
sold_text = driver.find_elements('xpath', '//*[contains(text(), "Sold")]')
if len(sold_text) >= 3:
print(len(sold_text))
print(len(all_post))
print('sold_listing found, stopping function')
break
except NoSuchElementException:
print(len(sold_text))
pass
count = 0
def each_post():
global count
print("starting item count " + str(count))
time.sleep(3)
print('reading all posts')
#all post doesnt work
all_post = driver.find_elements('xpath', '//*[contains(@class, "styles__PrimaryProductImage-sc-__dbpyge-1 jUQFmU")]')
all_posts = all_post[count:]
print('length of posts')
print(len(all_posts))
for post in all_posts:
print('start post ' + str(count))
try:
driver.execute_script("arguments[0].scrollIntoView(true)",all_post[count])
except:
pass
print('clicking good')
time.sleep(6)
try:
# all_post[count].click()
all_post[count].click()
print('click 1')
except:
# driver.execute_script("arguments[0].click();", all_post[count])
driver.execute_script("arguments[0].click();", all_post[count])
print('click 2')
all_post = driver.find_elements('xpath', '//*[contains(@class, "styles__PrimaryProductImage-sc-__dbpyge-1 jUQFmU")]')
time.sleep(2)
title_parsing()
count += 1
all_titles = []
def hashtag_parser(word):
global overall_title
ind_title = []
time.sleep(2)
try:
wordd = word.split("PLEASE READ CAREFULLY!! Some items may have unlisted markings, but for the most part all marking would be listed. These clothes are often old and used, so tend to not be in pristine condition. Most items that are listed here will by default be used, unless stated otherwise. Items have not been washed so we also highly recommend that you wash before putting it on. All items have been handpicked by us. Thank you for choosing to support us at RealVintage._ and if there is anything we can do to make your experience better feel free to send us a message! We also give large discounts if purchasing stuff in bulk.")
except:
wordd = word
for letter in wordd:
if letter != '#' or ',' or '/n' or "PLEASE":
ind_title.append(letter)
else:
overall_title = ''.join(ind_title)
print(overall_title)
if overall_title not in all_titles:
all_titles.append(overall_title)
break
title =''.join(ind_title)
item_title = title.split('#')[0].strip()
print(item_title)
def title_parsing():
global ovr_listings
global count
print('start title parsing')
time.sleep(2)
script_tag = driver.find_element('css selector', "[data-testid='meta-schema__json-ld']")
script_content = script_tag.get_attribute("innerHTML")
data = json.loads(script_content)
# Extract the name category
item_title = data["description"]
hashtag_parser(item_title)
#price
price = data['offers']['price']
print(price)
#images
images = data['image']
print(images)
print(len(images))
#size
size_condition = driver.find_elements('xpath', '//*[contains(@class, "TableCell-sc-__sc-12y8so1-0 bWenjz")]')
if len(size_condition) >= 3:
try:
size = size_condition[0]
print(size.text)
except:
print('No size for ' + str(count))
try:
brand = size_condition[1]
print(brand.text)
except:
print('No condition for ' + str(count))
#condition
try:
condition = size_condition[2]
print(condition.text)
except:
print('No condition for ' + str(count))
else:
try:
size = size_condition[0]
print(size.text)
except:
print('No size for ' + str(count))
#condition
try:
condition = size_condition[1]
print(condition.text)
except:
print('No condition for ' + str(count))
#styles
try:
style_color = driver.find_elements('xpath', '//*[contains(@class, "TableCell-sc-__sc-12y8so1-0 kRbwCZ")]')
style = style_color[0]
text_of_style = style.text
style_list = text_of_style.split(', ')
for styles in style_list:
print(styles)
except:
print('no style for ' + str(count))
#color
try:
color = style_color[1]
text_of_color = color.text
try:
colors = text_of_color.split(", ")
color1 = colors[0]
color2 = colors[1]
print(color1)
print(color2)
except:
print('only 1 color')
except:
print('no color for ' + str(count))
category = driver.find_elements('css selector', 'span[itemprop="name"]')
# specific category
if len(category) >= 5:
try:
item_category = category[5]
category_text = item_category.text
print(category_text)
except:
print('No specific category for ' + str(count))
#category
try:
gen_category = category[4]
gen_category_text = gen_category.text
print(gen_category_text)
except:
print('No category for ' + str(count))
else:
try:
item_category = category[3]
category_text = item_category.text
print(category_text)
except:
print('No specific category for ' + str(count))
#category
try:
gen_category = category[2]
gen_category_text = gen_category.text
print(gen_category_text)
except:
print('No category for ' + str(count))
#likes
all_text = driver.find_element('xpath', "/html/body").text
match = re.search(r'\b\d{1,3}\b(?=\s+likes)', all_text)
if match:
likes = int(match.group())
print(likes)
else:
print("No likes found.")
driver.back()
time.sleep(3)
print('end title parsing')
# scroll_down_script()
each_post()
driver.find_element('xpath', '//*[@id="__next"]/div/div[2]/div[2]/button[2]').click()
time.sleep(1.5)