我一直在尽力搜索 xpath 来抓取点赞、日期描述等,但每次我的程序都什么也没有返回。我尝试过更改标题,例如:
likes_xpath = '//*[@id="react-root"]/section/main/div/div[1]/article/div[3]/section[2]/div/div/button/span'
然后改为:
likes_xpath = '//div[@class="likes"]/section/div/div/span/a/span'
转念一想,这似乎是错误的做法。 我不明白如何制作不太长的短xpath,以便程序可以长时间正常工作并且更加可靠,我也无法找出提取此类内容的正确方法。 此时我什至怀疑它周围的代码
def __lcondition(self, link):
return '.com/p/' in link.get_attribute('href')
def __get_user(self):
user = self.__driver.find_element(By.XAPTH, '//*[@id="react-root"]\
/section/main/div/div[1]/article/header/div[2]/div[1]/div[1]/span/a')
user = user.get_attribute('href').split('/')[-2]
return user
def filter_links(self, links):
post_links = []
for link in links:
try:
if '.com/p/' in link.get_attribute('href'):
post_links.append(link)
except:
logger.warning("A https://www.instagram.com/p/ link was not found")
continue
return post_links
def __get_subtitles(self,):
subtitles_xpath = '/html/body/div[1]/section/main/div/div[1]/article/div[3]/div[1]/ul/div/li/div/div/div[2]/span'
subtitles = self.__driver.find_element(By.XPATH, subtitles_xpath)
subtitles = subtitles.get_attribute('innerHTML')
subtitles = BeautifulSoup(subtitles).get_text()
return subtitles
def __get_image_description(self) -> str:
images = self.__driver.find_elements(By.TAG_NAME, 'img')
post_infos = images[1].get_attribute('alt')
if len(post_infos) > 0:
post_infos = post_infos.split("Image may contain: ")
if len(post_infos) >=2:
#image
return post_infos[1]
else:
#video
logger.warning("Description is not available")
return ''
else:
return ''
def __get_likes(self):
likes_xpath = '//div[@class="likes"]/section/div/div/span/a/span'
likes = self.__driver.find_element(By.XPATH, likes_xpath)
likes = likes.get_attribute('innerHTML')
return likes
def __get_views(self):
'''
Getting views
'''
views_xpath = '/html/body/div[1]/section/main/div/div[1]/article/div[3]/section[2]/div/span/span'
views = self.__driver.find_element(By.XPATH, views_xpath)
views = views.get_attribute('innerHTML')
return views
def __get_date(self):
date_xpath = '//time[@class="_aaqe"]/div/a/span/time'
date = self.__driver.find_element(By.XPATH, date_xpath)
date = date.get_attribute('datetime')
return date
def __get_image_data(self, link):
infos = {}
infos['date'] = self.__get_date()
infos['type'] = 'image'
if len(infos['date']) == 0:
return None
infos['user'] = self.__get_user()
infos['subtitles'] = self.__get_subtitles()
infos['image_description'] = self.__get_image_description()
infos['likes'] = self.__get_likes()
#infos['views'] = None
infos['link'] = link
return infos
def __get_video_data(self, link):
infos = {}
infos['date'] = self.__get_date()
infos['type'] = 'video'
if len(infos['date']) == 0:
return None
infos['user'] = self.__get_user()
infos['subtitles'] = self.__get_subtitles()
#infos['views'] = self.__get_views()
infos['likes'] = self.__get_likes()
infos['link'] = link
return infos
有人可以帮我找到这种情况下的错误吗?
有时 XPath 很长,但有时我们可以根据需要缩短它们(但也要查看 HTML 层次结构),例如上面提供的 xpath 的示例:
likes_xpath = '//*[@id="react-root"]/section/main/div/div[1]/article/div[3]/section[2]/div/div/button/span'
我们也可以这样做:
likes_xpath = '//*[@id="react-root"]//article/div[3]/section[2]//div/button/span'
为了验证xpath是否正确。我们有无数的浏览器扩展可以使用。 我个人使用 xpath-helper,市场上最受欢迎的是 selectorshub.. 你可以选择..