下面的代码可以提取该网站的所有链接,
from bs4 import BeautifulSoup
import requests
r=requests.get("https://www.drishtiias.com/current-affairs-news-analysis-editorials")
soup=BeautifulSoup(r.content,"html.parser")
links = soup.find_all("a")
for link in links:
print("Link:", link.get("href"), "Text:", link.string)
但我想从网站的某些部分提取链接。
我试过这个
links = soup.find_all("a", class_='box-hide')
但它不起作用
我需要的href标签在
/html/body/section[1]/div[2]/div/article/div[1]/div[1]/div/div/ul/li[2]/a
还有一种方法可以将这些链接导出到文本文件并在提取某些编号后停止循环。链接。
谢谢
以下代码将帮助您仅提取 .box-hide 类中的链接,并将它们保存在“links.txt”文件中。祝您准备顺利!!
import undetected_chromedriver as uc
from bs4 import BeautifulSoup as bs
from selenium.webdriver.common.by import By
import time
def write_links_to_file(links, filename):
with open(filename, "w") as file:
for link in links:
file.write(link + "\n")
page_link = "https://www.drishtiias.com/current-affairs-news-analysis-editorials"
options = uc.ChromeOptions()
options.add_argument("--headless")
driver = uc.Chrome(options=options)
driver.get(page_link)
time.sleep(5)
soup = bs(driver.page_source, "lxml")
links = []
a_tags = soup.select(".box-hide a")
for a_tag in a_tags:
links.append(a_tag.get("href"))
driver.close()
driver.quit()
filename = "links.txt"
write_links_to_file(links, filename)