我正在尝试从 https://www.manta.com/ 网站上抓取餐厅名称和电话号码。我使用 selenium 来自动化整个任务,因为该网站本质上是动态的,并且在运行 python 代码时我得到了以下内容 “权限策略标头错误:未启用原始试验控制功能:‘兴趣群组’。”并得到空列表作为输出。
你们能帮我指出我错在哪里,并推荐我一些可以学习如何抓取动态网站的书籍或网站吗?
这是我的Python代码供您参考。
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
# Set the path to the ChromeDriver executable
webdriver_service = Service('C:\webdrivers')
# Set Chrome options for running in headless mode
chrome_options = Options()
chrome_options.add_argument('--headless')
# Create a new ChromeDriver instance
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)
# URL of the website you want to scrape
url ="https://www.manta.com/search?search=Restaurants&context=unknown&search_source=nav&city=Dallas&state=Texas&country=United%20States&pt=32.7936%2C-96.7662&device=desktop&screenResolution=1280x720"
# Navigate to the website
driver.get(url)
time.sleep(5)
# Find all the restaurant shop cards
listings = driver.find_elements(By.XPATH, "/html/body/main/div[3]/div[1]/div[1]/div[2]")
print(listings) #print all the restaurant shop cards
# Close the driver and quit the browser
driver.quit()
这是我在 vscode 中遇到的错误
DevTools listening on ws://127.0.0.1:49483/devtools/browser/88be036f-53d8-4f6a-b9ff-5b103ad5e6ff
[0525/152704.760:INFO:CONSOLE(0)] "Error with Permissions-Policy header: Origin trial controlled feature not enabled: 'interest-cohort'.",
source: (0)
[0525/152705.055:INFO:CONSOLE(0)] "Error with Permissions-Policy header: Origin trial controlled feature not enabled: 'interest-cohort'.",
source: (0)
[]
这是因为该网站使用 cloudflare 进行 Bot 检测并阻止您的 selenium 驱动程序 Chrome 实例加载页面
我已更新您的定位器以获取所有商店卡并打印其文本,并使用unDetected_chromedriver,它不会触发 Cloudfare 等反机器人服务,并自动下载驱动程序二进制文件并对其进行修补。
完整代码
import time
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
# Create a new ChromeDriver instance
options = uc.ChromeOptions()
options.add_argument("--headless=new")
driver = uc.Chrome(options=options)
# URL of the website you want to scrape
url = "https://www.manta.com/search?search=Restaurants&context=unknown&search_source=nav&city=Dallas&state=Texas&country=United%20States&pt=32.7936%2C-96.7662&device=desktop&screenResolution=1280x720"
# Navigate to the website
driver.get(url)
time.sleep(5)
# Find all the restaurant shop cards
listings = driver.find_elements(By.XPATH, "/html/body/main/div[3]/div[1]/div[1]/div")
# print all the restaurant shop cards
for listing in listings:
print(listing.text)
# Close the driver and quit the browser
driver.quit()
打印
QualitY Restaurants Dallas
1130 S Bowen Rd
Dallas, TX
(336) 536-4955
Visit Website
CLAIMED
Categori...
也帮帮我吧
import os
import requests
from bs4 import BeautifulSoup
import moviepy.editor as mymovie
import random
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import chromedriver_autoinstaller
## setup chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless') # ensure GUI is off
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
# set path to chromedriver as per your configuration
chromedriver_autoinstaller.install()
# set the target URL
url = "https://www.pexels.com/search/videos/motivational/?orientation=portrait"
# Function to get video links
def get_video_links(url, chrome_options):
driver = webdriver.Chrome(options=chrome_options)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
video_links = []
for video in soup.find_all("video"):
source = video.find("source")
if source:
video_links.append(source["src"])
driver.quit()
return video_links
# Function to download and edit videos
def download_and_edit_videos(video_links):
songs = input("How many songs you have? ")
if not os.path.exists("songs"):
os.makedirs("songs")
if not os.path.exists("videos"):
os.makedirs("videos")
i = 1
for link in video_links:
fn = link.split('/')[-1].split("?")[0]
print("Downloading video: %s" % fn)
r = requests.get(link, stream=True)
with open(fn, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024 * 1024):
if chunk:
f.write(chunk)
print("%s downloaded!" % fn)
random_song = random.randint(1, int(songs))
audio_file = f"songs/audio{random_song}.mp3"
if not os.path.exists(audio_file):
print(f"Audio file {audio_file} not found.")
continue
print(f"Using audio file: {audio_file}")
clip = mymovie.VideoFileClip(fn)
clip_duration = clip.duration
audioclip = mymovie.AudioFileClip(audio_file).set_duration(clip_duration)
new_audioclip = mymovie.CompositeAudioClip([audioclip])
final_clip = clip.set_audio(new_audioclip)
output_file = f"videos/vid{i}.mp4"
print(f"Writing edited video to: {output_file}")
final_clip.write_videofile(output_file, fps=60)
print(f"{fn} has been edited and saved as {output_file}\n")
i += 1
if __name__ == "__main__":
vids = input("How many videos you want to download? ")
video_links = get_video_links(url, chrome_options)
download_and_edit_videos(video_links)