我是Python新手,我需要解析网站上的所有记录URL。我尝试了下面的程序,但它无法找到录制链接,但它正在打印网页中的其他链接。我不知道网站设计,我尝试使用人工智能工具和 Stackoverflow,但我可以在任何地方找到相同的解决方案。您能否提供我在这里犯的错误或我需要遵循的其他方式来解析它?
我使用检查元素从网页中找到的示例记录 URL:
这是我尝试过的代码片段:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
def parse_page(url):
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
for quote in soup.find_all('a',href=True):
href = quote.get('href')
print(href)
base_url = 'https://www.vector.com/int/en/search/#type=%5B%22webinar_recording%22%5D&page=1&pageSize=50&sort=date&order=desc'
parse_page(base_url)
您在页面上看到的数据是通过Javascript加载的,所以BeautifulSoup看不到它。要模拟此请求,您可以尝试:
import requests
payload = {
"aggs": {
"categories": {
"terms": {"field": "downloadType", "order": {"_key": "asc"}, "size": 1000}
},
"content_type": {
"terms": {"field": "type", "order": {"_key": "asc"}, "size": 1000}
},
"file_type": {
"terms": {"field": "fileType", "order": {"_key": "asc"}, "size": 1000}
},
"languages": {
"terms": {
"field": "categoryFileLanguageDefaultLang",
"order": {"_key": "asc"},
"size": 1000,
}
},
"products": {
"terms": {
"field": "categoryProductDefaultLang",
"order": {"_key": "asc"},
"size": 1000,
}
},
"standards": {
"terms": {
"field": "categoryStandardDefaultLang",
"order": {"_key": "asc"},
"size": 1000,
}
},
"topics": {
"terms": {
"field": "categoryTopicDefaultLang",
"order": {"_key": "asc"},
"size": 1000,
}
},
},
"explain": False,
"from": 0,
"query": {
"function_score": {
"boost_mode": "multiply",
"functions": [{"filter": {"match": {"type": "products"}}, "weight": 50}],
"query": {
"bool": {
"filter": [
{
"match": {
"type": {
"boost": 1,
"operator": "AND",
"query": "webinar_recording",
}
}
}
],
"must": [
{
"bool": {
"should": [
{
"bool": {
"must_not": {"exists": {"field": "endtime"}}
}
},
{"range": {"endtime": {"gte": "now"}}},
]
}
},
{
"bool": {
"should": [
{
"bool": {
"must_not": {
"exists": {"field": "starttime"}
}
}
},
{"range": {"starttime": {"lte": "now"}}},
]
}
},
],
"must_not": [
{"term": {"type": {"value": "marketingitems"}}},
{"match": {"downloadType": {"query": "demos"}}},
{"match": {"downloadType": {"query": "software"}}},
{"match": {"downloadType": {"query": "drivers"}}},
{"match": {"downloadType": {"query": "freeware"}}},
{"match": {"downloadType": {"query": "service"}}},
],
"should": [],
}
},
"score_mode": "first",
}
},
"size": 50,
"sort": [{"sortdate": "desc"}],
"suggest": {
"didYouMean": {
"phrase": {
"direct_generator": [
{"field": "didYouMean.trigram", "suggest_mode": "always"}
],
"field": "didYouMean.trigram",
"gram_size": 3,
"size": 5,
},
"text": "",
}
},
}
api_url = "https://search.vector.com/int-en/_search/"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"
}
data = requests.post(api_url, headers=headers, json=payload).json()
for h in data["hits"]["hits"]:
print(h["_source"]["title"])
print(h["_source"]["streamingUrl"])
print("-" * 80)
打印:
Simplify the Simulation, Testing and Measurement of 10BASE-T1S Networks With CANoe/CANalyzer
https://vector-group.webex.com/vector-group/ldr.php?RCID=0ce68a6a7132fb032088f53a6b5cd4b2
--------------------------------------------------------------------------------
Remote Diagnostics and Flashing
https://vector-group.webex.com/vector-group/ldr.php?RCID=7307e0a9000c63ad7dce5523ec058af2
--------------------------------------------------------------------------------
Maintain ODX-based Diagnostic Data Easily, Quickly and Effectively with ODXStudio
https://vector-group.webex.com/vector-group/ldr.php?RCID=4c3f1a4d894482e9899b2c0a64f1914b
--------------------------------------------------------------------------------
...