我正在尝试使用RegEx在JSON数据中查找列表。这是我的代码:
import requests
from bs4 import BeautifulSoup
import re
import json
source = requests.get('https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS').text
soup = BeautifulSoup(source, 'lxml')
pattern = re.compile(r'window.__WEB_CONTEXT__={pageManifest:(\{.*\})};')
script = soup.find("script", text=pattern)
dictData = pattern.search(script.text).group(1)
jsonData = json.loads(dictData)
pattern2 = re.compile(r'^\"[0-9]*\":{\"data\":{\"locations\":(.*)},')
data_list = pattern2.search(str(jsonData)).group(1)
print(data_list)
使用此正则表达式pattern2 = re.compile(r'^\"[0-9]*\":{\"data\":{\"locations\":(.*)},')
,我想查找locations
的值(列表),但出现错误AttributeError:'NoneType'对象没有属性'group'。
我要查找的JSON数据部分看起来像这样:
"3960485871": {
"data": {
"locations": [
{
"detail": {
"hotel": {
"aliases": [
{
"id": 1099146,
"locale": "de",
"score": 390000,
"text": "hotel coronado"
},
{
"id": 1261196,
"locale": "es",
"score": 260000,
"text": "hotel coronado"
},
{
"id": 261321,
"locale": null,
"score": 112500,
"text": "coronado hotel z\u00fcrich"
}
],
"details": {
"numRooms": 40
}
},
"priceRange": {
"maximum": 212,
"minimum": 133
}
},
"formerName": null,
"locationId": 228146,
"neighborhoods": [],
"parents": [
{
"locationId": 188113,
"name": "Z\u00fcrich",
"placeType": "MUNICIPALITY"
},
{
"locationId": 188111,
"name": "Kanton Z\u00fcrich",
"placeType": "CANTON"
},
{
"locationId": 188045,
"name": "Schweiz",
"placeType": "COUNTRY"
},
{
"locationId": 4,
"name": "Europa",
"placeType": "CONTINENT"
},
{
"locationId": 1,
"name": "Welt",
"placeType": null
}
]
}
]
}
},
尝试一下(输入数据太大,会减少输入数据)
jsonData = {
"3960485871": {
"data": {
"locations": [
{
"detail": {},
"formerName": None,
"locationId": 228146,
"neighborhoods": [],
"parents": []
}
]
}
},
}
def find_recursive(data, type_):
# If we found what we are looking for, return it
if isinstance(data, type_):
return data
# If we didn't find it, recursively look for it
# Lists and tuples
if isinstance(data, (list, tuple)):
for item in data:
item = find_recursive(item, type_)
if item is not None:
return item
# Dicts
elif isinstance(data, dict):
for item in data.values():
item = find_recursive(item, type_)
if item is not None:
return item
# Add here other containers that you want to recursively look for
# If we weren't able to find it recursively, return None
return None
find_recursive(jsonData, list)
用法:find_recursive(DATA, TYPE)
,其中DATA
是嵌套容器,TYPE
是要查找的python类型。它确实接受列表和字典作为嵌套容器,但是可以通过迭代它们的项并返回它们(如果它们不是None
的话)来将其扩展到其他对象(例如集合,元组等)。