我有以下 JSON 文件,我想使用 python 处理它(即从某些字段获取数据并用它进行一些计算和分析)。
JSON 文件:
{
"object1":{
"field1.1":"value1",
"field1.2":"value2",
"field1.3":"value3",
"field1.4":"value4",
"field1.5":"value5",
"field1.6":"value6",
"field1.7":"",
"field1.8":"",
"field1.9":"",
"field1.10":""
},
"object2":[
{
"field2.1":0,
"field2.2":"value2",
"field2.3":"value3",
"field2.4":"20"
}
],
"object3":{
"field3.1":0,
"field3.2":0,
"field3.3":"value5"
},
"object4":[
{
"field4.1.1":"value1",
"field4.1.2":"10",
"field4.1.3":128,
"field4.1.4":0
},
{
"field4.1.1":"value1",
"field4.1.2":"1400",
"field4.1.3":"value5",
"field4.1.4":1
}
],
"object5":[
{
"field5.1":0,
"field5.2":"value1",
"field5.3":"value7",
"field5.4":"6"
}
]
}
如何使用 python Pandas 读取这个 JSON 文件? 我尝试执行 pd.read_json(json_path) 但总是收到错误,无论我使用哪种类型的 JSON 方向(拆分、索引等)。
您可以定义一个函数来取消数据中所有嵌套字段的嵌套:
def flatten_nested_json_df(df):
df = df.reset_index()
s = (df.applymap(type) == list).all()
list_columns = s[s].index.tolist()
s = (df.applymap(type) == dict).all()
dict_columns = s[s].index.tolist()
while len(list_columns) > 0 or len(dict_columns) > 0:
new_columns = []
for col in dict_columns:
horiz_exploded = pd.json_normalize(df[col]).add_prefix(f'{col}.')
horiz_exploded.index = df.index
df = pd.concat([df, horiz_exploded], axis=1).drop(columns=[col])
new_columns.extend(horiz_exploded.columns) # inplace
for col in list_columns:
#print(f"exploding: {col}")
df = df.drop(columns=[col]).join(df[col].explode().to_frame())
new_columns.append(col)
s = (df[new_columns].applymap(type) == list).all()
list_columns = s[s].index.tolist()
s = (df[new_columns].applymap(type) == dict).all()
dict_columns = s[s].index.tolist()
return df
然后这样做:
results = pd.json_normalize(data) #data is your json
df = pd.DataFrame(results)
outdf = flatten_nested_json_df(df)
导致
index object1.field1.1 object1.field1.2 object1.field1.3 object1.field1.4 \
0 0 value1 value2 value3 value4
0 0 value1 value2 value3 value4
0 0 value1 value2 value3 value4
0 0 value1 value2 value3 value4
object1.field1.5 object1.field1.6 object1.field1.7 object1.field1.8 \
0 value5 value6
0 value5 value6
0 value5 value6
0 value5 value6
object1.field1.9 ... object2.field2.3 object2.field2.4 \
0 ... value3 20
0 ... value3 20
0 ... value3 20
0 ... value3 20
object4.field4.1.1 object4.field4.1.2 object4.field4.1.3 \
0 value1 10 128
0 value1 10 128
0 value1 1400 value5
0 value1 1400 value5
object4.field4.1.4 object5.field5.1 object5.field5.2 object5.field5.3 \
0 0 0 value1 value7
0 0 0 value1 value7
0 1 0 value1 value7
0 1 0 value1 value7
object5.field5.4
0 6
0 6
0 6
0 6
[4 rows x 26 columns]
from rich import print
data = {
"object1": {
"field1.1": "value1",
"field1.2": "value2",
"field1.3": "value3",
"field1.4": "value4",
"field1.5": "value5",
"field1.6": "value6",
"field1.7": "",
"field1.8": "",
"field1.9": "",
"field1.10": "",
},
"object2": [
{"field2.1": 0, "field2.2": "value2", "field2.3": "value3", "field2.4": "20"}
],
"object3": {"field3.1": 0, "field3.2": 0, "field3.3": "value5"},
"object4": [
{
"field4.1.1": "value1",
"field4.1.2": "10",
"field4.1.3": 128,
"field4.1.4": 0,
},
{
"field4.1.1": "value1",
"field4.1.2": "1400",
"field4.1.3": "value5",
"field4.1.4": 1,
},
],
"object5": [
{"field5.1": 0, "field5.2": "value1", "field5.3": "value7", "field5.4": "6"}
],
}
"""
Mental data model (design in human language)
I consider that each element in the lists we see in the data is a record
of observations on some objectX that are recorded at the same time.
I speculate this type of user question: "have any objects ever had field x
and field y as negative at the same time?"
To answer my user's question I need each row in our table to denote a
record of observations.
"""
table_of_records = []
def normalize(data):
"""
I need to replace each single dict as a single dict in a list
so that every object value has the exact same normal structure.
Ideally you would formalize your mental model as an explicit Pydantic model.
"""
normalized = {}
for k, v in data.items():
if type(v) == dict:
normalized[k] = [v]
else:
normalized[k] = v
return normalized
print(normalize(data))
def get_fields(data):
"""
before we can make a table of observation records we need to have a consistent set of field names for each record
notice how this function requires the data to be normalized.
"""
ndata = normalize(data)
fields = set()
for object_id, records in ndata.items():
for record in records:
print(record)
for field, value in record.items():
fields.add(field)
return fields
print(get_fields(data))
def make_table(data):
ndata = normalize(data)
fields = get_fields(ndata)
for object_id, records in ndata.items():
for record in records:
table_of_records.append(
{field: record.get(field, None) for field in fields}
)
return table_of_records
if __name__ == "__main__":
# we transformed dense raw data into a sparse table for easy querying
print(make_table(data))
from pandas import DataFrame
df = DataFrame.from_records(make_table(data))
print(df)