使用 Pandas 处理非规范化 JSON

Question

我有以下 JSON 文件，我想使用 python 处理它（即从某些字段获取数据并用它进行一些计算和分析）。

JSON 文件：

{
    "object1":{
       "field1.1":"value1",
       "field1.2":"value2",
       "field1.3":"value3",
       "field1.4":"value4",
       "field1.5":"value5",
       "field1.6":"value6",
       "field1.7":"",
       "field1.8":"",
       "field1.9":"",
       "field1.10":""
    },
    "object2":[
       {
          "field2.1":0,
          "field2.2":"value2",
          "field2.3":"value3",
          "field2.4":"20"
       }
    ],
    "object3":{
       "field3.1":0,
       "field3.2":0,
       "field3.3":"value5"
    },
    "object4":[
       {
          "field4.1.1":"value1",
          "field4.1.2":"10",
          "field4.1.3":128,
          "field4.1.4":0
       },
       {
          "field4.1.1":"value1",
          "field4.1.2":"1400",
          "field4.1.3":"value5",
          "field4.1.4":1
       }
    ],
    "object5":[
       {
          "field5.1":0,
          "field5.2":"value1",
          "field5.3":"value7",
          "field5.4":"6"
       }
    ]
 }

如何使用 python Pandas 读取这个 JSON 文件？我尝试执行 pd.read_json(json_path) 但总是收到错误，无论我使用哪种类型的 JSON 方向（拆分、索引等）。

Answer 1

您可以定义一个函数来取消数据中所有嵌套字段的嵌套：

def flatten_nested_json_df(df):
    df = df.reset_index()
    s = (df.applymap(type) == list).all()
    list_columns = s[s].index.tolist()
    
    s = (df.applymap(type) == dict).all()
    dict_columns = s[s].index.tolist()

    
    while len(list_columns) > 0 or len(dict_columns) > 0:
        new_columns = []

        for col in dict_columns:
            horiz_exploded = pd.json_normalize(df[col]).add_prefix(f'{col}.')
            horiz_exploded.index = df.index
            df = pd.concat([df, horiz_exploded], axis=1).drop(columns=[col])
            new_columns.extend(horiz_exploded.columns) # inplace

        for col in list_columns:
            #print(f"exploding: {col}")
            df = df.drop(columns=[col]).join(df[col].explode().to_frame())
            new_columns.append(col)

        s = (df[new_columns].applymap(type) == list).all()
        list_columns = s[s].index.tolist()

        s = (df[new_columns].applymap(type) == dict).all()
        dict_columns = s[s].index.tolist()
    return df

然后这样做：

results = pd.json_normalize(data) #data is your json 
df = pd.DataFrame(results)

outdf = flatten_nested_json_df(df)

导致

index object1.field1.1 object1.field1.2 object1.field1.3 object1.field1.4  \
0      0           value1           value2           value3           value4   
0      0           value1           value2           value3           value4   
0      0           value1           value2           value3           value4   
0      0           value1           value2           value3           value4   

  object1.field1.5 object1.field1.6 object1.field1.7 object1.field1.8  \
0           value5           value6                                     
0           value5           value6                                     
0           value5           value6                                     
0           value5           value6                                     

  object1.field1.9  ... object2.field2.3  object2.field2.4  \
0                   ...           value3                20   
0                   ...           value3                20   
0                   ...           value3                20   
0                   ...           value3                20   

   object4.field4.1.1 object4.field4.1.2  object4.field4.1.3  \
0              value1                 10                 128   
0              value1                 10                 128   
0              value1               1400              value5   
0              value1               1400              value5   

  object4.field4.1.4 object5.field5.1 object5.field5.2 object5.field5.3  \
0                  0                0           value1           value7   
0                  0                0           value1           value7   
0                  1                0           value1           value7   
0                  1                0           value1           value7   

  object5.field5.4  
0                6  
0                6  
0                6  
0                6  

[4 rows x 26 columns]

Answer 2

from rich import print

data = {
    "object1": {
        "field1.1": "value1",
        "field1.2": "value2",
        "field1.3": "value3",
        "field1.4": "value4",
        "field1.5": "value5",
        "field1.6": "value6",
        "field1.7": "",
        "field1.8": "",
        "field1.9": "",
        "field1.10": "",
    },
    "object2": [
        {"field2.1": 0, "field2.2": "value2", "field2.3": "value3", "field2.4": "20"}
    ],
    "object3": {"field3.1": 0, "field3.2": 0, "field3.3": "value5"},
    "object4": [
        {
            "field4.1.1": "value1",
            "field4.1.2": "10",
            "field4.1.3": 128,
            "field4.1.4": 0,
        },
        {
            "field4.1.1": "value1",
            "field4.1.2": "1400",
            "field4.1.3": "value5",
            "field4.1.4": 1,
        },
    ],
    "object5": [
        {"field5.1": 0, "field5.2": "value1", "field5.3": "value7", "field5.4": "6"}
    ],
}

"""
Mental data model (design in human language)

I consider that each element in the lists we see in the data is a record
of observations on some objectX that are recorded at the same time.

I speculate this type of user question: "have any objects ever had field x
and field y as negative at the same time?"


To answer my user's question I need each row in our table to denote a
record of observations.
"""

table_of_records = []


def normalize(data):
    """
    I need to replace each single dict as a single dict in a list
    so that every object value has the exact same normal structure.

    Ideally you would formalize your mental model as an explicit Pydantic model.
    """
    normalized = {}
    for k, v in data.items():
        if type(v) == dict:
            normalized[k] = [v]
        else:
            normalized[k] = v
    return normalized


print(normalize(data))


def get_fields(data):
    """
    before we can make a table of observation records we need to have a consistent set of field names for each record

    notice how this function requires the data to be normalized.
    """
    ndata = normalize(data)
    fields = set()
    for object_id, records in ndata.items():
        for record in records:
            print(record)
            for field, value in record.items():
                fields.add(field)
    return fields


print(get_fields(data))


def make_table(data):
    ndata = normalize(data)
    fields = get_fields(ndata)
    for object_id, records in ndata.items():
        for record in records:
            table_of_records.append(
                {field: record.get(field, None) for field in fields}
            )
    return table_of_records


if __name__ == "__main__":
    # we transformed dense raw data into a sparse table for easy querying
    print(make_table(data))
    from pandas import DataFrame

    df = DataFrame.from_records(make_table(data))
    print(df)

使用 Pandas 处理非规范化 JSON

问题描述投票：0回答：2

2个回答

最新问题

使用 Pandas 处理非规范化 JSON

问题描述 投票：0回答：2

2个回答

最新问题

问题描述投票：0回答：2