从(稀疏)JSON获取可预测的Pandas DataFrame

问题描述 投票:0回答:1

我正在从API获取JSON。该API忽略null值(null的属性不会通过电线发送),因此可以稀疏数据。这些属性包含字符串,数字,布尔值,unix时间戳,ISO8601时间戳和ISO8601持续时间的混合。

这是带有所有数据类型的示例JSON(作为Python列表/字典)>

        data_full = [
            {'name': 'alice', 'lastname': 'foo', 'value': 1.11, 'unix_ts': 1591848156000, 'iso_ts': '2020-05-17T12:33:44Z',
             'iso_dur': 'PT1H11M', 'bool_val': True},
            {'name': 'clair', 'lastname': 'bar', 'value': 3.33, 'unix_ts': 1591648156000, 'iso_ts': '2020-03-17T12:33:44Z',
             'iso_dur': 'PT3H33M', 'bool_val': True},
        ]

稀疏数据可能在任何行或所有行上都缺少字段,或者API结果也可能完全为空。范例

        some_fields_missing_in_some_rows = [
            {'name': 'alice', 'lastname': 'foo', 'value': 1.23, 'unix_ts': 1591848156000,
             'iso_ts': '2020-05-17T12:33:44Z',
             'iso_dur': 'PT1H11M', 'bool_val': True},
            {'name': 'clair', }
        ]
        some_fields_missing_in_all_rows = [
            {'name': 'alice'},
            {'name': 'clair'}
        ]
        no_data = []

我使用json_normalize将其转换为Pandas DataFrame。为了进行预测性的进一步处理,我希望在所有稀疏情况下,输出dtype都与数据已满一样,并在丢失的位置插入正确的NA。我很难获得正确类型(np.nan或其他)的缺失值。

下面完全包含的测试用例显示了问题(又名,如果您通过了4个测试,我相信它正在按照我的期望进行。)>

一个明显的问题是如何用NaN创建和填充类型为str的空列。任何反馈意见表示赞赏。

import datetime
from typing import List, Tuple
from unittest import TestCase

import isodate
import numpy as np
import pandas as pd


class TestDFNormalization(TestCase):
    def test_full_fields(self):
        jsList = [
            {'name': 'alice', 'lastname': 'foo', 'value': 1.11, 'unix_ts': 1591848156000,
             'iso_ts': '2020-05-17T12:33:44Z',
             'iso_dur': 'PT1H11M', 'bool_val': True},
            {'name': 'clair', 'lastname': 'bar', 'value': 3.33, 'unix_ts': 1591648156000,
             'iso_ts': '2020-03-17T12:33:44Z',
             'iso_dur': 'PT3H33M', 'bool_val': True},
        ]
        df = extract_df(js=jsList)
        print(df.dtypes)
        print(df)
        self.assert_dtypes_conform(df)
        self.assert_correct_NaNs(df, 2)  # no NaN, so all rows (=2) kept

    def test_sparse_fields(self):
        some_fields_missing_in_some_rows  = [
            {'name': 'alice', 'lastname': 'foo', 'value': 1.23, 'unix_ts': 1591848156000,
             'iso_ts': '2020-05-17T12:33:44Z',
             'iso_dur': 'PT1H11M', 'bool_val': True},
            {'name': 'clair', }
        ]
        df = extract_df(js=some_fields_missing_in_some_rows )
        print(df.dtypes)
        print(df)
        self.assert_dtypes_conform(df)
        self.assert_correct_NaNs(df, 1)  # some NaN, only 1 row kept

    def test_lacking_fields(self):
        some_fields_missing_in_all_rows  = [
            {'name': 'alice'},
            {'name': 'clair'}
        ]

        df = extract_df(js=some_fields_missing_in_all_rows )
        print(df.dtypes)
        print(df)
        self.assert_dtypes_conform(df)
        self.assert_correct_NaNs(df, 0)  # all NaN, no rows

    def test_no_data(self):
        no_data  = []

        df = extract_df(js=no_data )
        print(df.dtypes)
        print(df)
        self.assert_dtypes_conform(df)
        self.assert_correct_NaNs(df, 0)  # no rows

    def assert_dtypes_conform(self, df: pd.DataFrame) -> None:
        self.assertEqual("object", df['name'].dtype)
        self.assertEqual("object", df['lastname'].dtype)
        self.assertEqual("float", df['value'].dtype)
        self.assertEqual("datetime64[ns, UTC]", df['unix_ts'].dtype)
        self.assertEqual("datetime64[ns, UTC]", df['iso_ts'].dtype)
        self.assertEqual("timedelta64[ns]", df['iso_dur'].dtype)
        self.assertEqual("boolean", df['bool_val'].dtype)

    def assert_correct_NaNs(self, df: pd.DataFrame, expectedNumRowsAfterDropNA: int) -> None:
        self.assertEqual(expectedNumRowsAfterDropNA, len(df.dropna(subset=['lastname']).index))
        self.assertEqual(expectedNumRowsAfterDropNA, len(df.dropna(subset=['value']).index))
        self.assertEqual(expectedNumRowsAfterDropNA, len(df.dropna(subset=['unix_ts']).index))
        self.assertEqual(expectedNumRowsAfterDropNA, len(df.dropna(subset=['iso_ts']).index))
        self.assertEqual(expectedNumRowsAfterDropNA, len(df.dropna(subset=['iso_dur']).index))
        self.assertEqual(expectedNumRowsAfterDropNA, len(df.dropna(subset=['bool_val']).index))


def extract_df(js: List) -> pd.DataFrame:
    df = pd.json_normalize(js)
    create_cols_if_absent(df=df,
                          expected_cols=('name', 'lastname', 'value', 'unix_ts', 'iso_ts', 'iso_dur', 'bool_val'))
    # astype_per_column(df=df, column='name', dtype='str')
    # astype_per_column(df=df, column='lastname', dtype='str')
    # astype_per_column(df=df, column='value', dtype='float')
    parse_unix_ms(df=df, column='unix_ts')
    parse_iso(df=df, column='iso_ts')
    parse_dur(df=df, column='iso_dur')
    astype_per_column(df=df, column='bool_val', dtype='boolean')
    return df


def create_cols_if_absent(df: pd.DataFrame, expected_cols: Tuple) -> None:
    for col in expected_cols:
        if col not in df.columns:
            df[col] = np.nan  # or None or pd.NA or np.nan ?


def parse_unix_ms(df, column):
    df[column] = pd.to_datetime(df[column], unit='ms', origin='unix', utc=True)


def parse_iso(df, column):
    df[column] = pd.to_datetime(df[column], utc=True)


def parse_iso_duration(durationstring: str) -> datetime.timedelta:
    if not durationstring or pd.isna(durationstring):
        return None
    return isodate.parse_duration(durationstring)


def parse_dur(df, column) -> None:
    df[column] = pd.to_timedelta(
        df[column].apply(parse_iso_duration))  # why does to_timedelta() not support ISO8601 notation?


def astype_per_column(df: pd.DataFrame, column: str, dtype) -> None:
    df[column] = df[column].astype(dtype)
            

我正在从API获取JSON。该API省略了空值(不通过网络发送为空的属性),因此可以稀疏数据。这些属性包含字符串,数字,布尔值,...

python python-3.x pandas dataframe missing-data
1个回答
0
投票

哦,很好。它应在StringDtype调用中使用新的(pandas> = 1.0)字符串类型(astype)。

def extract_df(js: List) -> pd.DataFrame:
    df = pd.json_normalize(js)
    create_cols_if_absent(df=df,
                          expected_cols=('name', 'lastname', 'value', 'unix_ts', 'iso_ts', 'iso_dur', 'bool_val'))
    astype_per_column(df=df, column='name', dtype='string')
    astype_per_column(df=df, column='lastname', dtype='string')
    astype_per_column(df=df, column='value', dtype='float')
    parse_unix_ms(df=df, column='unix_ts')
    parse_iso(df=df, column='iso_ts')
    parse_dur(df=df, column='iso_dur')
    astype_per_column(df=df, column='bool_val', dtype='boolean')
    return df
© www.soinside.com 2019 - 2024. All rights reserved.