我正在从API获取JSON。该API忽略null
值(null
的属性不会通过电线发送),因此可以稀疏数据。这些属性包含字符串,数字,布尔值,unix时间戳,ISO8601时间戳和ISO8601持续时间的混合。
这是带有所有数据类型的示例JSON(作为Python列表/字典)>
data_full = [ {'name': 'alice', 'lastname': 'foo', 'value': 1.11, 'unix_ts': 1591848156000, 'iso_ts': '2020-05-17T12:33:44Z', 'iso_dur': 'PT1H11M', 'bool_val': True}, {'name': 'clair', 'lastname': 'bar', 'value': 3.33, 'unix_ts': 1591648156000, 'iso_ts': '2020-03-17T12:33:44Z', 'iso_dur': 'PT3H33M', 'bool_val': True}, ]
稀疏数据可能在任何行或所有行上都缺少字段,或者API结果也可能完全为空。范例
some_fields_missing_in_some_rows = [ {'name': 'alice', 'lastname': 'foo', 'value': 1.23, 'unix_ts': 1591848156000, 'iso_ts': '2020-05-17T12:33:44Z', 'iso_dur': 'PT1H11M', 'bool_val': True}, {'name': 'clair', } ] some_fields_missing_in_all_rows = [ {'name': 'alice'}, {'name': 'clair'} ] no_data = []
我使用
json_normalize
将其转换为Pandas DataFrame。为了进行预测性的进一步处理,我希望在所有稀疏情况下,输出dtype都与数据已满一样,并在丢失的位置插入正确的NA
。我很难获得正确类型(np.nan或其他)的缺失值。
下面完全包含的测试用例显示了问题(又名,如果您通过了4个测试,我相信它正在按照我的期望进行。)>
一个明显的问题是如何用NaN创建和填充类型为str
的空列。任何反馈意见表示赞赏。
import datetime from typing import List, Tuple from unittest import TestCase import isodate import numpy as np import pandas as pd class TestDFNormalization(TestCase): def test_full_fields(self): jsList = [ {'name': 'alice', 'lastname': 'foo', 'value': 1.11, 'unix_ts': 1591848156000, 'iso_ts': '2020-05-17T12:33:44Z', 'iso_dur': 'PT1H11M', 'bool_val': True}, {'name': 'clair', 'lastname': 'bar', 'value': 3.33, 'unix_ts': 1591648156000, 'iso_ts': '2020-03-17T12:33:44Z', 'iso_dur': 'PT3H33M', 'bool_val': True}, ] df = extract_df(js=jsList) print(df.dtypes) print(df) self.assert_dtypes_conform(df) self.assert_correct_NaNs(df, 2) # no NaN, so all rows (=2) kept def test_sparse_fields(self): some_fields_missing_in_some_rows = [ {'name': 'alice', 'lastname': 'foo', 'value': 1.23, 'unix_ts': 1591848156000, 'iso_ts': '2020-05-17T12:33:44Z', 'iso_dur': 'PT1H11M', 'bool_val': True}, {'name': 'clair', } ] df = extract_df(js=some_fields_missing_in_some_rows ) print(df.dtypes) print(df) self.assert_dtypes_conform(df) self.assert_correct_NaNs(df, 1) # some NaN, only 1 row kept def test_lacking_fields(self): some_fields_missing_in_all_rows = [ {'name': 'alice'}, {'name': 'clair'} ] df = extract_df(js=some_fields_missing_in_all_rows ) print(df.dtypes) print(df) self.assert_dtypes_conform(df) self.assert_correct_NaNs(df, 0) # all NaN, no rows def test_no_data(self): no_data = [] df = extract_df(js=no_data ) print(df.dtypes) print(df) self.assert_dtypes_conform(df) self.assert_correct_NaNs(df, 0) # no rows def assert_dtypes_conform(self, df: pd.DataFrame) -> None: self.assertEqual("object", df['name'].dtype) self.assertEqual("object", df['lastname'].dtype) self.assertEqual("float", df['value'].dtype) self.assertEqual("datetime64[ns, UTC]", df['unix_ts'].dtype) self.assertEqual("datetime64[ns, UTC]", df['iso_ts'].dtype) self.assertEqual("timedelta64[ns]", df['iso_dur'].dtype) self.assertEqual("boolean", df['bool_val'].dtype) def assert_correct_NaNs(self, df: pd.DataFrame, expectedNumRowsAfterDropNA: int) -> None: self.assertEqual(expectedNumRowsAfterDropNA, len(df.dropna(subset=['lastname']).index)) self.assertEqual(expectedNumRowsAfterDropNA, len(df.dropna(subset=['value']).index)) self.assertEqual(expectedNumRowsAfterDropNA, len(df.dropna(subset=['unix_ts']).index)) self.assertEqual(expectedNumRowsAfterDropNA, len(df.dropna(subset=['iso_ts']).index)) self.assertEqual(expectedNumRowsAfterDropNA, len(df.dropna(subset=['iso_dur']).index)) self.assertEqual(expectedNumRowsAfterDropNA, len(df.dropna(subset=['bool_val']).index)) def extract_df(js: List) -> pd.DataFrame: df = pd.json_normalize(js) create_cols_if_absent(df=df, expected_cols=('name', 'lastname', 'value', 'unix_ts', 'iso_ts', 'iso_dur', 'bool_val')) # astype_per_column(df=df, column='name', dtype='str') # astype_per_column(df=df, column='lastname', dtype='str') # astype_per_column(df=df, column='value', dtype='float') parse_unix_ms(df=df, column='unix_ts') parse_iso(df=df, column='iso_ts') parse_dur(df=df, column='iso_dur') astype_per_column(df=df, column='bool_val', dtype='boolean') return df def create_cols_if_absent(df: pd.DataFrame, expected_cols: Tuple) -> None: for col in expected_cols: if col not in df.columns: df[col] = np.nan # or None or pd.NA or np.nan ? def parse_unix_ms(df, column): df[column] = pd.to_datetime(df[column], unit='ms', origin='unix', utc=True) def parse_iso(df, column): df[column] = pd.to_datetime(df[column], utc=True) def parse_iso_duration(durationstring: str) -> datetime.timedelta: if not durationstring or pd.isna(durationstring): return None return isodate.parse_duration(durationstring) def parse_dur(df, column) -> None: df[column] = pd.to_timedelta( df[column].apply(parse_iso_duration)) # why does to_timedelta() not support ISO8601 notation? def astype_per_column(df: pd.DataFrame, column: str, dtype) -> None: df[column] = df[column].astype(dtype)
我正在从API获取JSON。该API省略了空值(不通过网络发送为空的属性),因此可以稀疏数据。这些属性包含字符串,数字,布尔值,...
哦,很好。它应在StringDtype
调用中使用新的(pandas> = 1.0)字符串类型(astype
)。
def extract_df(js: List) -> pd.DataFrame:
df = pd.json_normalize(js)
create_cols_if_absent(df=df,
expected_cols=('name', 'lastname', 'value', 'unix_ts', 'iso_ts', 'iso_dur', 'bool_val'))
astype_per_column(df=df, column='name', dtype='string')
astype_per_column(df=df, column='lastname', dtype='string')
astype_per_column(df=df, column='value', dtype='float')
parse_unix_ms(df=df, column='unix_ts')
parse_iso(df=df, column='iso_ts')
parse_dur(df=df, column='iso_dur')
astype_per_column(df=df, column='bool_val', dtype='boolean')
return df