pandas 数据框内的 Numpy 数组未转换为列表

问题描述 投票:0回答:0

我正在尝试将包含 numpy 数组的数据框写入镶木地板文件。这适用于列表,但不适用于 numpy 数组。使用 numpy 数组时,出现以下错误:

('Can only convert 1-dimensional array values', 'Conversion failed for column 101 with type object')
。 为了存储 numpy 数组,我尝试将它们转换为列表,但它们似乎仍然是 numpy 数组。

我不想展平这些数组并将它们保留为二维列表,这在第一个代码示例中有效。

带有有效列表的代码:

import dask.dataframe as dd
import pandas as pd
from dask import delayed
from dask.diagnostics import ProgressBar
import numpy as np

def dict_to_dataframe(dict, all_columns):
    df = pd.DataFrame.from_dict(dict)

    # Add missing columns and sort columns   
    missing_columns = list(set(all_columns).difference(df.columns)) 
    df = df.reindex(columns=sorted([*df.columns.tolist(), *missing_columns]))

    # Set new column type to object
    df[missing_columns] = df[missing_columns].astype(object)

    return df


data_a = {
    101: [[[1], [2]], [[5], [6]]],
    110: [[[9], [10]], [[13], [14]]],
}
data_b = {
    105: [[[17], [18]], [[21], [22]]],
    130: [[[25], [26]], [[29], [30]]],
}


all_columns = [101, 105, 110, 130, 140]
parts = [delayed(dict_to_dataframe)(fn, all_columns) for fn in [data_a, data_b]]
types = pd.DataFrame(columns=all_columns, dtype=object)
ddf_result = dd.from_delayed(parts, meta=types)

# Convert column names to strings
ddf_result.columns = ddf_result.columns.map(str)

print("Write to file")
file_path = "test.parquet"
# print(ddf_result.compute())
with ProgressBar():
    ddf_result.compute().reset_index(drop=True).to_parquet(file_path) #.to_hdf(file_path, key='speeds', format='table')

written = dd.read_parquet(file_path)

带有不起作用的 numpy 数组的代码:

import dask.dataframe as dd
import pandas as pd
from dask import delayed
from dask.diagnostics import ProgressBar
import numpy as np

def dict_to_dataframe(dict, all_columns):
    df = pd.DataFrame.from_dict(dict)

    # Add missing columns and sort columns   
    missing_columns = list(set(all_columns).difference(df.columns)) 
    df = df.reindex(columns=sorted([*df.columns.tolist(), *missing_columns]))

    # Set new column type to object
    # df[missing_columns] = df[missing_columns].astype(object)

    print('type before')
    print(type(df[101][0]))
    df = df.apply(lambda x: x.tolist())
    print('type after')
    print(type(df[101][0]))

    return df.astype(object)


data_a = {
    101: [np.array([[4.21275084, 4.21275084, 9.16020628, 8.27254411, 1.38404369],
 [4.18106017, 4.18106017, 9.2148525,  8.15240762, 5.77259211],
 [2.60123054, 2.60123054, 8.91083957, 7.81762064, 3.5770306 ],
 [7.4244178,  7.4244178,  7.8176024, 4.16825841,        np.nan],
 [7.88384761, 7.88384761,        np.nan, 2.54106066,        np.nan],
 [8.06353919, 8.06353919,        np.nan, 1.83942825,        np.nan]]), np.array([[5], [6]])],
    110: [np.array([[9], [10]]), np.array([[13], [14]])],
}
data_b = {
    105: [np.array([[17], [18]]), np.array([[21], [22]])],
    130: [np.array([[25], [26]]), np.array([[29], [30]])],
}


all_columns = [101, 105, 110, 130, 140]
parts = [delayed(dict_to_dataframe)(fn, all_columns) for fn in [data_a, data_b]]
types = pd.DataFrame(columns=all_columns, dtype=object)
ddf_result = dd.from_delayed(parts, meta=types)

# Convert column names to strings
ddf_result.columns = ddf_result.columns.map(str)

print("Write to file")
file_path = "test.parquet"
with ProgressBar():
    ddf_result.compute().reset_index(drop=True).to_parquet(file_path)
    
written = dd.read_parquet(file_path)

如何将 numpy 数组转换为列表以将它们写入镶木地板?

python pandas numpy dask dask-delayed
© www.soinside.com 2019 - 2024. All rights reserved.