我正在尝试将包含 numpy 数组的数据框写入镶木地板文件。这适用于列表,但不适用于 numpy 数组。使用 numpy 数组时,出现以下错误:
('Can only convert 1-dimensional array values', 'Conversion failed for column 101 with type object')
。
为了存储 numpy 数组,我尝试将它们转换为列表,但它们似乎仍然是 numpy 数组。
我不想展平这些数组并将它们保留为二维列表,这在第一个代码示例中有效。
带有有效列表的代码:
import dask.dataframe as dd
import pandas as pd
from dask import delayed
from dask.diagnostics import ProgressBar
import numpy as np
def dict_to_dataframe(dict, all_columns):
df = pd.DataFrame.from_dict(dict)
# Add missing columns and sort columns
missing_columns = list(set(all_columns).difference(df.columns))
df = df.reindex(columns=sorted([*df.columns.tolist(), *missing_columns]))
# Set new column type to object
df[missing_columns] = df[missing_columns].astype(object)
return df
data_a = {
101: [[[1], [2]], [[5], [6]]],
110: [[[9], [10]], [[13], [14]]],
}
data_b = {
105: [[[17], [18]], [[21], [22]]],
130: [[[25], [26]], [[29], [30]]],
}
all_columns = [101, 105, 110, 130, 140]
parts = [delayed(dict_to_dataframe)(fn, all_columns) for fn in [data_a, data_b]]
types = pd.DataFrame(columns=all_columns, dtype=object)
ddf_result = dd.from_delayed(parts, meta=types)
# Convert column names to strings
ddf_result.columns = ddf_result.columns.map(str)
print("Write to file")
file_path = "test.parquet"
# print(ddf_result.compute())
with ProgressBar():
ddf_result.compute().reset_index(drop=True).to_parquet(file_path) #.to_hdf(file_path, key='speeds', format='table')
written = dd.read_parquet(file_path)
带有不起作用的 numpy 数组的代码:
import dask.dataframe as dd
import pandas as pd
from dask import delayed
from dask.diagnostics import ProgressBar
import numpy as np
def dict_to_dataframe(dict, all_columns):
df = pd.DataFrame.from_dict(dict)
# Add missing columns and sort columns
missing_columns = list(set(all_columns).difference(df.columns))
df = df.reindex(columns=sorted([*df.columns.tolist(), *missing_columns]))
# Set new column type to object
# df[missing_columns] = df[missing_columns].astype(object)
print('type before')
print(type(df[101][0]))
df = df.apply(lambda x: x.tolist())
print('type after')
print(type(df[101][0]))
return df.astype(object)
data_a = {
101: [np.array([[4.21275084, 4.21275084, 9.16020628, 8.27254411, 1.38404369],
[4.18106017, 4.18106017, 9.2148525, 8.15240762, 5.77259211],
[2.60123054, 2.60123054, 8.91083957, 7.81762064, 3.5770306 ],
[7.4244178, 7.4244178, 7.8176024, 4.16825841, np.nan],
[7.88384761, 7.88384761, np.nan, 2.54106066, np.nan],
[8.06353919, 8.06353919, np.nan, 1.83942825, np.nan]]), np.array([[5], [6]])],
110: [np.array([[9], [10]]), np.array([[13], [14]])],
}
data_b = {
105: [np.array([[17], [18]]), np.array([[21], [22]])],
130: [np.array([[25], [26]]), np.array([[29], [30]])],
}
all_columns = [101, 105, 110, 130, 140]
parts = [delayed(dict_to_dataframe)(fn, all_columns) for fn in [data_a, data_b]]
types = pd.DataFrame(columns=all_columns, dtype=object)
ddf_result = dd.from_delayed(parts, meta=types)
# Convert column names to strings
ddf_result.columns = ddf_result.columns.map(str)
print("Write to file")
file_path = "test.parquet"
with ProgressBar():
ddf_result.compute().reset_index(drop=True).to_parquet(file_path)
written = dd.read_parquet(file_path)
如何将 numpy 数组转换为列表以将它们写入镶木地板?