我尝试使用dask来代替pandas,但我不知道该怎么做,有人可以帮忙吗? 这是代码:
newfilename = "test.xlsx"
cols = ['A', 'B', 'C', 'D']
data = pd.read_excel("old_test.xlsx", header=1, names=cols, usecols=cols, converters={'A': int, 'B': int, 'C': int})
df = pd.DataFrame(data)
df_list = np.split(df, df[df.isnull().all(1)].index)
for i in range(1, len(df_list) + 1):
df_list[i - 1] = df_list[i - 1].dropna(how='all')
for i in range(len(df_list)):
df_list[i] = df_list[i].groupby(['A', 'B', 'D'])['C'].apply(lambda x: ','.join(map(str, x))).reset_index()
df_list[i] = df_list[i].reindex(columns=['A', 'B', 'C', 'D'])
writer = pd.ExcelWriter(newfilename, engine="xlsxwriter")
workbook = writer.book
for i, df in enumerate(df_list):
df.to_excel(writer, sheet_name='Sheet{}'.format(i), index=False)
writer.close()
在这里您可以并行读取 Excel 文件,然后使用 dask
参考
compute()
和 在此处检查计算
import dask.dataframe as dd
import pandas as pd
import numpy as np
newfilename = "test.xlsx"
cols = ['A', 'B', 'C', 'D']
df = dd.read_excel(
"old_test.xlsx",
header=1,
names=cols,
usecols=cols,
converters={'A': int, 'B': int, 'C': int}
).compute()
df_list = [
df_part.dropna(how='all')
.groupby(['A', 'B', 'D'])['C']
.apply(lambda x: ','.join(map(str, x)))
.reset_index()
.reindex(columns=cols)
for df_part in np.split(df, df[df.isnull().all(1)).index)
]
writer = pd.ExcelWriter(newfilename, engine="xlsxwriter")
[part.to_excel(writer, sheet_name=f'Sheet{i}', index=False) for i, part in enumerate(df_list)]
writer.close()