将 pandas 导入为 pd
data = [['4/04/2023','地板','桌子','屋顶','油漆','油漆'],['4/05/2023','屋顶','地板' ,'桌子','屋顶','油漆'],['4/06/2023','油漆','','地板','桌子','屋顶'],['4/07/2023 ','屋顶','油漆','','地板','桌子']]
df = pd.DataFrame(data, columns=['Date', 'Jim', 'Bob', 'Ed', 'James', 'Joe'])
df2 = df.melt(id_vars="Date", value_name='Items').dropna().reset_index(drop=True)
df3 = df2.pivot(index="Date", columns='Items', values='variable') df3 = df3[["Floors","Tables","Roof","Paint"]] #改变顺序
打印(df)
打印(df3)
我希望 Pandas 检测重复项并在有重复项的地方创建一个额外的列,而不是抛出以下错误:
raise ValueError("Index contains duplicate entries, cannot reshape")
ValueError:索引包含重复条目,无法重塑
您可以使用:
(df.replace('', np.nan)
.melt(id_vars="Date", value_name='Items').dropna().reset_index(drop=True)
.assign(n=lambda d: d.groupby(['Date', 'Items']).cumcount().add(1))
.pivot(index="Date", columns=['Items', 'n'], values='variable')
.pipe(lambda d: d.set_axis(d.columns.map(lambda x: f'{x[0]}_{x[1]}'), axis=1))
.fillna('').reset_index()
)
输出:
Date Floors_1 Roof_1 Paint_1 Tables_1 Roof_2 Paint_2
0 4/04/2023 Jim Ed James Bob Joe
1 4/05/2023 Bob Jim Joe Ed James
2 4/06/2023 Ed Joe Jim James
3 4/07/2023 James Jim Bob Joe
# for each date convert columns to rows
s = df.melt('Date').query("value != ''")
# add a supplementary index to identify
# unique rows per 'date' and 'value'
s['idx'] = s.groupby(['Date', 'value']).cumcount().add(1).astype('str')
# Reshape the dataframe back to wide format
s = s.pivot(index='Date', columns=['value', 'idx'], values='variable')
# Flatten the multiindex columns
s.columns = s.columns.map('_'.join)
Floors_1 Roof_1 Paint_1 Tables_1 Roof_2 Paint_2
Date
4/04/2023 Jim Ed James Bob NaN Joe
4/05/2023 Bob Jim Joe Ed James NaN
4/06/2023 Ed Joe Jim James NaN NaN
4/07/2023 James Jim Bob Joe NaN NaN
如果您按照输出建议为重复条目添加后缀,则可以使用
pivot
:
add_suffix = lambda x: x['value'] + (-x.groupby(['Date', 'value']).cumcount().add(1)).astype(str).str.replace('-1', '')
out = (df.melt('Date').replace('', np.nan).dropna().assign(value=add_suffix)
.pivot(index='Date', columns='value', values='variable').fillna('')
.reset_index().rename_axis(columns=None))
输出:
>>> out
Date Floors Paint Paint-2 Roof Roof-2 Tables
0 4/4/2023 Jim James Joe Ed Bob
1 4/5/2023 Bob Joe Jim James Ed
2 4/6/2023 Ed Jim Joe James
3 4/7/2023 James Bob Jim Joe