具有多维相关性的大型数据框,包括:
日期,测量 1,测量 2,...测量 10,变体
我先用这个函数计算了最近两个月变异发生的频率,然后计算了平均值。
def window_s_prüfpunkte(df, config_col, date_col, data_cols):
results = {}
for config in df[config_col].unique():
config_df = df[df[config_col] == config]
date_col = pd.to_datetime(config_df[date_col])
monthly_count = config_df.groupby([date_col.dt.year,
date_col.dt.month]).count().rename_axis(['year', 'month'])[data_cols].reset_index()
quantity_average = round(monthly_count[data_cols].mean().mean())
results[config] = {'monthly_count': monthly_count,
'quantity_average': quantity_average}
return results
unique_configs = FSC3_Z_df_12M['Configuration Summary'].unique()
window_results = {}
for config in unique_configs:
config_df = FSC3_Z_df_12M[FSC3_Z_df_12M['Configuration Summary'] == config]
window_results[config] = window_s_prüfpunkte(config_df, 'Configuration Summary',
'Datum LP', COLS_Delta_Z)
现在我想通过插入各个变量的平均值作为窗口大小来计算每次测量的移动平均值。
def moving_average_by_config(df, config_col, date_col, data_cols, window_results):
results = []
for config in df[config_col].unique():
config_df = df[df[config_col] == config]
config_window_size = window_results[config]
for data_col in data_cols:
config_df[f'{data_col}_MA'] = config_df[data_col].rolling(window=config_window_size).mean()
results.append(config_df)
return pd.concat(results)
不幸的是,这种方法行不通
我会建议这样的事情:
# Create example dataframe
dates = pd.date_range('2022-01-01', '2022-02-28', freq='D')
variants = ['A', 'B', 'C']
data_cols = ['Measurement 1', 'Measurement 2', 'Measurement 3']
df = pd.DataFrame({
'Date': np.repeat(dates, len(variants) * len(data_cols)),
'Variant': np.tile(np.repeat(variants, len(data_cols)), len(dates)),
'Measurement 1': np.random.randn(len(dates) * len(variants) * len(data_cols)),
'Measurement 2': np.random.randn(len(dates) * len(variants) * len(data_cols)),
'Measurement 3': np.random.randn(len(dates) * len(variants) * len(data_cols))
})
df['Date'] = pd.to_datetime(df['Date'])
def window_s_prüfpunkte(df, config_col, date_col_name, data_cols):
results = {}
for config in df[config_col].unique():
config_df = df[df[config_col] == config]
date_col = pd.to_datetime(config_df[date_col_name])
monthly_count = config_df.groupby([date_col.dt.year,
date_col.dt.month]).count().rename_axis(['year', 'month'])[data_cols].reset_index()
quantity_average = round(monthly_count[data_cols].mean().mean())
results[config] = {data_col: quantity_average for data_col in data_cols}
return results
def moving_average_by_config(df, config_col, date_col, data_cols, window_sizes):
results = []
for config in df[config_col].unique():
config_df = df[df[config_col] == config]
for data_col in data_cols:
config_window_size = window_sizes[config][data_col]
config_df[f'{data_col}_MA'] = config_df[data_col].rolling(window=config_window_size).mean()
results.append(config_df)
return pd.concat(results)
window_sizes = window_s_prüfpunkte(df, 'Variant', 'Date', ['Measurement 1', 'Measurement 2', 'Measurement 3'])
moving_average_df = moving_average_by_config(df, 'Variant', 'Date', ['Measurement 1', 'Measurement 2', 'Measurement 3'], window_sizes)
初始数据框是
Date Variant Measurement 1 Measurement 2 Measurement 3
0 2022-01-01 A -0.598400 0.186690 0.412284
1 2022-01-01 A 1.304156 -0.242278 -0.357972
2 2022-01-01 A -0.689837 -0.583031 0.987450
3 2022-01-01 B 0.096719 1.300683 0.681955
4 2022-01-01 B -0.034100 -0.540658 0.878540
.. ... ... ... ... ...
526 2022-02-28 B -1.311980 0.702785 0.427267
527 2022-02-28 B -1.322088 -1.358726 -0.429957
528 2022-02-28 C 0.753643 -0.811415 0.347925
529 2022-02-28 C 0.889842 0.086922 -1.129692
530 2022-02-28 C -0.424379 -1.005491 -0.362359
输出是
Date Variant Measurement 1 Measurement 2 Measurement 3 \
0 2022-01-01 A -0.598400 0.186690 0.412284
1 2022-01-01 A 1.304156 -0.242278 -0.357972
2 2022-01-01 A -0.689837 -0.583031 0.987450
9 2022-01-02 A 0.640133 -1.602435 0.477864
10 2022-01-02 A 0.489324 -0.585138 0.722814
.. ... ... ... ... ...
520 2022-02-27 C -0.293254 -0.555126 0.414370
521 2022-02-27 C -0.428985 1.110329 -0.242367
528 2022-02-28 C 0.753643 -0.811415 0.347925
529 2022-02-28 C 0.889842 0.086922 -1.129692
530 2022-02-28 C -0.424379 -1.005491 -0.362359
Measurement 1_MA Measurement 2_MA Measurement 3_MA
0 NaN NaN NaN
1 NaN NaN NaN
2 NaN NaN NaN
9 NaN NaN NaN
10 NaN NaN NaN
.. ... ... ...
520 -0.038969 0.105347 0.114437
521 -0.044342 0.100342 0.115158
528 -0.035779 0.089320 0.131298
529 -0.017788 0.085785 0.103497
530 -0.019763 0.059974 0.082724
[531 rows x 8 columns]