假设我在以下 pandas 数据帧格式中有 dataset,带有 non-standard timestamp column without datetime format 如下:
+--------+-----+
|TS_24hrs|count|
+--------+-----+
|0 |157 |
|1 |334 |
|2 |176 |
|3 |86 |
|4 |89 |
... ...
|270 |192 |
|271 |196 |
|270 |251 |
|273 |138 |
+--------+-----+
274 rows × 2 columns
在将数据拆分为training-set和test-set后,我已经应用了一些回归算法,得到如下结果:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('/content/U2996_24hrs_.csv', sep=",")
print(df.shape)
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.27, shuffle=False)
print(train.shape) #(200, 2)
print(test.shape) #(74, 2)
#visulize splitted data
train['count'].plot(label='Training-set')
test['count'].plot(label='Test-set')
plt.legend()
plt.show()
#Train and fit the model
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor().fit(train, train['count']) #X, y
rf.score(train, train['count']) #0.9998644192184375
# Use the forest's model to predict on the test-set
predictions = rf.predict(test)
#convert prediction result into dataframe for plot issue in ease
df_pre = pd.DataFrame({'TS_24hrs':test['TS_24hrs'], 'count_prediction':predictions})
# Calculate the mean absolute errors
from sklearn.metrics import mean_absolute_error
rf_mae = mean_absolute_error(test['count'], df_pre['count_prediction'])
print(train.shape) #(200, 2)
print(test.shape) #(74, 2)
print(df_pre.shape) #(74, 2)
#visulize forecast or prediction of used regressor model
train['count'].plot(label='Training-set')
test['count'].plot(label='Test-set')
df_pre['count_prediction'].plot(label=f'RF_forecast MAE={rf_mae:.2f}')
plt.legend()
plt.show()
根据这个answer我注意到:
如果您的数据已经根据时间排序那么只需使用
shuffle=False
train, test = train_test_split(newdf, test_size=0.3, shuffle=False)
到目前为止,我一直使用这种经典的拆分数据方法,但我想尝试这里总结的基于时间序列的拆分方法:
此外,根据我的调查(请参阅帖子末尾的参考资料),建议在应用回归模型之前使用交叉验证方法(K-Fold)。解释:时间序列中的交叉验证
我试过推荐 TimeSeriesSplit() 如下不成功:
#Try to split data with CV (K-Fold)
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(
n_splits=len(df['TS_24hrs'].unique()) - 1,
gap=0, # since data alraedy groupedby for 24hours to retrieve daily count there is no need to to have gap
#max_train_size=199,
#test_size=73,
)
#for train_idx, test_idx in tscv.split(df['TS_24hrs']):
# print('TRAIN: ', df.loc[df.index.isin(train_idx), 'TS_24hrs'].unique(),
# 'TEST: ', df.loc[df.index.isin(test_idx), 'TS_24hrs'].unique())
# Function modified from https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html
import matplotlib
from matplotlib.patches import Patch
from matplotlib import pyplot as plt
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
def plot_cv_indices(cv, n_splits, X, y, date_col = None):
"""Create a sample plot for indices of a cross-validation object."""
fig, ax = plt.subplots(1, 1, figsize = (11, 7))
# Generate the training/testing visualizations for each CV split
for ii, (tr, tt) in enumerate(cv.split(X=X, y=y)):
# Fill in indices with the training/test groups
indices = np.array([np.nan] * len(X))
indices[tt] = 1
indices[tr] = 0
# Visualize the results
ax.scatter(range(len(indices)), [ii + .5] * len(indices),
c=indices, marker='_', lw=10, cmap=cmap_cv,
vmin=-.2, vmax=1.2)
# Formatting
yticklabels = list(range(n_splits))
if date_col is not None:
tick_locations = ax.get_xticks()
tick_dates = [" "] + date_col.iloc[list(tick_locations[1:-1])].astype(str).tolist() + [" "]
tick_locations_str = [str(int(i)) for i in tick_locations]
new_labels = ['\n\n'.join(x) for x in zip(list(tick_locations_str), tick_dates) ]
ax.set_xticks(tick_locations)
ax.set_xticklabels(new_labels)
ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
xlabel='Sample index', ylabel="CV iteration",
ylim=[n_splits+0.2, -.2])
ax.legend([Patch(color=cmap_cv(.8)), Patch(color=cmap_cv(.02))],
['Testing set', 'Training set'], loc=(1.02, .8))
ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
#Visualizing cross-validation behavior inspired from https://www.kaggle.com/code/tomwarrens/timeseriessplit-how-to-use-it/notebook
n_splits = len(df['TS_24hrs'].unique()) - 1
print(n_splits) # 273
X = df['TS_24hrs']
y = df['count']
plot_cv_indices(tscv, 273, X, y, date_col = df['TS_24hrs'] )
我也受到了Visualizing cross-validation behavior in scikit-learn try to visualizing the K-Fold CV but get error:
ValueError:FixedLocator 位置的数量 (275),通常来自对 set_ticks 的调用,与标签的数量 (273) 不匹配。
在正确分配回归变量(可视化分割数据)之前,我如何应用基于时间序列的交叉验证(CV)方法(例如,
TimeSeriesSplit()
,BlockingTimeSeriesSplit()
)来比较使用/不使用的预测结果CV处于数据拆分阶段?我找到了这个 workaround 但还没有达到预期的输出。任何帮助将不胜感激。
如果可以绘制以下用于理解和更好地对齐拆分数据的数字,则可能是预期输出的一部分:
参考资料: