假设我们对多个实体有多个时间序列观察,并且我们希望对单个模型执行超参数调整。
据我所知,在 scikit-learn 框架内执行此超参数调整操作没有一个简单的解决方案。存在使用 TimeSeriesSplit 对单个时间序列执行此操作的功能,但这不适用于多个实体。
我的问题是:如何在sklearn框架中使用面板数据进行超参数调优?
我建议使用PanelSplit,一个面板数据的交叉验证器。它本质上是 TimeSeriesSplit 的包装器,采用与 TimeSeriesSplit 类似的相同参数,但允许面板数据功能。
PanelSplit 的工作原理如下:
在我的特定用例(即分类任务)中,我需要检查特定折叠的测试集是否包含多个类,以便计算 ROC AUC。因此,我添加了额外的功能,以便它省略一类折叠。另外,为了进行故障排除,我添加了一个绘图函数来绘制时间序列分割。
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
class PanelSplit:
def __init__(self, n_splits, gap, test_size, unique_periods, train_periods, drop_one_class_folds=False, X=None, y=None, plot=False, return_warning=True):
"""
A class for performing time series cross-validation with custom train/test splits based on unique periods.
Parameters:
- n_splits: Number of splits for TimeSeriesSplit
- gap: Gap between train and test sets in TimeSeriesSplit
- test_size: Size of the test set in TimeSeriesSplit
- unique_periods: Pandas DataFrame or Series containing unique periods
- train_periods: All available training periods
- drop_one_class_folds: Flag to drop folds with only one class in the test set
- X: Input features
- y: Target variable
- plot: Flag to visualize time series splits
- return_warning: Flag to return warning regarding one-class folds in the test set
"""
self.tss = TimeSeriesSplit(n_splits=n_splits, gap=gap, test_size=test_size)
indices = self.tss.split(unique_periods)
self.u_periods_cv = self.split_unique_periods(indices, unique_periods)
self.all_periods = train_periods; self.return_warning = return_warning; self.drop_one_class_folds = drop_one_class_folds
self.n_splits = self.split(X, y, return_n_splits=True)
if plot:
self.plot_time_series_splits(self.u_periods_cv)
def split_unique_periods(self, indices, unique_periods):
"""
Split unique periods into train/test sets based on TimeSeriesSplit indices.
Parameters:
- indices: TimeSeriesSplit indices
- unique_periods: Pandas DataFrame or Series containing unique periods
Returns: List of tuples containing train and test periods
"""
u_periods_cv = []
for i, (train_index, test_index) in enumerate(indices):
unique_train_periods = unique_periods.iloc[train_index].values
unique_test_periods = unique_periods.iloc[test_index].values
u_periods_cv.append((unique_train_periods, unique_test_periods))
return u_periods_cv
def split(self, X, y, groups=None, return_n_splits=False):
"""
Generate train/test indices based on unique periods and drop folds if specified.
Parameters:
- X: Input features
- y: Target variable
- groups: Group labels for the samples
- return_n_splits: Flag to return the number of splits
Returns: List of tuples containing train and test indices or number of splits
"""
self.all_indices = []
for i, (train_periods, test_periods) in enumerate(self.u_periods_cv):
train_indices = X.loc[(self.all_periods.isin(train_periods))].index
test_indices = y.loc[(self.all_periods.isin(test_periods))].index
append_indices = self.check_classes_in_test(y, test_indices, test_periods, i, return_n_splits)
if append_indices:
self.all_indices.append((train_indices, test_indices))
if return_n_splits:
return len(self.all_indices)
else:
return self.all_indices
def get_n_splits(self, X=None, y=None, groups=None):
"""
Return the number of splits.
Parameters:
- X: Input features
- y: Target variable
- groups: Group labels for the samples
Returns: Number of splits
"""
return self.n_splits
def check_classes_in_test(self, y, test_indices, test_periods, i, return_n_splits):
"""
Check for the existence of a single class in the test set and handle accordingly.
Parameters:
- y: Target variable
- test_indices: Indices of the test set
- test_periods: Periods in the test set
- i: Fold index
- return_n_splits: Flag to return the number of splits
- return_warning: Flag to return warning regarding one-class folds in the test set
Returns: Boolean indicating whether to append indices or not
"""
# Check for existence of 1 in the test set
one_class = (y.loc[test_indices].sum() == 0)
if one_class and not self.drop_one_class_folds:
if return_n_splits:
if self.return_warning:
print(f'''Warning: Fold {i} has no 1s in the test set, so it cannot compute ROC AUC.
The period for this test set is {test_periods}''')
return True
elif one_class and self.drop_one_class_folds:
if return_n_splits:
print(f'Fold {i} has only one class in the test set (period {test_periods}). Omitting fold {i}.')
return False
else:
return True
def plot_time_series_splits(self, split_output):
"""
Visualize time series splits using a scatter plot.
Parameters:
- split_output: Output of time series splits
"""
folds = len(split_output)
fig, ax = plt.subplots()
def int_to_dt(an_array):
return pd.to_datetime(an_array.astype(str), format='%Y%m')
for i, (train_index, test_index) in enumerate(split_output):
ax.scatter(int_to_dt(train_index), [i] * len(train_index), color='blue', marker='.', s=50)
ax.scatter(int_to_dt(test_index), [i] * len(test_index), color='red', marker='.', s=50)
ax.set_xlabel('Periods')
ax.set_ylabel('Folds')
ax.set_title('Cross-Validation Splits')
ax.grid(True)
ax.set_yticks(range(folds)) # Set the number of ticks on y-axis
ax.set_yticklabels([f'{i}' for i in range(folds)]) # Set custom labels for y-axi
plt.show()
在进行超参数调整之前,请确保重置索引并删除特征变量和目标的 NaN 值。此外,它假设 unique_periods 参数是一个包含所有唯一的、排序的周期的系列。
以下是如何将其用作超参数调整的交叉验证器的演示:
from itertools import product
# create a dataframe
countries = ['ESP','FRA']
periods = list(range(10))
df = pd.DataFrame(list(product(countries,periods)), columns = ['country','period'])
df['target'] = np.concatenate((np.repeat(1, 10), np.repeat(0, 10)))
df['a_feature'] = np.random.randn(20, 1)
# initialize PanelSplit
panel_split = PanelSplit(n_splits=3, gap=1, test_size=1,
unique_periods=pd.Series(df.period.unique()),
train_periods=df.period,
X = df[['a_feature']], y = df.target)
from sklearn.model_selection import GridSearchCV; from sklearn.ensemble import RandomForestClassifier
param_grid = {
'max_depth': [5, 15]
}
param_search = GridSearchCV(RandomForestClassifier(),
param_grid,
scoring='roc_auc',
cv=panel_split)
param_search.fit(df[['a_feature']], df['target'])