我一直在尝试使用 TimeSeriesSplit 来获取面板数据。我所说的面板数据是指人口的年度照片。我对多年来的数据分割很感兴趣。该人口正在不断变化,每年的人口规模并不相同。因此,直接使用 TimeSeriesSplit 是不可能的。
基本上我试图获得以下简历方案:
我设法使用以下代码来做到这一点:
import pandas as pd, numpy as np
import seaborn as sns, matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
X_test, y_test = [], []
start_year = 2010
end_year = 2020
for year in np.arange(start_year, end_year+1):
X_year, y_year = make_regression(n_samples=5+year-start_year, n_features=2, bias=100, noise=1, random_state=year)
X_year = pd.DataFrame(X_year).rename(columns={0:'X1', 1:'X2'})
X_year['year'] = year
y_year = pd.Series(y_year)
X_test.append(X_year)
y_test.append(y_year)
X_test, y_test = pd.concat(X_test), pd.concat(y_test)
# modelling
X = X_test
y = y_test
years = np.unique(X_test['year'])
# modelisation
model = DummyRegressor(strategy="mean")
metric = mean_squared_error
cv = TimeSeriesSplit(n_splits=len(years)-1)
years_folds = []
res = []
for i, (train_year, test_year) in enumerate(cv.split(years)):
print(f"Fold {i}:")
print(f" Train: index={years[train_year]}")
print(f" Test: index={years[test_year]}")
years_folds.append((years[train_year], years[test_year]))
train_filter = X['year'].isin(years[train_year])
test_filter = X['year'].isin(years[test_year])
X_train, y_train = X.loc[train_filter.values], y[train_filter.values]
X_test, y_test = X.loc[test_filter.values], y[test_filter.values]
model.fit(X_train, y_train)
score = metric(model.predict(X_test), y_test)
print(f' {score=:.3}')
res.append((years[test_year][0], score))
plot_year_folds(years_folds)
folds_res = pd.DataFrame(res,columns=['test_year', metric.__name__])
folds_res.plot.scatter(x='test_year', y=metric.__name__, title=f'{metric.__name__} over test_year');
注意:我使用虚拟数据集和虚拟模型是为了提供运行示例。这不是我帖子的主题。
正如您所注意到的,我必须使用一个技巧:我分割年份而不是数据。我想知道:是否有一种标准方法可以使用 sklearn cv 对象分割数据?目标是在 cross_val_score 函数中使用它。
如上所述,mlxtend 允许这样做。
import pandas as pd, numpy as np
import seaborn as sns, matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_score
from mlxtend.evaluate.time_series import GroupTimeSeriesSplit, plot_splits
X_test, y_test = [], []
start_year = 2010
end_year = 2020
for year in np.arange(start_year, end_year+1):
X_year, y_year = make_regression(n_samples=5+(year-start_year), n_features=2, bias=0, noise=1, random_state=year)
X_year = pd.DataFrame(X_year).rename(columns={0:'X1', 1:'X2'})
X_year['year'] = year
y_year = pd.Series(y_year)
X_test.append(X_year)
y_test.append(y_year)
X, y = pd.concat(X_test), pd.concat(y_test)
# modelisation
model = DummyRegressor(strategy="mean")
metric = mean_squared_error
cv_args = {"test_size": 1, 'n_splits': len(np.unique(X['year'])) - 1, 'window_type': 'expanding'}
cv = GroupTimeSeriesSplit(**cv_args)
scores = cross_val_score(model, X, y, cv=cv, groups=X['year'], scoring=make_scorer(metric))
plot_splits(X, y, X['year'], **cv_args)
它允许使用sklearn cross_val_score。它仍然有一些缺点(添加依赖项,情节可以改进),但它满足了我的需求。