我正在尝试执行嵌套交叉验证,同时还使用
GroupShuffleSplit
类合并基于组的分割。但是,当我尝试使用带有 GridSearchCV
的自定义交叉验证对象时,我遇到了“TypeError:无法 pickle 'generator' 对象”。据我所知,发生此错误是因为 group_split.split(...)
返回一个无法在 cross_val_score
函数中使用的生成器。因此我想问是否有一种方法可以轻松使用GroupShuffleSplit
进行嵌套交叉验证。
关于我的简化示例代码: 我有一个包含特征
X
、标签 y
和组标签 groups
的数据集。目标是执行嵌套交叉验证,其中内部循环和外部循环根据组标签分割数据。我想使用 GridSearchCV
进行超参数调整,使用 cross_val_score
评估性能。
import numpy as np
from sklearn.model_selection import GroupShuffleSplit, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
X = np.random.rand(100, 10)
y = np.random.randint(2, size=100)
groups = np.random.randint(4, size=100) # Example group labels
rf_classifier = RandomForestClassifier()
param_grid = {'n_estimators': [50, 100, 200]}
inner_cv = GroupShuffleSplit(n_splits=5, test_size=0.2)
outer_cv = GroupShuffleSplit(n_splits=5, test_size=0.2)
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=inner_cv.split(X, y, groups=groups))
nested_scores = cross_val_score(estimator=grid_search, X=X, y=y, cv=outer_cv.split(X, y, groups=groups))
导致以下堆栈跟踪错误:
---------------------------------------------------------------------------
Empty Traceback (most recent call last)
File c:\Anaconda3_x64\lib\site-packages\joblib\parallel.py:825, in Parallel.dispatch_one_batch(self, iterator)
824 try:
--> 825 tasks = self._ready_batches.get(block=False)
826 except queue.Empty:
827 # slice the iterator n_jobs * batchsize items at a time. If the
828 # slice returns less than that, then the current batchsize puts
(...)
831 # accordingly to distribute evenly the last items between all
832 # workers.
File c:\Anaconda3_x64\lib\queue.py:168, in Queue.get(self, block, timeout)
167 if not self._qsize():
--> 168 raise Empty
169 elif timeout is None:
Empty:
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
Cell In[29], line 16
13 outer_cv = GroupShuffleSplit(n_splits=5, test_size=0.2)
15 grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=inner_cv.split(X, y, groups=groups))
---> 16 nested_scores = cross_val_score(estimator=grid_search, X=X, y=y, cv=outer_cv.split(X, y, groups=groups))
18 print(nested_scores)
File c:\Anaconda3_x64\lib\site-packages\sklearn\model_selection\_validation.py:515, in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
512 # To ensure multimetric format is not supported
513 scorer = check_scoring(estimator, scoring=scoring)
--> 515 cv_results = cross_validate(
516 estimator=estimator,
517 X=X,
518 y=y,
519 groups=groups,
520 scoring={"score": scorer},
521 cv=cv,
522 n_jobs=n_jobs,
523 verbose=verbose,
524 fit_params=fit_params,
525 pre_dispatch=pre_dispatch,
526 error_score=error_score,
527 )
528 return cv_results["test_score"]
File c:\Anaconda3_x64\lib\site-packages\sklearn\model_selection\_validation.py:266, in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
263 # We clone the estimator to make sure that all the folds are
264 # independent, and that it is pickle-able.
265 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
--> 266 results = parallel(
267 delayed(_fit_and_score)(
268 clone(estimator),
269 X,
270 y,
271 scorers,
272 train,
273 test,
274 verbose,
275 None,
276 fit_params,
277 return_train_score=return_train_score,
278 return_times=True,
279 return_estimator=return_estimator,
280 error_score=error_score,
281 )
282 for train, test in cv.split(X, y, groups)
283 )
285 _warn_or_raise_about_fit_failures(results, error_score)
287 # For callabe scoring, the return type is only know after calling. If the
288 # return type is a dictionary, the error scores can now be inserted with
289 # the correct key.
File c:\Anaconda3_x64\lib\site-packages\sklearn\utils\parallel.py:63, in Parallel.__call__(self, iterable)
58 config = get_config()
59 iterable_with_config = (
60 (_with_config(delayed_func, config), args, kwargs)
61 for delayed_func, args, kwargs in iterable
62 )
---> 63 return super().__call__(iterable_with_config)
File c:\Anaconda3_x64\lib\site-packages\joblib\parallel.py:1048, in Parallel.__call__(self, iterable)
1039 try:
1040 # Only set self._iterating to True if at least a batch
1041 # was dispatched. In particular this covers the edge
(...)
1045 # was very quick and its callback already dispatched all the
1046 # remaining jobs.
1047 self._iterating = False
-> 1048 if self.dispatch_one_batch(iterator):
1049 self._iterating = self._original_iterator is not None
1051 while self.dispatch_one_batch(iterator):
File c:\Anaconda3_x64\lib\site-packages\joblib\parallel.py:836, in Parallel.dispatch_one_batch(self, iterator)
833 n_jobs = self._cached_effective_n_jobs
834 big_batch_size = batch_size * n_jobs
--> 836 islice = list(itertools.islice(iterator, big_batch_size))
837 if len(islice) == 0:
838 return False
File c:\Anaconda3_x64\lib\site-packages\sklearn\utils\parallel.py:59, in <genexpr>(.0)
54 # Capture the thread-local scikit-learn configuration at the time
55 # Parallel.__call__ is issued since the tasks can be dispatched
56 # in a different thread depending on the backend and on the value of
57 # pre_dispatch and n_jobs.
58 config = get_config()
---> 59 iterable_with_config = (
60 (_with_config(delayed_func, config), args, kwargs)
61 for delayed_func, args, kwargs in iterable
62 )
63 return super().__call__(iterable_with_config)
File c:\Anaconda3_x64\lib\site-packages\sklearn\model_selection\_validation.py:268, in <genexpr>(.0)
263 # We clone the estimator to make sure that all the folds are
264 # independent, and that it is pickle-able.
265 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
266 results = parallel(
267 delayed(_fit_and_score)(
--> 268 clone(estimator),
269 X,
270 y,
271 scorers,
272 train,
273 test,
274 verbose,
275 None,
276 fit_params,
277 return_train_score=return_train_score,
278 return_times=True,
279 return_estimator=return_estimator,
280 error_score=error_score,
281 )
282 for train, test in cv.split(X, y, groups)
283 )
285 _warn_or_raise_about_fit_failures(results, error_score)
287 # For callabe scoring, the return type is only know after calling. If the
288 # return type is a dictionary, the error scores can now be inserted with
289 # the correct key.
File c:\Anaconda3_x64\lib\site-packages\sklearn\base.py:89, in clone(estimator, safe)
87 new_object_params = estimator.get_params(deep=False)
88 for name, param in new_object_params.items():
---> 89 new_object_params[name] = clone(param, safe=False)
90 new_object = klass(**new_object_params)
91 params_set = new_object.get_params(deep=False)
File c:\Anaconda3_x64\lib\site-packages\sklearn\base.py:70, in clone(estimator, safe)
68 elif not hasattr(estimator, "get_params") or isinstance(estimator, type):
69 if not safe:
---> 70 return copy.deepcopy(estimator)
71 else:
72 if isinstance(estimator, type):
File c:\Anaconda3_x64\lib\copy.py:161, in deepcopy(x, memo, _nil)
159 reductor = getattr(x, "__reduce_ex__", None)
160 if reductor is not None:
--> 161 rv = reductor(4)
162 else:
163 reductor = getattr(x, "__reduce__", None)
TypeError: cannot pickle 'generator' object
我不确定在 1.3 版本之前,如果不编写手动循环来替换
cross_val_score
,这是否可行。除了生成器问题之外,您还试图告诉网格搜索对象它应该分割所有 X
,但它不会看到所有 X
(它已经被外部分割器分割)。
在 1.3 中,我们获得了元数据路由,它自动将
groups
路由到组拆分器。然后我们可以做例如
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=inner_cv)
nested_scores = cross_val_score(estimator=grid_search, X=X, y=y, cv=outer_cv, params={'groups': groups})
只是为了检查这是否真的路由到两个分离器,这是脚本的修改版本:
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, GridSearchCV, cross_val_score
from sklearn.base import BaseEstimator, ClassifierMixin
X = np.random.rand(100, 10)
y = np.random.randint(2, size=100)
groups = np.random.randint(4, size=100) # Example group labels
X = pd.DataFrame(X)
class MyClassifier(ClassifierMixin, BaseEstimator):
def __init__(self, n_estimators=1):
self.n_estimators = n_estimators
def fit(self, X, y):
print("train: ", groups[X.index])
return self
def predict(self, X):
print("test: ", groups[X.index])
return np.random.randint(2, size=len(X))
rf_classifier = MyClassifier()
param_grid = {'n_estimators': [50, 100]}
inner_cv = GroupShuffleSplit(n_splits=2, test_size=0.33)
outer_cv = GroupShuffleSplit(n_splits=2, test_size=0.25)
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=inner_cv, verbose=10)
nested_scores = cross_val_score(estimator=grid_search, X=X, y=y, cv=outer_cv, params={'groups': groups}, verbose=10)
print(nested_score)
外部分割将单个组放入测试集中,然后内部分割选择剩余三个中的一个作为测试,最后两个作为训练。这是我的输出:
[CV] START .....................................................................
Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV 1/2; 1/2] START n_estimators=50.............................................
train: [1 3 3 3 1 3 3 1 1 1 1 1 3 1 1 1 3 3 1 3 3 3 3 1 1 1 3 3 3 3 3 3 3 3 3 1 3
3 3 3 1 3 1 1 1 3 3 1 1 3 1 1 1 1 1 1]
test: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[CV 1/2; 1/2] END ..............n_estimators=50;, score=0.353 total time= 0.0s
[CV 2/2; 1/2] START n_estimators=50.............................................
train: [3 3 3 3 3 0 0 3 3 3 3 3 3 0 3 0 0 0 3 3 0 0 0 3 3 0 0 3 3 3 3 3 3 3 3 3 0
0 0 0 3 3 3 0 0 3]
test: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[CV 2/2; 1/2] END ..............n_estimators=50;, score=0.407 total time= 0.0s
[CV 1/2; 2/2] START n_estimators=100............................................
train: [1 3 3 3 1 3 3 1 1 1 1 1 3 1 1 1 3 3 1 3 3 3 3 1 1 1 3 3 3 3 3 3 3 3 3 1 3
3 3 3 1 3 1 1 1 3 3 1 1 3 1 1 1 1 1 1]
test: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[CV 1/2; 2/2] END .............n_estimators=100;, score=0.412 total time= 0.0s
[CV 2/2; 2/2] START n_estimators=100............................................
train: [3 3 3 3 3 0 0 3 3 3 3 3 3 0 3 0 0 0 3 3 0 0 0 3 3 0 0 3 3 3 3 3 3 3 3 3 0
0 0 0 3 3 3 0 0 3]
test: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[CV 2/2; 2/2] END .............n_estimators=100;, score=0.333 total time= 0.0s
train: [1 3 3 3 1 3 3 1 1 1 0 1 1 0 3 1 1 1 3 3 1 3 3 3 0 3 1 1 0 1 0 0 3 3 0 0 0
3 3 0 0 3 3 3 3 3 1 3 3 3 3 0 0 1 0 0 3 1 1 1 3 3 1 1 0 0 3 1 1 1 1 1 1]
test: [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
[CV] END ................................ score: (test=0.481) total time= 0.0s
[CV] START .....................................................................
Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV 1/2; 1/2] START n_estimators=50.............................................
train: [3 3 3 3 2 3 2 2 2 3 2 3 2 3 2 2 2 3 3 2 3 3 2 2 2 3 3 2 2 3 3 2 2 2 3 3 3
3 3 3 3 2 3 3 2 2 3 2 2 2 2 3 3 2 3 2]
test: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[CV 1/2; 1/2] END ..............n_estimators=50;, score=0.588 total time= 0.0s
[CV 2/2; 1/2] START n_estimators=50.............................................
train: [3 3 3 3 2 3 2 2 2 3 2 3 2 3 2 2 2 3 3 2 3 3 2 2 2 3 3 2 2 3 3 2 2 2 3 3 3
3 3 3 3 2 3 3 2 2 3 2 2 2 2 3 3 2 3 2]
test: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[CV 2/2; 1/2] END ..............n_estimators=50;, score=0.588 total time= 0.0s
[CV 1/2; 2/2] START n_estimators=100............................................
train: [3 3 3 3 2 3 2 2 2 3 2 3 2 3 2 2 2 3 3 2 3 3 2 2 2 3 3 2 2 3 3 2 2 2 3 3 3
3 3 3 3 2 3 3 2 2 3 2 2 2 2 3 3 2 3 2]
test: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[CV 1/2; 2/2] END .............n_estimators=100;, score=0.647 total time= 0.0s
[CV 2/2; 2/2] START n_estimators=100............................................
train: [3 3 3 3 2 3 2 2 2 3 2 3 2 3 2 2 2 3 3 2 3 3 2 2 2 3 3 2 2 3 3 2 2 2 3 3 3
3 3 3 3 2 3 3 2 2 3 2 2 2 2 3 3 2 3 2]
test: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[CV 2/2; 2/2] END .............n_estimators=100;, score=0.471 total time= 0.0s
train: [3 3 3 3 2 3 2 2 0 2 0 3 2 3 2 3 2 2 2 3 3 2 3 0 3 2 2 2 0 0 0 3 3 2 0 0 2
0 3 3 0 0 2 2 2 3 3 3 3 3 3 3 2 3 3 2 2 0 0 0 0 3 2 2 2 2 3 3 2 0 0 3 2]
test: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[CV] END ................................ score: (test=0.593) total time= 0.0s
[0.48148148 0.59259259]
[Parallel(n_jobs=1)]: Done 1 tasks | elapsed: 0.0s