问题定义,持续挑战
这个新的 imputer_bayesian_ridge() 函数用于 Iterative Imputer 插补训练数据。发送数据帧训练数据,然后立即获取 numpy 数组变量的 data.values。这发送或传递具有许多特征和 Y 响应变量的训练数据。这项工作只是试图归因于一个单一的特征。
显然我的连续数据,Y 响应数据,即 price $$$$ 连续数据,在 cross_val_score(interative_imputer, data_array) 中不受支持。
那么关于如何使用 Y 响应变量中的连续数据以使用 Iterative Imputer 并满足对象“interativea_imputer”的 cross_val_score 有什么建议
为了支持目标类型,我应该将 Y 响应变量中的连续数据转换为二进制吗?不,因为这不是二元分类,所以多类更符合。那么当价格数据是响应变量时如何处理呢?
收到错误
ValueError:支持的目标类型是:('binary','multiclass')。取而代之的是“连续”。
代码
def imputer_regressor_bay_ridge(data, y):
data_array = data.values. ##looks OK
interative_imputer = IterativeImputer(BayesianRidge()). ## runs OK
interative_imputer_fit = interative_imputer.fit(data_array) ## runs OK
data_imputed = interative_imputer_fit.transform(data_array) ## runs OK
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) ## runs OK
scores = cross_val_score(interative_imputer, data_array, y,
scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
return scores, data_imputed
数据样本
print(train_data.shape)
data_array = train_data.values
data_array
(1460, 250)
array([[-1.73086488, -0.20803433, -0.20714171, ..., -0.11785113,
0.4676514 , -0.30599503],
[-1.7284922 , 0.40989452, -0.09188637, ..., -0.11785113,
0.4676514 , -0.30599503],
[-1.72611953, -0.08444856, 0.07347998, ..., -0.11785113,
0.4676514 , -0.30599503],
...,
[ 1.72611953, -0.16683907, -0.14781027, ..., -0.11785113,
0.4676514 , -0.30599503],
[ 1.7284922 , -0.08444856, -0.08016039, ..., -0.11785113,
0.4676514 , -0.30599503],
[ 1.73086488, 0.20391824, -0.05811155, ..., -0.11785113,
0.4676514 , -0.30599503]])
y = train_data['ResponseY'].values
y.shape
(1460,)
array([ 0.34727322, 0.00728832, 0.53615372, ..., 1.07761115,
-0.48852299, -0.42084081])
值错误
显然我的连续数据,即价格 $ 数据,在 cross_val_score(interative_imputer, data_array on:
ValueError:支持的目标类型是:('binary','multiclass')。取而代之的是“连续”。
Empty Traceback (most recent call last)
File ~/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py:820, in Parallel.dispatch_one_batch(self, iterator)
819 try:
--> 820 tasks = self._ready_batches.get(block=False)
821 except queue.Empty:
822 # slice the iterator n_jobs * batchsize items at a time. If the
823 # slice returns less than that, then the current batchsize puts
(...)
826 # accordingly to distribute evenly the last items between all
827 # workers.
File ~/opt/anaconda3/lib/python3.9/queue.py:168, in Queue.get(self, block, timeout)
167 if not self._qsize():
--> 168 raise Empty
169 elif timeout is None:
Empty:
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
Cell In[27], line 5
3 #train_data, test_data = minmaxscaler(train_data, test_data) # alternate run for min-max scaler
4 columns, imputed_df = imputer_regressor(train_data)
----> 5 scores, data_imputed = imputer_regressor_bay_ridge(train_data, y)
7 misTrain = whichColumnsMissing(train_data)
8 misTest = whichColumnsMissing(test_data)
Cell In[24], line 110, in imputer_regressor_bay_ridge(data, y)
108 data_imputed = interative_imputer_fit.transform(data_array)
109 cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
--> 110 scores = cross_val_score(interative_imputer, data_array,
111 y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
113 return scores, data_imputed
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py:509, in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
506 # To ensure multimetric format is not supported
507 scorer = check_scoring(estimator, scoring=scoring)
--> 509 cv_results = cross_validate(
510 estimator=estimator,
511 X=X,
512 y=y,
513 groups=groups,
514 scoring={"score": scorer},
515 cv=cv,
516 n_jobs=n_jobs,
517 verbose=verbose,
518 fit_params=fit_params,
519 pre_dispatch=pre_dispatch,
520 error_score=error_score,
521 )
522 return cv_results["test_score"]
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py:267, in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
264 # We clone the estimator to make sure that all the folds are
265 # independent, and that it is pickle-able.
266 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
--> 267 results = parallel(
268 delayed(_fit_and_score)(
269 clone(estimator),
270 X,
271 y,
272 scorers,
273 train,
274 test,
275 verbose,
276 None,
277 fit_params,
278 return_train_score=return_train_score,
279 return_times=True,
280 return_estimator=return_estimator,
281 error_score=error_score,
282 )
283 for train, test in cv.split(X, y, groups)
284 )
286 _warn_about_fit_failures(results, error_score)
288 # For callabe scoring, the return type is only know after calling. If the
289 # return type is a dictionary, the error scores can now be inserted with
290 # the correct key.
File ~/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py:1041, in Parallel.__call__(self, iterable)
1032 try:
1033 # Only set self._iterating to True if at least a batch
1034 # was dispatched. In particular this covers the edge
(...)
1038 # was very quick and its callback already dispatched all the
1039 # remaining jobs.
1040 self._iterating = False
-> 1041 if self.dispatch_one_batch(iterator):
1042 self._iterating = self._original_iterator is not None
1044 while self.dispatch_one_batch(iterator):
File ~/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py:831, in Parallel.dispatch_one_batch(self, iterator)
828 n_jobs = self._cached_effective_n_jobs
829 big_batch_size = batch_size * n_jobs
--> 831 islice = list(itertools.islice(iterator, big_batch_size))
832 if len(islice) == 0:
833 return False
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py:267, in <genexpr>(.0)
264 # We clone the estimator to make sure that all the folds are
265 # independent, and that it is pickle-able.
266 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
--> 267 results = parallel(
268 delayed(_fit_and_score)(
269 clone(estimator),
270 X,
271 y,
272 scorers,
273 train,
274 test,
275 verbose,
276 None,
277 fit_params,
278 return_train_score=return_train_score,
279 return_times=True,
280 return_estimator=return_estimator,
281 error_score=error_score,
282 )
283 for train, test in cv.split(X, y, groups)
284 )
286 _warn_about_fit_failures(results, error_score)
288 # For callabe scoring, the return type is only know after calling. If the
289 # return type is a dictionary, the error scores can now be inserted with
290 # the correct key.
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_split.py:1411, in _RepeatedSplits.split(self, X, y, groups)
1409 for idx in range(n_repeats):
1410 cv = self.cv(random_state=rng, shuffle=True, **self.cvargs)
-> 1411 for train_index, test_index in cv.split(X, y, groups):
1412 yield train_index, test_index
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_split.py:340, in _BaseKFold.split(self, X, y, groups)
332 if self.n_splits > n_samples:
333 raise ValueError(
334 (
335 "Cannot have number of splits n_splits={0} greater"
336 " than the number of samples: n_samples={1}."
337 ).format(self.n_splits, n_samples)
338 )
--> 340 for train, test in super().split(X, y, groups):
341 yield train, test
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_split.py:86, in BaseCrossValidator.split(self, X, y, groups)
84 X, y, groups = indexable(X, y, groups)
85 indices = np.arange(_num_samples(X))
---> 86 for test_index in self._iter_test_masks(X, y, groups):
87 train_index = indices[np.logical_not(test_index)]
88 test_index = indices[test_index]
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_split.py:709, in StratifiedKFold._iter_test_masks(self, X, y, groups)
708 def _iter_test_masks(self, X, y=None, groups=None):
--> 709 test_folds = self._make_test_folds(X, y)
710 for i in range(self.n_splits):
711 yield test_folds == i
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_split.py:652, in StratifiedKFold._make_test_folds(self, X, y)
650 allowed_target_types = ("binary", "multiclass")
651 if type_of_target_y not in allowed_target_types:
--> 652 raise ValueError(
653 "Supported target types are: {}. Got {!r} instead.".format(
654 allowed_target_types, type_of_target_y
655 )
656 )
658 y = column_or_1d(y)
660 _, y_idx, y_inv = np.unique(y, return_index=True, return_inverse=True)
ValueError: Supported target types are: ('binary', 'multiclass'). Got 'continuous' instead.
scoring='accuracy'
的使用仅适用于二进制或多类目标。scoring=None
或其他一些足以满足连续目标的评分。参见regression('neg_mean_absolute_error', 'neg_mean_squared_error', ...)。