在
smoteboost.py
文件中给出以下 SMOTEBoost 类实现:
import numbers
import numpy as np
from collections import Counter
from sklearn.base import (clone,
is_regressor)
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble._forest import BaseForest
from sklearn.preprocessing import normalize
from sklearn.tree import BaseDecisionTree
from sklearn.utils import (check_random_state,
check_X_y,
check_array,
_safe_indexing)
from imblearn.utils import check_neighbors_object
from imblearn.over_sampling import SMOTE
__all__ = ['SMOTEBoost']
MAX_INT = np.iinfo(np.int32).max
class SMOTEBoost(AdaBoostClassifier):
def __init__(self,
k_neighbors=5,
base_estimator=None,
n_estimators=50,
learning_rate=1.,
sampling_strategy="auto",
algorithm='SAMME.R',
random_state=None,
n_jobs=1):
super(AdaBoostClassifier, self).__init__(
base_estimator=base_estimator,
n_estimators=n_estimators,
learning_rate=learning_rate,
random_state=random_state)
self.algorithm = algorithm
self.k_neighbors = k_neighbors
self.sampling_strategy = sampling_strategy
self.n_jobs=n_jobs
def _validate_estimator(self, default=AdaBoostClassifier()):
if not isinstance(self.n_estimators, (numbers.Integral, np.integer)):
raise ValueError("n_estimators must be an integer, "
"got {0}.".format(type(self.n_estimators)))
if self.n_estimators <= 0:
raise ValueError("n_estimators must be greater than zero, "
"got {0}.".format(self.n_estimators))
if self.base_estimator is not None:
base_estimator = clone(self.base_estimator)
else:
base_estimator = clone(default)
if isinstance(self.sampling_strategy, dict) and self.sampling_strategy != {}:
raise ValueError("'dict' type cannot be accepted for ratio in this class; "
"use alternative options")
self.nn_k_ = check_neighbors_object('k_neighbors',
self.k_neighbors,
additional_neighbor=1)
self.nn_k_.set_params(**{'n_jobs': self.n_jobs})
self.smote = SMOTE(sampling_strategy=self.sampling_strategy, k_neighbors=self.k_neighbors,
random_state=self.random_state)
self.base_estimator_ = base_estimator
def fit(self, X, y, sample_weight=None):
if self.algorithm not in ('SAMME', 'SAMME.R'):
raise ValueError("algorithm %s is not supported" % self.algorithm)
# Check parameters
if self.learning_rate <= 0:
raise ValueError("learning_rate must be greater than zero")
if (self.base_estimator is None or
isinstance(self.base_estimator, (BaseDecisionTree,
BaseForest))):
DTYPE = np.float64
dtype = DTYPE
accept_sparse = 'csc'
else:
dtype = None
accept_sparse = ['csr', 'csc']
X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=dtype,
y_numeric=is_regressor(self))
if sample_weight is None:
# Initialize weights to 1 / n_samples
sample_weight = np.empty(X.shape[0], dtype=np.float64)
sample_weight[:] = 1. / X.shape[0]
else:
sample_weight = check_array(sample_weight, ensure_2d=False)
# Normalize existing weights
sample_weight = sample_weight / sample_weight.sum(dtype=np.float64)
# Check that the sample weights sum is positive
if sample_weight.sum() <= 0:
raise ValueError(
"Attempting to fit with a non-positive "
"weighted number of samples.")
# Check parameters
self._validate_estimator()
# Clear any previous fit results
self.estimators_ = []
self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64)
self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64)
random_state = check_random_state(self.random_state)
for iboost in range(self.n_estimators):
# SMOTE step
target_stats = Counter(y)
min_class = min(target_stats, key=target_stats.get)
n_sample_majority = max(target_stats.values())
n_samples = n_sample_majority - target_stats[min_class]
target_class_indices = np.flatnonzero(y == min_class)
X_class = _safe_indexing(X, target_class_indices)
self.nn_k_.fit(X_class)
nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
#smote._make_samples(X_class, y.dtype,
X_new, y_new = self.smote._make_samples(X_class, y.dtype, min_class, X_class,
nns, n_samples, 1.0)
# Normalize synthetic sample weights based on current training set.
sample_weight_syn = np.empty(X_new.shape[0], dtype=np.float64)
sample_weight_syn[:] = 1. / X.shape[0]
# Combine the original and synthetic samples.
X = np.vstack((X, X_new))
y = np.append(y, y_new)
# Combine the weights.
sample_weight = \
np.append(sample_weight, sample_weight_syn).reshape(-1, 1)
sample_weight = \
np.squeeze(normalize(sample_weight, axis=0, norm='l1'))
# Boosting step
sample_weight, estimator_weight, estimator_error = self._boost(
iboost,
X, y,
sample_weight,
random_state)
# Early termination
if sample_weight is None:
break
self.estimator_weights_[iboost] = estimator_weight
self.estimator_errors_[iboost] = estimator_error
# Stop if error is zero
if estimator_error == 0:
break
sample_weight_sum = np.sum(sample_weight)
# Stop if the sum of sample weights has become non-positive
if sample_weight_sum <= 0:
break
if iboost < self.n_estimators - 1:
# Normalize
sample_weight /= sample_weight_sum
return self
我正在尝试让它工作,但不知道如何修复。 重现:
from sklearn.datasets import make_classification
from smoteboost import SMOTEBoost
from sklearn.model_selection import train_test_split
X, y = make_classification(n_samples=1000, n_features=10, n_classes=5,
n_informative=4, weights=[0.22,0.03,0.16,0.51,0.05])
X_train,X_test,y_train,y_test=train_test_split(X,y)
smt = SMOTEBoost()
smt.fit(X_train, y_train)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "~/smoteboost.py", line 176, in fit
self._validate_estimator()
File "~/smoteboost.py", line 129, in _validate_estimator
self.base_estimator_ = base_estimator
AttributeError: can't set attribute
我理解错误消息表明
SMOTEBoost
对象没有属性 estimator_
。所以我尝试这样设置:
self.set_params(base_estimator_=self.base_estimator)
Error:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "~/smoteboost.py", line 177, in fit
self._validate_estimator()
File "~/smoteboost.py", line 130, in _validate_estimator
self.set_params(base_estimator_=self.base_estimator) #self.set_params(base_estimator=self.base_estimator_)
File "~/venv/lib/python3.9/site-packages/sklearn/base.py", line 205, in set_params
raise ValueError(
ValueError: Invalid parameter 'base_estimator_' for estimator SMOTEBoost(). Valid parameters are: ['algorithm', 'base_estimator', 'k_neighbors', 'learning_rate', 'n_estimators', 'n_jobs', 'random_state', 'sampling_strategy'].
编辑
scikit-learn 版本:
import sklearn
sklearn.__version__
'1.2.2'
我不会分配或更改名称以下划线结尾的 scikit-learn 类的属性。似乎这些属性是生成的(在调用
.fit
方法后进行计算),以及粗略的只读属性(所谓的“估计属性”,根据 scikit-learn 文档。
此外,从 1.2 版本开始,base_estimator
已更名为
estimator
。所以使用后者。这建议你应该使用
self.set_params(estimator=estimator)
并将所有出现的 base_estimator
替换为简单的
estimator
,并且不分配给
estimator_
。