R 语言中有两篇与此主题相关的帖子 包括 Lasso 回归模型中的固定回归器 和 固定效果 Lasso logit 模型
我正在使用套索惩罚编写一个特征选择模型,我的数据有一些季节性虚拟变量,在建模阶段不得删除它们。最简单的方法是应用于线性模型系数的收缩应避免固定特征的系数。你能帮我写一个 custom Lasso 函数吗?它不会缩小固定特征的系数,并且可以在 sklearn Pipeline 中调用?**
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('scaler', StandardScaler()), # Optional: Feature scaling
('lasso', LassoWithFixedFeatures(fixed_features_indices))
])
我已经尝试过了,
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.linear_model import Lasso
import numpy as np
class LassoWithFixedFeatures(BaseEstimator, RegressorMixin):
def __init__(self, fixed_features_indices, alpha=1.0):
self.fixed_features_indices = fixed_features_indices
self.alpha = alpha
def fit(self, X, y):
# Fit Lasso model with regularized coefficients
self.lasso = Lasso(alpha=self.alpha)
self.lasso.fit(X, y)
# Calculate the penalty term for fixed features
penalty_fixed = self.alpha * np.abs(self.lasso.coef_[self.fixed_features_indices])
# Set coefficients of fixed features to their original values
fixed_features_coefs = np.linalg.lstsq(X[:, self.fixed_features_indices], y, rcond=None)[0]
self.coef_ = np.zeros(X.shape[1])
self.coef_[self.fixed_features_indices] = fixed_features_coefs
# Calculate the penalty term for non-fixed features
penalty_non_fixed = np.zeros(X.shape[1])
penalty_non_fixed[self.fixed_features_indices] = 0 # Exclude fixed features from penalty
penalty_non_fixed[~np.isin(np.arange(X.shape[1]), self.fixed_features_indices)] = self.alpha * np.abs(self.lasso.coef_)
# Update coefficients by considering penalties
self.coef_ += self.lasso.coef_ - penalty_non_fixed + penalty_fixed
return self
def predict(self, X):
return np.dot(X, self.coef_)
总共有 18 个回归量,其中 7 个我保留在
fixed_features
。我期待带有自定义收缩的套索回归,但得到了
ValueError:
All the 500 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'
500 fits failed with the following error: Traceback (most recent call last): File "C:\Users\...py", line 732, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params) File "C:\..\base.py", line 1151, in wrapper
return fit_method(estimator, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\...py", line 420, in fit
self._final_estimator.fit(Xt, y, **fit_params_last_step) File "C:\Users\...5759.py", line 26, in fit
penalty_non_fixed[~np.isin(np.arange(X.shape[1]), self.fixed_features_indices)] = self.alpha * np.abs(self.lasso.coef_)
~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ValueError: NumPy boolean array indexing assignment cannot assign 18 input values to the 11 output values where the mask is true
我已经修复了索引问题,但无法评论系数计算的正确性。我只是试图让它更清楚。
测试数据:
from sklearn.datasets import make_regression
import pandas as pd
X, y = make_regression(n_samples=500, n_features=17, n_informative=17)
X_df = pd.DataFrame({
feat: vals for feat, vals in zip([f'feat{i}' for i in range(17)], X.T)
})
fixed_features_indices = np.arange(7)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
lasso_ff = LassoWithFixedFeatures(fixed_features_indices)
pipeline = Pipeline([
('scaler', StandardScaler()), # Optional: Feature scaling
('lasso', lasso_ff)
])
pipeline.fit(X, y).predict(X)
pipeline.fit(X_df, y).predict(X_df)
修改后的类:
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.linear_model import Lasso
import numpy as np
from sklearn.base import check_X_y, check_array, check_is_fitted
class LassoWithFixedFeatures(RegressorMixin, BaseEstimator):
def __init__(self, fixed_features_indices, alpha=1.0):
self.fixed_features_indices = fixed_features_indices
self.alpha = alpha
def fit(self, X, y):
#Check and convert to numpy
X, y = check_X_y(X, y)
#Define the various groups of indices
feature_indices = np.arange(X.shape[1])
nonfixed_features_indices = feature_indices[
~np.isin(feature_indices, self.fixed_features_indices)
]
# Fit Lasso model with regularized coefficients
self.lasso_ = Lasso(alpha=self.alpha).fit(X, y)
abs_lasso_coefs = np.abs(self.lasso_.coef_)
#Initialise penalty vectors relevant to different indices
# and calculate penalties
penalties_fixed = np.zeros_like(feature_indices)
penalties_nonfixed = np.zeros_like(feature_indices)
penalties_fixed[self.fixed_features_indices] = (
self.alpha * abs_lasso_coefs[self.fixed_features_indices]
)
penalties_nonfixed[nonfixed_features_indices] = (
self.alpha * abs_lasso_coefs[nonfixed_features_indices]
)
# Original values of the coefficients for the fixed features
original_coef_fixed = np.zeros_like(feature_indices)
original_coef_fixed[self.fixed_features_indices] = np.linalg.lstsq(
X[:, self.fixed_features_indices], y, rcond=None
)[0]
# Final coefficients by considering penalties
self.coef_ = (
#Lasso coefs are the starting point for all of the features
self.lasso_.coef_
#Fixed features: add lstsq coefs and penalties.
# They become: lasso coef + lstsq coef + nonneg penalty
+ original_coef_fixed
+ penalties_fixed
#Remaining features: subtract penalty.
# They become: lasso coef - nonneg penalty
- penalties_nonfixed
)
return self
def predict(self, X):
check_is_fitted(self)
X = check_array(X)
return X @ self.coef_.reshape(-1, 1)