我正在使用sklearn 1.4.1,但随机森林仍然无法处理缺失值

问题描述 投票:0回答:1

我读过 sklearn > 1.4 中的随机森林算法应该能够处理 NaN。我已检查我是否拥有最新版本的 Sklearn。

! pip install --upgrade scikit-learn

import sklearn
print(sklearn.__version__)

1.4.1

但是我仍然收到错误:

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

为什么?我应该进口其他东西吗?我很困惑。

编辑:

这是一个最小的代码,应该会给出我提到的错误:

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Example DataFrame with NaN values
data = {
    "tipo_locazione": ["A", "B", None, "A"],
    "flg_polizza_caa": [1, 0, 1, 0],
    "cl_bisogni_3": [0, 1, 1, 0]
}
df = pd.DataFrame(data)

def random_forest_model(variabili):
    X = df[variabili]
    y = df['cl_bisogni_3'].astype(str)

    # Identifying categorical features
    categorical_features = X.select_dtypes(include=['object']).columns

    # Transformer for categorical features
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Preprocessor to apply transformations
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

    # Model pipeline
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    # Splitting the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fitting the model
    model.fit(X_train, y_train)

    print("Model trained successfully")

# Attempt to train the model with NaN values
variables = ['tipo_locazione', 'flg_polizza_caa']
random_forest_model(variables)

编辑2我的回溯错误:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-75-1a77bdc05207> in <cell line: 65>()
     66     selected_variables = sample(variabili_ab, 2)  # Adjust number to be <= length of variabili_ab
     67 
---> 68     metrics = random_forest_model(selected_variables)
     69     results[tuple(selected_variables)] = metrics
     70     count_cicli -= 1

8 frames
<ipython-input-75-1a77bdc05207> in random_forest_model(variabili)
     47 
     48     # Fitting the model
---> 49     model.fit(X_train, y_train)
     50     y_pred = model.predict(X_test)
     51 

/usr/local/lib/python3.10/dist-packages/sklearn/base.py in wrapper(estimator, *args, **kwargs)
   1472                 )
   1473             ):
-> 1474                 return fit_method(estimator, *args, **kwargs)
   1475 
   1476         return wrapper

/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py in fit(self, X, y, **params)
    473             if self._final_estimator != "passthrough":
    474                 last_step_params = routed_params[self.steps[-1][0]]
--> 475                 self._final_estimator.fit(Xt, y, **last_step_params["fit"])
    476 
    477         return self

/usr/local/lib/python3.10/dist-packages/sklearn/base.py in wrapper(estimator, *args, **kwargs)
   1472                 )
   1473             ):
-> 1474                 return fit_method(estimator, *args, **kwargs)
   1475 
   1476         return wrapper

/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py in fit(self, X, y, sample_weight)
    375         estimator = type(self.estimator)(criterion=self.criterion)
    376         missing_values_in_feature_mask = (
--> 377             estimator._compute_missing_values_in_feature_mask(
    378                 X, estimator_name=self.__class__.__name__
    379             )

/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py in _compute_missing_values_in_feature_mask(self, X, estimator_name)
    212 
    213         if not self._support_missing_values(X):
--> 214             assert_all_finite(X, **common_kwargs)
    215             return None
    216 

/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in assert_all_finite(X, allow_nan, estimator_name, input_name)
    214     Test failed: Array contains non-finite values.
    215     """
--> 216     _assert_all_finite(
    217         X.data if sp.issparse(X) else X,
    218         allow_nan=allow_nan,

/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype, estimator_name, input_name)
    124         return
    125 
--> 126     _assert_all_finite_element_wise(
    127         X,
    128         xp=xp,

/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in _assert_all_finite_element_wise(X, xp, allow_nan, msg_dtype, estimator_name, input_name)
    173                 "#estimators-that-handle-nan-values"
    174             )
--> 175         raise ValueError(msg_err)
    176 
    177 

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

print(pd.__version__)
print(np.__version__)

2.0.3
1.25.2
python-3.x scikit-learn google-colaboratory random-forest missing-data
1个回答
0
投票

我相信 pandas 数据框有一个

fillnan
函数。例如:

df["tipo_locazione"].fillnan(-1)

然后照常使用模型

© www.soinside.com 2019 - 2024. All rights reserved.