如何在 pandas dataFrame 上使用 sklearn StandardScaler 而不缩放列名称?

问题描述 投票:0回答:1

我相信我正在缩放数据帧中的列名称,这就是出现错误的原因。我该如何防止这种情况?我注意到 dataFrame 中的列名称从“Inflation”、“Homes”更改为 0 和 1,可能是因为我正在缩放列名称。然后我将数组更改回数据帧,因为标准标量将我的原始数据帧变成了 numpy 数组。我需要它作为数据帧来使用 sklearn 执行回归。谢谢:

import pandas as pd
df= pd.read_excel("C:/Users/ME/OneDrive/Desktop/simple regress.xlsx")
print(df)
from sklearn.preprocessing import StandardScaler

num_features = ['Inflation', 'Homes']
X = df[num_features]
std_scaler = StandardScaler() 
df_scaled = std_scaler.fit_transform(X)

df_scaled

# convert the array back to a dataframe
from pandas import DataFrame
df = DataFrame(df_scaled)


    0               1
0   -1.021149   -0.219107
1   -1.006189   -0.248034
2   -1.216238   -0.326069
3   -1.195059   -0.403671
4   -1.181612   -0.419835

   
import statsmodels.formula.api as smf
#Next use the ols function to create a regression model using that data in 
#the dataframe #called df  and fit the data of the dataframe into the model

result = smf.ols(formula='1 ~ 0', data=df).fit()
    

我明白了

ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_21632\2193012581.py in <module>
      2 #Next use the ols function to create a regression model using that data in
      3 #the dataframe #called df  and fit the data of the dataframe into the model
----> 4 result = smf.ols(formula='1 ~ 0', data=df).fit()
      5 
      6 # Print the parameters/coefficients in the regression equation

~\Anaconda3\lib\site-packages\statsmodels\base\model.py in from_formula(cls, formula, data, subset, drop_cols, *args, **kwargs)
    224                        'formula': formula,  # attach formula for unpckling
    225                        'design_info': design_info})
--> 226         mod = cls(endog, exog, *args, **kwargs)
    227         mod.formula = formula
    228         # since we got a dataframe, attach the original

~\Anaconda3\lib\site-packages\statsmodels\regression\linear_model.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
    905             warnings.warn(msg, ValueWarning)
    906         super(OLS, self).__init__(endog, exog, missing=missing,
--> 907                                   hasconst=hasconst, **kwargs)
    908         if "weights" in self._init_keys:
    909             self._init_keys.remove("weights")

~\Anaconda3\lib\site-packages\statsmodels\regression\linear_model.py in __init__(self, endog, exog, weights, missing, hasconst, **kwargs)
    732             weights = weights.squeeze()
    733         super(WLS, self).__init__(endog, exog, missing=missing,
--> 734                                   weights=weights, hasconst=hasconst, **kwargs)
    735         nobs = self.exog.shape[0]
    736         weights = self.weights

~\Anaconda3\lib\site-packages\statsmodels\regression\linear_model.py in __init__(self, endog, exog, **kwargs)
    188     """
    189     def __init__(self, endog, exog, **kwargs):
--> 190         super(RegressionModel, self).__init__(endog, exog, **kwargs)
    191         self._data_attr.extend(['pinv_wexog', 'wendog', 'wexog', 'weights'])
    192 

~\Anaconda3\lib\site-packages\statsmodels\base\model.py in __init__(self, endog, exog, **kwargs)
    265 
    266     def __init__(self, endog, exog=None, **kwargs):
--> 267         super().__init__(endog, exog, **kwargs)
    268         self.initialize()
    269 

~\Anaconda3\lib\site-packages\statsmodels\base\model.py in __init__(self, endog, exog, **kwargs)
     91         hasconst = kwargs.pop('hasconst', None)
     92         self.data = self._handle_data(endog, exog, missing, hasconst,
---> 93                                       **kwargs)
     94         self.k_constant = self.data.k_constant
     95         self.exog = self.data.exog

~\Anaconda3\lib\site-packages\statsmodels\base\model.py in _handle_data(self, endog, exog, missing, hasconst, **kwargs)
    130 
    131     def _handle_data(self, endog, exog, missing, hasconst, **kwargs):
--> 132         data = handle_data(endog, exog, missing, hasconst, **kwargs)
    133         # kwargs arrays could have changed, easier to just attach here
    134         for key in kwargs:

~\Anaconda3\lib\site-packages\statsmodels\base\data.py in handle_data(endog, exog, missing, hasconst, **kwargs)
    699     klass = handle_data_class_factory(endog, exog)
    700     return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
--> 701                  **kwargs)

~\Anaconda3\lib\site-packages\statsmodels\base\data.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
     86         self.const_idx = None
     87         self.k_constant = 0
---> 88         self._handle_constant(hasconst)
     89         self._check_integrity()
     90         self._cache = {}

~\Anaconda3\lib\site-packages\statsmodels\base\data.py in _handle_constant(self, hasconst)
    176                             (np.ones(self.exog.shape[0]), self.exog))
    177                 rank_augm = np.linalg.matrix_rank(augmented_exog)
--> 178                 rank_orig = np.linalg.matrix_rank(self.exog)
    179                 self.k_constant = int(rank_orig == rank_augm)
    180                 self.const_idx = None

<__array_function__ internals> in matrix_rank(*args, **kwargs)

~\Anaconda3\lib\site-packages\numpy\linalg\linalg.py in matrix_rank(M, tol, hermitian)
   1901     S = svd(M, compute_uv=False, hermitian=hermitian)
   1902     if tol is None:
-> 1903         tol = S.max(axis=-1, keepdims=True) * max(M.shape[-2:]) * finfo(S.dtype).eps
   1904     else:
   1905         tol = asarray(tol)[..., newaxis]

~\Anaconda3\lib\site-packages\numpy\core\_methods.py in _amax(a, axis, out, keepdims, initial, where)
     38 def _amax(a, axis=None, out=None, keepdims=False,
     39           initial=_NoValue, where=True):
---> 40     return umr_maximum(a, axis, None, out, keepdims, initial, where)
     41 
     42 def _amin(a, axis=None, out=None, keepdims=False,

ValueError: zero-size array to reduction operation maximum which has no identity 
python pandas scikit-learn statsmodels scaling
1个回答
0
投票

如果您使用最新版本的 scikit-learn,您可以设置输出类型。它看起来像这样:

    X = df[num_features]
    std_scaler = StandardScaler()
    std_scaler.set_output(transform='pandas')
    df_scaled = std_scaler.fit_transform(X)

df_scaled
将是一个 pandas 数据框,列名称也将相同,即“通货膨胀”和“房屋”。您可以在此处的set_ouput API中阅读更多相关信息。

© www.soinside.com 2019 - 2024. All rights reserved.