我相信我正在缩放数据帧中的列名称,这就是出现错误的原因。我该如何防止这种情况?我注意到 dataFrame 中的列名称从“Inflation”、“Homes”更改为 0 和 1,可能是因为我正在缩放列名称。然后我将数组更改回数据帧,因为标准标量将我的原始数据帧变成了 numpy 数组。我需要它作为数据帧来使用 sklearn 执行回归。谢谢:
import pandas as pd
df= pd.read_excel("C:/Users/ME/OneDrive/Desktop/simple regress.xlsx")
print(df)
from sklearn.preprocessing import StandardScaler
num_features = ['Inflation', 'Homes']
X = df[num_features]
std_scaler = StandardScaler()
df_scaled = std_scaler.fit_transform(X)
df_scaled
# convert the array back to a dataframe
from pandas import DataFrame
df = DataFrame(df_scaled)
0 1
0 -1.021149 -0.219107
1 -1.006189 -0.248034
2 -1.216238 -0.326069
3 -1.195059 -0.403671
4 -1.181612 -0.419835
import statsmodels.formula.api as smf
#Next use the ols function to create a regression model using that data in
#the dataframe #called df and fit the data of the dataframe into the model
result = smf.ols(formula='1 ~ 0', data=df).fit()
我明白了
ValueError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_21632\2193012581.py in <module>
2 #Next use the ols function to create a regression model using that data in
3 #the dataframe #called df and fit the data of the dataframe into the model
----> 4 result = smf.ols(formula='1 ~ 0', data=df).fit()
5
6 # Print the parameters/coefficients in the regression equation
~\Anaconda3\lib\site-packages\statsmodels\base\model.py in from_formula(cls, formula, data, subset, drop_cols, *args, **kwargs)
224 'formula': formula, # attach formula for unpckling
225 'design_info': design_info})
--> 226 mod = cls(endog, exog, *args, **kwargs)
227 mod.formula = formula
228 # since we got a dataframe, attach the original
~\Anaconda3\lib\site-packages\statsmodels\regression\linear_model.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
905 warnings.warn(msg, ValueWarning)
906 super(OLS, self).__init__(endog, exog, missing=missing,
--> 907 hasconst=hasconst, **kwargs)
908 if "weights" in self._init_keys:
909 self._init_keys.remove("weights")
~\Anaconda3\lib\site-packages\statsmodels\regression\linear_model.py in __init__(self, endog, exog, weights, missing, hasconst, **kwargs)
732 weights = weights.squeeze()
733 super(WLS, self).__init__(endog, exog, missing=missing,
--> 734 weights=weights, hasconst=hasconst, **kwargs)
735 nobs = self.exog.shape[0]
736 weights = self.weights
~\Anaconda3\lib\site-packages\statsmodels\regression\linear_model.py in __init__(self, endog, exog, **kwargs)
188 """
189 def __init__(self, endog, exog, **kwargs):
--> 190 super(RegressionModel, self).__init__(endog, exog, **kwargs)
191 self._data_attr.extend(['pinv_wexog', 'wendog', 'wexog', 'weights'])
192
~\Anaconda3\lib\site-packages\statsmodels\base\model.py in __init__(self, endog, exog, **kwargs)
265
266 def __init__(self, endog, exog=None, **kwargs):
--> 267 super().__init__(endog, exog, **kwargs)
268 self.initialize()
269
~\Anaconda3\lib\site-packages\statsmodels\base\model.py in __init__(self, endog, exog, **kwargs)
91 hasconst = kwargs.pop('hasconst', None)
92 self.data = self._handle_data(endog, exog, missing, hasconst,
---> 93 **kwargs)
94 self.k_constant = self.data.k_constant
95 self.exog = self.data.exog
~\Anaconda3\lib\site-packages\statsmodels\base\model.py in _handle_data(self, endog, exog, missing, hasconst, **kwargs)
130
131 def _handle_data(self, endog, exog, missing, hasconst, **kwargs):
--> 132 data = handle_data(endog, exog, missing, hasconst, **kwargs)
133 # kwargs arrays could have changed, easier to just attach here
134 for key in kwargs:
~\Anaconda3\lib\site-packages\statsmodels\base\data.py in handle_data(endog, exog, missing, hasconst, **kwargs)
699 klass = handle_data_class_factory(endog, exog)
700 return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
--> 701 **kwargs)
~\Anaconda3\lib\site-packages\statsmodels\base\data.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
86 self.const_idx = None
87 self.k_constant = 0
---> 88 self._handle_constant(hasconst)
89 self._check_integrity()
90 self._cache = {}
~\Anaconda3\lib\site-packages\statsmodels\base\data.py in _handle_constant(self, hasconst)
176 (np.ones(self.exog.shape[0]), self.exog))
177 rank_augm = np.linalg.matrix_rank(augmented_exog)
--> 178 rank_orig = np.linalg.matrix_rank(self.exog)
179 self.k_constant = int(rank_orig == rank_augm)
180 self.const_idx = None
<__array_function__ internals> in matrix_rank(*args, **kwargs)
~\Anaconda3\lib\site-packages\numpy\linalg\linalg.py in matrix_rank(M, tol, hermitian)
1901 S = svd(M, compute_uv=False, hermitian=hermitian)
1902 if tol is None:
-> 1903 tol = S.max(axis=-1, keepdims=True) * max(M.shape[-2:]) * finfo(S.dtype).eps
1904 else:
1905 tol = asarray(tol)[..., newaxis]
~\Anaconda3\lib\site-packages\numpy\core\_methods.py in _amax(a, axis, out, keepdims, initial, where)
38 def _amax(a, axis=None, out=None, keepdims=False,
39 initial=_NoValue, where=True):
---> 40 return umr_maximum(a, axis, None, out, keepdims, initial, where)
41
42 def _amin(a, axis=None, out=None, keepdims=False,
ValueError: zero-size array to reduction operation maximum which has no identity
如果您使用最新版本的 scikit-learn,您可以设置输出类型。它看起来像这样:
X = df[num_features]
std_scaler = StandardScaler()
std_scaler.set_output(transform='pandas')
df_scaled = std_scaler.fit_transform(X)
df_scaled
将是一个 pandas 数据框,列名称也将相同,即“通货膨胀”和“房屋”。您可以在此处的set_ouput API中阅读更多相关信息。