我第一次建立特征转换管道,但是注意到使用FeatureUnion将管道组合在一起会返回一个numpy数组,但是我宁愿返回一个pandas数据框,以便可以看到特征名称。
我创建了一个此类,该类返回用于要素的熊猫数据框,并且可以正常工作,但是此自定义featureunion类的速度是内置sklearn Featureunion类的两倍。我该如何并行化以使运行时处于同等水平?任何帮助将不胜感激!
为了介绍多处理,我尝试了这个,但是出现错误。
class PandasFeatureUnion(BaseEstimator, TransformerMixin):
def __init__(self, transformer_list, n_jobs=cpu_count()):
self.transformer_list = transformer_list
self.n_jobs = min(n_jobs, len(transformer_list))
def one_fit(self, transformer, X, y):
return transformer.fit(X, y)
def one_transform(self, transformer, X):
return transformer.transform(X)
def fit(self, X, y=None):
Parallel(n_jobs=self.n_jobs)(
delayed(self.one_fit)(trans, X, y)
for _, trans in self.transformer_list)
return self
def transform(self, X):
Xts = Parallel(n_jobs=self.n_jobs)(
delayed(self.one_transform)(trans, X)
for name, trans in self.transformer_list)
Xunion = reduce(lambda X1, X2:
pd.merge(X1, X2,
left_index=True, right_index=True), Xts)
return Xunion
我得到的错误如下:
sklearn.externals.joblib.externals.loky.process_executor._RemoteTraceback:
"""
Traceback (most recent call last):
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 382, in pandas._libs.hashtable.Float64HashTable.get_item
TypeError: must be real number, not NoneType
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/rihun/anaconda3/envs/CrossSell/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 2657, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 134, in pandas._libs.index.IndexEngine.get_loc
KeyError: None
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 382, in pandas._libs.hashtable.Float64HashTable.get_item
TypeError: must be real number, not NoneType
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/rihun/anaconda3/envs/CrossSell/lib/python3.6/site-packages/sklearn/externals/joblib/externals/loky/process_executor.py", line 418, in _process_worker
r = call_item()
File "/Users/rihun/anaconda3/envs/CrossSell/lib/python3.6/site-packages/sklearn/externals/joblib/externals/loky/process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "/Users/rihun/anaconda3/envs/CrossSell/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 567, in __call__
return self.func(*args, **kwargs)
File "/Users/rihun/anaconda3/envs/CrossSell/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "/Users/rihun/anaconda3/envs/CrossSell/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "/Users/rihun/PycharmProjects/cross_sell/code/cross_sell_features.py", line 386, in one_transform
def one_transform(self, transformer, X):
File "/Users/rihun/anaconda3/envs/CrossSell/lib/python3.6/site-packages/sklearn/pipeline.py", line 451, in _transform
Xt = transform.transform(Xt)
File "/Users/rihun/PycharmProjects/cross_sell/code/cross_sell_features.py", line 126, in transform
X = X.copy()
File "/Users/rihun/anaconda3/envs/CrossSell/lib/python3.6/site-packages/pandas/core/indexing.py", line 190, in __setitem__
self._setitem_with_indexer(indexer, value)
File "/Users/rihun/anaconda3/envs/CrossSell/lib/python3.6/site-packages/pandas/core/indexing.py", line 380, in _setitem_with_indexer
nindexer.append(labels.get_loc(key))
File "/Users/rihun/anaconda3/envs/CrossSell/lib/python3.6/site-packages/pandas/core/indexes/numeric.py", line 436, in get_loc
tolerance=tolerance)
File "/Users/rihun/anaconda3/envs/CrossSell/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 2659, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 134, in pandas._libs.index.IndexEngine.get_loc
KeyError: None
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/Users/rihun/PycharmProjects/cross_sell/code/savm_features_pipeline.py", line 75, in <module>
data_tr = full_pipeline.fit_transform(data)
File "/Users/rihun/anaconda3/envs/CrossSell/lib/python3.6/site-packages/sklearn/pipeline.py", line 300, in fit_transform
return last_step.fit_transform(Xt, y, **fit_params)
File "/Users/rihun/anaconda3/envs/CrossSell/lib/python3.6/site-packages/sklearn/base.py", line 464, in fit_transform
return self.fit(X, **fit_params).transform(X)
File "/Users/rihun/PycharmProjects/cross_sell/code/cross_sell_features.py", line 397, in transform
for _, trans in self.transformer_list)
File "/Users/rihun/anaconda3/envs/CrossSell/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py", line 930, in __call__
self.retrieve()
File "/Users/rihun/anaconda3/envs/CrossSell/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py", line 833, in retrieve
self._output.extend(job.get(timeout=self.timeout))
File "/Users/rihun/anaconda3/envs/CrossSell/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 521, in wrap_future_result
return future.result(timeout=timeout)
File "/Users/rihun/anaconda3/envs/CrossSell/lib/python3.6/concurrent/futures/_base.py", line 425, in result
return self.__get_result()
File "/Users/rihun/anaconda3/envs/CrossSell/lib/python3.6/concurrent/futures/_base.py", line 384, in __get_result
raise self._exception
KeyError: None
我在这里重建您的代码:colab notebook,似乎对我有用...由于每个变压器的内部状态以及您应按顺序应用它们以确保数据的一致性...好像您正在复制sklearn.Pipeline ...
如果可以提供可重现的错误示例,它可能会很有用...