我正在使用sklearn的
ColumnTransformer
方法来构建数据处理的管道。在 ColumnTransformer
中,我正在为数据集创建新列,这些新列是其他列的组合,但在下一步中,我想使用刚刚创建的这个新列,但收到此列不存在的错误。我知道它在原始数据集中不存在,但为什么我不能使用这个新列?或者如果我可以:如何?
代码如下:
def sum_name(function_transformer, feature_names_in):
return ["relatives"] # feature names out
def sum_relatives(X):
X_copy = X.copy()
X_copy['total_relatives'] = X_copy['SibSp'] + X_copy['Parch']
return X_copy
def cat_travel_name(function_transformer, feature_names_in):
return ["relatives"] # feature names out
def categorize_travel(X):
X_copy = X.copy()
conditions = [
(X_copy['total_relatives'] == 0),
(X_copy['total_relatives'] >= 1) & (X_copy['total_relatives'] <= 3),
(X_copy['total_relatives'] >= 4)
]
categories = ['A', 'B', 'C']
X_copy['traveling_category'] = np.select(conditions, categories, default='Unknown')
return X_copy
class_order = [[1, 2, 3]]
ord_pipeline = make_pipeline(
OrdinalEncoder(categories=class_order)
)
def interval_name(function_transformer, feature_names_in):
return ["interval"] # feature names out
def age_transformer(X):
X_copy = X.copy()
median_age_by_class = X_copy.groupby('Pclass')['Age'].median().reset_index()
median_age_by_class.columns = ['Pclass', 'median_age']
for index, row in median_age_by_class.iterrows():
class_value = row['Pclass']
median_age = row['median_age']
X_copy.loc[X_copy['Pclass'] == class_value, 'Age'] = X_copy.loc[X_copy['Pclass'] == class_value, 'Age'].fillna(median_age)
bins = [0, 10, 20, 30, 40, 50, 60, 70, 100]
X_copy['age_interval'] = pd.cut(X_copy['Age'], bins=bins)
return X_copy
def age_processor():
return make_pipeline(
FunctionTransformer(age_transformer, feature_names_out=interval_name),
)
total_relatives_pipeline = make_pipeline(
FunctionTransformer(sum_relatives, feature_names_out=sum_name)
)
travel_category_pipeline = make_pipeline(
FunctionTransformer(categorize_travel, feature_names_out=cat_travel_name)
)
cat_pipeline = make_pipeline(
SimpleImputer(strategy="most_frequent"),
OneHotEncoder(handle_unknown="ignore")
)
num_pipeline = make_pipeline(
StandardScaler()
)
preprocessing = ColumnTransformer([
("ord", ord_pipeline, ['Pclass']),
("age_processing", age_processor(), ['Pclass', 'Age']),
("total_relatives", total_relatives_pipeline, ['SibSp', 'Parch']),
("travel_category", travel_category_pipeline, ['total_relatives']),
("cat", cat_pipeline, ['Sex', 'Embarked', 'traveling_category', 'age_interval']),
("num", num_pipeline, ['Fare']),
])
当我使用以下命令调用
total_relatives
方法时,此代码给出错误:fit_transform
不是数据集的列:
data_processed = preprocessing.fit_transform(titanic_data)
嗯,确实如此,
total_relatives
确实不是原始数据集的列,它是在ColumnTransformer
内动态创建的。如果我想使用“travel_category”步骤,是否需要再次创建它?我不能从上一步中恢复并使用它吗?
同样的事情将在下一步中发生,因为
traveling_category
和 age_interval
是在前面的步骤中创建的,并且不是来自原始数据集。
函数转换器返回的列与其
feature_names_out=
名称不匹配。它们在返回的功能数量和名称方面都存在冲突。我首先确保每个函数转换器仅返回它应该返回的一列(或多列),并且还更改了名称以使它们一致。这样,feature_names_out=
中定义的特征和名称的数量与每个变压器的实际返回数据相匹配。
第二个问题是在
ColumnTransformer
中创建了新功能,并尝试通过同一 ColumnTransformer
的其他部分访问这些变量。这不起作用,因为列变压器并行运行。您需要做的是将列转换器按顺序链接起来,以便下一个列转换器可以访问从前一个列转换器创建的新变量。我已对代码进行了此更改。
我做了一些其他的小改动,包括强制转换器返回
pandas
数据帧而不是 numpy
数组。
新的预处理器:
其输出列名称:
Index(['Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
'traveling_category_A', 'traveling_category_B', 'traveling_category_C',
'age_interval_(0, 10]', 'age_interval_(10, 20]',
'age_interval_(20, 30]', 'age_interval_(30, 40]',
'age_interval_(40, 50]', 'age_interval_(50, 60]',
'age_interval_(60, 70]', 'age_interval_(70, 100]', 'Pclass', 'Fare',
'PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'],
dtype='object')
代码:
import pandas as pd
titanic_data = pd.read_csv('../titanic.csv')
from sklearn.pipeline import make_pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn import set_config
set_config(transform_output='pandas')
#
# Sum relatives
#
def sum_name(function_transformer, feature_names_in):
return ["total_relatives"] # feature names out
def sum_relatives(X):
X_copy = X.copy()
X_copy['total_relatives'] = X_copy['SibSp'] + X_copy['Parch']
return X_copy[['total_relatives']]
total_relatives_pipeline = make_pipeline(
FunctionTransformer(sum_relatives, feature_names_out=sum_name)
)
#
#Categorize travel
#
def cat_travel_name(function_transformer, feature_names_in):
return ["traveling_category"] # feature names out
def categorize_travel(X):
X_copy = X.copy()
conditions = [
(X_copy['total_relatives'] == 0),
(X_copy['total_relatives'] >= 1) & (X_copy['total_relatives'] <= 3),
(X_copy['total_relatives'] >= 4)
]
categories = ['A', 'B', 'C']
X_copy['traveling_category'] = np.select(conditions, categories, default='Unknown')
return X_copy[['traveling_category']]
travel_category_pipeline = make_pipeline(
FunctionTransformer(categorize_travel, feature_names_out=cat_travel_name)
)
#
# Ordinal encoder
#
class_order = [[1, 2, 3]]
ord_pipeline = make_pipeline(
OrdinalEncoder(categories=class_order)
)
cat_pipeline = make_pipeline(
SimpleImputer(strategy="most_frequent"),
OneHotEncoder(handle_unknown="ignore", sparse_output=False)
)
#Numerical for fare
fare_pipeline = make_pipeline(
StandardScaler()
)
#
# Age transformer
#
def interval_name(function_transformer, feature_names_in):
return ["age_interval"] # feature names out
def age_transformer(X):
X_copy = X.copy()
median_age_by_class = X_copy.groupby('Pclass')['Age'].median().reset_index()
median_age_by_class.columns = ['Pclass', 'median_age']
for index, row in median_age_by_class.iterrows():
class_value = row['Pclass']
median_age = row['median_age']
X_copy.loc[X_copy['Pclass'] == class_value, 'Age'] = \
X_copy.loc[X_copy['Pclass'] == class_value, 'Age'].fillna(median_age)
bins = [0, 10, 20, 30, 40, 50, 60, 70, 100]
X_copy['age_interval'] = pd.cut(X_copy['Age'], bins=bins)
return X_copy[['age_interval']]
def age_processor():
return make_pipeline(
FunctionTransformer(age_transformer, feature_names_out=interval_name),
)
#
# Column transformers
#
preprocessing_initial = ColumnTransformer([
("ord", ord_pipeline, ['Pclass']),
("age_processing", age_processor(), ['Pclass', 'Age']),
("num", fare_pipeline, ['Fare']),
("total_relatives", total_relatives_pipeline, ['SibSp', 'Parch'])],
remainder='passthrough',
verbose_feature_names_out=False
)
preprocessing_travel_category = ColumnTransformer(
[("travel_category", travel_category_pipeline, ['total_relatives'])],
remainder='passthrough',
verbose_feature_names_out=False
)
preprocessing_cat = ColumnTransformer(
[("cat", cat_pipeline, ['Sex', 'Embarked', 'traveling_category', 'age_interval'])],
remainder='passthrough',
verbose_feature_names_out=False
)
#Final transformer
preprocessor = make_pipeline(
preprocessing_initial,
preprocessing_travel_category,
preprocessing_cat
)
#Run
preprocessor.fit(titanic_data)
preprocessor.fit_transform(titanic_data).columns