如何在同一管道中使用通过ColumnTransformer管道创建的列？

Question

我正在使用sklearn的

ColumnTransformer

方法来构建数据处理的管道。在

ColumnTransformer

中，我正在为数据集创建新列，这些新列是其他列的组合，但在下一步中，我想使用刚刚创建的这个新列，但收到此列不存在的错误。我知道它在原始数据集中不存在，但为什么我不能使用这个新列？或者如果我可以：如何？代码如下：

def sum_name(function_transformer, feature_names_in):
    return ["relatives"]  # feature names out

def sum_relatives(X):
    X_copy = X.copy()
    X_copy['total_relatives'] = X_copy['SibSp'] + X_copy['Parch']
    return X_copy

def cat_travel_name(function_transformer, feature_names_in):
    return ["relatives"]  # feature names out

def categorize_travel(X):
    X_copy = X.copy()

    conditions = [
        (X_copy['total_relatives'] == 0),
        (X_copy['total_relatives'] >= 1) & (X_copy['total_relatives'] <= 3),
        (X_copy['total_relatives'] >= 4)
    ]
    categories = ['A', 'B', 'C']

    X_copy['traveling_category'] = np.select(conditions, categories, default='Unknown')
    return X_copy

class_order = [[1, 2, 3]]

ord_pipeline = make_pipeline(
    OrdinalEncoder(categories=class_order)    
    )

def interval_name(function_transformer, feature_names_in):
    return ["interval"]  # feature names out

def age_transformer(X):
    X_copy = X.copy()
    median_age_by_class = X_copy.groupby('Pclass')['Age'].median().reset_index()
    median_age_by_class.columns = ['Pclass', 'median_age']
    for index, row in median_age_by_class.iterrows():
        class_value = row['Pclass']
        median_age = row['median_age']
        X_copy.loc[X_copy['Pclass'] == class_value, 'Age'] = X_copy.loc[X_copy['Pclass'] == class_value, 'Age'].fillna(median_age)
    bins = [0, 10, 20, 30, 40, 50, 60, 70, 100]
    X_copy['age_interval'] = pd.cut(X_copy['Age'], bins=bins)
    return X_copy

def age_processor():
    return make_pipeline(
        FunctionTransformer(age_transformer, feature_names_out=interval_name),
        )

total_relatives_pipeline = make_pipeline(
    FunctionTransformer(sum_relatives, feature_names_out=sum_name)
)

travel_category_pipeline = make_pipeline(
    FunctionTransformer(categorize_travel, feature_names_out=cat_travel_name)
)

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
    )

num_pipeline = make_pipeline(
        StandardScaler()
    )

preprocessing = ColumnTransformer([
        ("ord", ord_pipeline, ['Pclass']),
        ("age_processing", age_processor(), ['Pclass', 'Age']),
        ("total_relatives", total_relatives_pipeline, ['SibSp', 'Parch']),
        ("travel_category", travel_category_pipeline, ['total_relatives']),
        ("cat", cat_pipeline, ['Sex', 'Embarked', 'traveling_category', 'age_interval']),
        ("num", num_pipeline, ['Fare']),
    ])

当我使用以下命令调用

total_relatives

方法时，此代码给出错误：

fit_transform

不是数据集的列：

data_processed = preprocessing.fit_transform(titanic_data)

嗯，确实如此，

total_relatives

确实不是原始数据集的列，它是在

ColumnTransformer

内动态创建的。如果我想使用“travel_category”步骤，是否需要再次创建它？我不能从上一步中恢复并使用它吗？

同样的事情将在下一步中发生，因为

traveling_category

和

age_interval

是在前面的步骤中创建的，并且不是来自原始数据集。

Answer 1

函数转换器返回的列与其

feature_names_out=

名称不匹配。它们在返回的功能数量和名称方面都存在冲突。我首先确保每个函数转换器仅返回它应该返回的一列（或多列），并且还更改了名称以使它们一致。这样，

feature_names_out=

中定义的特征和名称的数量与每个变压器的实际返回数据相匹配。

第二个问题是在

ColumnTransformer

中创建了新功能，并尝试通过同一

ColumnTransformer

的其他部分访问这些变量。这不起作用，因为列变压器并行运行。您需要做的是将列转换器按顺序链接起来，以便下一个列转换器可以访问从前一个列转换器创建的新变量。我已对代码进行了此更改。

我做了一些其他的小改动，包括强制转换器返回

pandas

数据帧而不是

numpy

数组。

新的预处理器：

其输出列名称：

Index(['Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'traveling_category_A', 'traveling_category_B', 'traveling_category_C',
       'age_interval_(0, 10]', 'age_interval_(10, 20]',
       'age_interval_(20, 30]', 'age_interval_(30, 40]',
       'age_interval_(40, 50]', 'age_interval_(50, 60]',
       'age_interval_(60, 70]', 'age_interval_(70, 100]', 'Pclass', 'Fare',
       'PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'],
      dtype='object')

代码：

import pandas as pd
titanic_data = pd.read_csv('../titanic.csv')

from sklearn.pipeline import make_pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn import set_config
set_config(transform_output='pandas')

#
# Sum relatives
#
def sum_name(function_transformer, feature_names_in):
    return ["total_relatives"]  # feature names out

def sum_relatives(X):
    X_copy = X.copy()
    X_copy['total_relatives'] = X_copy['SibSp'] + X_copy['Parch']
    return X_copy[['total_relatives']]

total_relatives_pipeline = make_pipeline(
    FunctionTransformer(sum_relatives, feature_names_out=sum_name)
)

#
#Categorize travel
#
def cat_travel_name(function_transformer, feature_names_in):
    return ["traveling_category"]  # feature names out

def categorize_travel(X):
    X_copy = X.copy()

    conditions = [
        (X_copy['total_relatives'] == 0),
        (X_copy['total_relatives'] >= 1) & (X_copy['total_relatives'] <= 3),
        (X_copy['total_relatives'] >= 4)
    ]
    categories = ['A', 'B', 'C']

    X_copy['traveling_category'] = np.select(conditions, categories, default='Unknown')
    return X_copy[['traveling_category']]

travel_category_pipeline = make_pipeline(
    FunctionTransformer(categorize_travel, feature_names_out=cat_travel_name)
)

#
# Ordinal encoder
#
class_order = [[1, 2, 3]]

ord_pipeline = make_pipeline(
    OrdinalEncoder(categories=class_order)    
    )

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    )

#Numerical for fare
fare_pipeline = make_pipeline(
        StandardScaler()
    )

#
# Age transformer
#
def interval_name(function_transformer, feature_names_in):
    return ["age_interval"]  # feature names out

def age_transformer(X):
    X_copy = X.copy()
    median_age_by_class = X_copy.groupby('Pclass')['Age'].median().reset_index()
    median_age_by_class.columns = ['Pclass', 'median_age']
    for index, row in median_age_by_class.iterrows():
        class_value = row['Pclass']
        median_age = row['median_age']
        X_copy.loc[X_copy['Pclass'] == class_value, 'Age'] = \
            X_copy.loc[X_copy['Pclass'] == class_value, 'Age'].fillna(median_age)
    bins = [0, 10, 20, 30, 40, 50, 60, 70, 100]
    X_copy['age_interval'] = pd.cut(X_copy['Age'], bins=bins)
    return X_copy[['age_interval']]

def age_processor():
    return make_pipeline(
        FunctionTransformer(age_transformer, feature_names_out=interval_name),
        )

#
# Column transformers
#
preprocessing_initial = ColumnTransformer([
        ("ord", ord_pipeline, ['Pclass']),
        ("age_processing", age_processor(), ['Pclass', 'Age']),
        ("num", fare_pipeline, ['Fare']),
        ("total_relatives", total_relatives_pipeline, ['SibSp', 'Parch'])],
        remainder='passthrough',
        verbose_feature_names_out=False
)

preprocessing_travel_category = ColumnTransformer(
    [("travel_category", travel_category_pipeline, ['total_relatives'])],
    remainder='passthrough',
    verbose_feature_names_out=False
)

preprocessing_cat = ColumnTransformer(
    [("cat", cat_pipeline, ['Sex', 'Embarked', 'traveling_category', 'age_interval'])],
    remainder='passthrough',
    verbose_feature_names_out=False
)

#Final transformer
preprocessor = make_pipeline(
    preprocessing_initial,
    preprocessing_travel_category,
    preprocessing_cat
)

#Run
preprocessor.fit(titanic_data)
preprocessor.fit_transform(titanic_data).columns

如何在同一管道中使用通过ColumnTransformer管道创建的列？

问题描述投票：0回答：1

1个回答

最新问题

如何在同一管道中使用通过ColumnTransformer管道创建的列？

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1