如何在同一管道中使用通过ColumnTransformer管道创建的列?

问题描述 投票:0回答:1

我正在使用sklearn的

ColumnTransformer
方法来构建数据处理的管道。在
ColumnTransformer
中,我正在为数据集创建新列,这些新列是其他列的组合,但在下一步中,我想使用刚刚创建的这个新列,但收到此列不存在的错误。我知道它在原始数据集中不存在,但为什么我不能使用这个新列?或者如果我可以:如何? 代码如下:

def sum_name(function_transformer, feature_names_in):
    return ["relatives"]  # feature names out

def sum_relatives(X):
    X_copy = X.copy()
    X_copy['total_relatives'] = X_copy['SibSp'] + X_copy['Parch']
    return X_copy

def cat_travel_name(function_transformer, feature_names_in):
    return ["relatives"]  # feature names out

def categorize_travel(X):
    X_copy = X.copy()

    conditions = [
        (X_copy['total_relatives'] == 0),
        (X_copy['total_relatives'] >= 1) & (X_copy['total_relatives'] <= 3),
        (X_copy['total_relatives'] >= 4)
    ]
    categories = ['A', 'B', 'C']

    X_copy['traveling_category'] = np.select(conditions, categories, default='Unknown')
    return X_copy

class_order = [[1, 2, 3]]

ord_pipeline = make_pipeline(
    OrdinalEncoder(categories=class_order)    
    )

def interval_name(function_transformer, feature_names_in):
    return ["interval"]  # feature names out

def age_transformer(X):
    X_copy = X.copy()
    median_age_by_class = X_copy.groupby('Pclass')['Age'].median().reset_index()
    median_age_by_class.columns = ['Pclass', 'median_age']
    for index, row in median_age_by_class.iterrows():
        class_value = row['Pclass']
        median_age = row['median_age']
        X_copy.loc[X_copy['Pclass'] == class_value, 'Age'] = X_copy.loc[X_copy['Pclass'] == class_value, 'Age'].fillna(median_age)
    bins = [0, 10, 20, 30, 40, 50, 60, 70, 100]
    X_copy['age_interval'] = pd.cut(X_copy['Age'], bins=bins)
    return X_copy

def age_processor():
    return make_pipeline(
        FunctionTransformer(age_transformer, feature_names_out=interval_name),
        )

total_relatives_pipeline = make_pipeline(
    FunctionTransformer(sum_relatives, feature_names_out=sum_name)
)

travel_category_pipeline = make_pipeline(
    FunctionTransformer(categorize_travel, feature_names_out=cat_travel_name)
)

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
    )

num_pipeline = make_pipeline(
        StandardScaler()
    )

preprocessing = ColumnTransformer([
        ("ord", ord_pipeline, ['Pclass']),
        ("age_processing", age_processor(), ['Pclass', 'Age']),
        ("total_relatives", total_relatives_pipeline, ['SibSp', 'Parch']),
        ("travel_category", travel_category_pipeline, ['total_relatives']),
        ("cat", cat_pipeline, ['Sex', 'Embarked', 'traveling_category', 'age_interval']),
        ("num", num_pipeline, ['Fare']),
    ])

当我使用以下命令调用

total_relatives
方法时,此代码给出错误:
fit_transform
不是数据集的列:

data_processed = preprocessing.fit_transform(titanic_data)

嗯,确实如此,

total_relatives
确实不是原始数据集的列,它是在
ColumnTransformer
内动态创建的。如果我想使用“travel_category”步骤,是否需要再次创建它?我不能从上一步中恢复并使用它吗?

同样的事情将在下一步中发生,因为

traveling_category
age_interval
是在前面的步骤中创建的,并且不是来自原始数据集。

python-3.x scikit-learn pipeline
1个回答
0
投票

函数转换器返回的列与其

feature_names_out=
名称不匹配。它们在返回的功能数量和名称方面都存在冲突。我首先确保每个函数转换器仅返回它应该返回的一列(或多列),并且还更改了名称以使它们一致。这样,
feature_names_out=
中定义的特征和名称的数量与每个变压器的实际返回数据相匹配。

第二个问题是在

ColumnTransformer
中创建了新功能,并尝试通过同一
ColumnTransformer
的其他部分访问这些变量。这不起作用,因为列变压器并行运行。您需要做的是将列转换器按顺序链接起来,以便下一个列转换器可以访问从前一个列转换器创建的新变量。我已对代码进行了此更改。

我做了一些其他的小改动,包括强制转换器返回

pandas
数据帧而不是
numpy
数组。

新的预处理器:

其输出列名称:

Index(['Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'traveling_category_A', 'traveling_category_B', 'traveling_category_C',
       'age_interval_(0, 10]', 'age_interval_(10, 20]',
       'age_interval_(20, 30]', 'age_interval_(30, 40]',
       'age_interval_(40, 50]', 'age_interval_(50, 60]',
       'age_interval_(60, 70]', 'age_interval_(70, 100]', 'Pclass', 'Fare',
       'PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'],
      dtype='object')

代码:

import pandas as pd
titanic_data = pd.read_csv('../titanic.csv')

from sklearn.pipeline import make_pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn import set_config
set_config(transform_output='pandas')

#
# Sum relatives
#
def sum_name(function_transformer, feature_names_in):
    return ["total_relatives"]  # feature names out

def sum_relatives(X):
    X_copy = X.copy()
    X_copy['total_relatives'] = X_copy['SibSp'] + X_copy['Parch']
    return X_copy[['total_relatives']]

total_relatives_pipeline = make_pipeline(
    FunctionTransformer(sum_relatives, feature_names_out=sum_name)
)

#
#Categorize travel
#
def cat_travel_name(function_transformer, feature_names_in):
    return ["traveling_category"]  # feature names out

def categorize_travel(X):
    X_copy = X.copy()

    conditions = [
        (X_copy['total_relatives'] == 0),
        (X_copy['total_relatives'] >= 1) & (X_copy['total_relatives'] <= 3),
        (X_copy['total_relatives'] >= 4)
    ]
    categories = ['A', 'B', 'C']

    X_copy['traveling_category'] = np.select(conditions, categories, default='Unknown')
    return X_copy[['traveling_category']]

travel_category_pipeline = make_pipeline(
    FunctionTransformer(categorize_travel, feature_names_out=cat_travel_name)
)

#
# Ordinal encoder
#
class_order = [[1, 2, 3]]

ord_pipeline = make_pipeline(
    OrdinalEncoder(categories=class_order)    
    )

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    )

#Numerical for fare
fare_pipeline = make_pipeline(
        StandardScaler()
    )

#
# Age transformer
#
def interval_name(function_transformer, feature_names_in):
    return ["age_interval"]  # feature names out

def age_transformer(X):
    X_copy = X.copy()
    median_age_by_class = X_copy.groupby('Pclass')['Age'].median().reset_index()
    median_age_by_class.columns = ['Pclass', 'median_age']
    for index, row in median_age_by_class.iterrows():
        class_value = row['Pclass']
        median_age = row['median_age']
        X_copy.loc[X_copy['Pclass'] == class_value, 'Age'] = \
            X_copy.loc[X_copy['Pclass'] == class_value, 'Age'].fillna(median_age)
    bins = [0, 10, 20, 30, 40, 50, 60, 70, 100]
    X_copy['age_interval'] = pd.cut(X_copy['Age'], bins=bins)
    return X_copy[['age_interval']]

def age_processor():
    return make_pipeline(
        FunctionTransformer(age_transformer, feature_names_out=interval_name),
        )

#
# Column transformers
#
preprocessing_initial = ColumnTransformer([
        ("ord", ord_pipeline, ['Pclass']),
        ("age_processing", age_processor(), ['Pclass', 'Age']),
        ("num", fare_pipeline, ['Fare']),
        ("total_relatives", total_relatives_pipeline, ['SibSp', 'Parch'])],
        remainder='passthrough',
        verbose_feature_names_out=False
)

preprocessing_travel_category = ColumnTransformer(
    [("travel_category", travel_category_pipeline, ['total_relatives'])],
    remainder='passthrough',
    verbose_feature_names_out=False
)

preprocessing_cat = ColumnTransformer(
    [("cat", cat_pipeline, ['Sex', 'Embarked', 'traveling_category', 'age_interval'])],
    remainder='passthrough',
    verbose_feature_names_out=False
)

#Final transformer
preprocessor = make_pipeline(
    preprocessing_initial,
    preprocessing_travel_category,
    preprocessing_cat
)

#Run
preprocessor.fit(titanic_data)
preprocessor.fit_transform(titanic_data).columns
© www.soinside.com 2019 - 2024. All rights reserved.