在训练 DecisionTreeClassifier 模型之前,我尝试使用 SelectFromModel 从数据集中选择特征。我还使用 cross_validate 来评估模型性能。但是,我不知道如何一起使用这两个功能。
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
# Split the dataset into features and target
X = dataset.drop('target_column', axis=1) # Remove 'target_column' with the actual target column name
y = dataset['target_column']
# Feature selection using SelectFromModel(RandomForestClassifier)
selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))
X = selector.fit_transform(X, y)
num_selected_features = X.shape[1]
print(f'Number of Features Selected: {num_selected_features}')
# train a DecisionTreeClassifier with specificity as the evaluation metric
classifier = DecisionTreeClassifier(criterion="entropy",
min_samples_split=5,
random_state=0)
# Cross-validate the model using specificity as the evaluation metric
cross_validation_results = cross_validate(estimator=classifier,
X=X,
y=y,
cv=5,
scoring="accuracy",
return_train_score=True)
# Print the cross-validation results
print(cross_validation_results)
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
# Define your feature selector and classifier
selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))
classifier = DecisionTreeClassifier(criterion="entropy", min_samples_split=5, random_state=0)
# Create a pipeline that first applies the feature selection and then runs the classifier
pipeline = Pipeline([
('feature_selection', selector),
('classification', classifier)
])
# Now, run cross-validation on the pipeline
cross_validation_results = cross_validate(estimator=pipeline,
X=X,
y=y,
cv=5,
scoring="accuracy",
return_train_score=True)
# Print the cross-validation results
print(cross_validation_results)
这样,特征选择将应用于交叉验证中的每个训练折叠,避免潜在的数据泄漏。