使用以下代码:
def __init__(self, model, num_folds=5, batch_size=32, epochs=10, lr=0.001, betas=(0.9, 0.999), eps=1e-8):
"""
Initialize the ModelTrainer with specified parameters.
Args:
model (torch.nn.Module): The PyTorch model to be trained and validated.
num_folds (int): The number of folds for stratified k-fold cross-validation. Default is 5.
batch_size (int): Batch size for training and validation. Default is 32.
epochs (int): Number of epochs for training. Default is 10.
lr (float): Learning rate for the optimizer. Default is 0.001.
betas (tuple): Coefficients used for computing running averages of gradient and its square.
Default is (0.9, 0.999).
eps (float): Term added to the denominator to improve numerical stability in the optimizer.
Default is 1e-8.
"""
self.model = model
self.num_folds = num_folds
self.batch_size = batch_size
self.epochs = epochs
self.lr = lr
self.betas = betas
self.eps = eps
def train_and_validate(self, data, labels):
"""
Train and validate the model using stratified k-fold cross-validation.
Args:
data (torch.Tensor): The input data for training and validation.
labels (torch.Tensor): The labels corresponding to the input data.
Returns:
auc_scores (list): A list of AUC scores for each fold.
sensitivities (list): Sensitivities at 100% specificity for each fold.
mean_auc (float): Mean AUC score across all folds.
mean_sensitivity (float): Mean sensitivity at 100% specificity across all folds.
"""
# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=self.num_folds, shuffle=True, random_state=42)
# Lists to store AUC scores and sensitivities for each fold
auc_scores = []
sensitivities = []
# Lists to store ROC curve data for each fold
tprs = []
mean_fpr = np.linspace(0, 1, 100)
# Iterate over folds
for fold, (train_index, val_index) in enumerate(skf.split(data, labels)):
# Get the data for this fold
X_train_fold, X_val_fold = data[train_index], data[val_index]
y_train_fold, y_val_fold = labels[train_index], labels[val_index]
# Create PyTorch datasets and data loaders for this fold
train_dataset = TensorDataset(X_train_fold, y_train_fold)
val_dataset = TensorDataset(X_val_fold, y_val_fold)
train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=self.batch_size)
# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr, betas=self.betas, eps=self.eps)
# Training loop for this fold
for epoch in range(self.epochs):
self.model.train()
for inputs, labels in train_loader:
optimizer.zero_grad()
outputs = self.model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# Validation loop for this fold
self.model.eval()
val_outputs_list = []
val_labels_list = []
for inputs, labels in val_loader:
with torch.no_grad():
val_outputs = self.model(inputs)
val_outputs_list.append(val_outputs.numpy())
val_labels_list.append(labels.numpy())
val_outputs_np = np.concatenate(val_outputs_list)
val_labels_np = np.concatenate(val_labels_list)
# Calculate ROC curve for this fold
fpr, tpr, thresholds = roc_curve(val_labels_np, val_outputs_np)
roc_auc = auc(fpr, tpr)
auc_scores.append(roc_auc)
# Calculate sensitivity at 100% specificity
sensitivity = np.interp(1e-3, fpr, tpr)
sensitivities.append(sensitivity)
# Store ROC curve data for this fold
tprs.append(np.interp(mean_fpr, fpr, tpr))
tprs[-1][0] = 0.0
# Plot ROC curve for this fold
plt.plot(fpr, tpr, lw=1, alpha=0.3)
# Plot settings
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.show()
# Calculate mean ROC curve
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
# Plot mean ROC curve with thicker line
plt.plot(mean_fpr, mean_tpr, color='black', lw=2, linestyle='--', label=f'Mean ROC (AUC = {mean_auc:.2f})')
plt.legend(loc='lower right')
plt.show()
# Calculate mean sensitivity at 100% specificity
mean_sensitivity = np.mean(sensitivities)
return auc_scores, sensitivities, mean_auc, mean_sensitivity
当第二次折叠想要开始时,我不断地在 y_train_fold, y_val_fold = labels[train_index], labels[val_index] 中出现索引越界错误。第一个折叠运行。我的数据已成型 (864,19,500)和我的二进制标签(864)。任何人都可以发现此代码中可能触发错误的问题吗?
我尝试用 SMOTE 来平衡类别分布,检查标签的长度和形状,但我无法让它工作。在第一次折叠期间,标签的长度为 864,但在第二次折叠期间,当我使用 num_folds=4 时,形状变为 24,导致尺寸为 24 的维度 0 在索引 24 处出现越界错误
如果
labels
是 pandas.Series
对象,那么 labels[train_index]
将使用数据帧的原始索引而不是线性 numpy 索引。两者不一定一致。
确保
labels
转换为 numpy 数组,或者将其保留为系列,但使用 labels.iloc[train_index]
对其进行索引。