在深度训练/验证循环期间使用分层 k 折叠时出现越界错误

问题描述 投票:0回答:1

使用以下代码:

def __init__(self, model, num_folds=5, batch_size=32, epochs=10, lr=0.001, betas=(0.9, 0.999), eps=1e-8):
    """
    Initialize the ModelTrainer with specified parameters.

    Args:
        model (torch.nn.Module): The PyTorch model to be trained and validated.
        num_folds (int): The number of folds for stratified k-fold cross-validation. Default is 5.
        batch_size (int): Batch size for training and validation. Default is 32.
        epochs (int): Number of epochs for training. Default is 10.
        lr (float): Learning rate for the optimizer. Default is 0.001.
        betas (tuple): Coefficients used for computing running averages of gradient and its square.
                       Default is (0.9, 0.999).
        eps (float): Term added to the denominator to improve numerical stability in the optimizer.
                     Default is 1e-8.
    """
    self.model = model
    self.num_folds = num_folds
    self.batch_size = batch_size
    self.epochs = epochs
    self.lr = lr
    self.betas = betas
    self.eps = eps

def train_and_validate(self, data, labels):
    """
    Train and validate the model using stratified k-fold cross-validation.

    Args:
        data (torch.Tensor): The input data for training and validation.
        labels (torch.Tensor): The labels corresponding to the input data.

    Returns:
        auc_scores (list): A list of AUC scores for each fold.
        sensitivities (list): Sensitivities at 100% specificity for each fold.
        mean_auc (float): Mean AUC score across all folds.
        mean_sensitivity (float): Mean sensitivity at 100% specificity across all folds.
    """
    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=self.num_folds, shuffle=True, random_state=42)

    # Lists to store AUC scores and sensitivities for each fold
    auc_scores = []
    sensitivities = []

    # Lists to store ROC curve data for each fold
    tprs = []
    mean_fpr = np.linspace(0, 1, 100)

    # Iterate over folds
    for fold, (train_index, val_index) in enumerate(skf.split(data, labels)):
        # Get the data for this fold
        X_train_fold, X_val_fold = data[train_index], data[val_index]
        y_train_fold, y_val_fold = labels[train_index], labels[val_index]

        # Create PyTorch datasets and data loaders for this fold
        train_dataset = TensorDataset(X_train_fold, y_train_fold)
        val_dataset = TensorDataset(X_val_fold, y_val_fold)
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=self.batch_size)

        # Define loss function and optimizer
        criterion = nn.BCELoss()
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr, betas=self.betas, eps=self.eps)

        # Training loop for this fold
        for epoch in range(self.epochs):
            self.model.train()
            for inputs, labels in train_loader:
                optimizer.zero_grad()
                outputs = self.model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

        # Validation loop for this fold
        self.model.eval()
        val_outputs_list = []
        val_labels_list = []
        for inputs, labels in val_loader:
            with torch.no_grad():
                val_outputs = self.model(inputs)
                val_outputs_list.append(val_outputs.numpy())
                val_labels_list.append(labels.numpy())
        val_outputs_np = np.concatenate(val_outputs_list)
        val_labels_np = np.concatenate(val_labels_list)

        # Calculate ROC curve for this fold
        fpr, tpr, thresholds = roc_curve(val_labels_np, val_outputs_np)
        roc_auc = auc(fpr, tpr)
        auc_scores.append(roc_auc)

        # Calculate sensitivity at 100% specificity
        sensitivity = np.interp(1e-3, fpr, tpr)
        sensitivities.append(sensitivity)

        # Store ROC curve data for this fold
        tprs.append(np.interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0

        # Plot ROC curve for this fold
        plt.plot(fpr, tpr, lw=1, alpha=0.3)

    # Plot settings
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.show()

    # Calculate mean ROC curve
    mean_tpr = np.mean(tprs, axis=0)
    mean_auc = auc(mean_fpr, mean_tpr)

    # Plot mean ROC curve with thicker line
    plt.plot(mean_fpr, mean_tpr, color='black', lw=2, linestyle='--', label=f'Mean ROC (AUC = {mean_auc:.2f})')
    plt.legend(loc='lower right')
    plt.show()

    # Calculate mean sensitivity at 100% specificity
    mean_sensitivity = np.mean(sensitivities)

    return auc_scores, sensitivities, mean_auc, mean_sensitivity

当第二次折叠想要开始时,我不断地在 y_train_fold, y_val_fold = labels[train_index], labels[val_index] 中出现索引越界错误。第一个折叠运行。我的数据已成型 (864,19,500)和我的二进制标签(864)。任何人都可以发现此代码中可能触发错误的问题吗?

我尝试用 SMOTE 来平衡类别分布,检查标签的长度和形状,但我无法让它工作。在第一次折叠期间,标签的长度为 864,但在第二次折叠期间,当我使用 num_folds=4 时,形状变为 24,导致尺寸为 24 的维度 0 在索引 24 处出现越界错误

python machine-learning k-fold
1个回答
0
投票

如果

labels
pandas.Series
对象,那么
labels[train_index]
将使用数据帧的原始索引而不是线性 numpy 索引。两者不一定一致。

确保

labels
转换为 numpy 数组,或者将其保留为系列,但使用
labels.iloc[train_index]
对其进行索引。

© www.soinside.com 2019 - 2024. All rights reserved.