我在表格数据集 pendigits 数据集上训练 MLP。问题是训练损失和准确性或多或少是稳定的,而验证和测试损失和准确性是完全恒定的。 pendigits 数据集包含 10 个类。我的代码与我在 MNIST 或 CIFAR10 上所做的其他实验完全相同,它们都可以正常工作。唯一发生变化的是数据集从 MNIST/CIFAR10 到 pendigits 和 NN,从 ResNet-18 到简单的 MLP。下面是训练函数和网络:
def train(net, loaders, optimizer, criterion, epochs=100, dev=dev, save_param = True, model_name="only-pendigits"):
torch.manual_seed(myseed)
try:
net = net.to(dev)
print(net)
# Initialize history
history_loss = {"train": [], "val": [], "test": []}
history_accuracy = {"train": [], "val": [], "test": []}
# Process each epoch
for epoch in range(epochs):
# Initialize epoch variables
sum_loss = {"train": 0, "val": 0, "test": 0}
sum_accuracy = {"train": 0, "val": 0, "test": 0}
# Process each split
for split in ["train", "val", "test"]:
# Process each batch
for (input, labels) in loaders[split]:
# Move to CUDA
input = input.to(dev)
labels = labels.to(dev)
# Reset gradients
optimizer.zero_grad()
# Compute output
pred = net(input)
#labels = labels.long()
loss = criterion(pred, labels)
# Update loss
sum_loss[split] += loss.item()
# Check parameter update
if split == "train":
# Compute gradients
loss.backward()
# Optimize
optimizer.step()
# Compute accuracy
_,pred_labels = pred.max(1)
batch_accuracy = (pred_labels == labels).sum().item()/input.size(0)
# Update accuracy
sum_accuracy[split] += batch_accuracy
scheduler.step()
# Compute epoch loss/accuracy
epoch_loss = {split: sum_loss[split]/len(loaders[split]) for split in ["train", "val", "test"]}
epoch_accuracy = {split: sum_accuracy[split]/len(loaders[split]) for split in ["train", "val", "test"]}
# Update history
for split in ["train", "val", "test"]:
history_loss[split].append(epoch_loss[split])
history_accuracy[split].append(epoch_accuracy[split])
# Print info
print(f"Epoch {epoch+1}:",
f"TrL={epoch_loss['train']:.4f},",
f"TrA={epoch_accuracy['train']:.4f},",
f"VL={epoch_loss['val']:.4f},",
f"VA={epoch_accuracy['val']:.4f},",
f"TeL={epoch_loss['test']:.4f},",
f"TeA={epoch_accuracy['test']:.4f},",
f"LR={optimizer.param_groups[0]['lr']:.5f},")
except KeyboardInterrupt:
print("Interrupted")
finally:
# Plot loss
plt.title("Loss")
for split in ["train", "val", "test"]:
plt.plot(history_loss[split], label=split)
plt.legend()
plt.show()
# Plot accuracy
plt.title("Accuracy")
for split in ["train", "val", "test"]:
plt.plot(history_accuracy[split], label=split)
plt.legend()
plt.show()
网络:
#RETE TESTO
class TextNN(nn.Module):
#Constructor
def __init__(self):
# Call parent contructor
super().__init__()
torch.manual_seed(myseed)
self.relu = nn.ReLU()
self.linear1 = nn.Linear(16, 128) #16 sono le colonne in input
self.linear2 = nn.Linear(128, 128)
self.linear3 = nn.Linear(128, 32)
self.linear4 = nn.Linear(32, 10)
def forward(self, tab):
tab = self.linear1(tab)
tab = self.relu(tab)
tab = self.linear2(tab)
tab = self.relu(tab)
tab = self.linear3(tab)
tab = self.relu(tab)
tab = self.linear4(tab)
return tab
model = TextNN()
print(model)
有没有可能是模型太简单了,什么都没学到?我不这么认为。我认为在训练中存在一些错误(但该函数与我用于 MNIST 或 CIFAR10 的函数完全相同,但可以正常工作),或者在数据加载中。以下是我加载数据集的方式:
pentrain = pd.read_csv("pendigits.tr.csv")
pentest = pd.read_csv("pendigits.te.csv")
class TextDataset(Dataset):
"""Tabular and Image dataset."""
def __init__(self, excel_file, transform=None):
self.excel_file = excel_file
#self.tabular = pd.read_csv(excel_file)
self.tabular = excel_file
self.transform = transform
def __len__(self):
return len(self.tabular)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
tabular = self.tabular.iloc[idx, 0:]
y = tabular["class"]
tabular = tabular[['input1', 'input2', 'input3', 'input4', 'input5', 'input6', 'input7',
'input8', 'input9', 'input10', 'input11', 'input12', 'input13',
'input14', 'input15', 'input16']]
tabular = tabular.tolist()
tabular = torch.FloatTensor(tabular)
if self.transform:
tabular = self.transform(tabular)
return tabular, y
penditrain = TextDataset(excel_file=pentrain, transform=None)
train_size = int(0.80 * len(penditrain))
val_size = int((len(penditrain) - train_size))
pentrain, penval = random_split(penditrain, (train_size, val_size))
pentest = TextDataset(excel_file=pentest, transform=None)
所有都正确加载,如果我打印一个例子:
text_x, label_x = pentrain[0]
print(text_x.shape, label_x)
text_x
torch.Size([16]) 1
tensor([ 48., 74., 88., 95., 100., 100., 78., 75., 66., 49., 64., 23.,
32., 0., 0., 1.])
这些是我的数据加载器:
#Define generators
generator=torch.Generator()
generator.manual_seed(myseed)
# Define loaders
from torch.utils.data import DataLoader
train_loader = DataLoader(pentrain, batch_size=128, num_workers=2, drop_last=True, shuffle=True, generator=generator)
val_loader = DataLoader(penval, batch_size=128, num_workers=2, drop_last=False, shuffle=False, generator=generator)
test_loader = DataLoader(pentest, batch_size=128, num_workers=2, drop_last=False, shuffle=False, generator=generator)
我被这个问题卡了2天了,不知道是什么问题。。。
编辑:基本上,如果我在每个纪元的开头写
print(list(net.parameters()))
,我会看到权重永远不会改变,因此损失和准确性保持不变。为什么权重没有变化?
EDIT2:还有另一个数据集,比如 sklearn 的数字,问题是完全一样的。