我使用SWA方法在pytorch中训练模型。
SWA:https://pytorch.org/blog/stochastic-weight-averaging-in-pytorch/
我的火车密码的损失立即上升到了零。
模特的损失跳到南
损失输出在下面。
1。损失:张量(4.8463,device ='cuda:0',grad_fn =)
2。损失:张量(118317.8516,device ='cuda:0',grad_fn =)
3。损失:张量(5.7568e + 22,device ='cuda:0',grad_fn =)
4。损失:张量(nan,device ='cuda:0',grad_fn =)
没有SWA方法,损失不会增加。SWA方法在火车模型的代码中有问题吗?
谢谢您,有任何建议,谢谢。
#batch_size
batch_size=5
#DataLoader
train_dataloader=torch.utils.data.DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=batch_size,shuffle=False)
#dict
dataloaders_dict={"train":train_dataloader,"val":val_dataloader}
#example outputs
#train :torch.Size([5, 25, 32, 32])
#target : torch.Size([5])
def train_model_withSWA(net,dataloaders_dict,criterion,optimizer,num_epochs):
loss_list=[]
acc_list=[]
#validation list
val_loss_list=[]
val_acc_list=[]
for epoch in tqdm(range(num_epochs)):
print("Epoch{}/{}".format(epoch+1,num_epochs))
print("--------------------------")
for phase in ["train","val"]:
if phase=="train":
net.train()
else:
net.eval()
epoch_loss=0.0
epoch_corrects=0
#if (epoch==0) and (phase=="train"):
continue
for inputs,labels in dataloaders_dict[phase]:
#optimizerを初期化:
optimizer.zero_grad()
with torch.set_grad_enabled(phase=="train"):
inputs=inputs.to(device)
labels=labels.to(device)
outputs=net(inputs)
loss=criterion(outputs,labels)
print("loss:",loss)
_,preds=torch.max(outputs,1)
if phase == "train":
loss.backward()
optimizer.step()
epoch_loss += loss.item()*inputs.size(0)
epoch_corrects +=torch.sum(preds==labels.data)
#for swa
optimizer.swap_swa_sgd()
epoch_loss=epoch_loss/len(dataloaders_dict[phase].dataset)
epoch_acc=epoch_corrects.double()/len(dataloaders_dict[phase].dataset)
print("{} Loss:{:.4f} Acc:{:.4f}".format(phase,epoch_loss,epoch_acc))
if phase=="train":
loss_list.append(epoch_loss.detach().numpy())
acc_list.append(epoch_acc.detach().numpy())
else:
val_loss_list.append(epoch_loss.detach().numpy())
val_acc_list.append(epoch_acc.detach().numpy())
from torchcontrib.optim import SWA
# ignore warning
import warnings
warnings.filterwarnings('ignore') # set to ignore
#criterion
criterion = nn.CrossEntropyLoss()
net=net.to(device)
base_opt = torch.optim.SGD(net.parameters(), lr=0.1)
optimizer = SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=0.05)
### train with SWA
train_model_withSWA(net=net,dataloaders_dict=dataloaders_dict,
criterion=criterion,
optimizer=optimizer,
num_epochs=num_epochs)
#model' loss jump up to nan....
#loss: tensor(4.8463, device='cuda:0', grad_fn=<NllLossBackward>)
#loss: tensor(118317.8516, device='cuda:0', grad_fn=<NllLossBackward>)
#loss: tensor(5.7568e+22, device='cuda:0', grad_fn=<NllLossBackward>)
#loss: tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
# In additional.
I use Self Attention and Positional Encoder code
class Self_Attention(nn.Module):
""" Self-Attention Layer"""
def __init__(self, in_dim):
super(Self_Attention, self).__init__()
#pointwise convolution
self.query_conv = nn.Conv2d(
in_channels=in_dim, out_channels=in_dim, kernel_size=1)
self.key_conv = nn.Conv2d(
in_channels=in_dim, out_channels=in_dim, kernel_size=1)
self.value_conv = nn.Conv2d(
in_channels=in_dim, out_channels=in_dim, kernel_size=1)
# softmax
self.softmax = nn.Softmax(dim=-2)
#output = x +gamma*o
# first:gamma=0
self.gamma = nn.Parameter(torch.zeros(1))
def forward(self, x):
x=x.to(device)
X = x
#B,C',W,H→B,C',N
proj_query = self.query_conv(X).view(
X.shape[0], -1, X.shape[2]*X.shape[3]) # size:B,C',N
proj_query = proj_query.permute(0, 2, 1) # transpose
proj_key = self.key_conv(X).view(
X.shape[0], -1, X.shape[2]*X.shape[3]) # size:B,C',N
# bmm
S = torch.bmm(proj_query, proj_key)
#
attention_map_T = self.softmax(S)
attention_map = attention_map_T.permute(0, 2, 1)
# Self-Attention Map
proj_value = self.value_conv(X).view(
X.shape[0], -1, X.shape[2]*X.shape[3]) # size:B,C,N
o = torch.bmm(proj_value, attention_map.permute(
0, 2, 1))
# Self-Attention Map
o = o.view(X.shape[0], X.shape[1], X.shape[2], X.shape[3])
out = x+self.gamma*o
#print("gamma:",self.gamma)
return out, attention_map
class PositionalEncoder(nn.Module):
def __init__(self, d_model=300, max_seq_len=256):
super(PositionalEncoder,self).__init__()
self.d_model = d_model
pe = torch.zeros(max_seq_len, d_model)
# GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
pe = pe.to(device)
for pos in range(max_seq_len):
for i in range(0, d_model, 2):
pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
pe[pos, i + 1] = math.cos(pos /
(10000 ** ((2 * (i + 1))/d_model)))
#
self.pe = pe.unsqueeze(0)
#
self.pe.requires_grad = False
def forward(self, x):
x=x.to(device)
ret = math.sqrt(self.d_model)*x + self.pe
return ret
没有数据预处理中的标准化,损失不会上升到零。
这可能是由于我的预处理错误引起的。我什至不了解归一化与归一化的关系,甚至在SWA方法中损失的上升……我为这个错误感到抱歉。