运行时错误：numel：整数乘法溢出

Question

我正在尝试为多变量时间序列数据构建一个生成式循环 GAN 架构。这是我的模型的判别器：

from torchgan.models import Generator, Discriminator
import torch
import torch.nn as nn

class RGANDiscriminator(Discriminator):
    def __init__(self,
                 sequence_length,
                 input_size,
                 hidden_size=None,
                 num_layers=1,
                 dropout=0,
                 last_layer=None,
                 device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
                 **kwargs):
        
        hidden_size = hidden_size or input_size
        self.device = device
        self.input_size = input_size
        self.sequence_length = sequence_length
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.label_type ="none"
        # Set kwargs (might overried above attributes)
        for key, value in kwargs.items():
            setattr(self, key, value)

        super(RGANDiscriminator, self).__init__(self.input_size,
                                                self.label_type)
        # Build RNN layer    
        self.rnn = nn.LSTM(input_size=input_size,
                           hidden_size=hidden_size,
                           num_layers=num_layers,
                           batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(hidden_size, 1)
        self.last_layer = last_layer    
        # Initialize all weights.
        self.rnn.apply(init_weights)
        nn.init.xavier_normal_(self.linear.weight)

    def forward(self, x):
        h0 = torch.randn((self.num_layers, x.size(0), self.hidden_size)).to(self.device)
        c0 = torch.randn((self.num_layers, x.size(0), self.hidden_size)).to(self.device)
        print(f"input {x.shape}")
        print(f"x: {x}")
        length =  torch.LongTensor([torch.max((x[i,:,0]!=0).nonzero()).item()+1 for i in range(x.shape[0])])
        packed = nn.utils.rnn.pack_padded_sequence(
            x, length, batch_first=True, enforce_sorted=False
        )
        out_packed, (_, _) = self.rnn(packed, (h0, c0))
        y, _ = nn.utils.rnn.pad_packed_sequence(out_packed, batch_first=True)
        y = self.dropout(y)
        y = self.linear(y)
        return y if self.last_layer is None else self.last_layer(y)

这是我的模型的训练模块：

def train(self, epochs, writer_frequency=1, saver_frequency=20):
    avg_mmd = []
    for epoch in range(epochs):
        mmd = []
        for batch_idx, (data_attribute, data_feature) in enumerate(self.real_train_dl):
            data_attribute = data_attribute.to(self.device)
            input_feature = data_feature.to(self.device)
            batch_size = data_attribute.shape[0]
            ### Train Discriminator: max log(D(x)) + log(1 - D(G(z)))
            noise = gen_noise((batch_size, self.sequence_length[0], self.noise_dim)).to(self.device)
            print(f"Noise:{noise.shape}")
            print(f"data attribute {data_attribute.shape}")
            noise = torch.cat((data_attribute, noise), dim=2)
            print(f"noise again : {noise.shape}")
            input_feature = torch.cat((data_attribute, input_feature), dim=2)
            print(f"input_feature : {input_feature.shape}")
            fake = self.generator(noise)
            print(f"fake :{fake.shape}")
            x = fake.clone()
            x = x.permute(0,2,1)
            padded = nn.ConstantPad1d((0, input_feature.shape[1] - fake.shape[1]), 0)(x)
            x = padded.permute(0,2,1)
            print(f"new fake :{x.shape}")
            mmd.append(calculate_mmd_rbf(torch.mean(fake, dim=0).detach().cpu().numpy(),
                                         torch.mean(data_feature, dim=0).detach().cpu().numpy()))
            fake = torch.cat((data_attribute, x), dim=2)
            disc_real = self.discriminator(input_feature).view(-1)
            lossD_real = self.criterion(disc_real, torch.ones_like(disc_real))
            disc_fake = self.discriminator(fake).view(-1)
            lossD_fake = self.criterion(disc_fake, torch.zeros_like(disc_fake))
            lossD = (lossD_real + lossD_fake) / 2
            self.discriminator.zero_grad()
            lossD.backward(retain_graph=True)
            self.optimizer_dis.step()

            ### Train Generator: min log(1 - D(G(z))) <-> max log(D(G(z))
            output = self.discriminator(fake).view(-1)
            lossG = self.criterion(output, torch.ones_like(output))
            self.generator.zero_grad()
            lossG.backward()
            self.optimizer_gen.step()

这是错误消息

INFO:config_logger:Batch Size: 40
INFO:config_logger:Noise Dimension: 5
INFO:config_logger:d_rounds: 1
INFO:config_logger:g_rounds: 1
INFO:config_logger:Device: cuda:0
INFO:config_logger:Input Dimension: 14
INFO:config_logger:Output Dimension: 12
INFO:config_logger:Sequence Length: (382,)
Noise:torch.Size([40, 382, 5])
data attribute torch.Size([40, 382, 14])
noise again : torch.Size([40, 382, 19])
input_feature : torch.Size([40, 382, 26])
fake :torch.Size([40, 340, 12])
new fake :torch.Size([40, 382, 12])
input torch.Size([40, 382, 26])
input torch.Size([40, 382, 26])
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-2-5cea213d3975> in <module>
    676                          time_logging_file=time_logging_file, batch_size=batch_size,
    677                          config_logging_file=config_logging_file)
--> 678     trainer.train(epochs=epoch, writer_frequency=1, saver_frequency=save_frequency)

8 frames
<ipython-input-2-5cea213d3975> in train(self, epochs, writer_frequency, saver_frequency)
    592                 disc_real = self.discriminator(input_feature).view(-1)
    593                 lossD_real = self.criterion(disc_real, torch.ones_like(disc_real))
--> 594                 disc_fake = self.discriminator(fake).view(-1)
    595                 lossD_fake = self.criterion(disc_fake, torch.zeros_like(disc_fake))
    596                 lossD = (lossD_real + lossD_fake) / 2

/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1192         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1193                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194             return forward_call(*input, **kwargs)
   1195         # Do not call functions when jit is used
   1196         full_backward_hooks, non_full_backward_hooks = [], []

<ipython-input-2-5cea213d3975> in forward(self, x)
    370         c0 = torch.randn((self.num_layers, x.size(0), self.hidden_size)).to(self.device)
    371         print(f"input {x.shape}")
--> 372         print(f"x: {x}")
    373         length =  torch.LongTensor([torch.max((x[i,:,0]!=0).nonzero()).item()+1 for i in range(x.shape[0])])
    374         packed = nn.utils.rnn.pack_padded_sequence(

/usr/local/lib/python3.8/dist-packages/torch/_tensor.py in __format__(self, format_spec)
    857         if self.dim() == 0 and not self.is_meta and type(self) is Tensor:
    858             return self.item().__format__(format_spec)
--> 859         return object.__format__(self, format_spec)
    860 
    861     @_handle_torch_function_and_wrap_type_error_to_not_implemented

/usr/local/lib/python3.8/dist-packages/torch/_tensor.py in __repr__(self, tensor_contents)
    425             )
    426         # All strings are unicode in Python 3.
--> 427         return torch._tensor_str._str(self, tensor_contents=tensor_contents)
    428 
    429     def backward(

/usr/local/lib/python3.8/dist-packages/torch/_tensor_str.py in _str(self, tensor_contents)
    635     with torch.no_grad():
    636         guard = torch._C._DisableFuncTorch()
--> 637         return _str_intern(self, tensor_contents=tensor_contents)

/usr/local/lib/python3.8/dist-packages/torch/_tensor_str.py in _str_intern(inp, tensor_contents)
    566                         tensor_str = _tensor_str(self.to_dense(), indent)
    567                     else:
--> 568                         tensor_str = _tensor_str(self, indent)
    569 
    570     if self.layout != torch.strided:

/usr/local/lib/python3.8/dist-packages/torch/_tensor_str.py in _tensor_str(self, indent)
    326         )
    327     else:
--> 328         formatter = _Formatter(get_summarized_data(self) if summarize else self)
    329         return _tensor_str_with_formatter(self, indent, summarize, formatter)
    330 

/usr/local/lib/python3.8/dist-packages/torch/_tensor_str.py in __init__(self, tensor)
    113 
    114         else:
--> 115             nonzero_finite_vals = torch.masked_select(
    116                 tensor_view, torch.isfinite(tensor_view) & tensor_view.ne(0)
    117             )

RuntimeError: numel: integer multiplication overflow

如果有人能帮助我理解为什么我会收到此错误，我将不胜感激。

Answer 1

溢出意味着你的操作值爆炸，以至于结果占用了太多内存，程序无法继续运行。这意味着您会经历数值不稳定。通常它是由梯度爆炸引起的，并且有数学技巧可以阻止上溢/下溢的发生：

恒等式：例如问题展示了如何应用数学恒等式以使计算值在数值上更加稳定
数据标准化：确保数据标准化，以便网络层在矩阵相乘时不会爆炸。还可以尝试密切监控权重值，以识别可能导致这些溢出的架构部分。
降低学习率：在较高的学习率下，由于正反馈循环也可能会出现溢出

这里是一篇更深入解释的文章

Answer 2

类标签可能大于数据集中存在的类数量。例如。您有包含 10 个类别的 CIFAR10 数据集，但在数据集中的某处您有一个类别标签

运行时错误：numel：整数乘法溢出

问题描述投票：0回答：2

2个回答

最新问题

运行时错误：numel：整数乘法溢出

问题描述 投票：0回答：2

2个回答

最新问题

问题描述投票：0回答：2