为双向GRU调整Pytorch“从零开始的NLP”

Question

我从本教程中获取了代码，并尝试对其进行修改，以包括双向性和GRU的任意数量的层。

链接到使用单向单层GRU的教程：https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

模型运行良好，但是当我使用set bidirectional = True时，出现尺寸不匹配错误（如下所示）。有什么想法为什么呢？

编码器：

import torch.nn.init as init
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1, bidirectional=False):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.hidden_var = hidden_size//2 if bidirectional else hidden_size
        self.n_layers = n_layers
        self.bidirectional = bidirectional
        self.n_directions = 2 if bidirectional else 1

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size,
                          self.hidden_var, 
                          num_layers=self.n_layers,
                          bidirectional=self.bidirectional)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        #output = (output[:, :, :self.hidden_size] +
        #        output[:, :, self.hidden_size:])
        return output, hidden

    def initHidden(self):
        return torch.zeros(self.n_layers*self.n_directions, 1, self.hidden_var, device=device)

包含LSTM选项的替代编码器

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1, bidirectional=False, method='GRU'):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.hidden_var = hidden_size // 2 if bidirectional else hidden_size
        self.n_layers = n_layers
        self.bidirectional = bidirectional
        self.n_directions = 2 if bidirectional else 1
        self.method = method

        self.embedding = nn.Embedding(input_size, hidden_size)
        if self.method == 'GRU':
            self.net = nn.GRU(hidden_size,
                              self.hidden_var,
                              num_layers=self.n_layers,
                              bidirectional=self.bidirectional)
        else:
            self.net = nn.LSTM(hidden_size,
                               self.hidden_var,
                               num_layers=self.n_layers,
                               bidirectional=self.bidirectional)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.net(output, hidden)
        # output = (output[:, :, :self.hidden_size] +
        #        output[:, :, self.hidden_size:])
        return output, hidden, embedded

    def initHidden(self):
        if self.method == 'GRU':
            return torch.zeros(self.n_layers * self.n_directions, 1, self.hidden_var, device=device)
        else:
            h_state = torch.zeros(self.n_layers * self.n_directions, 1, self.hidden_var)
            c_state = torch.zeros(self.n_layers * self.n_directions, 1, self.hidden_var)
            hidden = (h_state, c_state)
            return hidden

AttnDecoder：

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        self.n_layers = n_layers

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)

        self.gru = nn.GRU(self.hidden_size,
                          self.hidden_size,
                          num_layers = self.n_layers)

        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)

        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1*self.n_layers, 1, self.hidden_size, device=device)

本教程中的所有其他内容都与该代码块完全相同（以考虑新参数）：

n_layers=1
bidirectional = True
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size, n_layers=n_layers, bidirectional=bidirectional).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1, n_layers=n_layers).to(device)
trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

错误：

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-133-37084c93a197> in <module>
      5 attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1, n_layers=n_layers).to(device)
      6 
----> 7 trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

<ipython-input-131-774ce8edefa6> in trainIters(encoder, decoder, n_iters, print_every, plot_every, learning_rate)
     16 
     17         loss = train(input_tensor, target_tensor, encoder,
---> 18                      decoder, encoder_optimizer, decoder_optimizer, criterion)
     19         print_loss_total += loss
     20         plot_loss_total += loss

<ipython-input-130-67be7e8c2a58> in train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length)
     39         for di in range(target_length):
     40             decoder_output, decoder_hidden, decoder_attention = decoder(
---> 41                 decoder_input, decoder_hidden, encoder_outputs)
     42             topv, topi = decoder_output.topk(1)
     43             decoder_input = topi.squeeze().detach()  # detach from history as input

~/miniconda3/envs/pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    545             result = self._slow_forward(*input, **kwargs)
    546         else:
--> 547             result = self.forward(*input, **kwargs)
    548         for hook in self._forward_hooks.values():
    549             hook_result = hook(self, input, result)

<ipython-input-129-6dd1d30fe28f> in forward(self, input, hidden, encoder_outputs)
     24 
     25         attn_weights = F.softmax(
---> 26             self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
     27         attn_applied = torch.bmm(attn_weights.unsqueeze(0),
     28                                  encoder_outputs.unsqueeze(0))

~/miniconda3/envs/pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    545             result = self._slow_forward(*input, **kwargs)
    546         else:
--> 547             result = self.forward(*input, **kwargs)
    548         for hook in self._forward_hooks.values():
    549             hook_result = hook(self, input, result)

~/miniconda3/envs/pytorch/lib/python3.7/site-packages/torch/nn/modules/linear.py in forward(self, input)
     85 
     86     def forward(self, input):
---> 87         return F.linear(input, self.weight, self.bias)
     88 
     89     def extra_repr(self):

~/miniconda3/envs/pytorch/lib/python3.7/site-packages/torch/nn/functional.py in linear(input, weight, bias)
   1367     if input.dim() == 2 and bias is not None:
   1368         # fused op is marginally faster
-> 1369         ret = torch.addmm(bias, input, weight.t())
   1370     else:
   1371         output = input.matmul(weight.t())

RuntimeError: size mismatch, m1: [1 x 384], m2: [512 x 10] at /tmp/pip-req-build-58y_cjjl/aten/src/TH/generic/THTensorMath.cpp:752

任何帮助将不胜感激！

基于user3923920注释的更新（解码器还包括LSTM选项）>

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1,
                 max_length=MAX_LENGTH, method='GRU', bidirectional=False):

        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        self.n_layers = n_layers
        self.method = method
        self.bidirectional = bidirectional

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)

        if self.method == 'GRU':
            self.net = nn.GRU(self.hidden_size,
                              self.hidden_size,
                              num_layers=self.n_layers)
        else:
            self.net = nn.LSTM(self.hidden_size,
                               self.hidden_size,
                               num_layers=self.n_layers)

        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):

        # Embed
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        self.hidden = hidden

        # Concatenate all of the layers
        hidden_h_rows = ()
        hidden_c_rows = ()

        if self.method == 'LSTM':
            # hidden is a tuple of h_state and c_state
            decoder_h, decoder_c = hidden
            print(decoder_h.shape)
            hidden_shape = decoder_h.shape[0]

            # h_state
            for x in range(0, hidden_shape):
                hidden_h_rows += (decoder_h[x],)

            # c_state
            for x in range(0, hidden_shape):
                hidden_c_rows += (decoder_c[x],)

        elif self.method == "GRU":

            # hidden is not a tuple (GRU)
            decoder_h = hidden
            hidden_shape = decoder_h.shape[0]

            # h_state
            for x in range(0, hidden_shape):
                hidden_h_rows += (decoder_h[x],)

        if self.bidirectional:
            decoder_h_cat = torch.cat(hidden_h_rows, 1)
            # Make sure the h_dim size is compatible with num_layers with concatenation.
            decoder_h = decoder_h_cat.view((self.n_layers, 1, self.hidden_size))  # hidden_size=256

            if self.method == "LSTM":
                decoder_c_cat = torch.cat(hidden_c_rows, 1)
                decoder_c = decoder_c_cat.view((self.n_layers, 1, self.hidden_size))  # hidden_size=256
                hidden_lstm = (decoder_h, decoder_c)

            elif self.method == "GRU":
                hidden_gru = decoder_h

        # Attention Block
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden_lstm[0][0] if self.method == "LSTM" else \
                hidden_gru[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.net(output,
                                  hidden_lstm if self.method == "LSTM" else hidden_gru)  # I am not sure about this!
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):

        if self.method == 'GRU':
            return torch.zeros(self.n_layers * 1, 1, self.hidden_var, device=device)
        else:
            h_state = torch.zeros(self.n_layers * 1, 1, self.hidden_var)
            c_state = torch.zeros(self.n_layers * 1, 1, self.hidden_var)
            hidden = (h_state, c_state)
            return hidden

我从教程中获取了代码，并尝试对其进行修改，以包括双向性和GRU的任意数量的层。链接到使用单向，单列...的教程。

Answer 1

所以我不确定这是否100％正确，因为我只是在学习如何编程RNN，但是我在几个额外的地方更改了代码。

为双向GRU调整Pytorch“从零开始的NLP”

问题描述投票：3回答：1

1个回答

最新问题

为双向GRU调整Pytorch“从零开始的NLP”

问题描述 投票：3回答：1

1个回答

最新问题

问题描述投票：3回答：1