我正在尝试在 pytorch 中重现用于机器翻译的原始变压器
class Transformer(nn.Module):
def __init__(self, vocab_size_in, vocab_size_out, embedding_dim, n_heads, key_dim, value_dim, ffn_dim, n=10000,
eps=1e-5, padding_token_index=0, p_drop=0.1, n_encoder_layers=1, n_decoder_layers=1):
super(Transformer, self).__init__()
# parameters
self.key_dim = key_dim
self.n_heads = n_heads
self.embedding_dim = embedding_dim
self.eps = eps
self.ffn_dim = ffn_dim
self.padding_token_index = padding_token_index
self.vocab_size_in = vocab_size_in
# Embedding layers encoder
self.embedding_layer_enc = EmbeddingLayer(vocab_size_in, embedding_dim, n)
self.dropout_enc = nn.Dropout(p_drop)
# Encoder layers
self.encoder_layers = [EncoderLayer(embedding_dim, key_dim, value_dim, ffn_dim, n_heads, p_drop, eps)] * n_encoder_layers
# Embedding layers decoder
self.embedding_layer_dec = EmbeddingLayer(vocab_size_out, embedding_dim)
self.dropout_dec = nn.Dropout(p_drop)
# Decoder layers
self.decoder_layers = [DecoderLayer(embedding_dim, key_dim, value_dim, ffn_dim, n_heads, p_drop, eps)] * n_decoder_layers
# Linear output layer
self.output_linear = nn.Linear(embedding_dim, vocab_size_out)
def forward(self, input, target):
...
如您所见,这里定义了两个 dropout 层。此外,我在 EncoderLayer 和 DecoderLayer 中还有更多的 dropout 层
class EncoderLayer(nn.Module):
def __init__(self, embedding_dim=512, key_dim=512, value_dim=512, ffn_dim=512, n_heads=8, p_drop=0.1, eps=1e-5):
super().__init__()
self.multi_head = MultiHeadAttentionLayer(n_heads, embedding_dim, key_dim, value_dim)
self.dropout_multi_head = nn.Dropout(p_drop)
self.norm_multi_head = LayerNormalization(embedding_dim, eps)
self.FFN_in = nn.Linear(embedding_dim, ffn_dim)
self.FFN_out = nn.Linear(ffn_dim, embedding_dim)
self.dropout_FFN = nn.Dropout(p_drop)
self.norm_FFN = LayerNormalization(embedding_dim, eps)
def forward(self, source, mask=None):
multi_head_out = self.multi_head(source, mask) # shape = (n_sentences, len_sentence, embedding_dim)
multi_head_out = self.dropout_multi_head(multi_head_out)
multi_head_norm = self.norm_multi_head(source + multi_head_out)
ffn_in = self.FFN_in(multi_head_norm)
ffn_in = F.relu(ffn_in)
ffn_out = self.FFN_out(ffn_in)
ffn_out = self.dropout_FFN(ffn_out)
enc_out = self.norm_FFN(multi_head_norm + ffn_out)
return enc_out
我正在评估模式下测试我的代码。前向步骤采用索引的源序列和目标序列,并输出输出序列中每个单词的概率张量。到目前为止,测试功能很简单,我只是想确保当我输入相同的内容时输出保持不变:
def translate_sentence(model, source, target, max_num_words=200):
for i in range(max_num_words):
model.eval()
with torch.no_grad():
output = model(source, target)
print(output)
然而,这并没有发生。按照建议,我添加了
print(model.dropout_enc.training)
print(model.encoder_layers[0].dropout_multi_head.training)
为了检查 dropout 层是否处于活动状态,输出为
False
True
因此 model.eval() 禁用 Transformer 的 init 中定义的 dropout 层,但不禁用其子层中定义的 dropout 层。有知道如何解决的吗?
已解决:我必须使用 nn.ModuleList
__repr__
的nn.Dropout
方法不会输出此类信息。这就是为什么无论图层处于何种模式它都会显示相同的原因。
training
属性检查子图层或父图层的模式:
>>> model = nn.Dropout(0.5)
>>> model.training
True
>>> model.eval()
Dropout(p=0.5, inplace=False)
>>> model.training
False