您好,登录后才能下订单哦!
Transformer模型自2017年由Vaswani等人在论文《Attention is All You Need》中提出以来,已经成为自然语言处理(NLP)领域的重要基石。Transformer的核心思想是使用自注意力机制(Self-Attention)来捕捉输入序列中的全局依赖关系,从而避免了传统RNN和LSTM模型中的序列依赖问题。本文将介绍如何使用PyTorch实现一个简单的Transformer模型。
Transformer模型由编码器(Encoder)和解码器(Decoder)两部分组成。每个编码器和解码器都由多个相同的层堆叠而成。每一层包含两个主要子层:
此外,每个子层后面都会接一个残差连接(Residual Connection)和层归一化(Layer Normalization)。
下面我们将使用PyTorch实现一个简单的Transformer模型。为了简化,我们将实现一个只有一层编码器和一层解码器的Transformer。
import torch
import torch.nn as nn
import torch.nn.functional as F
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
super(MultiHeadAttention, self).__init__()
self.num_heads = num_heads
self.d_model = d_model
self.depth = d_model // num_heads
self.wq = nn.Linear(d_model, d_model)
self.wk = nn.Linear(d_model, d_model)
self.wv = nn.Linear(d_model, d_model)
self.dense = nn.Linear(d_model, d_model)
def split_heads(self, x, batch_size):
x = x.view(batch_size, -1, self.num_heads, self.depth)
return x.permute(0, 2, 1, 3)
def forward(self, q, k, v, mask):
batch_size = q.size(0)
q = self.wq(q)
k = self.wk(k)
v = self.wv(v)
q = self.split_heads(q, batch_size)
k = self.split_heads(k, batch_size)
v = self.split_heads(v, batch_size)
scaled_attention, attention_weights = self.scaled_dot_product_attention(q, k, v, mask)
scaled_attention = scaled_attention.permute(0, 2, 1, 3)
concat_attention = scaled_attention.reshape(batch_size, -1, self.d_model)
output = self.dense(concat_attention)
return output, attention_weights
def scaled_dot_product_attention(self, q, k, v, mask):
matmul_qk = torch.matmul(q, k.transpose(-2, -1))
dk = torch.tensor(k.size(-1), dtype=torch.float32)
scaled_attention_logits = matmul_qk / torch.sqrt(dk)
if mask is not None:
scaled_attention_logits += (mask * -1e9)
attention_weights = F.softmax(scaled_attention_logits, dim=-1)
output = torch.matmul(attention_weights, v)
return output, attention_weights
class FeedForward(nn.Module):
def __init__(self, d_model, dff):
super(FeedForward, self).__init__()
self.linear1 = nn.Linear(d_model, dff)
self.linear2 = nn.Linear(dff, d_model)
def forward(self, x):
x = F.relu(self.linear1(x))
x = self.linear2(x)
return x
class EncoderLayer(nn.Module):
def __init__(self, d_model, num_heads, dff, rate=0.1):
super(EncoderLayer, self).__init__()
self.mha = MultiHeadAttention(d_model, num_heads)
self.ffn = FeedForward(d_model, dff)
self.layernorm1 = nn.LayerNorm(d_model)
self.layernorm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(rate)
self.dropout2 = nn.Dropout(rate)
def forward(self, x, mask):
attn_output, _ = self.mha(x, x, x, mask)
attn_output = self.dropout1(attn_output)
out1 = self.layernorm1(x + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output)
out2 = self.layernorm2(out1 + ffn_output)
return out2
class DecoderLayer(nn.Module):
def __init__(self, d_model, num_heads, dff, rate=0.1):
super(DecoderLayer, self).__init__()
self.mha1 = MultiHeadAttention(d_model, num_heads)
self.mha2 = MultiHeadAttention(d_model, num_heads)
self.ffn = FeedForward(d_model, dff)
self.layernorm1 = nn.LayerNorm(d_model)
self.layernorm2 = nn.LayerNorm(d_model)
self.layernorm3 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(rate)
self.dropout2 = nn.Dropout(rate)
self.dropout3 = nn.Dropout(rate)
def forward(self, x, enc_output, look_ahead_mask, padding_mask):
attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
attn1 = self.dropout1(attn1)
out1 = self.layernorm1(x + attn1)
attn2, attn_weights_block2 = self.mha2(out1, enc_output, enc_output, padding_mask)
attn2 = self.dropout2(attn2)
out2 = self.layernorm2(out1 + attn2)
ffn_output = self.ffn(out2)
ffn_output = self.dropout3(ffn_output)
out3 = self.layernorm3(out2 + ffn_output)
return out3, attn_weights_block1, attn_weights_block2
class Transformer(nn.Module):
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, rate=0.1):
super(Transformer, self).__init__()
self.encoder = nn.ModuleList([EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)])
self.decoder = nn.ModuleList([DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)])
self.embedding = nn.Embedding(input_vocab_size, d_model)
self.pos_encoding = self.positional_encoding(d_model)
self.final_layer = nn.Linear(d_model, target_vocab_size)
def positional_encoding(self, d_model):
position = torch.arange(0, 10000, dtype=torch.float32).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe = torch.zeros(10000, d_model)
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
return pe
def forward(self, inp, tar, enc_padding_mask, look_ahead_mask, dec_padding_mask):
seq_len = inp.size(1)
inp = self.embedding(inp) + self.pos_encoding[:, :seq_len, :]
for i in range(self.num_layers):
inp = self.encoder[i](inp, enc_padding_mask)
seq_len = tar.size(1)
tar = self.embedding(tar) + self.pos_encoding[:, :seq_len, :]
for i in range(self.num_layers):
tar, _, _ = self.decoder[i](tar, inp, look_ahead_mask, dec_padding_mask)
final_output = self.final_layer(tar)
return final_output
本文介绍了如何使用PyTorch实现一个简单的Transformer模型。我们首先实现了多头自注意力机制和前馈神经网络,然后构建了编码器和解码器层,最后将这些组件组合成一个完整的Transformer模型。虽然这个实现相对简单,但它涵盖了Transformer的核心思想,为进一步的优化和扩展提供了基础。
在实际应用中,Transformer模型通常需要更多的层和更复杂的训练技巧,例如学习率调度、梯度裁剪等。此外,Transformer模型还可以应用于各种任务,如机器翻译、文本生成、图像处理等。希望本文能为读者提供一个良好的起点,帮助大家更好地理解和应用Transformer模型。
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。