大模型基础--Transformer的代码实现
1.下图是Transformer的架构图。主要部分是Positional Encoding,Cross Multi-Head self-Attention,Masked Multi-Head Attention。

2. Positional Encoding
Transformer 使用了一种基于正弦(sin)和余弦(cos)函数的位置编码方式,具体定义如下:

class PositionEncoding(nn.Module):
# 位置编码矩阵 (max_len, d_model)
def __init__(self, max_len, d_model):
super().__init__()
# 初始化编码矩阵 (max_len, d_model)
pe = torch.zeros(size=(max_len, d_model))
# 当前词在序列中的位置 (max_len, 1)
pos = torch.arange(0, max_len).unsqueeze(1)
# 表示公式中2i (d_model/2, )
_2i = torch.arange(0, d_model, 2)
# 计算10000**(2i/d_model) (d_model/2, )
div_term = torch.pow(10000, (_2i / d_model))
# 按奇偶数维度计算位置编码值 (max_len, d_model)
pe[:, 0::2] = torch.sin( pos / div_term)
pe[:, 1::2] = torch.cos( pos / div_term)
self.register_buffer("pe", pe)
# 前向传播
def forward(self, x):
# x (N, L, E=d_model)
# 提取当前序列长度 L (DataLoader中pad_sequence的逻辑是将一批按照最长的句子进行padding,所以L是小于等于max_len)
seq_len = x.shape[1]
# 在位置编码矩阵中截取 L个向量 (L, E)
part_pe = self.pe[0:seq_len]
return x + part_pe
3. Cross Multi-Head Attention
Cross Multi-Head Attention是由Cross Self-Attention Head组成。
3.1 Cross Self-Attention Head


import torch
import torch.nn as nn
class EncoderAttentionHead(nn.Module):
def __init__(self, d_model, head_size):
super().__init__()
self.head_size = head_size
self.query = nn.Linear(d_model, head_size)
self.key = nn.Linear(d_model, head_size)
self.value = nn.Linear(d_model, head_size)
def forward(self, x):
# 计算Q, K, V
Q = self.query(x)
K = self.key(x)
V = self.value(x)
# Q和K的点积
attention = Q @ K.transpose(-2, -1)
# 缩放
attention = attention / (self.head_size ** 0.5)
# 打分
attention = torch.softmax(attention, dim=-1)
# 加权汇总
attention = attention @ V
return attention
3.2 Cross Multi-Head Attention

import torch
import torch.nn as nn
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, n_heads):
super().__init__()
# 注意力头的个数,这块也可以看出d_model和注意力头数目由整倍数关系
self.head_size = d_model // n_heads
self.heads = nn.ModuleList([
EncoderAttentionHead(d_model, self.head_size) for _ in range(n_heads)
])
self.multi_header = nn.Linear(d_model, d_model)
def forward(self, x):
# 拼接多个注意力头
multi_header = torch.cat([head(x) for head in self.heads], dim=-1)
out = self.multi_header(multi_header)
return out
4. Encoder
Encoder Layer是由Cross Multi-Head Attention和前馈神经网络组成

class TransformerEncoder(nn.Module):
def __init__(self, d_model, n_heads, r_mlp=4):
super().__init__()
self.d_model = d_model
self.n_heads = n_heads
# 多头注意力
self.mha = MultiHeadAttention(d_model, n_heads)
# 层归一化
self.ln1 = nn.LayerNorm(d_model)
# 前馈神经网络
self.ffn = nn.Sequential(
nn.Linear(d_model, d_model * r_mlp),
nn.GELU(),
nn.Linear(d_model * r_mlp, d_model)
)
# 层归一化
self.ln2 = nn.LayerNorm(d_model)
def forward(self, x):
# 第一次层归一化之后的残差
out = x + self.mha(x)
# 第二次层归一化之后的残差
out = out + self.ffn(self.ln1(out))
return self.ln2(out)
5. Masked Multi-Head Attention
Masked Multi-Head Attention是由Masked Attention Head组成。
5.1 Masked Attention Header
解码器在自注意力机制中引入了遮盖机制(Mask)。该机制会在计算注意力时,阻止模型访问当前位置之后的词,只允许它依赖自身及前文的信息


class DecoderAttentionHead(nn.Module):
def __init__(self, d_model, head_size):
super().__init__()
self.head_size = head_size
self.query = nn.Linear(d_model, head_size)
self.key = nn.Linear(d_model, head_size)
self.value = nn.Linear(d_model, head_size)
def forward(self, x, encoder_out=None, masked=False):
# 计算Q, K, V
Q = self.query(x)
if encoder_out is not None:
x = encoder_out
K = self.key(x)
V = self.value(x)
# Q和K的点积
attention = Q @ K.transpose(-2, -1)
# 缩放
attention = attention / (self.head_size ** 0.5)
if masked:
mask = torch.triu(torch.ones_like(attention), diagonal=1).bool()
attention.masked_fill(mask, float('-inf'))
# 打分
attention = torch.softmax(attention, dim=-1)
# 加权汇总
attention = attention @ V
return attention
5.2 Masked Multi-Head Attention

class MaskedMultiHeadAttention(nn.Module):
def __init__(self, d_model, n_heads):
super().__init__()
# 注意力头的个数,这块也可以看出d_model和注意力头数目由整倍数关系
self.head_size = d_model // n_heads
self.heads = nn.ModuleList([
DecoderAttentionHead(d_model, self.head_size) for _ in range(n_heads)
])
self.multi_header = nn.Linear(d_model, d_model)
def forward(self, x, encoder_out=None, masked=False):
# 拼接多个注意力头
multi_header = torch.cat([head(x, encoder_out, masked) for head in self.heads], dim=-1)
out = self.multi_header(multi_header)
return out
6. Decoder
Decoder Layer是由一个Masked Self-Attention Head,一个Masked Encoder-Decoder Attention Head和前馈神经网络组成。
class TransformerDecoder(nn.Module):
def __init__(self, d_model, n_heads, r_mlp=4):
super().__init__()
self.d_model = d_model
self.n_heads = n_heads
# Masked多头自注意力
self.mha = MaskedMultiHeadAttention(d_model, n_heads)
# 层归一化
self.ln1 = nn.LayerNorm(d_model)
# Masked多头Encoder-Decoder注意力
self.edha = MaskedMultiHeadAttention(d_model, n_heads)
# 层归一化
self.ln2 = nn.LayerNorm(d_model)
# 前馈神经网络
self.ffn = nn.Sequential(
nn.Linear(d_model, d_model * r_mlp),
nn.GELU(),
nn.Linear(d_model * r_mlp, d_model)
)
# 层归一化
self.ln3 = nn.LayerNorm(d_model)
def forward(self, x, encoder_out):
# 第一次层归一化之后的残差
out = x + self.mha(x,masked=True)
# 第二次层归一化之后的残差
out = out + self.ffn(self.ln1(out))
# 第三次层归一化之后的残差
out = out + self.edha(self.ln2(out), encoder_out=encoder_out, masked=True)
return self.ln3(out)
7. 完整的Transformer模型,具有Encoder-Decoder架构,用于中英翻译。
class TranslationTransformer(nn.Module):
def __init__(self,d_model,max_seq_length,n_heads,n_layers,cn_vocab_size,cn_padding_idx,en_vocab_size,en_padding_idx):
super().__init__()
self.d_model = d_model # 模型维度,嵌入的维度(宽度)
self.max_seq_length = max_seq_length # 输入词的最大数目
self.n_heads = n_heads # 注意力头的数量
self.cn_vocab_size = cn_vocab_size # 中文词表的大小
self.cn_padding_idx = cn_padding_idx # 中文填充对应的序号
self.en_vocab_size = en_vocab_size # 英文文词表的大小
self.en_padding_idx = en_padding_idx # 英文填充对应的序号
self.cn_embd = nn.Embedding(cn_vocab_size,d_model,padding_idx=cn_padding_idx) # 中文词嵌入
self.cn_positional_encoding = PositionEncoding(self.max_seq_length,self.d_model) # 位置编码
# Encoder
self.transformer_encoder = nn.Sequential(*[
TransformerEncoder(self.d_model, self.n_heads)
for _ in range(n_layers)
])
self.en_embd = nn.Embedding(en_vocab_size, d_model, padding_idx=en_padding_idx) # 英文词嵌入
self.en_positional_encoding = PositionEncoding(self.max_seq_length, self.d_model) # 位置编码
# Decoder
self.transformer_decoder = nn.Sequential(*[
TransformerDecoder(self.d_model, self.n_heads)
for _ in range(n_layers)
])
# 用于预测英文词概率
self.predict_word = nn.Sequential(nn.Linear(self.d_model, self.en_vocab_size),nn.Softmax(dim=-1))
def forward(self, cn_x,en_y):
x = self.cn_embd(cn_x)
x = self.cn_positional_encoding(x)
x = self.transformer_encoder(x)
y = self.en_embd(en_y)
y = self.en_positional_encoding(y)
y = self.transformer_decoder(x,y)
return self.predict_word(y)
- 点赞
- 收藏
- 关注作者
评论(0)