预计时长:6-8周 目标:深入理解Transformer架构,掌握预训练模型使用,为LLM学习奠定基础
Self-Attention 让序列中每个位置都能关注所有其他位置
输入: X = [x_1, x_2, ..., x_n]
输出: Z = [z_1, z_2, ..., z_n]
每个z_i是所有x的加权和,权重由内容决定
1. 线性变换生成Q、K、V:
Q = X · W_Q (Query: 查询)
K = X · W_K (Key: 键)
V = X · W_V (Value: 值)
2. 计算注意力分数:
Attention(Q,K,V) = softmax(QK^T / √d_k) · V
其中 √d_k 是缩放因子,防止点积过大
设: batch_size=B, seq_len=L, d_model=D, d_k=d_v=D/h
X: (B, L, D)
W_Q, W_K, W_V: (D, d_k)
Q, K, V: (B, L, d_k)
QK^T: (B, L, L) - 注意力矩阵
Output: (B, L, d_v)
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
class SelfAttention(nn.Module):
def __init__(self, d_model, d_k):
super().__init__()
self.d_k = d_k
self.W_Q = nn.Linear(d_model, d_k)
self.W_K = nn.Linear(d_model, d_k)
self.W_V = nn.Linear(d_model, d_k)
def forward(self, x, mask=None):
# x: (batch, seq_len, d_model)
Q = self.W_Q(x) # (batch, seq_len, d_k)
K = self.W_K(x)
V = self.W_V(x)
# 计算注意力分数
scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
# scores: (batch, seq_len, seq_len)
# 应用mask(可选)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
# Softmax
attn_weights = F.softmax(scores, dim=-1)
# 加权求和
output = torch.matmul(attn_weights, V)
return output, attn_weights多头注意力:并行运行多个注意力层
优点:
1. 关注不同的表示子空间
2. 关注不同的位置
3. 增加模型容量
MultiHead(Q,K,V) = Concat(head_1, ..., head_h) · W_O
head_i = Attention(QW_Qi, KW_Ki, VW_Vi)
参数量:
- 每个头: 3 × d_model × (d_model/h)
- 输出投影: d_model × d_model
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
super().__init__()
assert d_model % num_heads == 0
self.d_model = d_model
self.num_heads = num_heads
self.d_k = d_model // num_heads
self.W_Q = nn.Linear(d_model, d_model)
self.W_K = nn.Linear(d_model, d_model)
self.W_V = nn.Linear(d_model, d_model)
self.W_O = nn.Linear(d_model, d_model)
def forward(self, Q, K, V, mask=None):
batch_size = Q.size(0)
# 线性变换 + 分头
Q = self.W_Q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
K = self.W_K(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
V = self.W_V(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
# (batch, num_heads, seq_len, d_k)
# 计算注意力
scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
attn_weights = F.softmax(scores, dim=-1)
attn_output = torch.matmul(attn_weights, V)
# 合并多头
attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
output = self.W_O(attn_output)
return outputSelf-Attention是排列不变的:
Attention(permute(X)) = permute(Attention(X))
需要额外注入位置信息
PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
特点:
1. 可处理任意长度
2. 相对位置可通过线性变换表示
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_len=5000):
super().__init__()
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() *
(-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0) # (1, max_len, d_model)
self.register_buffer('pe', pe)
def forward(self, x):
# x: (batch, seq_len, d_model)
return x + self.pe[:, :x.size(1)]| 方法 | 说明 | 使用模型 |
|---|---|---|
| 可学习位置编码 | 位置嵌入作为参数学习 | BERT, GPT |
| RoPE | 旋转位置编码 | LLaMA, Qwen |
| ALiBi | 注意力线性偏置 | BLOOM |
| 相对位置编码 | 编码相对位置关系 | T5 |
RoPE核心思想:
1. 将位置信息编码到旋转矩阵中
2. 使注意力分数自然包含相对位置信息
优点:
1. 直接外推到更长序列
2. 计算高效
3. 效果好
Transformer 架构
[输入嵌入] + [位置编码] [输出嵌入] + [位置编码]
│ │
▼ ▼
┌───────────────┐ ┌───────────────┐
│ Encoder │ │ Decoder │
│ ×N layers │───────────────────▶│ ×N layers │
└───────────────┘ └───────────────┘
│ │
▼ ▼
[编码表示] [Linear + Softmax]
│
▼
[输出概率]
class EncoderLayer(nn.Module):
def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
super().__init__()
self.self_attn = MultiHeadAttention(d_model, num_heads)
self.ffn = nn.Sequential(
nn.Linear(d_model, d_ff),
nn.ReLU(),
nn.Linear(d_ff, d_model)
)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask=None):
# Self-Attention + 残差连接 + LayerNorm
attn_output = self.self_attn(x, x, x, mask)
x = self.norm1(x + self.dropout(attn_output))
# FFN + 残差连接 + LayerNorm
ffn_output = self.ffn(x)
x = self.norm2(x + self.dropout(ffn_output))
return xclass DecoderLayer(nn.Module):
def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
super().__init__()
self.masked_self_attn = MultiHeadAttention(d_model, num_heads)
self.cross_attn = MultiHeadAttention(d_model, num_heads)
self.ffn = nn.Sequential(
nn.Linear(d_model, d_ff),
nn.ReLU(),
nn.Linear(d_ff, d_model)
)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
# Masked Self-Attention(防止看到未来)
attn1 = self.masked_self_attn(x, x, x, tgt_mask)
x = self.norm1(x + self.dropout(attn1))
# Cross-Attention(关注Encoder输出)
attn2 = self.cross_attn(x, enc_output, enc_output, src_mask)
x = self.norm2(x + self.dropout(attn2))
# FFN
ffn_output = self.ffn(x)
x = self.norm3(x + self.dropout(ffn_output))
return xclass Transformer(nn.Module):
def __init__(self, src_vocab, tgt_vocab, d_model=512, num_heads=8,
num_layers=6, d_ff=2048, max_len=5000, dropout=0.1):
super().__init__()
# 嵌入层
self.src_embed = nn.Embedding(src_vocab, d_model)
self.tgt_embed = nn.Embedding(tgt_vocab, d_model)
self.pos_encoding = PositionalEncoding(d_model, max_len)
# Encoder
self.encoder_layers = nn.ModuleList([
EncoderLayer(d_model, num_heads, d_ff, dropout)
for _ in range(num_layers)
])
# Decoder
self.decoder_layers = nn.ModuleList([
DecoderLayer(d_model, num_heads, d_ff, dropout)
for _ in range(num_layers)
])
# 输出层
self.fc_out = nn.Linear(d_model, tgt_vocab)
self.dropout = nn.Dropout(dropout)
self.d_model = d_model
def make_src_mask(self, src):
# (batch, 1, 1, src_len)
return (src != 0).unsqueeze(1).unsqueeze(2)
def make_tgt_mask(self, tgt):
batch_size, tgt_len = tgt.shape
# Padding mask
tgt_pad_mask = (tgt != 0).unsqueeze(1).unsqueeze(2)
# Causal mask(下三角矩阵)
tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len))).bool()
return tgt_pad_mask & tgt_sub_mask
def encode(self, src, src_mask):
x = self.dropout(self.pos_encoding(self.src_embed(src) * math.sqrt(self.d_model)))
for layer in self.encoder_layers:
x = layer(x, src_mask)
return x
def decode(self, tgt, enc_output, src_mask, tgt_mask):
x = self.dropout(self.pos_encoding(self.tgt_embed(tgt) * math.sqrt(self.d_model)))
for layer in self.decoder_layers:
x = layer(x, enc_output, src_mask, tgt_mask)
return x
def forward(self, src, tgt):
src_mask = self.make_src_mask(src)
tgt_mask = self.make_tgt_mask(tgt)
enc_output = self.encode(src, src_mask)
dec_output = self.decode(tgt, enc_output, src_mask, tgt_mask)
output = self.fc_out(dec_output)
return output| 设计 | 作用 |
|---|---|
| 残差连接 | 缓解梯度消失,便于训练深层网络 |
| LayerNorm | 稳定训练,加速收敛 |
| FFN | 增加非线性,扩展模型容量 |
| Causal Mask | 解码时防止看到未来信息 |
| Padding Mask | 忽略padding位置 |
Pre-training + Fine-tuning 范式:
1. 预训练(Pre-training):
- 在大规模无标注数据上训练
- 学习通用语言表示
2. 微调(Fine-tuning):
- 在下游任务上微调
- 少量标注数据即可
BERT: Bidirectional Encoder Representations from Transformers
特点:
1. 只使用Encoder
2. 双向注意力(每个位置能看到所有其他位置)
3. 适合理解类任务(分类、问答)
| 任务 | 说明 |
|---|---|
| MLM (Masked LM) | 随机mask 15%的token,预测被mask的词 |
| NSP (Next Sentence Prediction) | 预测两个句子是否连续 |
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
# 加载预训练模型
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese')
# 编码文本
text = "我爱自然语言处理"
inputs = tokenizer(text, return_tensors='pt')
outputs = model(**inputs)
# 获取表示
last_hidden_state = outputs.last_hidden_state # (batch, seq_len, hidden)
pooler_output = outputs.pooler_output # [CLS] 表示 (batch, hidden)
# 分类任务微调
classifier = BertForSequenceClassification.from_pretrained(
'bert-base-chinese', num_labels=2
)GPT: Generative Pre-trained Transformer
特点:
1. 只使用Decoder
2. 单向注意力(Causal Mask,只能看到之前的token)
3. 自回归生成
4. 适合生成类任务
语言建模(LM):
给定前文,预测下一个token
P(x_1, x_2, ..., x_n) = Π P(x_i | x_1, ..., x_{i-1})
| 模型 | 参数量 | 特点 |
|---|---|---|
| GPT-1 | 117M | 验证预训练+微调范式 |
| GPT-2 | 1.5B | Zero-shot能力 |
| GPT-3 | 175B | In-Context Learning |
| GPT-4 | ~1.7T | 多模态、推理增强 |
| 模型 | 类型 | 特点 |
|---|---|---|
| RoBERTa | Encoder | BERT改进版,动态mask |
| ALBERT | Encoder | 参数共享,轻量化 |
| T5 | Encoder-Decoder | 统一文本到文本范式 |
| BART | Encoder-Decoder | 降噪自编码器 |
| XLNet | Encoder | 排列语言模型 |
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
# 自动加载
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
model = AutoModel.from_pretrained("bert-base-chinese")
# Pipeline(更简单)
from transformers import pipeline
classifier = pipeline("sentiment-analysis")
result = classifier("I love this movie!")
generator = pipeline("text-generation", model="gpt2")
result = generator("Once upon a time", max_length=50)from transformers import Trainer, TrainingArguments
from datasets import load_dataset
# 加载数据
dataset = load_dataset("glue", "sst2")
# 预处理
def tokenize_function(examples):
return tokenizer(examples["sentence"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# 训练配置
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=16,
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
)
trainer.train()目标:从零实现完整的Transformer
要求:
- 实现Multi-Head Attention
- 实现Encoder和Decoder
- 在简单翻译任务上训练
目标:使用BERT进行情感分类
要求:
- 使用HuggingFace加载预训练模型
- 微调BERT进行分类
- 达到90%+准确率
完成以下任务后,进入阶段六:
-
注意力机制
- 理解并实现Self-Attention
- 理解Multi-Head Attention
- 理解位置编码(尤其RoPE)
-
Transformer
- 手写完整Transformer
- 理解Encoder和Decoder的区别
- 理解各种Mask的作用
-
预训练模型
- 理解BERT和GPT的区别
- 熟练使用HuggingFace Transformers
- 完成一个微调任务
完成本阶段后,进入阶段六:大语言模型专题