注意力机制进阶：稀疏注意力与线性复杂度优化

张

张建站

2026/5/12 7:22:39

10分钟阅读

注意力机制进阶稀疏注意力与线性复杂度优化1. 技术分析1.1 标准注意力的复杂度问题标准 Transformer 的自注意力复杂度为 O(n²d)当序列长度 n 很大时计算成本显著增加注意力复杂度对比标准注意力: O(n²d) 稀疏注意力: O(nd log n) 线性注意力: O(nd)1.2 注意力机制变体类型复杂度适用场景代表模型标准注意力O(n²d)短序列Transformer稀疏注意力O(nd log n)中等序列Longformer线性注意力O(nd)长序列Linformer滑动窗口O(ndk)局部依赖Transformer-XL1.3 稀疏注意力模式稀疏注意力模式示意图全局注意力: 每个位置关注所有位置滑动窗口: 每个位置关注窗口内位置带状注意力: 对角线带状区域轴向注意力: 分别处理不同维度2. 核心功能实现2.1 稀疏注意力实现import torch import torch.nn as nn import torch.nn.functional as F class SparseAttention(nn.Module): def __init__(self, d_model, num_heads, window_size512): super().__init__() self.d_model d_model self.num_heads num_heads self.d_k d_model // num_heads self.window_size window_size self.W_q nn.Linear(d_model, d_model) self.W_k nn.Linear(d_model, d_model) self.W_v nn.Linear(d_model, d_model) self.W_o nn.Linear(d_model, d_model) def forward(self, Q, K, V, maskNone): batch_size Q.size(0) seq_len Q.size(1) Q self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) K self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) V self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) outputs [] for i in range(0, seq_len, self.window_size): end min(i self.window_size, seq_len) Q_window Q[:, :, i:end, :] K_window K[:, :, max(0, i - self.window_size // 2):min(seq_len, i 3 * self.window_size // 2), :] V_window V[:, :, max(0, i - self.window_size // 2):min(seq_len, i 3 * self.window_size // 2), :] scores torch.matmul(Q_window, K_window.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtypetorch.float32)) attn_weights F.softmax(scores, dim-1) output torch.matmul(attn_weights, V_window) outputs.append(output) output torch.cat(outputs, dim2) output output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model) output self.W_o(output) return output class LongformerAttention(nn.Module): def __init__(self, d_model, num_heads, window_size512, global_tokens[0]): super().__init__() self.d_model d_model self.num_heads num_heads self.d_k d_model // num_heads self.window_size window_size self.global_tokens global_tokens self.W_q nn.Linear(d_model, d_model) self.W_k nn.Linear(d_model, d_model) self.W_v nn.Linear(d_model, d_model) self.W_o nn.Linear(d_model, d_model) def forward(self, x): batch_size x.size(0) seq_len x.size(1) Q self.W_q(x).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) K self.W_k(x).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) V self.W_v(x).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) output torch.zeros_like(Q) for i in range(seq_len): if i in self.global_tokens: scores torch.matmul(Q[:, :, i:i1, :], K.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtypetorch.float32)) attn_weights F.softmax(scores, dim-1) output[:, :, i:i1, :] torch.matmul(attn_weights, V) else: start max(0, i - self.window_size // 2) end min(seq_len, i self.window_size // 2 1) scores torch.matmul(Q[:, :, i:i1, :], K[:, :, start:end, :].transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtypetorch.float32)) for g in self.global_tokens: if g start and g end: continue global_scores torch.matmul(Q[:, :, i:i1, :], K[:, :, g:g1, :].transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtypetorch.float32)) scores torch.cat([scores[:, :, :, :g-start], global_scores, scores[:, :, :, g-start:]], dim-1) attn_weights F.softmax(scores, dim-1) v_window V[:, :, start:end, :] for g in self.global_tokens: if g start and g end: continue v_window torch.cat([v_window[:, :, :g-start, :], V[:, :, g:g1, :], v_window[:, :, g-start:, :]], dim2) output[:, :, i:i1, :] torch.matmul(attn_weights, v_window) output output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model) output self.W_o(output) return output2.2 线性注意力实现class LinearAttention(nn.Module): def __init__(self, d_model, num_heads): super().__init__() self.d_model d_model self.num_heads num_heads self.d_k d_model // num_heads self.W_q nn.Linear(d_model, d_model) self.W_k nn.Linear(d_model, d_model) self.W_v nn.Linear(d_model, d_model) self.W_o nn.Linear(d_model, d_model) def forward(self, Q, K, V): batch_size Q.size(0) Q self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) K self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) V self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) Q F.elu(Q) 1 K F.elu(K) 1 KV torch.einsum(bhld,bhlm-bhdm, K, V) Z 1 / (torch.einsum(bhlq,bhdq-bhl, Q, K.sum(dim2)) 1e-8) output torch.einsum(bhlq,bhdm,bhl-bhlm, Q, KV, Z) output output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model) output self.W_o(output) return output class LinformerAttention(nn.Module): def __init__(self, d_model, num_heads, k128): super().__init__() self.d_model d_model self.num_heads num_heads self.d_k d_model // num_heads self.k k self.W_q nn.Linear(d_model, d_model) self.W_k nn.Linear(d_model, d_model) self.W_v nn.Linear(d_model, d_model) self.W_o nn.Linear(d_model, d_model) self.E nn.Parameter(torch.randn(self.k, d_model)) self.F nn.Parameter(torch.randn(self.k, d_model)) def forward(self, Q, K, V): batch_size Q.size(0) seq_len Q.size(1) Q self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) K self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) V self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) K_proj torch.matmul(self.E[:self.k, :self.d_k], K.transpose(-2, -1)).transpose(-2, -1) V_proj torch.matmul(self.F[:self.k, :self.d_k], V.transpose(-2, -1)).transpose(-2, -1) scores torch.matmul(Q, K_proj.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtypetorch.float32)) attn_weights F.softmax(scores, dim-1) output torch.matmul(attn_weights, V_proj) output output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model) output self.W_o(output) return output2.3 轴向注意力实现class AxialAttention(nn.Module): def __init__(self, d_model, num_heads): super().__init__() self.d_model d_model self.num_heads num_heads self.row_attn MultiHeadAttention(d_model, num_heads) self.col_attn MultiHeadAttention(d_model, num_heads) def forward(self, x): batch_size, height, width, d_model x.size() x_row x.view(batch_size, height, width * d_model) x_row self.row_attn(x_row, x_row, x_row) x_row x_row.view(batch_size, height, width, d_model) x_col x_row.permute(0, 2, 1, 3).contiguous().view(batch_size, width, height * d_model) x_col self.col_attn(x_col, x_col, x_col) x_col x_col.view(batch_size, width, height, d_model).permute(0, 2, 1, 3).contiguous() return x_col3. 性能对比3.1 注意力机制复杂度对比注意力类型时间复杂度空间复杂度最长序列标准注意力O(n²d)O(n²)~1024稀疏注意力O(nd log n)O(nd)~8192线性注意力O(nd)O(nd)~655363.2 不同序列长度的性能序列长度标准注意力稀疏注意力线性注意力512100ms80ms60ms20481600ms200ms100ms8192OOM800ms300ms3.3 效果对比模型精度下降速度提升内存节省Longformer1%4x8xLinformer2%10x16xLinear Attention3%20x32x4. 最佳实践4.1 注意力机制选择def select_attention(sequence_length, task_type): if sequence_length 1024: return MultiHeadAttention elif sequence_length 8192: return LongformerAttention else: return LinformerAttention class AttentionFactory: staticmethod def create(config): if config[type] standard: return MultiHeadAttention(config[d_model], config[num_heads]) elif config[type] sparse: return LongformerAttention(config[d_model], config[num_heads], config[window_size]) elif config[type] linear: return LinearAttention(config[d_model], config[num_heads])4.2 长文本处理class LongTextProcessor: def __init__(self, model, chunk_size512, overlap128): self.model model self.chunk_size chunk_size self.overlap overlap def process(self, text): chunks [] for i in range(0, len(text), self.chunk_size - self.overlap): chunk text[i:i self.chunk_size] chunks.append(chunk) outputs [] for chunk in chunks: outputs.append(self.model(chunk)) return self._merge_outputs(outputs) def _merge_outputs(self, outputs): merged [] for i, output in enumerate(outputs): if i 0: merged.append(output[:-self.overlap]) elif i len(outputs) - 1: merged.append(output[self.overlap:]) else: merged.append(output[self.overlap:-self.overlap]) return torch.cat(merged, dim1)5. 总结注意力机制优化是处理长文本的关键稀疏注意力通过限制注意力范围降低复杂度线性注意力通过核方法实现 O(nd) 复杂度轴向注意力适用于二维结构数据权衡选择根据序列长度和精度要求选择合适方案对比数据如下稀疏注意力可处理 8192 长度序列精度下降 1%线性注意力可处理超长序列但精度略有下降推荐在长文本任务中使用 Longformer 或 Linformer标准注意力在短序列任务中仍然最优

Python的函数使用介绍

1 跳出循环-breakpython提供了一种方便快捷的跳出循环的方法-break，示例如下，计算未知数字个数的总和：123456789if __name__ "__main__":sum 0while True:num str(input(输入的数字 (或者 "完成"): ))if num 完成:br…...

2026/5/12 7:20:58 阅读更多 →

从零移植FlashDB：在STM32与SPI Flash上的实战配置指南

1. 为什么选择FlashDB？ 如果你正在开发嵌入式项目，尤其是资源受限的STM32这类MCU，肯定会遇到数据存储的需求。传统的文件系统对于小型设备来说太重了，而直接操作Flash又太底层。这时候FlashDB就是个绝佳选择 - 它专为嵌入式设计&a…...

2026/5/12 7:20:31 阅读更多 →

Vim插件vim-gpt-commit：基于AI自动生成Git提交信息的实践指南

1. 项目概述：当Vim遇上AI，让Git提交信息告别“fix bug”作为一名在Vim和Git世界里摸爬滚打了十多年的老码农，我深知写好一个Git提交信息有多重要，又有多烦人。多少次，在完成一段复杂的代码修改后，面对那个空…...

2026/5/12 7:15:48 阅读更多 →

【阳师范学院主办，多高校承协办 | IET出版，往届连续4届EI稳定检索，平均刊后2个月EI检索，EI检索稳定 | 择优至EI期刊】第五届电力工程与电气技术学术会议（ICPEET 2026）

ICPEET 2026 已申请IET (CA) EAI (JA) 出版会议论文（CA）： 会议已申请IET Conference Proceedings (ISSN: 2732-4494) 出版，收录于IET数字图书馆，并提交EI Compendex、Scopus、IEEE Xplore等数据库进行检索。期刊论文…...

2026/5/12 3:35:40 阅读更多 →