Editing Attention Mechanisms (section)

== <span style="color: #FFFFFF;">Applying</span> ==
'''Implementing scaled dot-product attention from scratch:'''
<syntaxhighlight lang="python">
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model=512, n_heads=8, dropout=0.1):
        super().__init__()
        assert d_model % n_heads == 0
        self.d_k = d_model // n_heads
        self.n_heads = n_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def split_heads(self, x):
        B, T, D = x.shape
        return x.view(B, T, self.n_heads, self.d_k).transpose(1, 2)
        # (B, H, T, d_k)

    def forward(self, query, key, value, mask=None):
        B, T, _ = query.shape
        Q = self.split_heads(self.W_q(query))
        K = self.split_heads(self.W_k(key))
        V = self.split_heads(self.W_v(value))
        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        attn = self.dropout(F.softmax(scores, dim=-1))
        out = torch.matmul(attn, V)           # (B, H, T, d_k)
        out = out.transpose(1, 2).contiguous().view(B, T, -1)
        return self.W_o(out), attn
</syntaxhighlight>

; Attention variant selection guide
: '''Standard self-attention''' → Transformers, BERT, GPT (seq len ≤ 4096)
: '''Flash Attention 2''' → Any modern transformer; same output, 2-4× faster, O(n) memory
: '''Grouped Query Attention (GQA)''' → LLaMA 2/3, Mistral — reduces KV cache in inference
: '''Sliding window attention''' → Longformer, Mistral — O(n·w) complexity for long docs
: '''Cross-attention''' → Encoder-decoder models (T5, Whisper, LLaVA projection)
: '''Linear attention''' → Mamba alternative, sub-quadratic, trades quality for speed
</div>

<div style="background-color: #8B4500; color: #FFFFFF; padding: 20px; border-radius: 8px; margin-bottom: 15px;">