Editing Model Compression and Quantization (section)

== <span style="color: #FFFFFF;">Applying</span> ==
'''LLM quantization with bitsandbytes:'''
<syntaxhighlight lang="python">
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

# 4-bit quantization (NF4) with double quantization — ~4GB for a 7B model
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,  # Compute in BF16 despite 4-bit storage
    bnb_4bit_use_double_quant=True,          # Quantize the quantization constants too
    bnb_4bit_quant_type="nf4",               # NF4: optimized for normally distributed weights
)
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3-8B-Instruct",
    quantization_config=quantization_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3-8B-Instruct")
inputs = tokenizer("What is quantum computing?", return_tensors="pt").to("cuda")
out = model.generate(**inputs, max_new_tokens=200)
print(tokenizer.decode(out[0]))
</syntaxhighlight>

'''Knowledge distillation:'''
<syntaxhighlight lang="python">
import torch.nn.functional as F

def distillation_loss(student_logits, teacher_logits, true_labels, T=4.0, alpha=0.7):
    """Combined distillation + task loss."""
    # Soft target loss: KL(teacher_soft || student_soft) at temperature T
    soft_teacher = F.softmax(teacher_logits / T, dim=-1)
    soft_student = F.log_softmax(student_logits / T, dim=-1)
    distill = F.kl_div(soft_student, soft_teacher, reduction='batchmean') * (T ** 2)
    # Hard target loss: standard cross-entropy
    task = F.cross_entropy(student_logits, true_labels)
    return alpha * distill + (1 - alpha) * task
</syntaxhighlight>

; Compression technique selection
: '''LLM, limited VRAM''' → GPTQ (INT4), AWQ, GGUF (llama.cpp)
: '''Fine-tuning large model''' → LoRA + QLoRA (4-bit base + LoRA adapters)
: '''Edge deployment (vision)''' → INT8 PTQ with TensorRT or ONNX Runtime
: '''Creating a smaller model''' → Knowledge distillation (teacher→student)
: '''Redundant model structure''' → Structured pruning (remove attention heads, layers)
</div>

<div style="background-color: #8B4500; color: #FFFFFF; padding: 20px; border-radius: 8px; margin-bottom: 15px;">