Editing Self-Supervised Learning (section)

== <span style="color: #FFFFFF;">Applying</span> ==
'''Contrastive self-supervised pre-training with SimCLR:'''
<syntaxhighlight lang="python">
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T
from torchvision.models import resnet50

class SimCLR(nn.Module):
    def __init__(self, projection_dim=128, temperature=0.5):
        super().__init__()
        self.temperature = temperature
        # Backbone: ResNet-50 without final FC
        backbone = resnet50(weights=None)
        self.encoder = nn.Sequential(*list(backbone.children())[:-1])
        # Projection head: 2-layer MLP
        self.projector = nn.Sequential(
            nn.Linear(2048, 2048), nn.ReLU(),
            nn.Linear(2048, projection_dim)
        )

    def forward(self, x1, x2):
        h1 = self.encoder(x1).squeeze()
        h2 = self.encoder(x2).squeeze()
        z1 = F.normalize(self.projector(h1), dim=1)
        z2 = F.normalize(self.projector(h2), dim=1)
        return self.nt_xent_loss(z1, z2)

    def nt_xent_loss(self, z1, z2):
        """NT-Xent (Normalized Temperature-scaled Cross Entropy) loss."""
        N = z1.size(0)
        z = torch.cat([z1, z2], dim=0)  # 2N x D
        sim = torch.mm(z, z.T) / self.temperature  # 2N x 2N
        # Mask self-similarity
        mask = torch.eye(2*N, dtype=bool)
        sim.masked_fill_(mask, float('-inf'))
        # Positive pairs are at offsets [i, i+N] and [i+N, i]
        labels = torch.cat([torch.arange(N) + N, torch.arange(N)])
        return F.cross_entropy(sim, labels.to(z.device))

# Augmentation pipeline for SSL
ssl_transform = T.Compose([
    T.RandomResizedCrop(224, scale=(0.2, 1.0)),
    T.RandomHorizontalFlip(),
    T.ColorJitter(0.8, 0.8, 0.8, 0.2),
    T.RandomGrayscale(p=0.2),
    T.GaussianBlur(kernel_size=23),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
</syntaxhighlight>

; SSL method selection guide
: '''NLP pretraining''' → MLM (BERT-style); causal LM (GPT-style)
: '''Vision: contrastive''' → SimCLR, MoCo v3, BYOL (no negatives)
: '''Vision: masked reconstruction''' → MAE, BEiT, SimMIM
: '''Vision: knowledge distillation''' → DINO, DINOv2
: '''Audio''' → wav2vec 2.0, HuBERT (masked acoustic modeling)
: '''Multimodal''' → CLIP (image-text contrastive), FLAVA
</div>

<div style="background-color: #8B4500; color: #FFFFFF; padding: 20px; border-radius: 8px; margin-bottom: 15px;">