Editing Visual Grounding (section)

== <span style="color: #FFFFFF;">Applying</span> ==
'''Open-vocabulary grounding with Grounding DINO + SAM:'''
<syntaxhighlight lang="python">
from PIL import Image
import torch
import numpy as np

# Method 1: Grounding DINO for bounding box grounding
from groundingdino.util.inference import load_model, load_image, predict, annotate

model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.py",
                   "weights/groundingdino_swint_ogc.pth")

image_source, image = load_image("street_scene.jpg")

# Ground a natural language description
boxes, logits, phrases = predict(
    model=model,
    image=image,
    caption="the woman in the red dress . the yellow car on the left",
    box_threshold=0.35,
    text_threshold=0.25
)
annotated = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
Image.fromarray(annotated).save("grounded_output.jpg")
print(f"Found {len(boxes)} objects: {phrases}")

# Method 2: LangSAM (Grounding DINO + SAM combined)
from lang_sam import LangSAM

lang_sam = LangSAM()
image = Image.open("garden.jpg").convert("RGB")
# Get segmentation masks for any text description
masks, boxes, phrases, logits = lang_sam.predict(image, "red flowers")
# masks: list of boolean numpy arrays — precise pixel-level masks
for i, (mask, phrase) in enumerate(zip(masks, phrases)):
    masked_image = np.array(image.copy())
    masked_image[~mask] = 0  # Keep only the grounded region
    Image.fromarray(masked_image).save(f"grounded_mask_{i}.png")
</syntaxhighlight>

; Visual grounding systems
: '''Open-vocabulary detection''' → Grounding DINO, GLIP, OWL-ViT (Google)
: '''Segmentation from text''' → LangSAM, SEEM, X-Decoder
: '''Referring expression''' → MDETR, TransVG, SeqTR
: '''Multimodal reasoning''' → Qwen2-VL, InternVL2, LLaVA-1.6 (grounded output)
: '''Video grounding''' → TubeDETR, MOMA, CLIP4Clip for temporal grounding
</div>

<div style="background-color: #8B4500; color: #FFFFFF; padding: 20px; border-radius: 8px; margin-bottom: 15px;">