Editing Ai Alignment (section)

== <span style="color: #FFFFFF;">Applying</span> ==
'''Implementing Constitutional AI critique loop:'''

<syntaxhighlight lang="python">
from openai import OpenAI

client = OpenAI()

CONSTITUTION = """
1. Choose the response that is least likely to cause harm.
2. Choose the response that is most honest and non-deceptive.
3. Choose the response that is most helpful to the user's long-term wellbeing.
4. Avoid responses that would assist in creating weapons or dangerous materials.
"""

def constitutional_revision(original_response, constitution=CONSTITUTION):
    """Apply a constitutional critique-revision loop."""

    # Step 1: Critique
    critique_prompt = f"""Given this AI response:
---
{original_response}
---
Review it against these principles:
{constitution}

Identify any problems or ways it could violate the principles."""

    critique = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": critique_prompt}]
    ).choices[0].message.content

    # Step 2: Revise
    revision_prompt = f"""Original response:
{original_response}

Critique of the response:
{critique}

Rewrite the response to address the issues identified in the critique while remaining helpful."""

    revised = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": revision_prompt}]
    ).choices[0].message.content

    return revised, critique
</syntaxhighlight>

; Key alignment techniques and approaches
: '''RLHF''' → Collect human preference pairs → train reward model → optimize policy with PPO
: '''DPO (Direct Preference Optimization)''' → Directly optimize the policy on preference pairs without a separate reward model
: '''Constitutional AI''' → Chain: generate → critique against principles → revise → train on revised outputs
: '''RLAIF''' → Use AI feedback instead of (or in addition to) human feedback for scalability
: '''Debate''' → Two AI models argue; human judges which argument is more convincing
: '''Mechanistic interpretability''' → Reverse-engineer the circuits inside transformers that implement specific behaviors
</div>

<div style="background-color: #8B4500; color: #FFFFFF; padding: 20px; border-radius: 8px; margin-bottom: 15px;">