if-emotion-ux/backend/claude_api_server_rag.py

#!/usr/bin/env python3
"""
Claude Max API Server v2.1 - with ChromaDB RAG for Sergio Personality DNA

OpenAI-compatible endpoint for if.emotion frontend with full RAG retrieval.
Based on: https://idsc2025.substack.com/p/how-i-built-claude_max-to-unlock

Usage:
    python claude_api_server_rag.py [--port 3001]
"""

import os
import sys
import json
import subprocess
from pathlib import Path
from datetime import datetime
from typing import Generator
import uuid

try:
    from flask import Flask, request, Response, jsonify
    from flask_cors import CORS
except ImportError:
    print("Installing flask...")
    subprocess.run([sys.executable, "-m", "pip", "install", "flask", "flask-cors", "-q"])
    from flask import Flask, request, Response, jsonify
    from flask_cors import CORS

# ChromaDB for RAG
try:
    import chromadb
    CHROMADB_AVAILABLE = True
except ImportError:
    print("ChromaDB not available - running without RAG")
    CHROMADB_AVAILABLE = False

app = Flask(__name__)
CORS(app)

# Configuration
CLAUDE_CLI = Path.home() / ".local/bin/claude"
CREDENTIALS_FILE = Path.home() / ".claude/.credentials.json"
CHROMADB_PATH = "/root/sergio_chatbot/chromadb"

# Collection names (all 4 populated)
COLLECTIONS = {
    "personality": "sergio_personality",  # 20 docs - frameworks, values, constraints
    "rhetorical": "sergio_rhetorical",    # 5 docs - rhetorical devices
    "humor": "sergio_humor",              # 28 docs - humor patterns
    "corpus": "sergio_corpus"             # 70 docs - conversation examples
}

# Initialize ChromaDB
chroma_client = None
chroma_collections = {}

if CHROMADB_AVAILABLE:
    try:
        chroma_client = chromadb.PersistentClient(path=CHROMADB_PATH)
        for key, name in COLLECTIONS.items():
            try:
                chroma_collections[key] = chroma_client.get_collection(name)
                print(f"Loaded {name}: {chroma_collections[key].count()} docs")
            except Exception as e:
                print(f"Warning: Could not load {name}: {e}")
    except Exception as e:
        print(f"ChromaDB initialization failed: {e}")

# Sergio System Prompt (base)
SERGIO_SYSTEM_PROMPT = """You are Sergio, a Spanish-speaking psychologist who specializes in debunking abstract spiritual language and operationalizing emotional concepts into concrete, testable frameworks.

CORE PERSONALITY:
- Direct and brash communication style - challenge unfalsifiable claims
- Systems-first thinker (context/culture factors, not individual blame)
- Bilingual Spanish/English code-switching is natural for you
- Convert abstract emotional language into concrete operational definitions
- Apply the Identity=Interaction framework: identity emerges from relational patterns, not fixed essence

VOICE GUIDELINES (MANDATORY):
- NEVER use bullet points or numbered lists - always narrative flow
- Variable sentence length pattern: short punchy + longer flowing explanation + short again
- Validate emotional reality FIRST, then challenge interpretation
- End with concrete operationalization: "What specific behavior in the next 30 minutes?"

RHETORICAL TOOLS:
- Aspiradora metaphor: When someone drowns in complexity, simplify to binary. "Una aspiradora no necesita 50 tipos de suciedad etiquetados. It needs one question: Is there dirt? Yes or no?"
- Reframing: "The problem isn't X. The problem is Y."
- Pattern exposure: "Here's what actually happens..."
- Counterexample testing: "What would falsify that belief?"

SPANISH USAGE:
- Use Spanish for emotional validation: "Mira, eso no está mal"
- Use Spanish for cultural concepts: vínculos, vergüenza ajena, sobremadre
- Use colloquial markers: tío, vale, pues, mira
- NEVER use formal Spanish: no obstante, asimismo, consecuentemente

ANTI-PATTERNS (NEVER DO):
- Never pathologize neurodivergence - frame as context mismatch, not deficit
- Never use "Furthermore", "In conclusion", "One could argue"
- Never create equal-length paragraphs
- Never give prescriptions without mechanism explanations

EXAMPLE RESPONSE STRUCTURE:
Hook (challenge assumption) → Narrative (explain mechanism) → Operationalization (concrete action) → Provocation (opening question)

{personality_context}"""


def retrieve_context(user_message: str) -> str:
    """Query all ChromaDB collections for relevant Sergio context"""
    if not chroma_client:
        return ""

    context_parts = []

    try:
        # Query corpus for similar conversation examples (most important)
        if "corpus" in chroma_collections:
            corpus_results = chroma_collections["corpus"].query(
                query_texts=[user_message],
                n_results=3
            )
            if corpus_results and corpus_results['documents'] and corpus_results['documents'][0]:
                context_parts.append("CONVERSATION EXAMPLES FROM SERGIO:")
                for doc in corpus_results['documents'][0]:
                    context_parts.append(doc[:500])  # Truncate long examples

        # Query personality for frameworks
        if "personality" in chroma_collections:
            personality_results = chroma_collections["personality"].query(
                query_texts=[user_message],
                n_results=2
            )
            if personality_results and personality_results['documents'] and personality_results['documents'][0]:
                context_parts.append("\nPERSONALITY FRAMEWORKS:")
                for doc in personality_results['documents'][0]:
                    context_parts.append(doc[:300])

        # Query rhetorical devices
        if "rhetorical" in chroma_collections:
            rhetorical_results = chroma_collections["rhetorical"].query(
                query_texts=[user_message],
                n_results=1
            )
            if rhetorical_results and rhetorical_results['documents'] and rhetorical_results['documents'][0]:
                context_parts.append("\nRHETORICAL DEVICE TO USE:")
                context_parts.append(rhetorical_results['documents'][0][0][:200])

        # Query humor patterns (if topic seems appropriate)
        humor_keywords = ['absurd', 'ridicul', 'spirit', 'vibra', 'energ', 'manifest', 'univers']
        if any(kw in user_message.lower() for kw in humor_keywords):
            if "humor" in chroma_collections:
                humor_results = chroma_collections["humor"].query(
                    query_texts=[user_message],
                    n_results=2
                )
                if humor_results and humor_results['documents'] and humor_results['documents'][0]:
                    context_parts.append("\nHUMOR PATTERNS TO DEPLOY:")
                    for doc in humor_results['documents'][0]:
                        context_parts.append(doc[:200])

    except Exception as e:
        print(f"RAG retrieval error: {e}")

    return "\n".join(context_parts) if context_parts else ""


def load_credentials():
    """Load Claude Max credentials"""
    if CREDENTIALS_FILE.exists():
        with open(CREDENTIALS_FILE) as f:
            return json.load(f)
    return None


def call_claude_cli(prompt: str, stream: bool = False) -> Generator[str, None, None]:
    """
    Call Claude CLI using Max subscription authentication.
    """
    env = os.environ.copy()

    # Remove API key to force subscription auth
    if "ANTHROPIC_API_KEY" in env:
        del env["ANTHROPIC_API_KEY"]

    env["CLAUDE_USE_SUBSCRIPTION"] = "true"

    try:
        if stream:
            process = subprocess.Popen(
                [str(CLAUDE_CLI), "--print", prompt],
                env=env,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True,
                bufsize=1
            )

            for line in process.stdout:
                yield line

            process.wait()
        else:
            result = subprocess.run(
                [str(CLAUDE_CLI), "--print", prompt],
                env=env,
                capture_output=True,
                text=True,
                timeout=300
            )
            yield result.stdout

    except subprocess.TimeoutExpired:
        yield "[Error: Request timed out after 300s]"
    except Exception as e:
        yield f"[Error: {str(e)}]"


@app.route('/health', methods=['GET'])
def health():
    """Health check endpoint with RAG status"""
    creds = load_credentials()

    # Get collection counts
    collection_counts = {}
    for key, coll in chroma_collections.items():
        try:
            collection_counts[key] = coll.count()
        except:
            collection_counts[key] = 0

    return jsonify({
        "status": "healthy",
        "service": "claude-max-api",
        "version": "2.1.0-rag",
        "subscription_type": creds.get("claudeAiOauth", {}).get("subscriptionType") if creds else None,
        "cli_path": str(CLAUDE_CLI),
        "cli_exists": CLAUDE_CLI.exists(),
        "chromadb_available": CHROMADB_AVAILABLE,
        "chromadb_path": CHROMADB_PATH,
        "collections": collection_counts
    })


@app.route('/v1/models', methods=['GET'])
def list_models():
    """OpenAI-compatible models endpoint"""
    return jsonify({
        "object": "list",
        "data": [
            {
                "id": "sergio-rag",
                "object": "model",
                "created": int(datetime.now().timestamp()),
                "owned_by": "infrafabric",
                "permission": [],
                "root": "sergio-rag",
                "parent": None
            },
            {
                "id": "claude-max",
                "object": "model",
                "created": int(datetime.now().timestamp()),
                "owned_by": "anthropic"
            }
        ]
    })


@app.route('/v1/chat/completions', methods=['POST'])
def chat_completions():
    """OpenAI-compatible chat completions with Sergio personality + RAG"""
    data = request.json
    messages = data.get('messages', [])
    stream = data.get('stream', False)
    model = data.get('model', 'sergio-rag')

    # Get the latest user message for RAG retrieval
    user_message = ""
    for msg in reversed(messages):
        if msg.get('role') == 'user':
            user_message = msg.get('content', '')
            break

    # Retrieve personality DNA context from ChromaDB
    personality_context = retrieve_context(user_message) if user_message else ""

    # Build system prompt with RAG context
    system_prompt = SERGIO_SYSTEM_PROMPT.format(
        personality_context=personality_context if personality_context else "No additional context retrieved."
    )

    # Build prompt
    prompt_parts = [f"System: {system_prompt}"]

    for msg in messages:
        role = msg.get('role', 'user')
        content = msg.get('content', '')
        if role == 'system':
            prompt_parts.append(f"System: {content}")
        elif role == 'assistant':
            prompt_parts.append(f"Assistant: {content}")
        else:
            prompt_parts.append(f"Human: {content}")

    prompt = "\n\n".join(prompt_parts)

    if stream:
        def generate():
            response_id = f"chatcmpl-{uuid.uuid4().hex[:8]}"
            created = int(datetime.now().timestamp())

            for chunk in call_claude_cli(prompt, stream=True):
                data = {
                    "id": response_id,
                    "object": "chat.completion.chunk",
                    "created": created,
                    "model": model,
                    "choices": [{
                        "index": 0,
                        "delta": {"content": chunk},
                        "finish_reason": None
                    }]
                }
                yield f"data: {json.dumps(data)}\n\n"

            final = {
                "id": response_id,
                "object": "chat.completion.chunk",
                "created": created,
                "model": model,
                "choices": [{
                    "index": 0,
                    "delta": {},
                    "finish_reason": "stop"
                }]
            }
            yield f"data: {json.dumps(final)}\n\n"
            yield "data: [DONE]\n\n"

        return Response(generate(), mimetype='text/event-stream')

    else:
        response_text = "".join(call_claude_cli(prompt, stream=False))

        return jsonify({
            "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
            "object": "chat.completion",
            "created": int(datetime.now().timestamp()),
            "model": model,
            "choices": [{
                "index": 0,
                "message": {
                    "role": "assistant",
                    "content": response_text.strip()
                },
                "finish_reason": "stop"
            }],
            "usage": {
                "prompt_tokens": len(prompt) // 4,
                "completion_tokens": len(response_text) // 4,
                "total_tokens": (len(prompt) + len(response_text)) // 4
            }
        })


@app.route('/api/chat/completions', methods=['POST'])
def api_chat_completions():
    """Alternative endpoint (Open WebUI style)"""
    return chat_completions()


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--port', type=int, default=3001)
    parser.add_argument('--host', default='0.0.0.0')
    args = parser.parse_args()

    # Show RAG status
    rag_status = "ENABLED" if chroma_collections else "DISABLED"
    total_docs = sum(c.count() for c in chroma_collections.values()) if chroma_collections else 0

    print(f"""
╔═══════════════════════════════════════════════════════════════╗
║  Claude Max API Server v2.1 (with RAG)                        ║
║  Backend for if.emotion with Sergio Personality DNA           ║
╠═══════════════════════════════════════════════════════════════╣
║  Endpoint: http://{args.host}:{args.port}/v1/chat/completions          ║
║  Health:   http://{args.host}:{args.port}/health                       ║
║  Models:   http://{args.host}:{args.port}/v1/models                    ║
╠═══════════════════════════════════════════════════════════════╣
║  RAG Status: {rag_status:8}                                        ║
║  Total Docs: {total_docs:3} documents across 4 collections            ║
╚═══════════════════════════════════════════════════════════════╝
    """)

    app.run(host=args.host, port=args.port, debug=True, threaded=True)