Add VoiceDNA profile from manual replies

2025-12-24 12:09:08 +00:00 · 2025-12-24 12:09:08 +00:00 · 25e306d4b0
commit 25e306d4b0
parent 817995ac6b
3 changed files with 951 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -141,6 +141,18 @@ Generate the deeper “no raw quotes” report directly from an Instagram export

 - `python3 -m sergio_instagram_messaging.generate_dm_report_detailed --export-input /path/to/export-root --out /root/tmp/dm_history_report_en_detailed.md`

+## VoiceDNA (manual reply style)
+
+`voice_dna/socialmediatorr.voice-dna.json` is a **safe-to-store** style fingerprint generated from the last 6 months of **manual (non-template) DM replies** (no raw DM quotes are included).
+
+It also encodes a hard rule for the bot:
+
+- Always reply in the **user’s input language** (English / Spanish / French / Catalan), with a short clarification if the user’s message is too short to detect.
+
+Regenerate from a local Instagram export folder:
+
+- `python3 -m sergio_instagram_messaging.generate_voice_dna --export-input /path/to/export-root --out voice_dna/socialmediatorr.voice-dna.json --owner-name "Sergio de Vocht" --window-months 6`
+
 ## Webhooks (new messages → auto-reply)

 Meta webhooks are two steps:
--- a/sergio_instagram_messaging/generate_voice_dna.py
+++ b/sergio_instagram_messaging/generate_voice_dna.py
@ -0,0 +1,471 @@
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import statistics
+from collections import Counter, defaultdict
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Iterable, Literal
+
+from .analyze_instagram_export import canonicalize_text
+from .generate_dm_report_detailed import _infer_owner_name, _load_events, _resolve_export_root
+
+
+VoiceLang = Literal["English", "Spanish", "French", "Catalan", "Too short to tell"]
+
+
+_RE_URL = re.compile(r"(?i)\b(?:https?://|www\.)\S+")
+_RE_HANDLE = re.compile(r"(?<!\w)@[\w._]{2,}")
+_RE_DIGIT = re.compile(r"\d")
+
+
+def _now_utc_iso() -> str:
+    return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
+
+
+def _parse_iso_utc(s: str) -> datetime:
+    """
+    Accept an ISO timestamp like:
+      - 2025-12-24T12:34:56Z
+      - 2025-12-24T12:34:56+00:00
+      - 2025-12-24T12:34:56
+    Defaults to UTC if tzinfo is missing.
+    """
+    raw = (s or "").strip()
+    if not raw:
+        raise ValueError("empty datetime")
+    if raw.endswith("Z"):
+        raw = raw[:-1] + "+00:00"
+    dt = datetime.fromisoformat(raw)
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=timezone.utc)
+    return dt.astimezone(timezone.utc).replace(microsecond=0)
+
+
+def _days_in_month(year: int, month: int) -> int:
+    if month in (1, 3, 5, 7, 8, 10, 12):
+        return 31
+    if month in (4, 6, 9, 11):
+        return 30
+    # February
+    leap = (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0)
+    return 29 if leap else 28
+
+
+def _subtract_months(dt: datetime, months: int) -> datetime:
+    if months <= 0:
+        return dt
+    year = dt.year
+    month = dt.month - months
+    while month <= 0:
+        month += 12
+        year -= 1
+    day = min(dt.day, _days_in_month(year, month))
+    return dt.replace(year=year, month=month, day=day)
+
+
+def _tokenize(text: str) -> list[str]:
+    return re.findall(r"\b[\w']+\b", (text or "").lower(), flags=re.UNICODE)
+
+
+def _guess_lang(text: str) -> VoiceLang:
+    # Keep this consistent with generate_dm_report_detailed.py
+    from .generate_dm_report_detailed import _guess_lang as _guess_lang_impl
+
+    return _guess_lang_impl(text)
+
+
+def _is_system_new_follower_message(text: str) -> bool:
+    s = (text or "").strip().lower()
+    return s.startswith("you messaged") and "followed your account" in s
+
+
+def _iter_emojis(text: str) -> Iterable[str]:
+    # Heuristic: treat unicode "Symbol, other" as emoji-like and ignore punctuation/formatting.
+    # This avoids counting typographic quotes (e.g., “ ”) and ellipses (…) as emojis.
+    import unicodedata
+
+    for ch in (text or ""):
+        if ch.isspace():
+            continue
+        cat = unicodedata.category(ch)
+        if cat != "So":
+            continue
+        yield ch
+
+
+def _quantile(values: list[int], q: float) -> int | None:
+    if not values:
+        return None
+    if q <= 0:
+        return min(values)
+    if q >= 1:
+        return max(values)
+    xs = sorted(values)
+    idx = int(round((len(xs) - 1) * q))
+    return xs[max(0, min(len(xs) - 1, idx))]
+
+
+def _pct(num: int, den: int) -> float:
+    return 0.0 if den <= 0 else (num / den) * 100.0
+
+
+_GREETING_WORDS: dict[VoiceLang, set[str]] = {
+    "English": {"hi", "hey", "hello"},
+    "Spanish": {"hola", "buenas", "buenos"},
+    "French": {"salut", "bonjour"},
+    "Catalan": {"hola", "bon"},
+    "Too short to tell": set(),
+}
+
+_THANKS_WORDS: dict[VoiceLang, set[str]] = {
+    "English": {"thanks", "thank"},
+    "Spanish": {"gracias"},
+    "French": {"merci"},
+    "Catalan": {"gracies", "gràcies"},
+    "Too short to tell": set(),
+}
+
+_CTA_WORDS: dict[VoiceLang, set[str]] = {
+    "English": {"link", "call", "book", "ebook", "price"},
+    "Spanish": {"enlace", "link", "llamada", "libro", "precio"},
+    "French": {"lien", "appel", "livre", "prix"},
+    "Catalan": {"enllaç", "enllac", "trucada", "llibre", "preu"},
+    "Too short to tell": set(),
+}
+
+
+@dataclass(frozen=True)
+class OutgoingSample:
+    ts_ms: int
+    canon: str
+    lang: VoiceLang
+    chars: int
+    words: int
+    has_emoji: bool
+    emoji: tuple[str, ...]
+    has_question: bool
+    ends_with_question: bool
+    has_exclamation: bool
+    has_linebreak: bool
+    has_url: bool
+    has_handle: bool
+    has_number: bool
+    starts_with_greeting: bool
+    contains_thanks: bool
+    contains_cta_terms: bool
+
+
+def _sample_from_text(*, ts_ms: int, canon: str, text: str) -> OutgoingSample:
+    s = (text or "").strip()
+    lang = _guess_lang(s)
+    toks = _tokenize(s)
+
+    emojis = tuple(_iter_emojis(s))
+    has_question = "?" in s or "¿" in s
+    ends_with_question = s.endswith("?")
+    has_exclamation = "!" in s or "¡" in s
+    has_linebreak = "\n" in s
+    has_url = bool(_RE_URL.search(s))
+    has_handle = bool(_RE_HANDLE.search(s))
+    has_number = bool(_RE_DIGIT.search(s))
+
+    first_word = toks[0] if toks else ""
+    starts_with_greeting = bool(first_word and first_word in _GREETING_WORDS.get(lang, set()))
+    contains_thanks = bool(set(toks) & _THANKS_WORDS.get(lang, set()))
+    contains_cta_terms = bool(set(toks) & _CTA_WORDS.get(lang, set()))
+
+    return OutgoingSample(
+        ts_ms=int(ts_ms),
+        canon=canon,
+        lang=lang,
+        chars=len(s),
+        words=len(toks),
+        has_emoji=bool(emojis),
+        emoji=emojis,
+        has_question=has_question,
+        ends_with_question=ends_with_question,
+        has_exclamation=has_exclamation,
+        has_linebreak=has_linebreak,
+        has_url=has_url,
+        has_handle=has_handle,
+        has_number=has_number,
+        starts_with_greeting=starts_with_greeting,
+        contains_thanks=contains_thanks,
+        contains_cta_terms=contains_cta_terms,
+    )
+
+
+def _lang_bucket(samples: list[OutgoingSample]) -> dict[VoiceLang, list[OutgoingSample]]:
+    by: dict[VoiceLang, list[OutgoingSample]] = defaultdict(list)
+    for s in samples:
+        by[s.lang].append(s)
+    # Ensure stable order in output (even if empty).
+    for lang in ("English", "Spanish", "French", "Catalan", "Too short to tell"):
+        by.setdefault(lang, [])
+    return dict(by)
+
+
+def _summarize_samples(samples: list[OutgoingSample]) -> dict[str, Any]:
+    if not samples:
+        return {
+            "count": 0,
+            "length": {"chars": {}, "words": {}},
+            "rates": {},
+            "top_emojis": [],
+        }
+
+    chars = [s.chars for s in samples]
+    words = [s.words for s in samples]
+
+    emoji_counter: Counter[str] = Counter()
+    for s in samples:
+        emoji_counter.update(s.emoji)
+
+    def rates() -> dict[str, float]:
+        n = len(samples)
+        return {
+            "emoji_messages_pct": _pct(sum(1 for s in samples if s.has_emoji), n),
+            "question_messages_pct": _pct(sum(1 for s in samples if s.has_question), n),
+            "ends_with_question_pct": _pct(sum(1 for s in samples if s.ends_with_question), n),
+            "exclamation_messages_pct": _pct(sum(1 for s in samples if s.has_exclamation), n),
+            "linebreak_messages_pct": _pct(sum(1 for s in samples if s.has_linebreak), n),
+            "url_messages_pct": _pct(sum(1 for s in samples if s.has_url), n),
+            "handle_messages_pct": _pct(sum(1 for s in samples if s.has_handle), n),
+            "number_messages_pct": _pct(sum(1 for s in samples if s.has_number), n),
+            "starts_with_greeting_pct": _pct(sum(1 for s in samples if s.starts_with_greeting), n),
+            "contains_thanks_pct": _pct(sum(1 for s in samples if s.contains_thanks), n),
+            "contains_cta_terms_pct": _pct(sum(1 for s in samples if s.contains_cta_terms), n),
+        }
+
+    def length_summary(values: list[int]) -> dict[str, int | float | None]:
+        return {
+            "min": min(values) if values else None,
+            "p10": _quantile(values, 0.10),
+            "median": int(statistics.median(values)) if values else None,
+            "p90": _quantile(values, 0.90),
+            "max": max(values) if values else None,
+            "mean": (sum(values) / len(values)) if values else None,
+        }
+
+    return {
+        "count": len(samples),
+        "length": {"chars": length_summary(chars), "words": length_summary(words)},
+        "rates": rates(),
+        "top_emojis": [{"emoji": e, "count": c} for e, c in emoji_counter.most_common(20)],
+    }
+
+
+def _choose_language_policy(supported_from_inbox: Counter[VoiceLang], supported_from_owner: Counter[VoiceLang]) -> dict[str, Any]:
+    supported = []
+    for lang in ("English", "Spanish", "French", "Catalan"):
+        if supported_from_inbox.get(lang, 0) > 0 or supported_from_owner.get(lang, 0) > 0:
+            supported.append(lang)
+    if not supported:
+        supported = ["English", "Spanish", "French", "Catalan"]
+
+    return {
+        "mode": "mirror_user_input_language",
+        "supported_languages": supported,
+        "rules": [
+            "Reply in the same language as the user's most recent message that contains enough text to classify.",
+            "Do not translate the user's message unless they explicitly ask for a translation.",
+            "Do not mix languages inside a single reply unless the user mixes languages first.",
+            "If the user's message is too short to classify, reuse the last confidently detected language in the same thread.",
+            "If there is still no signal, ask a 1-line clarification asking which language they prefer (keep it short).",
+        ],
+    }
+
+
+def generate_voice_dna(
+    *,
+    export_input: Path,
+    out_path: Path,
+    owner_name: str | None,
+    window_months: int,
+    response_window_hours: float,
+    scripted_min_count: int,
+    as_of_utc: datetime,
+) -> Path:
+    export_root = _resolve_export_root(export_input)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    if not owner_name or not owner_name.strip():
+        owner_name = _infer_owner_name(export_root)
+    if not owner_name:
+        raise RuntimeError("Could not infer owner name. Pass --owner-name.")
+
+    as_of_utc = as_of_utc.astimezone(timezone.utc).replace(microsecond=0)
+    start_utc = _subtract_months(as_of_utc, int(window_months))
+    start_ts_ms = int(start_utc.timestamp() * 1000)
+    end_ts_ms = int(as_of_utc.timestamp() * 1000)
+    response_window_ms = int(float(response_window_hours) * 3600 * 1000)
+
+    template_counts: Counter[str] = Counter()
+    inbound_lang_counts_window: Counter[VoiceLang] = Counter()
+
+    candidate_responses: list[OutgoingSample] = []
+    scanned_conversations = 0
+    scanned_message_files = 0
+
+    inbox_root = export_root / "messages" / "inbox"
+    for conv_dir in inbox_root.iterdir():
+        if not conv_dir.is_dir():
+            continue
+        scanned_conversations += 1
+        parts = sorted(conv_dir.glob("message*.json"), key=lambda p: p.name)
+        if not parts:
+            continue
+        scanned_message_files += len(parts)
+        events = _load_events(parts, owner_name=owner_name)
+        if not events:
+            continue
+
+        last_inbound_ts: int | None = None
+        for e in events:
+            # Track inbound language (for mirroring policy) for the same recent window.
+            if not e.is_owner and isinstance(e.text, str) and e.text.strip():
+                if start_ts_ms <= e.ts_ms <= end_ts_ms:
+                    inbound_lang_counts_window[_guess_lang(e.text)] += 1
+                last_inbound_ts = e.ts_ms
+                continue
+
+            if not e.is_owner:
+                last_inbound_ts = e.ts_ms
+                continue
+
+            if not isinstance(e.text, str) or not e.text.strip():
+                continue
+
+            canon = canonicalize_text(e.text)
+            if canon:
+                template_counts[canon] += 1
+
+            # Style samples: only use manual "responses" in the recent window.
+            if _is_system_new_follower_message(e.text):
+                continue
+            if not (start_ts_ms <= e.ts_ms <= end_ts_ms):
+                continue
+            if last_inbound_ts is None:
+                continue
+            if e.ts_ms < last_inbound_ts:
+                continue
+            if (e.ts_ms - last_inbound_ts) > response_window_ms:
+                continue
+
+            if not canon:
+                continue
+            candidate_responses.append(_sample_from_text(ts_ms=e.ts_ms, canon=canon, text=e.text))
+
+    scripted_templates = {canon for canon, cnt in template_counts.items() if cnt >= int(scripted_min_count)}
+    manual_responses = [s for s in candidate_responses if s.canon not in scripted_templates]
+
+    owner_lang_counts = Counter([s.lang for s in manual_responses])
+    policy = _choose_language_policy(inbound_lang_counts_window, owner_lang_counts)
+
+    by_lang = _lang_bucket(manual_responses)
+    per_lang_summary = {lang: _summarize_samples(by_lang[lang]) for lang in by_lang}
+
+    voice_dna: dict[str, Any] = {
+        "schema_version": "voice_dna/v1",
+        "created_at_utc": _now_utc_iso(),
+        "subject": {
+            "account": "@socialmediatorr",
+            "owner_name": owner_name,
+            "scope": "Instagram DMs",
+        },
+        "source": {
+            "type": "instagram_export",
+            "window": {
+                "months": int(window_months),
+                "start_utc": start_utc.isoformat(),
+                "end_utc": as_of_utc.isoformat(),
+                "response_window_hours": float(response_window_hours),
+            },
+            "classification": {
+                "manual_reply_definition": "owner text message within response_window_hours of the most recent inbound message, excluding repeated templates",
+                "scripted_template_definition": f"owner canonicalized text sent >= {int(scripted_min_count)} times across full export",
+                "system_messages_excluded": ["you messaged <user> because they followed your account"],
+            },
+            "scan": {
+                "export_root_hint": export_root.name,
+                "scanned_conversations": scanned_conversations,
+                "scanned_message_files": scanned_message_files,
+                "candidate_responses_in_window": len(candidate_responses),
+                "manual_responses_in_window": len(manual_responses),
+                "scripted_template_count": len(scripted_templates),
+            },
+        },
+        "policies": {
+            "language": policy,
+        },
+        "language_observed": {
+            "inbound_last_window_counts": dict(inbound_lang_counts_window),
+            "manual_reply_counts": dict(owner_lang_counts),
+        },
+        "style": {
+            "manual_replies": {
+                "overall": _summarize_samples(manual_responses),
+                "by_language": per_lang_summary,
+            }
+        },
+    }
+
+    out_path.write_text(json.dumps(voice_dna, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
+    return out_path
+
+
+def main(argv: list[str] | None = None) -> int:
+    ap = argparse.ArgumentParser(
+        description="Generate a VoiceDNA JSON file for recent manual DM replies (style fingerprint + language mirroring policy)."
+    )
+    ap.add_argument("--export-input", required=True, help="Instagram export root (contains messages/inbox)")
+    ap.add_argument("--out", required=True, help="output path (JSON)")
+    ap.add_argument("--owner-name", default=None, help='owner sender_name (e.g., "Sergio de Vocht")')
+    ap.add_argument("--window-months", type=int, default=6, help="how many recent months to use")
+    ap.add_argument(
+        "--response-window-hours",
+        type=float,
+        default=72.0,
+        help="max hours after inbound message for an outgoing message to count as a response",
+    )
+    ap.add_argument(
+        "--scripted-min-count",
+        type=int,
+        default=50,
+        help="owner canonicalized text sent >= this count is treated as scripted/templated",
+    )
+    ap.add_argument(
+        "--as-of-utc",
+        default=None,
+        help="analysis end time (UTC ISO); default: now",
+    )
+    args = ap.parse_args(argv)
+
+    as_of = _parse_iso_utc(args.as_of_utc) if args.as_of_utc else datetime.now(timezone.utc).replace(microsecond=0)
+
+    try:
+        out = generate_voice_dna(
+            export_input=Path(args.export_input),
+            out_path=Path(args.out),
+            owner_name=(args.owner_name.strip() if args.owner_name else None),
+            window_months=int(args.window_months),
+            response_window_hours=float(args.response_window_hours),
+            scripted_min_count=int(args.scripted_min_count),
+            as_of_utc=as_of,
+        )
+        print(json.dumps({"ok": True, "out": str(out)}, ensure_ascii=False))
+        return 0
+    except FileNotFoundError as e:
+        print(str(e))
+        return 2
+    except Exception as e:
+        print(f"VoiceDNA generation failed: {e}")
+        return 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/voice_dna/socialmediatorr.voice-dna.json
+++ b/voice_dna/socialmediatorr.voice-dna.json
@ -0,0 +1,468 @@
+{
+  "schema_version": "voice_dna/v1",
+  "created_at_utc": "2025-12-24T12:08:24+00:00",
+  "subject": {
+    "account": "@socialmediatorr",
+    "owner_name": "Sergio de Vocht",
+    "scope": "Instagram DMs"
+  },
+  "source": {
+    "type": "instagram_export",
+    "window": {
+      "months": 6,
+      "start_utc": "2025-06-24T12:08:20+00:00",
+      "end_utc": "2025-12-24T12:08:20+00:00",
+      "response_window_hours": 72.0
+    },
+    "classification": {
+      "manual_reply_definition": "owner text message within response_window_hours of the most recent inbound message, excluding repeated templates",
+      "scripted_template_definition": "owner canonicalized text sent >= 50 times across full export",
+      "system_messages_excluded": [
+        "you messaged <user> because they followed your account"
+      ]
+    },
+    "scan": {
+      "export_root_hint": "socialmediatorr-ig-export-raw-20251224",
+      "scanned_conversations": 10100,
+      "scanned_message_files": 10061,
+      "candidate_responses_in_window": 18934,
+      "manual_responses_in_window": 825,
+      "scripted_template_count": 24
+    }
+  },
+  "policies": {
+    "language": {
+      "mode": "mirror_user_input_language",
+      "supported_languages": [
+        "English",
+        "Spanish",
+        "French",
+        "Catalan"
+      ],
+      "rules": [
+        "Reply in the same language as the user's most recent message that contains enough text to classify.",
+        "Do not translate the user's message unless they explicitly ask for a translation.",
+        "Do not mix languages inside a single reply unless the user mixes languages first.",
+        "If the user's message is too short to classify, reuse the last confidently detected language in the same thread.",
+        "If there is still no signal, ask a 1-line clarification asking which language they prefer (keep it short)."
+      ]
+    }
+  },
+  "language_observed": {
+    "inbound_last_window_counts": {
+      "English": 2828,
+      "Too short to tell": 5024,
+      "Spanish": 2176,
+      "French": 191,
+      "Catalan": 16
+    },
+    "manual_reply_counts": {
+      "Spanish": 541,
+      "Too short to tell": 255,
+      "English": 17,
+      "French": 1,
+      "Catalan": 11
+    }
+  },
+  "style": {
+    "manual_replies": {
+      "overall": {
+        "count": 825,
+        "length": {
+          "chars": {
+            "min": 2,
+            "p10": 13,
+            "median": 54,
+            "p90": 199,
+            "max": 928,
+            "mean": 87.47878787878788
+          },
+          "words": {
+            "min": 1,
+            "p10": 2,
+            "median": 11,
+            "p90": 34,
+            "max": 169,
+            "mean": 15.436363636363636
+          }
+        },
+        "rates": {
+          "emoji_messages_pct": 16.363636363636363,
+          "question_messages_pct": 32.484848484848484,
+          "ends_with_question_pct": 27.151515151515156,
+          "exclamation_messages_pct": 18.303030303030305,
+          "linebreak_messages_pct": 9.212121212121211,
+          "url_messages_pct": 4.121212121212121,
+          "handle_messages_pct": 0.0,
+          "number_messages_pct": 9.454545454545455,
+          "starts_with_greeting_pct": 4.96969696969697,
+          "contains_thanks_pct": 4.484848484848484,
+          "contains_cta_terms_pct": 4.848484848484849
+        },
+        "top_emojis": [
+          {
+            "emoji": "🙌",
+            "count": 41
+          },
+          {
+            "emoji": "🫶",
+            "count": 23
+          },
+          {
+            "emoji": "👋",
+            "count": 19
+          },
+          {
+            "emoji": "🎁",
+            "count": 17
+          },
+          {
+            "emoji": "💪",
+            "count": 14
+          },
+          {
+            "emoji": "👇",
+            "count": 12
+          },
+          {
+            "emoji": "😜",
+            "count": 7
+          },
+          {
+            "emoji": "😉",
+            "count": 4
+          },
+          {
+            "emoji": "🤣",
+            "count": 2
+          },
+          {
+            "emoji": "🙏",
+            "count": 2
+          },
+          {
+            "emoji": "⬆",
+            "count": 2
+          },
+          {
+            "emoji": "👀",
+            "count": 2
+          },
+          {
+            "emoji": "¦",
+            "count": 1
+          },
+          {
+            "emoji": "😅",
+            "count": 1
+          },
+          {
+            "emoji": "🤝",
+            "count": 1
+          },
+          {
+            "emoji": "☺",
+            "count": 1
+          },
+          {
+            "emoji": "🫡",
+            "count": 1
+          },
+          {
+            "emoji": "😁",
+            "count": 1
+          },
+          {
+            "emoji": "👌",
+            "count": 1
+          },
+          {
+            "emoji": "🚀",
+            "count": 1
+          }
+        ]
+      },
+      "by_language": {
+        "Spanish": {
+          "count": 541,
+          "length": {
+            "chars": {
+              "min": 5,
+              "p10": 35,
+              "median": 99,
+              "p90": 211,
+              "max": 928,
+              "mean": 116.06469500924214
+            },
+            "words": {
+              "min": 1,
+              "p10": 6,
+              "median": 18,
+              "p90": 38,
+              "max": 169,
+              "mean": 20.478743068391868
+            }
+          },
+          "rates": {
+            "emoji_messages_pct": 20.70240295748614,
+            "question_messages_pct": 39.55637707948244,
+            "ends_with_question_pct": 32.34750462107209,
+            "exclamation_messages_pct": 16.266173752310536,
+            "linebreak_messages_pct": 13.67837338262477,
+            "url_messages_pct": 2.2181146025878005,
+            "handle_messages_pct": 0.0,
+            "number_messages_pct": 9.242144177449168,
+            "starts_with_greeting_pct": 7.578558225508318,
+            "contains_thanks_pct": 6.839186691312385,
+            "contains_cta_terms_pct": 7.024029574861368
+          },
+          "top_emojis": [
+            {
+              "emoji": "🙌",
+              "count": 24
+            },
+            {
+              "emoji": "🫶",
+              "count": 23
+            },
+            {
+              "emoji": "👋",
+              "count": 19
+            },
+            {
+              "emoji": "🎁",
+              "count": 17
+            },
+            {
+              "emoji": "👇",
+              "count": 12
+            },
+            {
+              "emoji": "💪",
+              "count": 10
+            },
+            {
+              "emoji": "😜",
+              "count": 7
+            },
+            {
+              "emoji": "😉",
+              "count": 4
+            },
+            {
+              "emoji": "🤣",
+              "count": 2
+            },
+            {
+              "emoji": "🙏",
+              "count": 2
+            },
+            {
+              "emoji": "⬆",
+              "count": 2
+            },
+            {
+              "emoji": "👀",
+              "count": 2
+            },
+            {
+              "emoji": "😅",
+              "count": 1
+            },
+            {
+              "emoji": "🤝",
+              "count": 1
+            },
+            {
+              "emoji": "☺",
+              "count": 1
+            },
+            {
+              "emoji": "😁",
+              "count": 1
+            },
+            {
+              "emoji": "👌",
+              "count": 1
+            },
+            {
+              "emoji": "🚀",
+              "count": 1
+            },
+            {
+              "emoji": "⏱",
+              "count": 1
+            },
+            {
+              "emoji": "🫂",
+              "count": 1
+            }
+          ]
+        },
+        "Too short to tell": {
+          "count": 255,
+          "length": {
+            "chars": {
+              "min": 2,
+              "p10": 10,
+              "median": 30,
+              "p90": 53,
+              "max": 151,
+              "mean": 32.01176470588236
+            },
+            "words": {
+              "min": 1,
+              "p10": 1,
+              "median": 4,
+              "p90": 11,
+              "max": 29,
+              "mean": 5.749019607843137
+            }
+          },
+          "rates": {
+            "emoji_messages_pct": 7.8431372549019605,
+            "question_messages_pct": 19.607843137254903,
+            "ends_with_question_pct": 17.647058823529413,
+            "exclamation_messages_pct": 23.52941176470588,
+            "linebreak_messages_pct": 0.39215686274509803,
+            "url_messages_pct": 4.705882352941177,
+            "handle_messages_pct": 0.0,
+            "number_messages_pct": 7.0588235294117645,
+            "starts_with_greeting_pct": 0.0,
+            "contains_thanks_pct": 0.0,
+            "contains_cta_terms_pct": 0.0
+          },
+          "top_emojis": [
+            {
+              "emoji": "🙌",
+              "count": 15
+            },
+            {
+              "emoji": "💪",
+              "count": 4
+            },
+            {
+              "emoji": "🫡",
+              "count": 1
+            }
+          ]
+        },
+        "English": {
+          "count": 17,
+          "length": {
+            "chars": {
+              "min": 4,
+              "p10": 11,
+              "median": 23,
+              "p90": 38,
+              "max": 40,
+              "mean": 23.176470588235293
+            },
+            "words": {
+              "min": 2,
+              "p10": 3,
+              "median": 4,
+              "p90": 6,
+              "max": 8,
+              "mean": 4.588235294117647
+            }
+          },
+          "rates": {
+            "emoji_messages_pct": 11.76470588235294,
+            "question_messages_pct": 23.52941176470588,
+            "ends_with_question_pct": 23.52941176470588,
+            "exclamation_messages_pct": 17.647058823529413,
+            "linebreak_messages_pct": 0.0,
+            "url_messages_pct": 0.0,
+            "handle_messages_pct": 0.0,
+            "number_messages_pct": 0.0,
+            "starts_with_greeting_pct": 0.0,
+            "contains_thanks_pct": 0.0,
+            "contains_cta_terms_pct": 11.76470588235294
+          },
+          "top_emojis": [
+            {
+              "emoji": "🙌",
+              "count": 2
+            }
+          ]
+        },
+        "French": {
+          "count": 1,
+          "length": {
+            "chars": {
+              "min": 29,
+              "p10": 29,
+              "median": 29,
+              "p90": 29,
+              "max": 29,
+              "mean": 29.0
+            },
+            "words": {
+              "min": 5,
+              "p10": 5,
+              "median": 5,
+              "p90": 5,
+              "max": 5,
+              "mean": 5.0
+            }
+          },
+          "rates": {
+            "emoji_messages_pct": 100.0,
+            "question_messages_pct": 0.0,
+            "ends_with_question_pct": 0.0,
+            "exclamation_messages_pct": 0.0,
+            "linebreak_messages_pct": 0.0,
+            "url_messages_pct": 0.0,
+            "handle_messages_pct": 0.0,
+            "number_messages_pct": 0.0,
+            "starts_with_greeting_pct": 0.0,
+            "contains_thanks_pct": 0.0,
+            "contains_cta_terms_pct": 0.0
+          },
+          "top_emojis": [
+            {
+              "emoji": "¦",
+              "count": 1
+            }
+          ]
+        },
+        "Catalan": {
+          "count": 11,
+          "length": {
+            "chars": {
+              "min": 26,
+              "p10": 48,
+              "median": 52,
+              "p90": 99,
+              "max": 205,
+              "mean": 72.0909090909091
+            },
+            "words": {
+              "min": 5,
+              "p10": 5,
+              "median": 8,
+              "p90": 10,
+              "max": 34,
+              "mean": 9.727272727272727
+            }
+          },
+          "rates": {
+            "emoji_messages_pct": 0.0,
+            "question_messages_pct": 0.0,
+            "ends_with_question_pct": 0.0,
+            "exclamation_messages_pct": 0.0,
+            "linebreak_messages_pct": 9.090909090909092,
+            "url_messages_pct": 90.9090909090909,
+            "handle_messages_pct": 0.0,
+            "number_messages_pct": 90.9090909090909,
+            "starts_with_greeting_pct": 0.0,
+            "contains_thanks_pct": 0.0,
+            "contains_cta_terms_pct": 0.0
+          },
+          "top_emojis": []
+        }
+      }
+    }
+  }
+}