Add VoiceDNA profile from manual replies

2025-12-24 12:09:08 +00:00 · 2025-12-24 12:09:08 +00:00 · 25e306d4b0
commit 25e306d4b0
parent 817995ac6b
3 changed files with 951 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -141,6 +141,18 @@ Generate the deeper “no raw quotes” report directly from an Instagram export
 - `python3 -m sergio_instagram_messaging.generate_dm_report_detailed --export-input /path/to/export-root --out /root/tmp/dm_history_report_en_detailed.md`
 ## VoiceDNA (manual reply style)
 `voice_dna/socialmediatorr.voice-dna.json` is a **safe-to-store** style fingerprint generated from the last 6 months of **manual (non-template) DM replies** (no raw DM quotes are included).
 It also encodes a hard rule for the bot:
 - Always reply in the **user’s input language** (English / Spanish / French / Catalan), with a short clarification if the user’s message is too short to detect.
 Regenerate from a local Instagram export folder:
 - `python3 -m sergio_instagram_messaging.generate_voice_dna --export-input /path/to/export-root --out voice_dna/socialmediatorr.voice-dna.json --owner-name "Sergio de Vocht" --window-months 6`
 ## Webhooks (new messages → auto-reply)
 Meta webhooks are two steps:
--- a/sergio_instagram_messaging/generate_voice_dna.py
+++ b/sergio_instagram_messaging/generate_voice_dna.py
@ -0,0 +1,471 @@
 from __future__ import annotations
 import argparse
 import json
 import re
 import statistics
 from collections import Counter, defaultdict
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Iterable, Literal
 from .analyze_instagram_export import canonicalize_text
 from .generate_dm_report_detailed import _infer_owner_name, _load_events, _resolve_export_root
 VoiceLang = Literal["English", "Spanish", "French", "Catalan", "Too short to tell"]
 _RE_URL = re.compile(r"(?i)\b(?:https?://|www\.)\S+")
 _RE_HANDLE = re.compile(r"(?<!\w)@[\w._]{2,}")
 _RE_DIGIT = re.compile(r"\d")
 def _now_utc_iso() -> str:
    return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
 def _parse_iso_utc(s: str) -> datetime:
    """
    Accept an ISO timestamp like:
      - 2025-12-24T12:34:56Z
      - 2025-12-24T12:34:56+00:00
      - 2025-12-24T12:34:56
    Defaults to UTC if tzinfo is missing.
    """
    raw = (s or "").strip()
    if not raw:
        raise ValueError("empty datetime")
    if raw.endswith("Z"):
        raw = raw[:-1] + "+00:00"
    dt = datetime.fromisoformat(raw)
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)
    return dt.astimezone(timezone.utc).replace(microsecond=0)
 def _days_in_month(year: int, month: int) -> int:
    if month in (1, 3, 5, 7, 8, 10, 12):
        return 31
    if month in (4, 6, 9, 11):
        return 30
    # February
    leap = (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0)
    return 29 if leap else 28
 def _subtract_months(dt: datetime, months: int) -> datetime:
    if months <= 0:
        return dt
    year = dt.year
    month = dt.month - months
    while month <= 0:
        month += 12
        year -= 1
    day = min(dt.day, _days_in_month(year, month))
    return dt.replace(year=year, month=month, day=day)
 def _tokenize(text: str) -> list[str]:
    return re.findall(r"\b[\w']+\b", (text or "").lower(), flags=re.UNICODE)
 def _guess_lang(text: str) -> VoiceLang:
    # Keep this consistent with generate_dm_report_detailed.py
    from .generate_dm_report_detailed import _guess_lang as _guess_lang_impl
    return _guess_lang_impl(text)
 def _is_system_new_follower_message(text: str) -> bool:
    s = (text or "").strip().lower()
    return s.startswith("you messaged") and "followed your account" in s
 def _iter_emojis(text: str) -> Iterable[str]:
    # Heuristic: treat unicode "Symbol, other" as emoji-like and ignore punctuation/formatting.
    # This avoids counting typographic quotes (e.g., “ ”) and ellipses (…) as emojis.
    import unicodedata
    for ch in (text or ""):
        if ch.isspace():
            continue
        cat = unicodedata.category(ch)
        if cat != "So":
            continue
        yield ch
 def _quantile(values: list[int], q: float) -> int | None:
    if not values:
        return None
    if q <= 0:
        return min(values)
    if q >= 1:
        return max(values)
    xs = sorted(values)
    idx = int(round((len(xs) - 1) * q))
    return xs[max(0, min(len(xs) - 1, idx))]
 def _pct(num: int, den: int) -> float:
    return 0.0 if den <= 0 else (num / den) * 100.0
 _GREETING_WORDS: dict[VoiceLang, set[str]] = {
    "English": {"hi", "hey", "hello"},
    "Spanish": {"hola", "buenas", "buenos"},
    "French": {"salut", "bonjour"},
    "Catalan": {"hola", "bon"},
    "Too short to tell": set(),
 }
 _THANKS_WORDS: dict[VoiceLang, set[str]] = {
    "English": {"thanks", "thank"},
    "Spanish": {"gracias"},
    "French": {"merci"},
    "Catalan": {"gracies", "gràcies"},
    "Too short to tell": set(),
 }
 _CTA_WORDS: dict[VoiceLang, set[str]] = {
    "English": {"link", "call", "book", "ebook", "price"},
    "Spanish": {"enlace", "link", "llamada", "libro", "precio"},
    "French": {"lien", "appel", "livre", "prix"},
    "Catalan": {"enllaç", "enllac", "trucada", "llibre", "preu"},
    "Too short to tell": set(),
 }
@dataclass(frozen=True)
 class OutgoingSample:
    ts_ms: int
    canon: str
    lang: VoiceLang
    chars: int
    words: int
    has_emoji: bool
    emoji: tuple[str, ...]
    has_question: bool
    ends_with_question: bool
    has_exclamation: bool
    has_linebreak: bool
    has_url: bool
    has_handle: bool
    has_number: bool
    starts_with_greeting: bool
    contains_thanks: bool
    contains_cta_terms: bool
 def _sample_from_text(*, ts_ms: int, canon: str, text: str) -> OutgoingSample:
    s = (text or "").strip()
    lang = _guess_lang(s)
    toks = _tokenize(s)
    emojis = tuple(_iter_emojis(s))
    has_question = "?" in s or "¿" in s
    ends_with_question = s.endswith("?")
    has_exclamation = "!" in s or "¡" in s
    has_linebreak = "\n" in s
    has_url = bool(_RE_URL.search(s))
    has_handle = bool(_RE_HANDLE.search(s))
    has_number = bool(_RE_DIGIT.search(s))
    first_word = toks[0] if toks else ""
    starts_with_greeting = bool(first_word and first_word in _GREETING_WORDS.get(lang, set()))
    contains_thanks = bool(set(toks) & _THANKS_WORDS.get(lang, set()))
    contains_cta_terms = bool(set(toks) & _CTA_WORDS.get(lang, set()))
    return OutgoingSample(
        ts_ms=int(ts_ms),
        canon=canon,
        lang=lang,
        chars=len(s),
        words=len(toks),
        has_emoji=bool(emojis),
        emoji=emojis,
        has_question=has_question,
        ends_with_question=ends_with_question,
        has_exclamation=has_exclamation,
        has_linebreak=has_linebreak,
        has_url=has_url,
        has_handle=has_handle,
        has_number=has_number,
        starts_with_greeting=starts_with_greeting,
        contains_thanks=contains_thanks,
        contains_cta_terms=contains_cta_terms,
    )
 def _lang_bucket(samples: list[OutgoingSample]) -> dict[VoiceLang, list[OutgoingSample]]:
    by: dict[VoiceLang, list[OutgoingSample]] = defaultdict(list)
    for s in samples:
        by[s.lang].append(s)
    # Ensure stable order in output (even if empty).
    for lang in ("English", "Spanish", "French", "Catalan", "Too short to tell"):
        by.setdefault(lang, [])
    return dict(by)
 def _summarize_samples(samples: list[OutgoingSample]) -> dict[str, Any]:
    if not samples:
        return {
            "count": 0,
            "length": {"chars": {}, "words": {}},
            "rates": {},
            "top_emojis": [],
        }
    chars = [s.chars for s in samples]
    words = [s.words for s in samples]
    emoji_counter: Counter[str] = Counter()
    for s in samples:
        emoji_counter.update(s.emoji)
    def rates() -> dict[str, float]:
        n = len(samples)
        return {
            "emoji_messages_pct": _pct(sum(1 for s in samples if s.has_emoji), n),
            "question_messages_pct": _pct(sum(1 for s in samples if s.has_question), n),
            "ends_with_question_pct": _pct(sum(1 for s in samples if s.ends_with_question), n),
            "exclamation_messages_pct": _pct(sum(1 for s in samples if s.has_exclamation), n),
            "linebreak_messages_pct": _pct(sum(1 for s in samples if s.has_linebreak), n),
            "url_messages_pct": _pct(sum(1 for s in samples if s.has_url), n),
            "handle_messages_pct": _pct(sum(1 for s in samples if s.has_handle), n),
            "number_messages_pct": _pct(sum(1 for s in samples if s.has_number), n),
            "starts_with_greeting_pct": _pct(sum(1 for s in samples if s.starts_with_greeting), n),
            "contains_thanks_pct": _pct(sum(1 for s in samples if s.contains_thanks), n),
            "contains_cta_terms_pct": _pct(sum(1 for s in samples if s.contains_cta_terms), n),
        }
    def length_summary(values: list[int]) -> dict[str, int | float | None]:
        return {
            "min": min(values) if values else None,
            "p10": _quantile(values, 0.10),
            "median": int(statistics.median(values)) if values else None,
            "p90": _quantile(values, 0.90),
            "max": max(values) if values else None,
            "mean": (sum(values) / len(values)) if values else None,
        }
    return {
        "count": len(samples),
        "length": {"chars": length_summary(chars), "words": length_summary(words)},
        "rates": rates(),
        "top_emojis": [{"emoji": e, "count": c} for e, c in emoji_counter.most_common(20)],
    }
 def _choose_language_policy(supported_from_inbox: Counter[VoiceLang], supported_from_owner: Counter[VoiceLang]) -> dict[str, Any]:
    supported = []
    for lang in ("English", "Spanish", "French", "Catalan"):
        if supported_from_inbox.get(lang, 0) > 0 or supported_from_owner.get(lang, 0) > 0:
            supported.append(lang)
    if not supported:
        supported = ["English", "Spanish", "French", "Catalan"]
    return {
        "mode": "mirror_user_input_language",
        "supported_languages": supported,
        "rules": [
            "Reply in the same language as the user's most recent message that contains enough text to classify.",
            "Do not translate the user's message unless they explicitly ask for a translation.",
            "Do not mix languages inside a single reply unless the user mixes languages first.",
            "If the user's message is too short to classify, reuse the last confidently detected language in the same thread.",
            "If there is still no signal, ask a 1-line clarification asking which language they prefer (keep it short).",
        ],
    }
 def generate_voice_dna(
    *,
    export_input: Path,
    out_path: Path,
    owner_name: str | None,
    window_months: int,
    response_window_hours: float,
    scripted_min_count: int,
    as_of_utc: datetime,
 ) -> Path:
    export_root = _resolve_export_root(export_input)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    if not owner_name or not owner_name.strip():
        owner_name = _infer_owner_name(export_root)
    if not owner_name:
        raise RuntimeError("Could not infer owner name. Pass --owner-name.")
    as_of_utc = as_of_utc.astimezone(timezone.utc).replace(microsecond=0)
    start_utc = _subtract_months(as_of_utc, int(window_months))
    start_ts_ms = int(start_utc.timestamp() * 1000)
    end_ts_ms = int(as_of_utc.timestamp() * 1000)
    response_window_ms = int(float(response_window_hours) * 3600 * 1000)
    template_counts: Counter[str] = Counter()
    inbound_lang_counts_window: Counter[VoiceLang] = Counter()
    candidate_responses: list[OutgoingSample] = []
    scanned_conversations = 0
    scanned_message_files = 0
    inbox_root = export_root / "messages" / "inbox"
    for conv_dir in inbox_root.iterdir():
        if not conv_dir.is_dir():
            continue
        scanned_conversations += 1
        parts = sorted(conv_dir.glob("message*.json"), key=lambda p: p.name)
        if not parts:
            continue
        scanned_message_files += len(parts)
        events = _load_events(parts, owner_name=owner_name)
        if not events:
            continue
        last_inbound_ts: int | None = None
        for e in events:
            # Track inbound language (for mirroring policy) for the same recent window.
            if not e.is_owner and isinstance(e.text, str) and e.text.strip():
                if start_ts_ms <= e.ts_ms <= end_ts_ms:
                    inbound_lang_counts_window[_guess_lang(e.text)] += 1
                last_inbound_ts = e.ts_ms
                continue
            if not e.is_owner:
                last_inbound_ts = e.ts_ms
                continue
            if not isinstance(e.text, str) or not e.text.strip():
                continue
            canon = canonicalize_text(e.text)
            if canon:
                template_counts[canon] += 1
            # Style samples: only use manual "responses" in the recent window.
            if _is_system_new_follower_message(e.text):
                continue
            if not (start_ts_ms <= e.ts_ms <= end_ts_ms):
                continue
            if last_inbound_ts is None:
                continue
            if e.ts_ms < last_inbound_ts:
                continue
            if (e.ts_ms - last_inbound_ts) > response_window_ms:
                continue
            if not canon:
                continue
            candidate_responses.append(_sample_from_text(ts_ms=e.ts_ms, canon=canon, text=e.text))
    scripted_templates = {canon for canon, cnt in template_counts.items() if cnt >= int(scripted_min_count)}
    manual_responses = [s for s in candidate_responses if s.canon not in scripted_templates]
    owner_lang_counts = Counter([s.lang for s in manual_responses])
    policy = _choose_language_policy(inbound_lang_counts_window, owner_lang_counts)
    by_lang = _lang_bucket(manual_responses)
    per_lang_summary = {lang: _summarize_samples(by_lang[lang]) for lang in by_lang}
    voice_dna: dict[str, Any] = {
        "schema_version": "voice_dna/v1",
        "created_at_utc": _now_utc_iso(),
        "subject": {
            "account": "@socialmediatorr",
            "owner_name": owner_name,
            "scope": "Instagram DMs",
        },
        "source": {
            "type": "instagram_export",
            "window": {
                "months": int(window_months),
                "start_utc": start_utc.isoformat(),
                "end_utc": as_of_utc.isoformat(),
                "response_window_hours": float(response_window_hours),
            },
            "classification": {
                "manual_reply_definition": "owner text message within response_window_hours of the most recent inbound message, excluding repeated templates",
                "scripted_template_definition": f"owner canonicalized text sent >= {int(scripted_min_count)} times across full export",
                "system_messages_excluded": ["you messaged <user> because they followed your account"],
            },
            "scan": {
                "export_root_hint": export_root.name,
                "scanned_conversations": scanned_conversations,
                "scanned_message_files": scanned_message_files,
                "candidate_responses_in_window": len(candidate_responses),
                "manual_responses_in_window": len(manual_responses),
                "scripted_template_count": len(scripted_templates),
            },
        },
        "policies": {
            "language": policy,
        },
        "language_observed": {
            "inbound_last_window_counts": dict(inbound_lang_counts_window),
            "manual_reply_counts": dict(owner_lang_counts),
        },
        "style": {
            "manual_replies": {
                "overall": _summarize_samples(manual_responses),
                "by_language": per_lang_summary,
            }
        },
    }
    out_path.write_text(json.dumps(voice_dna, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
    return out_path
 def main(argv: list[str] | None = None) -> int:
    ap = argparse.ArgumentParser(
        description="Generate a VoiceDNA JSON file for recent manual DM replies (style fingerprint + language mirroring policy)."
    )
    ap.add_argument("--export-input", required=True, help="Instagram export root (contains messages/inbox)")
    ap.add_argument("--out", required=True, help="output path (JSON)")
    ap.add_argument("--owner-name", default=None, help='owner sender_name (e.g., "Sergio de Vocht")')
    ap.add_argument("--window-months", type=int, default=6, help="how many recent months to use")
    ap.add_argument(
        "--response-window-hours",
        type=float,
        default=72.0,
        help="max hours after inbound message for an outgoing message to count as a response",
    )
    ap.add_argument(
        "--scripted-min-count",
        type=int,
        default=50,
        help="owner canonicalized text sent >= this count is treated as scripted/templated",
    )
    ap.add_argument(
        "--as-of-utc",
        default=None,
        help="analysis end time (UTC ISO); default: now",
    )
    args = ap.parse_args(argv)
    as_of = _parse_iso_utc(args.as_of_utc) if args.as_of_utc else datetime.now(timezone.utc).replace(microsecond=0)
    try:
        out = generate_voice_dna(
            export_input=Path(args.export_input),
            out_path=Path(args.out),
            owner_name=(args.owner_name.strip() if args.owner_name else None),
            window_months=int(args.window_months),
            response_window_hours=float(args.response_window_hours),
            scripted_min_count=int(args.scripted_min_count),
            as_of_utc=as_of,
        )
        print(json.dumps({"ok": True, "out": str(out)}, ensure_ascii=False))
        return 0
    except FileNotFoundError as e:
        print(str(e))
        return 2
    except Exception as e:
        print(f"VoiceDNA generation failed: {e}")
        return 1
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/voice_dna/socialmediatorr.voice-dna.json
+++ b/voice_dna/socialmediatorr.voice-dna.json
@ -0,0 +1,468 @@
 {
  "schema_version": "voice_dna/v1",
  "created_at_utc": "2025-12-24T12:08:24+00:00",
  "subject": {
    "account": "@socialmediatorr",
    "owner_name": "Sergio de Vocht",
    "scope": "Instagram DMs"
  },
  "source": {
    "type": "instagram_export",
    "window": {
      "months": 6,
      "start_utc": "2025-06-24T12:08:20+00:00",
      "end_utc": "2025-12-24T12:08:20+00:00",
      "response_window_hours": 72.0
    },
    "classification": {
      "manual_reply_definition": "owner text message within response_window_hours of the most recent inbound message, excluding repeated templates",
      "scripted_template_definition": "owner canonicalized text sent >= 50 times across full export",
      "system_messages_excluded": [
        "you messaged <user> because they followed your account"
      ]
    },
    "scan": {
      "export_root_hint": "socialmediatorr-ig-export-raw-20251224",
      "scanned_conversations": 10100,
      "scanned_message_files": 10061,
      "candidate_responses_in_window": 18934,
      "manual_responses_in_window": 825,
      "scripted_template_count": 24
    }
  },
  "policies": {
    "language": {
      "mode": "mirror_user_input_language",
      "supported_languages": [
        "English",
        "Spanish",
        "French",
        "Catalan"
      ],
      "rules": [
        "Reply in the same language as the user's most recent message that contains enough text to classify.",
        "Do not translate the user's message unless they explicitly ask for a translation.",
        "Do not mix languages inside a single reply unless the user mixes languages first.",
        "If the user's message is too short to classify, reuse the last confidently detected language in the same thread.",
        "If there is still no signal, ask a 1-line clarification asking which language they prefer (keep it short)."
      ]
    }
  },
  "language_observed": {
    "inbound_last_window_counts": {
      "English": 2828,
      "Too short to tell": 5024,
      "Spanish": 2176,
      "French": 191,
      "Catalan": 16
    },
    "manual_reply_counts": {
      "Spanish": 541,
      "Too short to tell": 255,
      "English": 17,
      "French": 1,
      "Catalan": 11
    }
  },
  "style": {
    "manual_replies": {
      "overall": {
        "count": 825,
        "length": {
          "chars": {
            "min": 2,
            "p10": 13,
            "median": 54,
            "p90": 199,
            "max": 928,
            "mean": 87.47878787878788
          },
          "words": {
            "min": 1,
            "p10": 2,
            "median": 11,
            "p90": 34,
            "max": 169,
            "mean": 15.436363636363636
          }
        },
        "rates": {
          "emoji_messages_pct": 16.363636363636363,
          "question_messages_pct": 32.484848484848484,
          "ends_with_question_pct": 27.151515151515156,
          "exclamation_messages_pct": 18.303030303030305,
          "linebreak_messages_pct": 9.212121212121211,
          "url_messages_pct": 4.121212121212121,
          "handle_messages_pct": 0.0,
          "number_messages_pct": 9.454545454545455,
          "starts_with_greeting_pct": 4.96969696969697,
          "contains_thanks_pct": 4.484848484848484,
          "contains_cta_terms_pct": 4.848484848484849
        },
        "top_emojis": [
          {
            "emoji": "🙌",
            "count": 41
          },
          {
            "emoji": "🫶",
            "count": 23
          },
          {
            "emoji": "👋",
            "count": 19
          },
          {
            "emoji": "🎁",
            "count": 17
          },
          {
            "emoji": "💪",
            "count": 14
          },
          {
            "emoji": "👇",
            "count": 12
          },
          {
            "emoji": "😜",
            "count": 7
          },
          {
            "emoji": "😉",
            "count": 4
          },
          {
            "emoji": "🤣",
            "count": 2
          },
          {
            "emoji": "🙏",
            "count": 2
          },
          {
            "emoji": "⬆",
            "count": 2
          },
          {
            "emoji": "👀",
            "count": 2
          },
          {
            "emoji": "¦",
            "count": 1
          },
          {
            "emoji": "😅",
            "count": 1
          },
          {
            "emoji": "🤝",
            "count": 1
          },
          {
            "emoji": "☺",
            "count": 1
          },
          {
            "emoji": "🫡",
            "count": 1
          },
          {
            "emoji": "😁",
            "count": 1
          },
          {
            "emoji": "👌",
            "count": 1
          },
          {
            "emoji": "🚀",
            "count": 1
          }
        ]
      },
      "by_language": {
        "Spanish": {
          "count": 541,
          "length": {
            "chars": {
              "min": 5,
              "p10": 35,
              "median": 99,
              "p90": 211,
              "max": 928,
              "mean": 116.06469500924214
            },
            "words": {
              "min": 1,
              "p10": 6,
              "median": 18,
              "p90": 38,
              "max": 169,
              "mean": 20.478743068391868
            }
          },
          "rates": {
            "emoji_messages_pct": 20.70240295748614,
            "question_messages_pct": 39.55637707948244,
            "ends_with_question_pct": 32.34750462107209,
            "exclamation_messages_pct": 16.266173752310536,
            "linebreak_messages_pct": 13.67837338262477,
            "url_messages_pct": 2.2181146025878005,
            "handle_messages_pct": 0.0,
            "number_messages_pct": 9.242144177449168,
            "starts_with_greeting_pct": 7.578558225508318,
            "contains_thanks_pct": 6.839186691312385,
            "contains_cta_terms_pct": 7.024029574861368
          },
          "top_emojis": [
            {
              "emoji": "🙌",
              "count": 24
            },
            {
              "emoji": "🫶",
              "count": 23
            },
            {
              "emoji": "👋",
              "count": 19
            },
            {
              "emoji": "🎁",
              "count": 17
            },
            {
              "emoji": "👇",
              "count": 12
            },
            {
              "emoji": "💪",
              "count": 10
            },
            {
              "emoji": "😜",
              "count": 7
            },
            {
              "emoji": "😉",
              "count": 4
            },
            {
              "emoji": "🤣",
              "count": 2
            },
            {
              "emoji": "🙏",
              "count": 2
            },
            {
              "emoji": "⬆",
              "count": 2
            },
            {
              "emoji": "👀",
              "count": 2
            },
            {
              "emoji": "😅",
              "count": 1
            },
            {
              "emoji": "🤝",
              "count": 1
            },
            {
              "emoji": "☺",
              "count": 1
            },
            {
              "emoji": "😁",
              "count": 1
            },
            {
              "emoji": "👌",
              "count": 1
            },
            {
              "emoji": "🚀",
              "count": 1
            },
            {
              "emoji": "⏱",
              "count": 1
            },
            {
              "emoji": "🫂",
              "count": 1
            }
          ]
        },
        "Too short to tell": {
          "count": 255,
          "length": {
            "chars": {
              "min": 2,
              "p10": 10,
              "median": 30,
              "p90": 53,
              "max": 151,
              "mean": 32.01176470588236
            },
            "words": {
              "min": 1,
              "p10": 1,
              "median": 4,
              "p90": 11,
              "max": 29,
              "mean": 5.749019607843137
            }
          },
          "rates": {
            "emoji_messages_pct": 7.8431372549019605,
            "question_messages_pct": 19.607843137254903,
            "ends_with_question_pct": 17.647058823529413,
            "exclamation_messages_pct": 23.52941176470588,
            "linebreak_messages_pct": 0.39215686274509803,
            "url_messages_pct": 4.705882352941177,
            "handle_messages_pct": 0.0,
            "number_messages_pct": 7.0588235294117645,
            "starts_with_greeting_pct": 0.0,
            "contains_thanks_pct": 0.0,
            "contains_cta_terms_pct": 0.0
          },
          "top_emojis": [
            {
              "emoji": "🙌",
              "count": 15
            },
            {
              "emoji": "💪",
              "count": 4
            },
            {
              "emoji": "🫡",
              "count": 1
            }
          ]
        },
        "English": {
          "count": 17,
          "length": {
            "chars": {
              "min": 4,
              "p10": 11,
              "median": 23,
              "p90": 38,
              "max": 40,
              "mean": 23.176470588235293
            },
            "words": {
              "min": 2,
              "p10": 3,
              "median": 4,
              "p90": 6,
              "max": 8,
              "mean": 4.588235294117647
            }
          },
          "rates": {
            "emoji_messages_pct": 11.76470588235294,
            "question_messages_pct": 23.52941176470588,
            "ends_with_question_pct": 23.52941176470588,
            "exclamation_messages_pct": 17.647058823529413,
            "linebreak_messages_pct": 0.0,
            "url_messages_pct": 0.0,
            "handle_messages_pct": 0.0,
            "number_messages_pct": 0.0,
            "starts_with_greeting_pct": 0.0,
            "contains_thanks_pct": 0.0,
            "contains_cta_terms_pct": 11.76470588235294
          },
          "top_emojis": [
            {
              "emoji": "🙌",
              "count": 2
            }
          ]
        },
        "French": {
          "count": 1,
          "length": {
            "chars": {
              "min": 29,
              "p10": 29,
              "median": 29,
              "p90": 29,
              "max": 29,
              "mean": 29.0
            },
            "words": {
              "min": 5,
              "p10": 5,
              "median": 5,
              "p90": 5,
              "max": 5,
              "mean": 5.0
            }
          },
          "rates": {
            "emoji_messages_pct": 100.0,
            "question_messages_pct": 0.0,
            "ends_with_question_pct": 0.0,
            "exclamation_messages_pct": 0.0,
            "linebreak_messages_pct": 0.0,
            "url_messages_pct": 0.0,
            "handle_messages_pct": 0.0,
            "number_messages_pct": 0.0,
            "starts_with_greeting_pct": 0.0,
            "contains_thanks_pct": 0.0,
            "contains_cta_terms_pct": 0.0
          },
          "top_emojis": [
            {
              "emoji": "¦",
              "count": 1
            }
          ]
        },
        "Catalan": {
          "count": 11,
          "length": {
            "chars": {
              "min": 26,
              "p10": 48,
              "median": 52,
              "p90": 99,
              "max": 205,
              "mean": 72.0909090909091
            },
            "words": {
              "min": 5,
              "p10": 5,
              "median": 8,
              "p90": 10,
              "max": 34,
              "mean": 9.727272727272727
            }
          },
          "rates": {
            "emoji_messages_pct": 0.0,
            "question_messages_pct": 0.0,
            "ends_with_question_pct": 0.0,
            "exclamation_messages_pct": 0.0,
            "linebreak_messages_pct": 9.090909090909092,
            "url_messages_pct": 90.9090909090909,
            "handle_messages_pct": 0.0,
            "number_messages_pct": 90.9090909090909,
            "starts_with_greeting_pct": 0.0,
            "contains_thanks_pct": 0.0,
            "contains_cta_terms_pct": 0.0
          },
          "top_emojis": []
        }
      }
    }
  }
 }