From 25e306d4b0b2e1ee1cd4e18fcbeb64f70590da39 Mon Sep 17 00:00:00 2001 From: danny Date: Wed, 24 Dec 2025 12:09:08 +0000 Subject: [PATCH] Add VoiceDNA profile from manual replies --- README.md | 12 + .../generate_voice_dna.py | 471 ++++++++++++++++++ voice_dna/socialmediatorr.voice-dna.json | 468 +++++++++++++++++ 3 files changed, 951 insertions(+) create mode 100644 sergio_instagram_messaging/generate_voice_dna.py create mode 100644 voice_dna/socialmediatorr.voice-dna.json diff --git a/README.md b/README.md index 1f45c3a..54ceb10 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,18 @@ Generate the deeper “no raw quotes” report directly from an Instagram export - `python3 -m sergio_instagram_messaging.generate_dm_report_detailed --export-input /path/to/export-root --out /root/tmp/dm_history_report_en_detailed.md` +## VoiceDNA (manual reply style) + +`voice_dna/socialmediatorr.voice-dna.json` is a **safe-to-store** style fingerprint generated from the last 6 months of **manual (non-template) DM replies** (no raw DM quotes are included). + +It also encodes a hard rule for the bot: + +- Always reply in the **user’s input language** (English / Spanish / French / Catalan), with a short clarification if the user’s message is too short to detect. + +Regenerate from a local Instagram export folder: + +- `python3 -m sergio_instagram_messaging.generate_voice_dna --export-input /path/to/export-root --out voice_dna/socialmediatorr.voice-dna.json --owner-name "Sergio de Vocht" --window-months 6` + ## Webhooks (new messages → auto-reply) Meta webhooks are two steps: diff --git a/sergio_instagram_messaging/generate_voice_dna.py b/sergio_instagram_messaging/generate_voice_dna.py new file mode 100644 index 0000000..eab39e2 --- /dev/null +++ b/sergio_instagram_messaging/generate_voice_dna.py @@ -0,0 +1,471 @@ +from __future__ import annotations + +import argparse +import json +import re +import statistics +from collections import Counter, defaultdict +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Iterable, Literal + +from .analyze_instagram_export import canonicalize_text +from .generate_dm_report_detailed import _infer_owner_name, _load_events, _resolve_export_root + + +VoiceLang = Literal["English", "Spanish", "French", "Catalan", "Too short to tell"] + + +_RE_URL = re.compile(r"(?i)\b(?:https?://|www\.)\S+") +_RE_HANDLE = re.compile(r"(? str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat() + + +def _parse_iso_utc(s: str) -> datetime: + """ + Accept an ISO timestamp like: + - 2025-12-24T12:34:56Z + - 2025-12-24T12:34:56+00:00 + - 2025-12-24T12:34:56 + Defaults to UTC if tzinfo is missing. + """ + raw = (s or "").strip() + if not raw: + raise ValueError("empty datetime") + if raw.endswith("Z"): + raw = raw[:-1] + "+00:00" + dt = datetime.fromisoformat(raw) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt.astimezone(timezone.utc).replace(microsecond=0) + + +def _days_in_month(year: int, month: int) -> int: + if month in (1, 3, 5, 7, 8, 10, 12): + return 31 + if month in (4, 6, 9, 11): + return 30 + # February + leap = (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0) + return 29 if leap else 28 + + +def _subtract_months(dt: datetime, months: int) -> datetime: + if months <= 0: + return dt + year = dt.year + month = dt.month - months + while month <= 0: + month += 12 + year -= 1 + day = min(dt.day, _days_in_month(year, month)) + return dt.replace(year=year, month=month, day=day) + + +def _tokenize(text: str) -> list[str]: + return re.findall(r"\b[\w']+\b", (text or "").lower(), flags=re.UNICODE) + + +def _guess_lang(text: str) -> VoiceLang: + # Keep this consistent with generate_dm_report_detailed.py + from .generate_dm_report_detailed import _guess_lang as _guess_lang_impl + + return _guess_lang_impl(text) + + +def _is_system_new_follower_message(text: str) -> bool: + s = (text or "").strip().lower() + return s.startswith("you messaged") and "followed your account" in s + + +def _iter_emojis(text: str) -> Iterable[str]: + # Heuristic: treat unicode "Symbol, other" as emoji-like and ignore punctuation/formatting. + # This avoids counting typographic quotes (e.g., “ ”) and ellipses (…) as emojis. + import unicodedata + + for ch in (text or ""): + if ch.isspace(): + continue + cat = unicodedata.category(ch) + if cat != "So": + continue + yield ch + + +def _quantile(values: list[int], q: float) -> int | None: + if not values: + return None + if q <= 0: + return min(values) + if q >= 1: + return max(values) + xs = sorted(values) + idx = int(round((len(xs) - 1) * q)) + return xs[max(0, min(len(xs) - 1, idx))] + + +def _pct(num: int, den: int) -> float: + return 0.0 if den <= 0 else (num / den) * 100.0 + + +_GREETING_WORDS: dict[VoiceLang, set[str]] = { + "English": {"hi", "hey", "hello"}, + "Spanish": {"hola", "buenas", "buenos"}, + "French": {"salut", "bonjour"}, + "Catalan": {"hola", "bon"}, + "Too short to tell": set(), +} + +_THANKS_WORDS: dict[VoiceLang, set[str]] = { + "English": {"thanks", "thank"}, + "Spanish": {"gracias"}, + "French": {"merci"}, + "Catalan": {"gracies", "gràcies"}, + "Too short to tell": set(), +} + +_CTA_WORDS: dict[VoiceLang, set[str]] = { + "English": {"link", "call", "book", "ebook", "price"}, + "Spanish": {"enlace", "link", "llamada", "libro", "precio"}, + "French": {"lien", "appel", "livre", "prix"}, + "Catalan": {"enllaç", "enllac", "trucada", "llibre", "preu"}, + "Too short to tell": set(), +} + + +@dataclass(frozen=True) +class OutgoingSample: + ts_ms: int + canon: str + lang: VoiceLang + chars: int + words: int + has_emoji: bool + emoji: tuple[str, ...] + has_question: bool + ends_with_question: bool + has_exclamation: bool + has_linebreak: bool + has_url: bool + has_handle: bool + has_number: bool + starts_with_greeting: bool + contains_thanks: bool + contains_cta_terms: bool + + +def _sample_from_text(*, ts_ms: int, canon: str, text: str) -> OutgoingSample: + s = (text or "").strip() + lang = _guess_lang(s) + toks = _tokenize(s) + + emojis = tuple(_iter_emojis(s)) + has_question = "?" in s or "¿" in s + ends_with_question = s.endswith("?") + has_exclamation = "!" in s or "¡" in s + has_linebreak = "\n" in s + has_url = bool(_RE_URL.search(s)) + has_handle = bool(_RE_HANDLE.search(s)) + has_number = bool(_RE_DIGIT.search(s)) + + first_word = toks[0] if toks else "" + starts_with_greeting = bool(first_word and first_word in _GREETING_WORDS.get(lang, set())) + contains_thanks = bool(set(toks) & _THANKS_WORDS.get(lang, set())) + contains_cta_terms = bool(set(toks) & _CTA_WORDS.get(lang, set())) + + return OutgoingSample( + ts_ms=int(ts_ms), + canon=canon, + lang=lang, + chars=len(s), + words=len(toks), + has_emoji=bool(emojis), + emoji=emojis, + has_question=has_question, + ends_with_question=ends_with_question, + has_exclamation=has_exclamation, + has_linebreak=has_linebreak, + has_url=has_url, + has_handle=has_handle, + has_number=has_number, + starts_with_greeting=starts_with_greeting, + contains_thanks=contains_thanks, + contains_cta_terms=contains_cta_terms, + ) + + +def _lang_bucket(samples: list[OutgoingSample]) -> dict[VoiceLang, list[OutgoingSample]]: + by: dict[VoiceLang, list[OutgoingSample]] = defaultdict(list) + for s in samples: + by[s.lang].append(s) + # Ensure stable order in output (even if empty). + for lang in ("English", "Spanish", "French", "Catalan", "Too short to tell"): + by.setdefault(lang, []) + return dict(by) + + +def _summarize_samples(samples: list[OutgoingSample]) -> dict[str, Any]: + if not samples: + return { + "count": 0, + "length": {"chars": {}, "words": {}}, + "rates": {}, + "top_emojis": [], + } + + chars = [s.chars for s in samples] + words = [s.words for s in samples] + + emoji_counter: Counter[str] = Counter() + for s in samples: + emoji_counter.update(s.emoji) + + def rates() -> dict[str, float]: + n = len(samples) + return { + "emoji_messages_pct": _pct(sum(1 for s in samples if s.has_emoji), n), + "question_messages_pct": _pct(sum(1 for s in samples if s.has_question), n), + "ends_with_question_pct": _pct(sum(1 for s in samples if s.ends_with_question), n), + "exclamation_messages_pct": _pct(sum(1 for s in samples if s.has_exclamation), n), + "linebreak_messages_pct": _pct(sum(1 for s in samples if s.has_linebreak), n), + "url_messages_pct": _pct(sum(1 for s in samples if s.has_url), n), + "handle_messages_pct": _pct(sum(1 for s in samples if s.has_handle), n), + "number_messages_pct": _pct(sum(1 for s in samples if s.has_number), n), + "starts_with_greeting_pct": _pct(sum(1 for s in samples if s.starts_with_greeting), n), + "contains_thanks_pct": _pct(sum(1 for s in samples if s.contains_thanks), n), + "contains_cta_terms_pct": _pct(sum(1 for s in samples if s.contains_cta_terms), n), + } + + def length_summary(values: list[int]) -> dict[str, int | float | None]: + return { + "min": min(values) if values else None, + "p10": _quantile(values, 0.10), + "median": int(statistics.median(values)) if values else None, + "p90": _quantile(values, 0.90), + "max": max(values) if values else None, + "mean": (sum(values) / len(values)) if values else None, + } + + return { + "count": len(samples), + "length": {"chars": length_summary(chars), "words": length_summary(words)}, + "rates": rates(), + "top_emojis": [{"emoji": e, "count": c} for e, c in emoji_counter.most_common(20)], + } + + +def _choose_language_policy(supported_from_inbox: Counter[VoiceLang], supported_from_owner: Counter[VoiceLang]) -> dict[str, Any]: + supported = [] + for lang in ("English", "Spanish", "French", "Catalan"): + if supported_from_inbox.get(lang, 0) > 0 or supported_from_owner.get(lang, 0) > 0: + supported.append(lang) + if not supported: + supported = ["English", "Spanish", "French", "Catalan"] + + return { + "mode": "mirror_user_input_language", + "supported_languages": supported, + "rules": [ + "Reply in the same language as the user's most recent message that contains enough text to classify.", + "Do not translate the user's message unless they explicitly ask for a translation.", + "Do not mix languages inside a single reply unless the user mixes languages first.", + "If the user's message is too short to classify, reuse the last confidently detected language in the same thread.", + "If there is still no signal, ask a 1-line clarification asking which language they prefer (keep it short).", + ], + } + + +def generate_voice_dna( + *, + export_input: Path, + out_path: Path, + owner_name: str | None, + window_months: int, + response_window_hours: float, + scripted_min_count: int, + as_of_utc: datetime, +) -> Path: + export_root = _resolve_export_root(export_input) + out_path.parent.mkdir(parents=True, exist_ok=True) + + if not owner_name or not owner_name.strip(): + owner_name = _infer_owner_name(export_root) + if not owner_name: + raise RuntimeError("Could not infer owner name. Pass --owner-name.") + + as_of_utc = as_of_utc.astimezone(timezone.utc).replace(microsecond=0) + start_utc = _subtract_months(as_of_utc, int(window_months)) + start_ts_ms = int(start_utc.timestamp() * 1000) + end_ts_ms = int(as_of_utc.timestamp() * 1000) + response_window_ms = int(float(response_window_hours) * 3600 * 1000) + + template_counts: Counter[str] = Counter() + inbound_lang_counts_window: Counter[VoiceLang] = Counter() + + candidate_responses: list[OutgoingSample] = [] + scanned_conversations = 0 + scanned_message_files = 0 + + inbox_root = export_root / "messages" / "inbox" + for conv_dir in inbox_root.iterdir(): + if not conv_dir.is_dir(): + continue + scanned_conversations += 1 + parts = sorted(conv_dir.glob("message*.json"), key=lambda p: p.name) + if not parts: + continue + scanned_message_files += len(parts) + events = _load_events(parts, owner_name=owner_name) + if not events: + continue + + last_inbound_ts: int | None = None + for e in events: + # Track inbound language (for mirroring policy) for the same recent window. + if not e.is_owner and isinstance(e.text, str) and e.text.strip(): + if start_ts_ms <= e.ts_ms <= end_ts_ms: + inbound_lang_counts_window[_guess_lang(e.text)] += 1 + last_inbound_ts = e.ts_ms + continue + + if not e.is_owner: + last_inbound_ts = e.ts_ms + continue + + if not isinstance(e.text, str) or not e.text.strip(): + continue + + canon = canonicalize_text(e.text) + if canon: + template_counts[canon] += 1 + + # Style samples: only use manual "responses" in the recent window. + if _is_system_new_follower_message(e.text): + continue + if not (start_ts_ms <= e.ts_ms <= end_ts_ms): + continue + if last_inbound_ts is None: + continue + if e.ts_ms < last_inbound_ts: + continue + if (e.ts_ms - last_inbound_ts) > response_window_ms: + continue + + if not canon: + continue + candidate_responses.append(_sample_from_text(ts_ms=e.ts_ms, canon=canon, text=e.text)) + + scripted_templates = {canon for canon, cnt in template_counts.items() if cnt >= int(scripted_min_count)} + manual_responses = [s for s in candidate_responses if s.canon not in scripted_templates] + + owner_lang_counts = Counter([s.lang for s in manual_responses]) + policy = _choose_language_policy(inbound_lang_counts_window, owner_lang_counts) + + by_lang = _lang_bucket(manual_responses) + per_lang_summary = {lang: _summarize_samples(by_lang[lang]) for lang in by_lang} + + voice_dna: dict[str, Any] = { + "schema_version": "voice_dna/v1", + "created_at_utc": _now_utc_iso(), + "subject": { + "account": "@socialmediatorr", + "owner_name": owner_name, + "scope": "Instagram DMs", + }, + "source": { + "type": "instagram_export", + "window": { + "months": int(window_months), + "start_utc": start_utc.isoformat(), + "end_utc": as_of_utc.isoformat(), + "response_window_hours": float(response_window_hours), + }, + "classification": { + "manual_reply_definition": "owner text message within response_window_hours of the most recent inbound message, excluding repeated templates", + "scripted_template_definition": f"owner canonicalized text sent >= {int(scripted_min_count)} times across full export", + "system_messages_excluded": ["you messaged because they followed your account"], + }, + "scan": { + "export_root_hint": export_root.name, + "scanned_conversations": scanned_conversations, + "scanned_message_files": scanned_message_files, + "candidate_responses_in_window": len(candidate_responses), + "manual_responses_in_window": len(manual_responses), + "scripted_template_count": len(scripted_templates), + }, + }, + "policies": { + "language": policy, + }, + "language_observed": { + "inbound_last_window_counts": dict(inbound_lang_counts_window), + "manual_reply_counts": dict(owner_lang_counts), + }, + "style": { + "manual_replies": { + "overall": _summarize_samples(manual_responses), + "by_language": per_lang_summary, + } + }, + } + + out_path.write_text(json.dumps(voice_dna, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") + return out_path + + +def main(argv: list[str] | None = None) -> int: + ap = argparse.ArgumentParser( + description="Generate a VoiceDNA JSON file for recent manual DM replies (style fingerprint + language mirroring policy)." + ) + ap.add_argument("--export-input", required=True, help="Instagram export root (contains messages/inbox)") + ap.add_argument("--out", required=True, help="output path (JSON)") + ap.add_argument("--owner-name", default=None, help='owner sender_name (e.g., "Sergio de Vocht")') + ap.add_argument("--window-months", type=int, default=6, help="how many recent months to use") + ap.add_argument( + "--response-window-hours", + type=float, + default=72.0, + help="max hours after inbound message for an outgoing message to count as a response", + ) + ap.add_argument( + "--scripted-min-count", + type=int, + default=50, + help="owner canonicalized text sent >= this count is treated as scripted/templated", + ) + ap.add_argument( + "--as-of-utc", + default=None, + help="analysis end time (UTC ISO); default: now", + ) + args = ap.parse_args(argv) + + as_of = _parse_iso_utc(args.as_of_utc) if args.as_of_utc else datetime.now(timezone.utc).replace(microsecond=0) + + try: + out = generate_voice_dna( + export_input=Path(args.export_input), + out_path=Path(args.out), + owner_name=(args.owner_name.strip() if args.owner_name else None), + window_months=int(args.window_months), + response_window_hours=float(args.response_window_hours), + scripted_min_count=int(args.scripted_min_count), + as_of_utc=as_of, + ) + print(json.dumps({"ok": True, "out": str(out)}, ensure_ascii=False)) + return 0 + except FileNotFoundError as e: + print(str(e)) + return 2 + except Exception as e: + print(f"VoiceDNA generation failed: {e}") + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/voice_dna/socialmediatorr.voice-dna.json b/voice_dna/socialmediatorr.voice-dna.json new file mode 100644 index 0000000..3caec1b --- /dev/null +++ b/voice_dna/socialmediatorr.voice-dna.json @@ -0,0 +1,468 @@ +{ + "schema_version": "voice_dna/v1", + "created_at_utc": "2025-12-24T12:08:24+00:00", + "subject": { + "account": "@socialmediatorr", + "owner_name": "Sergio de Vocht", + "scope": "Instagram DMs" + }, + "source": { + "type": "instagram_export", + "window": { + "months": 6, + "start_utc": "2025-06-24T12:08:20+00:00", + "end_utc": "2025-12-24T12:08:20+00:00", + "response_window_hours": 72.0 + }, + "classification": { + "manual_reply_definition": "owner text message within response_window_hours of the most recent inbound message, excluding repeated templates", + "scripted_template_definition": "owner canonicalized text sent >= 50 times across full export", + "system_messages_excluded": [ + "you messaged because they followed your account" + ] + }, + "scan": { + "export_root_hint": "socialmediatorr-ig-export-raw-20251224", + "scanned_conversations": 10100, + "scanned_message_files": 10061, + "candidate_responses_in_window": 18934, + "manual_responses_in_window": 825, + "scripted_template_count": 24 + } + }, + "policies": { + "language": { + "mode": "mirror_user_input_language", + "supported_languages": [ + "English", + "Spanish", + "French", + "Catalan" + ], + "rules": [ + "Reply in the same language as the user's most recent message that contains enough text to classify.", + "Do not translate the user's message unless they explicitly ask for a translation.", + "Do not mix languages inside a single reply unless the user mixes languages first.", + "If the user's message is too short to classify, reuse the last confidently detected language in the same thread.", + "If there is still no signal, ask a 1-line clarification asking which language they prefer (keep it short)." + ] + } + }, + "language_observed": { + "inbound_last_window_counts": { + "English": 2828, + "Too short to tell": 5024, + "Spanish": 2176, + "French": 191, + "Catalan": 16 + }, + "manual_reply_counts": { + "Spanish": 541, + "Too short to tell": 255, + "English": 17, + "French": 1, + "Catalan": 11 + } + }, + "style": { + "manual_replies": { + "overall": { + "count": 825, + "length": { + "chars": { + "min": 2, + "p10": 13, + "median": 54, + "p90": 199, + "max": 928, + "mean": 87.47878787878788 + }, + "words": { + "min": 1, + "p10": 2, + "median": 11, + "p90": 34, + "max": 169, + "mean": 15.436363636363636 + } + }, + "rates": { + "emoji_messages_pct": 16.363636363636363, + "question_messages_pct": 32.484848484848484, + "ends_with_question_pct": 27.151515151515156, + "exclamation_messages_pct": 18.303030303030305, + "linebreak_messages_pct": 9.212121212121211, + "url_messages_pct": 4.121212121212121, + "handle_messages_pct": 0.0, + "number_messages_pct": 9.454545454545455, + "starts_with_greeting_pct": 4.96969696969697, + "contains_thanks_pct": 4.484848484848484, + "contains_cta_terms_pct": 4.848484848484849 + }, + "top_emojis": [ + { + "emoji": "🙌", + "count": 41 + }, + { + "emoji": "🫶", + "count": 23 + }, + { + "emoji": "👋", + "count": 19 + }, + { + "emoji": "🎁", + "count": 17 + }, + { + "emoji": "💪", + "count": 14 + }, + { + "emoji": "👇", + "count": 12 + }, + { + "emoji": "😜", + "count": 7 + }, + { + "emoji": "😉", + "count": 4 + }, + { + "emoji": "🤣", + "count": 2 + }, + { + "emoji": "🙏", + "count": 2 + }, + { + "emoji": "⬆", + "count": 2 + }, + { + "emoji": "👀", + "count": 2 + }, + { + "emoji": "¦", + "count": 1 + }, + { + "emoji": "😅", + "count": 1 + }, + { + "emoji": "🤝", + "count": 1 + }, + { + "emoji": "☺", + "count": 1 + }, + { + "emoji": "🫡", + "count": 1 + }, + { + "emoji": "😁", + "count": 1 + }, + { + "emoji": "👌", + "count": 1 + }, + { + "emoji": "🚀", + "count": 1 + } + ] + }, + "by_language": { + "Spanish": { + "count": 541, + "length": { + "chars": { + "min": 5, + "p10": 35, + "median": 99, + "p90": 211, + "max": 928, + "mean": 116.06469500924214 + }, + "words": { + "min": 1, + "p10": 6, + "median": 18, + "p90": 38, + "max": 169, + "mean": 20.478743068391868 + } + }, + "rates": { + "emoji_messages_pct": 20.70240295748614, + "question_messages_pct": 39.55637707948244, + "ends_with_question_pct": 32.34750462107209, + "exclamation_messages_pct": 16.266173752310536, + "linebreak_messages_pct": 13.67837338262477, + "url_messages_pct": 2.2181146025878005, + "handle_messages_pct": 0.0, + "number_messages_pct": 9.242144177449168, + "starts_with_greeting_pct": 7.578558225508318, + "contains_thanks_pct": 6.839186691312385, + "contains_cta_terms_pct": 7.024029574861368 + }, + "top_emojis": [ + { + "emoji": "🙌", + "count": 24 + }, + { + "emoji": "🫶", + "count": 23 + }, + { + "emoji": "👋", + "count": 19 + }, + { + "emoji": "🎁", + "count": 17 + }, + { + "emoji": "👇", + "count": 12 + }, + { + "emoji": "💪", + "count": 10 + }, + { + "emoji": "😜", + "count": 7 + }, + { + "emoji": "😉", + "count": 4 + }, + { + "emoji": "🤣", + "count": 2 + }, + { + "emoji": "🙏", + "count": 2 + }, + { + "emoji": "⬆", + "count": 2 + }, + { + "emoji": "👀", + "count": 2 + }, + { + "emoji": "😅", + "count": 1 + }, + { + "emoji": "🤝", + "count": 1 + }, + { + "emoji": "☺", + "count": 1 + }, + { + "emoji": "😁", + "count": 1 + }, + { + "emoji": "👌", + "count": 1 + }, + { + "emoji": "🚀", + "count": 1 + }, + { + "emoji": "⏱", + "count": 1 + }, + { + "emoji": "🫂", + "count": 1 + } + ] + }, + "Too short to tell": { + "count": 255, + "length": { + "chars": { + "min": 2, + "p10": 10, + "median": 30, + "p90": 53, + "max": 151, + "mean": 32.01176470588236 + }, + "words": { + "min": 1, + "p10": 1, + "median": 4, + "p90": 11, + "max": 29, + "mean": 5.749019607843137 + } + }, + "rates": { + "emoji_messages_pct": 7.8431372549019605, + "question_messages_pct": 19.607843137254903, + "ends_with_question_pct": 17.647058823529413, + "exclamation_messages_pct": 23.52941176470588, + "linebreak_messages_pct": 0.39215686274509803, + "url_messages_pct": 4.705882352941177, + "handle_messages_pct": 0.0, + "number_messages_pct": 7.0588235294117645, + "starts_with_greeting_pct": 0.0, + "contains_thanks_pct": 0.0, + "contains_cta_terms_pct": 0.0 + }, + "top_emojis": [ + { + "emoji": "🙌", + "count": 15 + }, + { + "emoji": "💪", + "count": 4 + }, + { + "emoji": "🫡", + "count": 1 + } + ] + }, + "English": { + "count": 17, + "length": { + "chars": { + "min": 4, + "p10": 11, + "median": 23, + "p90": 38, + "max": 40, + "mean": 23.176470588235293 + }, + "words": { + "min": 2, + "p10": 3, + "median": 4, + "p90": 6, + "max": 8, + "mean": 4.588235294117647 + } + }, + "rates": { + "emoji_messages_pct": 11.76470588235294, + "question_messages_pct": 23.52941176470588, + "ends_with_question_pct": 23.52941176470588, + "exclamation_messages_pct": 17.647058823529413, + "linebreak_messages_pct": 0.0, + "url_messages_pct": 0.0, + "handle_messages_pct": 0.0, + "number_messages_pct": 0.0, + "starts_with_greeting_pct": 0.0, + "contains_thanks_pct": 0.0, + "contains_cta_terms_pct": 11.76470588235294 + }, + "top_emojis": [ + { + "emoji": "🙌", + "count": 2 + } + ] + }, + "French": { + "count": 1, + "length": { + "chars": { + "min": 29, + "p10": 29, + "median": 29, + "p90": 29, + "max": 29, + "mean": 29.0 + }, + "words": { + "min": 5, + "p10": 5, + "median": 5, + "p90": 5, + "max": 5, + "mean": 5.0 + } + }, + "rates": { + "emoji_messages_pct": 100.0, + "question_messages_pct": 0.0, + "ends_with_question_pct": 0.0, + "exclamation_messages_pct": 0.0, + "linebreak_messages_pct": 0.0, + "url_messages_pct": 0.0, + "handle_messages_pct": 0.0, + "number_messages_pct": 0.0, + "starts_with_greeting_pct": 0.0, + "contains_thanks_pct": 0.0, + "contains_cta_terms_pct": 0.0 + }, + "top_emojis": [ + { + "emoji": "¦", + "count": 1 + } + ] + }, + "Catalan": { + "count": 11, + "length": { + "chars": { + "min": 26, + "p10": 48, + "median": 52, + "p90": 99, + "max": 205, + "mean": 72.0909090909091 + }, + "words": { + "min": 5, + "p10": 5, + "median": 8, + "p90": 10, + "max": 34, + "mean": 9.727272727272727 + } + }, + "rates": { + "emoji_messages_pct": 0.0, + "question_messages_pct": 0.0, + "ends_with_question_pct": 0.0, + "exclamation_messages_pct": 0.0, + "linebreak_messages_pct": 9.090909090909092, + "url_messages_pct": 90.9090909090909, + "handle_messages_pct": 0.0, + "number_messages_pct": 90.9090909090909, + "starts_with_greeting_pct": 0.0, + "contains_thanks_pct": 0.0, + "contains_cta_terms_pct": 0.0 + }, + "top_emojis": [] + } + } + } + } +}