Add VoiceDNA profile from manual replies
This commit is contained in:
parent
817995ac6b
commit
25e306d4b0
3 changed files with 951 additions and 0 deletions
12
README.md
12
README.md
|
|
@ -141,6 +141,18 @@ Generate the deeper “no raw quotes” report directly from an Instagram export
|
|||
|
||||
- `python3 -m sergio_instagram_messaging.generate_dm_report_detailed --export-input /path/to/export-root --out /root/tmp/dm_history_report_en_detailed.md`
|
||||
|
||||
## VoiceDNA (manual reply style)
|
||||
|
||||
`voice_dna/socialmediatorr.voice-dna.json` is a **safe-to-store** style fingerprint generated from the last 6 months of **manual (non-template) DM replies** (no raw DM quotes are included).
|
||||
|
||||
It also encodes a hard rule for the bot:
|
||||
|
||||
- Always reply in the **user’s input language** (English / Spanish / French / Catalan), with a short clarification if the user’s message is too short to detect.
|
||||
|
||||
Regenerate from a local Instagram export folder:
|
||||
|
||||
- `python3 -m sergio_instagram_messaging.generate_voice_dna --export-input /path/to/export-root --out voice_dna/socialmediatorr.voice-dna.json --owner-name "Sergio de Vocht" --window-months 6`
|
||||
|
||||
## Webhooks (new messages → auto-reply)
|
||||
|
||||
Meta webhooks are two steps:
|
||||
|
|
|
|||
471
sergio_instagram_messaging/generate_voice_dna.py
Normal file
471
sergio_instagram_messaging/generate_voice_dna.py
Normal file
|
|
@ -0,0 +1,471 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import statistics
|
||||
from collections import Counter, defaultdict
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, Literal
|
||||
|
||||
from .analyze_instagram_export import canonicalize_text
|
||||
from .generate_dm_report_detailed import _infer_owner_name, _load_events, _resolve_export_root
|
||||
|
||||
|
||||
VoiceLang = Literal["English", "Spanish", "French", "Catalan", "Too short to tell"]
|
||||
|
||||
|
||||
_RE_URL = re.compile(r"(?i)\b(?:https?://|www\.)\S+")
|
||||
_RE_HANDLE = re.compile(r"(?<!\w)@[\w._]{2,}")
|
||||
_RE_DIGIT = re.compile(r"\d")
|
||||
|
||||
|
||||
def _now_utc_iso() -> str:
|
||||
return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
|
||||
|
||||
|
||||
def _parse_iso_utc(s: str) -> datetime:
|
||||
"""
|
||||
Accept an ISO timestamp like:
|
||||
- 2025-12-24T12:34:56Z
|
||||
- 2025-12-24T12:34:56+00:00
|
||||
- 2025-12-24T12:34:56
|
||||
Defaults to UTC if tzinfo is missing.
|
||||
"""
|
||||
raw = (s or "").strip()
|
||||
if not raw:
|
||||
raise ValueError("empty datetime")
|
||||
if raw.endswith("Z"):
|
||||
raw = raw[:-1] + "+00:00"
|
||||
dt = datetime.fromisoformat(raw)
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
return dt.astimezone(timezone.utc).replace(microsecond=0)
|
||||
|
||||
|
||||
def _days_in_month(year: int, month: int) -> int:
|
||||
if month in (1, 3, 5, 7, 8, 10, 12):
|
||||
return 31
|
||||
if month in (4, 6, 9, 11):
|
||||
return 30
|
||||
# February
|
||||
leap = (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0)
|
||||
return 29 if leap else 28
|
||||
|
||||
|
||||
def _subtract_months(dt: datetime, months: int) -> datetime:
|
||||
if months <= 0:
|
||||
return dt
|
||||
year = dt.year
|
||||
month = dt.month - months
|
||||
while month <= 0:
|
||||
month += 12
|
||||
year -= 1
|
||||
day = min(dt.day, _days_in_month(year, month))
|
||||
return dt.replace(year=year, month=month, day=day)
|
||||
|
||||
|
||||
def _tokenize(text: str) -> list[str]:
|
||||
return re.findall(r"\b[\w']+\b", (text or "").lower(), flags=re.UNICODE)
|
||||
|
||||
|
||||
def _guess_lang(text: str) -> VoiceLang:
|
||||
# Keep this consistent with generate_dm_report_detailed.py
|
||||
from .generate_dm_report_detailed import _guess_lang as _guess_lang_impl
|
||||
|
||||
return _guess_lang_impl(text)
|
||||
|
||||
|
||||
def _is_system_new_follower_message(text: str) -> bool:
|
||||
s = (text or "").strip().lower()
|
||||
return s.startswith("you messaged") and "followed your account" in s
|
||||
|
||||
|
||||
def _iter_emojis(text: str) -> Iterable[str]:
|
||||
# Heuristic: treat unicode "Symbol, other" as emoji-like and ignore punctuation/formatting.
|
||||
# This avoids counting typographic quotes (e.g., “ ”) and ellipses (…) as emojis.
|
||||
import unicodedata
|
||||
|
||||
for ch in (text or ""):
|
||||
if ch.isspace():
|
||||
continue
|
||||
cat = unicodedata.category(ch)
|
||||
if cat != "So":
|
||||
continue
|
||||
yield ch
|
||||
|
||||
|
||||
def _quantile(values: list[int], q: float) -> int | None:
|
||||
if not values:
|
||||
return None
|
||||
if q <= 0:
|
||||
return min(values)
|
||||
if q >= 1:
|
||||
return max(values)
|
||||
xs = sorted(values)
|
||||
idx = int(round((len(xs) - 1) * q))
|
||||
return xs[max(0, min(len(xs) - 1, idx))]
|
||||
|
||||
|
||||
def _pct(num: int, den: int) -> float:
|
||||
return 0.0 if den <= 0 else (num / den) * 100.0
|
||||
|
||||
|
||||
_GREETING_WORDS: dict[VoiceLang, set[str]] = {
|
||||
"English": {"hi", "hey", "hello"},
|
||||
"Spanish": {"hola", "buenas", "buenos"},
|
||||
"French": {"salut", "bonjour"},
|
||||
"Catalan": {"hola", "bon"},
|
||||
"Too short to tell": set(),
|
||||
}
|
||||
|
||||
_THANKS_WORDS: dict[VoiceLang, set[str]] = {
|
||||
"English": {"thanks", "thank"},
|
||||
"Spanish": {"gracias"},
|
||||
"French": {"merci"},
|
||||
"Catalan": {"gracies", "gràcies"},
|
||||
"Too short to tell": set(),
|
||||
}
|
||||
|
||||
_CTA_WORDS: dict[VoiceLang, set[str]] = {
|
||||
"English": {"link", "call", "book", "ebook", "price"},
|
||||
"Spanish": {"enlace", "link", "llamada", "libro", "precio"},
|
||||
"French": {"lien", "appel", "livre", "prix"},
|
||||
"Catalan": {"enllaç", "enllac", "trucada", "llibre", "preu"},
|
||||
"Too short to tell": set(),
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class OutgoingSample:
|
||||
ts_ms: int
|
||||
canon: str
|
||||
lang: VoiceLang
|
||||
chars: int
|
||||
words: int
|
||||
has_emoji: bool
|
||||
emoji: tuple[str, ...]
|
||||
has_question: bool
|
||||
ends_with_question: bool
|
||||
has_exclamation: bool
|
||||
has_linebreak: bool
|
||||
has_url: bool
|
||||
has_handle: bool
|
||||
has_number: bool
|
||||
starts_with_greeting: bool
|
||||
contains_thanks: bool
|
||||
contains_cta_terms: bool
|
||||
|
||||
|
||||
def _sample_from_text(*, ts_ms: int, canon: str, text: str) -> OutgoingSample:
|
||||
s = (text or "").strip()
|
||||
lang = _guess_lang(s)
|
||||
toks = _tokenize(s)
|
||||
|
||||
emojis = tuple(_iter_emojis(s))
|
||||
has_question = "?" in s or "¿" in s
|
||||
ends_with_question = s.endswith("?")
|
||||
has_exclamation = "!" in s or "¡" in s
|
||||
has_linebreak = "\n" in s
|
||||
has_url = bool(_RE_URL.search(s))
|
||||
has_handle = bool(_RE_HANDLE.search(s))
|
||||
has_number = bool(_RE_DIGIT.search(s))
|
||||
|
||||
first_word = toks[0] if toks else ""
|
||||
starts_with_greeting = bool(first_word and first_word in _GREETING_WORDS.get(lang, set()))
|
||||
contains_thanks = bool(set(toks) & _THANKS_WORDS.get(lang, set()))
|
||||
contains_cta_terms = bool(set(toks) & _CTA_WORDS.get(lang, set()))
|
||||
|
||||
return OutgoingSample(
|
||||
ts_ms=int(ts_ms),
|
||||
canon=canon,
|
||||
lang=lang,
|
||||
chars=len(s),
|
||||
words=len(toks),
|
||||
has_emoji=bool(emojis),
|
||||
emoji=emojis,
|
||||
has_question=has_question,
|
||||
ends_with_question=ends_with_question,
|
||||
has_exclamation=has_exclamation,
|
||||
has_linebreak=has_linebreak,
|
||||
has_url=has_url,
|
||||
has_handle=has_handle,
|
||||
has_number=has_number,
|
||||
starts_with_greeting=starts_with_greeting,
|
||||
contains_thanks=contains_thanks,
|
||||
contains_cta_terms=contains_cta_terms,
|
||||
)
|
||||
|
||||
|
||||
def _lang_bucket(samples: list[OutgoingSample]) -> dict[VoiceLang, list[OutgoingSample]]:
|
||||
by: dict[VoiceLang, list[OutgoingSample]] = defaultdict(list)
|
||||
for s in samples:
|
||||
by[s.lang].append(s)
|
||||
# Ensure stable order in output (even if empty).
|
||||
for lang in ("English", "Spanish", "French", "Catalan", "Too short to tell"):
|
||||
by.setdefault(lang, [])
|
||||
return dict(by)
|
||||
|
||||
|
||||
def _summarize_samples(samples: list[OutgoingSample]) -> dict[str, Any]:
|
||||
if not samples:
|
||||
return {
|
||||
"count": 0,
|
||||
"length": {"chars": {}, "words": {}},
|
||||
"rates": {},
|
||||
"top_emojis": [],
|
||||
}
|
||||
|
||||
chars = [s.chars for s in samples]
|
||||
words = [s.words for s in samples]
|
||||
|
||||
emoji_counter: Counter[str] = Counter()
|
||||
for s in samples:
|
||||
emoji_counter.update(s.emoji)
|
||||
|
||||
def rates() -> dict[str, float]:
|
||||
n = len(samples)
|
||||
return {
|
||||
"emoji_messages_pct": _pct(sum(1 for s in samples if s.has_emoji), n),
|
||||
"question_messages_pct": _pct(sum(1 for s in samples if s.has_question), n),
|
||||
"ends_with_question_pct": _pct(sum(1 for s in samples if s.ends_with_question), n),
|
||||
"exclamation_messages_pct": _pct(sum(1 for s in samples if s.has_exclamation), n),
|
||||
"linebreak_messages_pct": _pct(sum(1 for s in samples if s.has_linebreak), n),
|
||||
"url_messages_pct": _pct(sum(1 for s in samples if s.has_url), n),
|
||||
"handle_messages_pct": _pct(sum(1 for s in samples if s.has_handle), n),
|
||||
"number_messages_pct": _pct(sum(1 for s in samples if s.has_number), n),
|
||||
"starts_with_greeting_pct": _pct(sum(1 for s in samples if s.starts_with_greeting), n),
|
||||
"contains_thanks_pct": _pct(sum(1 for s in samples if s.contains_thanks), n),
|
||||
"contains_cta_terms_pct": _pct(sum(1 for s in samples if s.contains_cta_terms), n),
|
||||
}
|
||||
|
||||
def length_summary(values: list[int]) -> dict[str, int | float | None]:
|
||||
return {
|
||||
"min": min(values) if values else None,
|
||||
"p10": _quantile(values, 0.10),
|
||||
"median": int(statistics.median(values)) if values else None,
|
||||
"p90": _quantile(values, 0.90),
|
||||
"max": max(values) if values else None,
|
||||
"mean": (sum(values) / len(values)) if values else None,
|
||||
}
|
||||
|
||||
return {
|
||||
"count": len(samples),
|
||||
"length": {"chars": length_summary(chars), "words": length_summary(words)},
|
||||
"rates": rates(),
|
||||
"top_emojis": [{"emoji": e, "count": c} for e, c in emoji_counter.most_common(20)],
|
||||
}
|
||||
|
||||
|
||||
def _choose_language_policy(supported_from_inbox: Counter[VoiceLang], supported_from_owner: Counter[VoiceLang]) -> dict[str, Any]:
|
||||
supported = []
|
||||
for lang in ("English", "Spanish", "French", "Catalan"):
|
||||
if supported_from_inbox.get(lang, 0) > 0 or supported_from_owner.get(lang, 0) > 0:
|
||||
supported.append(lang)
|
||||
if not supported:
|
||||
supported = ["English", "Spanish", "French", "Catalan"]
|
||||
|
||||
return {
|
||||
"mode": "mirror_user_input_language",
|
||||
"supported_languages": supported,
|
||||
"rules": [
|
||||
"Reply in the same language as the user's most recent message that contains enough text to classify.",
|
||||
"Do not translate the user's message unless they explicitly ask for a translation.",
|
||||
"Do not mix languages inside a single reply unless the user mixes languages first.",
|
||||
"If the user's message is too short to classify, reuse the last confidently detected language in the same thread.",
|
||||
"If there is still no signal, ask a 1-line clarification asking which language they prefer (keep it short).",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def generate_voice_dna(
|
||||
*,
|
||||
export_input: Path,
|
||||
out_path: Path,
|
||||
owner_name: str | None,
|
||||
window_months: int,
|
||||
response_window_hours: float,
|
||||
scripted_min_count: int,
|
||||
as_of_utc: datetime,
|
||||
) -> Path:
|
||||
export_root = _resolve_export_root(export_input)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if not owner_name or not owner_name.strip():
|
||||
owner_name = _infer_owner_name(export_root)
|
||||
if not owner_name:
|
||||
raise RuntimeError("Could not infer owner name. Pass --owner-name.")
|
||||
|
||||
as_of_utc = as_of_utc.astimezone(timezone.utc).replace(microsecond=0)
|
||||
start_utc = _subtract_months(as_of_utc, int(window_months))
|
||||
start_ts_ms = int(start_utc.timestamp() * 1000)
|
||||
end_ts_ms = int(as_of_utc.timestamp() * 1000)
|
||||
response_window_ms = int(float(response_window_hours) * 3600 * 1000)
|
||||
|
||||
template_counts: Counter[str] = Counter()
|
||||
inbound_lang_counts_window: Counter[VoiceLang] = Counter()
|
||||
|
||||
candidate_responses: list[OutgoingSample] = []
|
||||
scanned_conversations = 0
|
||||
scanned_message_files = 0
|
||||
|
||||
inbox_root = export_root / "messages" / "inbox"
|
||||
for conv_dir in inbox_root.iterdir():
|
||||
if not conv_dir.is_dir():
|
||||
continue
|
||||
scanned_conversations += 1
|
||||
parts = sorted(conv_dir.glob("message*.json"), key=lambda p: p.name)
|
||||
if not parts:
|
||||
continue
|
||||
scanned_message_files += len(parts)
|
||||
events = _load_events(parts, owner_name=owner_name)
|
||||
if not events:
|
||||
continue
|
||||
|
||||
last_inbound_ts: int | None = None
|
||||
for e in events:
|
||||
# Track inbound language (for mirroring policy) for the same recent window.
|
||||
if not e.is_owner and isinstance(e.text, str) and e.text.strip():
|
||||
if start_ts_ms <= e.ts_ms <= end_ts_ms:
|
||||
inbound_lang_counts_window[_guess_lang(e.text)] += 1
|
||||
last_inbound_ts = e.ts_ms
|
||||
continue
|
||||
|
||||
if not e.is_owner:
|
||||
last_inbound_ts = e.ts_ms
|
||||
continue
|
||||
|
||||
if not isinstance(e.text, str) or not e.text.strip():
|
||||
continue
|
||||
|
||||
canon = canonicalize_text(e.text)
|
||||
if canon:
|
||||
template_counts[canon] += 1
|
||||
|
||||
# Style samples: only use manual "responses" in the recent window.
|
||||
if _is_system_new_follower_message(e.text):
|
||||
continue
|
||||
if not (start_ts_ms <= e.ts_ms <= end_ts_ms):
|
||||
continue
|
||||
if last_inbound_ts is None:
|
||||
continue
|
||||
if e.ts_ms < last_inbound_ts:
|
||||
continue
|
||||
if (e.ts_ms - last_inbound_ts) > response_window_ms:
|
||||
continue
|
||||
|
||||
if not canon:
|
||||
continue
|
||||
candidate_responses.append(_sample_from_text(ts_ms=e.ts_ms, canon=canon, text=e.text))
|
||||
|
||||
scripted_templates = {canon for canon, cnt in template_counts.items() if cnt >= int(scripted_min_count)}
|
||||
manual_responses = [s for s in candidate_responses if s.canon not in scripted_templates]
|
||||
|
||||
owner_lang_counts = Counter([s.lang for s in manual_responses])
|
||||
policy = _choose_language_policy(inbound_lang_counts_window, owner_lang_counts)
|
||||
|
||||
by_lang = _lang_bucket(manual_responses)
|
||||
per_lang_summary = {lang: _summarize_samples(by_lang[lang]) for lang in by_lang}
|
||||
|
||||
voice_dna: dict[str, Any] = {
|
||||
"schema_version": "voice_dna/v1",
|
||||
"created_at_utc": _now_utc_iso(),
|
||||
"subject": {
|
||||
"account": "@socialmediatorr",
|
||||
"owner_name": owner_name,
|
||||
"scope": "Instagram DMs",
|
||||
},
|
||||
"source": {
|
||||
"type": "instagram_export",
|
||||
"window": {
|
||||
"months": int(window_months),
|
||||
"start_utc": start_utc.isoformat(),
|
||||
"end_utc": as_of_utc.isoformat(),
|
||||
"response_window_hours": float(response_window_hours),
|
||||
},
|
||||
"classification": {
|
||||
"manual_reply_definition": "owner text message within response_window_hours of the most recent inbound message, excluding repeated templates",
|
||||
"scripted_template_definition": f"owner canonicalized text sent >= {int(scripted_min_count)} times across full export",
|
||||
"system_messages_excluded": ["you messaged <user> because they followed your account"],
|
||||
},
|
||||
"scan": {
|
||||
"export_root_hint": export_root.name,
|
||||
"scanned_conversations": scanned_conversations,
|
||||
"scanned_message_files": scanned_message_files,
|
||||
"candidate_responses_in_window": len(candidate_responses),
|
||||
"manual_responses_in_window": len(manual_responses),
|
||||
"scripted_template_count": len(scripted_templates),
|
||||
},
|
||||
},
|
||||
"policies": {
|
||||
"language": policy,
|
||||
},
|
||||
"language_observed": {
|
||||
"inbound_last_window_counts": dict(inbound_lang_counts_window),
|
||||
"manual_reply_counts": dict(owner_lang_counts),
|
||||
},
|
||||
"style": {
|
||||
"manual_replies": {
|
||||
"overall": _summarize_samples(manual_responses),
|
||||
"by_language": per_lang_summary,
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
out_path.write_text(json.dumps(voice_dna, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
||||
return out_path
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
ap = argparse.ArgumentParser(
|
||||
description="Generate a VoiceDNA JSON file for recent manual DM replies (style fingerprint + language mirroring policy)."
|
||||
)
|
||||
ap.add_argument("--export-input", required=True, help="Instagram export root (contains messages/inbox)")
|
||||
ap.add_argument("--out", required=True, help="output path (JSON)")
|
||||
ap.add_argument("--owner-name", default=None, help='owner sender_name (e.g., "Sergio de Vocht")')
|
||||
ap.add_argument("--window-months", type=int, default=6, help="how many recent months to use")
|
||||
ap.add_argument(
|
||||
"--response-window-hours",
|
||||
type=float,
|
||||
default=72.0,
|
||||
help="max hours after inbound message for an outgoing message to count as a response",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--scripted-min-count",
|
||||
type=int,
|
||||
default=50,
|
||||
help="owner canonicalized text sent >= this count is treated as scripted/templated",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--as-of-utc",
|
||||
default=None,
|
||||
help="analysis end time (UTC ISO); default: now",
|
||||
)
|
||||
args = ap.parse_args(argv)
|
||||
|
||||
as_of = _parse_iso_utc(args.as_of_utc) if args.as_of_utc else datetime.now(timezone.utc).replace(microsecond=0)
|
||||
|
||||
try:
|
||||
out = generate_voice_dna(
|
||||
export_input=Path(args.export_input),
|
||||
out_path=Path(args.out),
|
||||
owner_name=(args.owner_name.strip() if args.owner_name else None),
|
||||
window_months=int(args.window_months),
|
||||
response_window_hours=float(args.response_window_hours),
|
||||
scripted_min_count=int(args.scripted_min_count),
|
||||
as_of_utc=as_of,
|
||||
)
|
||||
print(json.dumps({"ok": True, "out": str(out)}, ensure_ascii=False))
|
||||
return 0
|
||||
except FileNotFoundError as e:
|
||||
print(str(e))
|
||||
return 2
|
||||
except Exception as e:
|
||||
print(f"VoiceDNA generation failed: {e}")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
468
voice_dna/socialmediatorr.voice-dna.json
Normal file
468
voice_dna/socialmediatorr.voice-dna.json
Normal file
|
|
@ -0,0 +1,468 @@
|
|||
{
|
||||
"schema_version": "voice_dna/v1",
|
||||
"created_at_utc": "2025-12-24T12:08:24+00:00",
|
||||
"subject": {
|
||||
"account": "@socialmediatorr",
|
||||
"owner_name": "Sergio de Vocht",
|
||||
"scope": "Instagram DMs"
|
||||
},
|
||||
"source": {
|
||||
"type": "instagram_export",
|
||||
"window": {
|
||||
"months": 6,
|
||||
"start_utc": "2025-06-24T12:08:20+00:00",
|
||||
"end_utc": "2025-12-24T12:08:20+00:00",
|
||||
"response_window_hours": 72.0
|
||||
},
|
||||
"classification": {
|
||||
"manual_reply_definition": "owner text message within response_window_hours of the most recent inbound message, excluding repeated templates",
|
||||
"scripted_template_definition": "owner canonicalized text sent >= 50 times across full export",
|
||||
"system_messages_excluded": [
|
||||
"you messaged <user> because they followed your account"
|
||||
]
|
||||
},
|
||||
"scan": {
|
||||
"export_root_hint": "socialmediatorr-ig-export-raw-20251224",
|
||||
"scanned_conversations": 10100,
|
||||
"scanned_message_files": 10061,
|
||||
"candidate_responses_in_window": 18934,
|
||||
"manual_responses_in_window": 825,
|
||||
"scripted_template_count": 24
|
||||
}
|
||||
},
|
||||
"policies": {
|
||||
"language": {
|
||||
"mode": "mirror_user_input_language",
|
||||
"supported_languages": [
|
||||
"English",
|
||||
"Spanish",
|
||||
"French",
|
||||
"Catalan"
|
||||
],
|
||||
"rules": [
|
||||
"Reply in the same language as the user's most recent message that contains enough text to classify.",
|
||||
"Do not translate the user's message unless they explicitly ask for a translation.",
|
||||
"Do not mix languages inside a single reply unless the user mixes languages first.",
|
||||
"If the user's message is too short to classify, reuse the last confidently detected language in the same thread.",
|
||||
"If there is still no signal, ask a 1-line clarification asking which language they prefer (keep it short)."
|
||||
]
|
||||
}
|
||||
},
|
||||
"language_observed": {
|
||||
"inbound_last_window_counts": {
|
||||
"English": 2828,
|
||||
"Too short to tell": 5024,
|
||||
"Spanish": 2176,
|
||||
"French": 191,
|
||||
"Catalan": 16
|
||||
},
|
||||
"manual_reply_counts": {
|
||||
"Spanish": 541,
|
||||
"Too short to tell": 255,
|
||||
"English": 17,
|
||||
"French": 1,
|
||||
"Catalan": 11
|
||||
}
|
||||
},
|
||||
"style": {
|
||||
"manual_replies": {
|
||||
"overall": {
|
||||
"count": 825,
|
||||
"length": {
|
||||
"chars": {
|
||||
"min": 2,
|
||||
"p10": 13,
|
||||
"median": 54,
|
||||
"p90": 199,
|
||||
"max": 928,
|
||||
"mean": 87.47878787878788
|
||||
},
|
||||
"words": {
|
||||
"min": 1,
|
||||
"p10": 2,
|
||||
"median": 11,
|
||||
"p90": 34,
|
||||
"max": 169,
|
||||
"mean": 15.436363636363636
|
||||
}
|
||||
},
|
||||
"rates": {
|
||||
"emoji_messages_pct": 16.363636363636363,
|
||||
"question_messages_pct": 32.484848484848484,
|
||||
"ends_with_question_pct": 27.151515151515156,
|
||||
"exclamation_messages_pct": 18.303030303030305,
|
||||
"linebreak_messages_pct": 9.212121212121211,
|
||||
"url_messages_pct": 4.121212121212121,
|
||||
"handle_messages_pct": 0.0,
|
||||
"number_messages_pct": 9.454545454545455,
|
||||
"starts_with_greeting_pct": 4.96969696969697,
|
||||
"contains_thanks_pct": 4.484848484848484,
|
||||
"contains_cta_terms_pct": 4.848484848484849
|
||||
},
|
||||
"top_emojis": [
|
||||
{
|
||||
"emoji": "🙌",
|
||||
"count": 41
|
||||
},
|
||||
{
|
||||
"emoji": "🫶",
|
||||
"count": 23
|
||||
},
|
||||
{
|
||||
"emoji": "👋",
|
||||
"count": 19
|
||||
},
|
||||
{
|
||||
"emoji": "🎁",
|
||||
"count": 17
|
||||
},
|
||||
{
|
||||
"emoji": "💪",
|
||||
"count": 14
|
||||
},
|
||||
{
|
||||
"emoji": "👇",
|
||||
"count": 12
|
||||
},
|
||||
{
|
||||
"emoji": "😜",
|
||||
"count": 7
|
||||
},
|
||||
{
|
||||
"emoji": "😉",
|
||||
"count": 4
|
||||
},
|
||||
{
|
||||
"emoji": "🤣",
|
||||
"count": 2
|
||||
},
|
||||
{
|
||||
"emoji": "🙏",
|
||||
"count": 2
|
||||
},
|
||||
{
|
||||
"emoji": "⬆",
|
||||
"count": 2
|
||||
},
|
||||
{
|
||||
"emoji": "👀",
|
||||
"count": 2
|
||||
},
|
||||
{
|
||||
"emoji": "¦",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"emoji": "😅",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"emoji": "🤝",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"emoji": "☺",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"emoji": "🫡",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"emoji": "😁",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"emoji": "👌",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"emoji": "🚀",
|
||||
"count": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"by_language": {
|
||||
"Spanish": {
|
||||
"count": 541,
|
||||
"length": {
|
||||
"chars": {
|
||||
"min": 5,
|
||||
"p10": 35,
|
||||
"median": 99,
|
||||
"p90": 211,
|
||||
"max": 928,
|
||||
"mean": 116.06469500924214
|
||||
},
|
||||
"words": {
|
||||
"min": 1,
|
||||
"p10": 6,
|
||||
"median": 18,
|
||||
"p90": 38,
|
||||
"max": 169,
|
||||
"mean": 20.478743068391868
|
||||
}
|
||||
},
|
||||
"rates": {
|
||||
"emoji_messages_pct": 20.70240295748614,
|
||||
"question_messages_pct": 39.55637707948244,
|
||||
"ends_with_question_pct": 32.34750462107209,
|
||||
"exclamation_messages_pct": 16.266173752310536,
|
||||
"linebreak_messages_pct": 13.67837338262477,
|
||||
"url_messages_pct": 2.2181146025878005,
|
||||
"handle_messages_pct": 0.0,
|
||||
"number_messages_pct": 9.242144177449168,
|
||||
"starts_with_greeting_pct": 7.578558225508318,
|
||||
"contains_thanks_pct": 6.839186691312385,
|
||||
"contains_cta_terms_pct": 7.024029574861368
|
||||
},
|
||||
"top_emojis": [
|
||||
{
|
||||
"emoji": "🙌",
|
||||
"count": 24
|
||||
},
|
||||
{
|
||||
"emoji": "🫶",
|
||||
"count": 23
|
||||
},
|
||||
{
|
||||
"emoji": "👋",
|
||||
"count": 19
|
||||
},
|
||||
{
|
||||
"emoji": "🎁",
|
||||
"count": 17
|
||||
},
|
||||
{
|
||||
"emoji": "👇",
|
||||
"count": 12
|
||||
},
|
||||
{
|
||||
"emoji": "💪",
|
||||
"count": 10
|
||||
},
|
||||
{
|
||||
"emoji": "😜",
|
||||
"count": 7
|
||||
},
|
||||
{
|
||||
"emoji": "😉",
|
||||
"count": 4
|
||||
},
|
||||
{
|
||||
"emoji": "🤣",
|
||||
"count": 2
|
||||
},
|
||||
{
|
||||
"emoji": "🙏",
|
||||
"count": 2
|
||||
},
|
||||
{
|
||||
"emoji": "⬆",
|
||||
"count": 2
|
||||
},
|
||||
{
|
||||
"emoji": "👀",
|
||||
"count": 2
|
||||
},
|
||||
{
|
||||
"emoji": "😅",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"emoji": "🤝",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"emoji": "☺",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"emoji": "😁",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"emoji": "👌",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"emoji": "🚀",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"emoji": "⏱",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"emoji": "🫂",
|
||||
"count": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"Too short to tell": {
|
||||
"count": 255,
|
||||
"length": {
|
||||
"chars": {
|
||||
"min": 2,
|
||||
"p10": 10,
|
||||
"median": 30,
|
||||
"p90": 53,
|
||||
"max": 151,
|
||||
"mean": 32.01176470588236
|
||||
},
|
||||
"words": {
|
||||
"min": 1,
|
||||
"p10": 1,
|
||||
"median": 4,
|
||||
"p90": 11,
|
||||
"max": 29,
|
||||
"mean": 5.749019607843137
|
||||
}
|
||||
},
|
||||
"rates": {
|
||||
"emoji_messages_pct": 7.8431372549019605,
|
||||
"question_messages_pct": 19.607843137254903,
|
||||
"ends_with_question_pct": 17.647058823529413,
|
||||
"exclamation_messages_pct": 23.52941176470588,
|
||||
"linebreak_messages_pct": 0.39215686274509803,
|
||||
"url_messages_pct": 4.705882352941177,
|
||||
"handle_messages_pct": 0.0,
|
||||
"number_messages_pct": 7.0588235294117645,
|
||||
"starts_with_greeting_pct": 0.0,
|
||||
"contains_thanks_pct": 0.0,
|
||||
"contains_cta_terms_pct": 0.0
|
||||
},
|
||||
"top_emojis": [
|
||||
{
|
||||
"emoji": "🙌",
|
||||
"count": 15
|
||||
},
|
||||
{
|
||||
"emoji": "💪",
|
||||
"count": 4
|
||||
},
|
||||
{
|
||||
"emoji": "🫡",
|
||||
"count": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"English": {
|
||||
"count": 17,
|
||||
"length": {
|
||||
"chars": {
|
||||
"min": 4,
|
||||
"p10": 11,
|
||||
"median": 23,
|
||||
"p90": 38,
|
||||
"max": 40,
|
||||
"mean": 23.176470588235293
|
||||
},
|
||||
"words": {
|
||||
"min": 2,
|
||||
"p10": 3,
|
||||
"median": 4,
|
||||
"p90": 6,
|
||||
"max": 8,
|
||||
"mean": 4.588235294117647
|
||||
}
|
||||
},
|
||||
"rates": {
|
||||
"emoji_messages_pct": 11.76470588235294,
|
||||
"question_messages_pct": 23.52941176470588,
|
||||
"ends_with_question_pct": 23.52941176470588,
|
||||
"exclamation_messages_pct": 17.647058823529413,
|
||||
"linebreak_messages_pct": 0.0,
|
||||
"url_messages_pct": 0.0,
|
||||
"handle_messages_pct": 0.0,
|
||||
"number_messages_pct": 0.0,
|
||||
"starts_with_greeting_pct": 0.0,
|
||||
"contains_thanks_pct": 0.0,
|
||||
"contains_cta_terms_pct": 11.76470588235294
|
||||
},
|
||||
"top_emojis": [
|
||||
{
|
||||
"emoji": "🙌",
|
||||
"count": 2
|
||||
}
|
||||
]
|
||||
},
|
||||
"French": {
|
||||
"count": 1,
|
||||
"length": {
|
||||
"chars": {
|
||||
"min": 29,
|
||||
"p10": 29,
|
||||
"median": 29,
|
||||
"p90": 29,
|
||||
"max": 29,
|
||||
"mean": 29.0
|
||||
},
|
||||
"words": {
|
||||
"min": 5,
|
||||
"p10": 5,
|
||||
"median": 5,
|
||||
"p90": 5,
|
||||
"max": 5,
|
||||
"mean": 5.0
|
||||
}
|
||||
},
|
||||
"rates": {
|
||||
"emoji_messages_pct": 100.0,
|
||||
"question_messages_pct": 0.0,
|
||||
"ends_with_question_pct": 0.0,
|
||||
"exclamation_messages_pct": 0.0,
|
||||
"linebreak_messages_pct": 0.0,
|
||||
"url_messages_pct": 0.0,
|
||||
"handle_messages_pct": 0.0,
|
||||
"number_messages_pct": 0.0,
|
||||
"starts_with_greeting_pct": 0.0,
|
||||
"contains_thanks_pct": 0.0,
|
||||
"contains_cta_terms_pct": 0.0
|
||||
},
|
||||
"top_emojis": [
|
||||
{
|
||||
"emoji": "¦",
|
||||
"count": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"Catalan": {
|
||||
"count": 11,
|
||||
"length": {
|
||||
"chars": {
|
||||
"min": 26,
|
||||
"p10": 48,
|
||||
"median": 52,
|
||||
"p90": 99,
|
||||
"max": 205,
|
||||
"mean": 72.0909090909091
|
||||
},
|
||||
"words": {
|
||||
"min": 5,
|
||||
"p10": 5,
|
||||
"median": 8,
|
||||
"p90": 10,
|
||||
"max": 34,
|
||||
"mean": 9.727272727272727
|
||||
}
|
||||
},
|
||||
"rates": {
|
||||
"emoji_messages_pct": 0.0,
|
||||
"question_messages_pct": 0.0,
|
||||
"ends_with_question_pct": 0.0,
|
||||
"exclamation_messages_pct": 0.0,
|
||||
"linebreak_messages_pct": 9.090909090909092,
|
||||
"url_messages_pct": 90.9090909090909,
|
||||
"handle_messages_pct": 0.0,
|
||||
"number_messages_pct": 90.9090909090909,
|
||||
"starts_with_greeting_pct": 0.0,
|
||||
"contains_thanks_pct": 0.0,
|
||||
"contains_cta_terms_pct": 0.0
|
||||
},
|
||||
"top_emojis": []
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue