Add VoiceDNA profile from manual replies
This commit is contained in:
parent
817995ac6b
commit
25e306d4b0
3 changed files with 951 additions and 0 deletions
12
README.md
12
README.md
|
|
@ -141,6 +141,18 @@ Generate the deeper “no raw quotes” report directly from an Instagram export
|
||||||
|
|
||||||
- `python3 -m sergio_instagram_messaging.generate_dm_report_detailed --export-input /path/to/export-root --out /root/tmp/dm_history_report_en_detailed.md`
|
- `python3 -m sergio_instagram_messaging.generate_dm_report_detailed --export-input /path/to/export-root --out /root/tmp/dm_history_report_en_detailed.md`
|
||||||
|
|
||||||
|
## VoiceDNA (manual reply style)
|
||||||
|
|
||||||
|
`voice_dna/socialmediatorr.voice-dna.json` is a **safe-to-store** style fingerprint generated from the last 6 months of **manual (non-template) DM replies** (no raw DM quotes are included).
|
||||||
|
|
||||||
|
It also encodes a hard rule for the bot:
|
||||||
|
|
||||||
|
- Always reply in the **user’s input language** (English / Spanish / French / Catalan), with a short clarification if the user’s message is too short to detect.
|
||||||
|
|
||||||
|
Regenerate from a local Instagram export folder:
|
||||||
|
|
||||||
|
- `python3 -m sergio_instagram_messaging.generate_voice_dna --export-input /path/to/export-root --out voice_dna/socialmediatorr.voice-dna.json --owner-name "Sergio de Vocht" --window-months 6`
|
||||||
|
|
||||||
## Webhooks (new messages → auto-reply)
|
## Webhooks (new messages → auto-reply)
|
||||||
|
|
||||||
Meta webhooks are two steps:
|
Meta webhooks are two steps:
|
||||||
|
|
|
||||||
471
sergio_instagram_messaging/generate_voice_dna.py
Normal file
471
sergio_instagram_messaging/generate_voice_dna.py
Normal file
|
|
@ -0,0 +1,471 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import statistics
|
||||||
|
from collections import Counter, defaultdict
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Iterable, Literal
|
||||||
|
|
||||||
|
from .analyze_instagram_export import canonicalize_text
|
||||||
|
from .generate_dm_report_detailed import _infer_owner_name, _load_events, _resolve_export_root
|
||||||
|
|
||||||
|
|
||||||
|
VoiceLang = Literal["English", "Spanish", "French", "Catalan", "Too short to tell"]
|
||||||
|
|
||||||
|
|
||||||
|
_RE_URL = re.compile(r"(?i)\b(?:https?://|www\.)\S+")
|
||||||
|
_RE_HANDLE = re.compile(r"(?<!\w)@[\w._]{2,}")
|
||||||
|
_RE_DIGIT = re.compile(r"\d")
|
||||||
|
|
||||||
|
|
||||||
|
def _now_utc_iso() -> str:
|
||||||
|
return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_iso_utc(s: str) -> datetime:
|
||||||
|
"""
|
||||||
|
Accept an ISO timestamp like:
|
||||||
|
- 2025-12-24T12:34:56Z
|
||||||
|
- 2025-12-24T12:34:56+00:00
|
||||||
|
- 2025-12-24T12:34:56
|
||||||
|
Defaults to UTC if tzinfo is missing.
|
||||||
|
"""
|
||||||
|
raw = (s or "").strip()
|
||||||
|
if not raw:
|
||||||
|
raise ValueError("empty datetime")
|
||||||
|
if raw.endswith("Z"):
|
||||||
|
raw = raw[:-1] + "+00:00"
|
||||||
|
dt = datetime.fromisoformat(raw)
|
||||||
|
if dt.tzinfo is None:
|
||||||
|
dt = dt.replace(tzinfo=timezone.utc)
|
||||||
|
return dt.astimezone(timezone.utc).replace(microsecond=0)
|
||||||
|
|
||||||
|
|
||||||
|
def _days_in_month(year: int, month: int) -> int:
|
||||||
|
if month in (1, 3, 5, 7, 8, 10, 12):
|
||||||
|
return 31
|
||||||
|
if month in (4, 6, 9, 11):
|
||||||
|
return 30
|
||||||
|
# February
|
||||||
|
leap = (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0)
|
||||||
|
return 29 if leap else 28
|
||||||
|
|
||||||
|
|
||||||
|
def _subtract_months(dt: datetime, months: int) -> datetime:
|
||||||
|
if months <= 0:
|
||||||
|
return dt
|
||||||
|
year = dt.year
|
||||||
|
month = dt.month - months
|
||||||
|
while month <= 0:
|
||||||
|
month += 12
|
||||||
|
year -= 1
|
||||||
|
day = min(dt.day, _days_in_month(year, month))
|
||||||
|
return dt.replace(year=year, month=month, day=day)
|
||||||
|
|
||||||
|
|
||||||
|
def _tokenize(text: str) -> list[str]:
|
||||||
|
return re.findall(r"\b[\w']+\b", (text or "").lower(), flags=re.UNICODE)
|
||||||
|
|
||||||
|
|
||||||
|
def _guess_lang(text: str) -> VoiceLang:
|
||||||
|
# Keep this consistent with generate_dm_report_detailed.py
|
||||||
|
from .generate_dm_report_detailed import _guess_lang as _guess_lang_impl
|
||||||
|
|
||||||
|
return _guess_lang_impl(text)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_system_new_follower_message(text: str) -> bool:
|
||||||
|
s = (text or "").strip().lower()
|
||||||
|
return s.startswith("you messaged") and "followed your account" in s
|
||||||
|
|
||||||
|
|
||||||
|
def _iter_emojis(text: str) -> Iterable[str]:
|
||||||
|
# Heuristic: treat unicode "Symbol, other" as emoji-like and ignore punctuation/formatting.
|
||||||
|
# This avoids counting typographic quotes (e.g., “ ”) and ellipses (…) as emojis.
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
for ch in (text or ""):
|
||||||
|
if ch.isspace():
|
||||||
|
continue
|
||||||
|
cat = unicodedata.category(ch)
|
||||||
|
if cat != "So":
|
||||||
|
continue
|
||||||
|
yield ch
|
||||||
|
|
||||||
|
|
||||||
|
def _quantile(values: list[int], q: float) -> int | None:
|
||||||
|
if not values:
|
||||||
|
return None
|
||||||
|
if q <= 0:
|
||||||
|
return min(values)
|
||||||
|
if q >= 1:
|
||||||
|
return max(values)
|
||||||
|
xs = sorted(values)
|
||||||
|
idx = int(round((len(xs) - 1) * q))
|
||||||
|
return xs[max(0, min(len(xs) - 1, idx))]
|
||||||
|
|
||||||
|
|
||||||
|
def _pct(num: int, den: int) -> float:
|
||||||
|
return 0.0 if den <= 0 else (num / den) * 100.0
|
||||||
|
|
||||||
|
|
||||||
|
_GREETING_WORDS: dict[VoiceLang, set[str]] = {
|
||||||
|
"English": {"hi", "hey", "hello"},
|
||||||
|
"Spanish": {"hola", "buenas", "buenos"},
|
||||||
|
"French": {"salut", "bonjour"},
|
||||||
|
"Catalan": {"hola", "bon"},
|
||||||
|
"Too short to tell": set(),
|
||||||
|
}
|
||||||
|
|
||||||
|
_THANKS_WORDS: dict[VoiceLang, set[str]] = {
|
||||||
|
"English": {"thanks", "thank"},
|
||||||
|
"Spanish": {"gracias"},
|
||||||
|
"French": {"merci"},
|
||||||
|
"Catalan": {"gracies", "gràcies"},
|
||||||
|
"Too short to tell": set(),
|
||||||
|
}
|
||||||
|
|
||||||
|
_CTA_WORDS: dict[VoiceLang, set[str]] = {
|
||||||
|
"English": {"link", "call", "book", "ebook", "price"},
|
||||||
|
"Spanish": {"enlace", "link", "llamada", "libro", "precio"},
|
||||||
|
"French": {"lien", "appel", "livre", "prix"},
|
||||||
|
"Catalan": {"enllaç", "enllac", "trucada", "llibre", "preu"},
|
||||||
|
"Too short to tell": set(),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class OutgoingSample:
|
||||||
|
ts_ms: int
|
||||||
|
canon: str
|
||||||
|
lang: VoiceLang
|
||||||
|
chars: int
|
||||||
|
words: int
|
||||||
|
has_emoji: bool
|
||||||
|
emoji: tuple[str, ...]
|
||||||
|
has_question: bool
|
||||||
|
ends_with_question: bool
|
||||||
|
has_exclamation: bool
|
||||||
|
has_linebreak: bool
|
||||||
|
has_url: bool
|
||||||
|
has_handle: bool
|
||||||
|
has_number: bool
|
||||||
|
starts_with_greeting: bool
|
||||||
|
contains_thanks: bool
|
||||||
|
contains_cta_terms: bool
|
||||||
|
|
||||||
|
|
||||||
|
def _sample_from_text(*, ts_ms: int, canon: str, text: str) -> OutgoingSample:
|
||||||
|
s = (text or "").strip()
|
||||||
|
lang = _guess_lang(s)
|
||||||
|
toks = _tokenize(s)
|
||||||
|
|
||||||
|
emojis = tuple(_iter_emojis(s))
|
||||||
|
has_question = "?" in s or "¿" in s
|
||||||
|
ends_with_question = s.endswith("?")
|
||||||
|
has_exclamation = "!" in s or "¡" in s
|
||||||
|
has_linebreak = "\n" in s
|
||||||
|
has_url = bool(_RE_URL.search(s))
|
||||||
|
has_handle = bool(_RE_HANDLE.search(s))
|
||||||
|
has_number = bool(_RE_DIGIT.search(s))
|
||||||
|
|
||||||
|
first_word = toks[0] if toks else ""
|
||||||
|
starts_with_greeting = bool(first_word and first_word in _GREETING_WORDS.get(lang, set()))
|
||||||
|
contains_thanks = bool(set(toks) & _THANKS_WORDS.get(lang, set()))
|
||||||
|
contains_cta_terms = bool(set(toks) & _CTA_WORDS.get(lang, set()))
|
||||||
|
|
||||||
|
return OutgoingSample(
|
||||||
|
ts_ms=int(ts_ms),
|
||||||
|
canon=canon,
|
||||||
|
lang=lang,
|
||||||
|
chars=len(s),
|
||||||
|
words=len(toks),
|
||||||
|
has_emoji=bool(emojis),
|
||||||
|
emoji=emojis,
|
||||||
|
has_question=has_question,
|
||||||
|
ends_with_question=ends_with_question,
|
||||||
|
has_exclamation=has_exclamation,
|
||||||
|
has_linebreak=has_linebreak,
|
||||||
|
has_url=has_url,
|
||||||
|
has_handle=has_handle,
|
||||||
|
has_number=has_number,
|
||||||
|
starts_with_greeting=starts_with_greeting,
|
||||||
|
contains_thanks=contains_thanks,
|
||||||
|
contains_cta_terms=contains_cta_terms,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _lang_bucket(samples: list[OutgoingSample]) -> dict[VoiceLang, list[OutgoingSample]]:
|
||||||
|
by: dict[VoiceLang, list[OutgoingSample]] = defaultdict(list)
|
||||||
|
for s in samples:
|
||||||
|
by[s.lang].append(s)
|
||||||
|
# Ensure stable order in output (even if empty).
|
||||||
|
for lang in ("English", "Spanish", "French", "Catalan", "Too short to tell"):
|
||||||
|
by.setdefault(lang, [])
|
||||||
|
return dict(by)
|
||||||
|
|
||||||
|
|
||||||
|
def _summarize_samples(samples: list[OutgoingSample]) -> dict[str, Any]:
|
||||||
|
if not samples:
|
||||||
|
return {
|
||||||
|
"count": 0,
|
||||||
|
"length": {"chars": {}, "words": {}},
|
||||||
|
"rates": {},
|
||||||
|
"top_emojis": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
chars = [s.chars for s in samples]
|
||||||
|
words = [s.words for s in samples]
|
||||||
|
|
||||||
|
emoji_counter: Counter[str] = Counter()
|
||||||
|
for s in samples:
|
||||||
|
emoji_counter.update(s.emoji)
|
||||||
|
|
||||||
|
def rates() -> dict[str, float]:
|
||||||
|
n = len(samples)
|
||||||
|
return {
|
||||||
|
"emoji_messages_pct": _pct(sum(1 for s in samples if s.has_emoji), n),
|
||||||
|
"question_messages_pct": _pct(sum(1 for s in samples if s.has_question), n),
|
||||||
|
"ends_with_question_pct": _pct(sum(1 for s in samples if s.ends_with_question), n),
|
||||||
|
"exclamation_messages_pct": _pct(sum(1 for s in samples if s.has_exclamation), n),
|
||||||
|
"linebreak_messages_pct": _pct(sum(1 for s in samples if s.has_linebreak), n),
|
||||||
|
"url_messages_pct": _pct(sum(1 for s in samples if s.has_url), n),
|
||||||
|
"handle_messages_pct": _pct(sum(1 for s in samples if s.has_handle), n),
|
||||||
|
"number_messages_pct": _pct(sum(1 for s in samples if s.has_number), n),
|
||||||
|
"starts_with_greeting_pct": _pct(sum(1 for s in samples if s.starts_with_greeting), n),
|
||||||
|
"contains_thanks_pct": _pct(sum(1 for s in samples if s.contains_thanks), n),
|
||||||
|
"contains_cta_terms_pct": _pct(sum(1 for s in samples if s.contains_cta_terms), n),
|
||||||
|
}
|
||||||
|
|
||||||
|
def length_summary(values: list[int]) -> dict[str, int | float | None]:
|
||||||
|
return {
|
||||||
|
"min": min(values) if values else None,
|
||||||
|
"p10": _quantile(values, 0.10),
|
||||||
|
"median": int(statistics.median(values)) if values else None,
|
||||||
|
"p90": _quantile(values, 0.90),
|
||||||
|
"max": max(values) if values else None,
|
||||||
|
"mean": (sum(values) / len(values)) if values else None,
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"count": len(samples),
|
||||||
|
"length": {"chars": length_summary(chars), "words": length_summary(words)},
|
||||||
|
"rates": rates(),
|
||||||
|
"top_emojis": [{"emoji": e, "count": c} for e, c in emoji_counter.most_common(20)],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _choose_language_policy(supported_from_inbox: Counter[VoiceLang], supported_from_owner: Counter[VoiceLang]) -> dict[str, Any]:
|
||||||
|
supported = []
|
||||||
|
for lang in ("English", "Spanish", "French", "Catalan"):
|
||||||
|
if supported_from_inbox.get(lang, 0) > 0 or supported_from_owner.get(lang, 0) > 0:
|
||||||
|
supported.append(lang)
|
||||||
|
if not supported:
|
||||||
|
supported = ["English", "Spanish", "French", "Catalan"]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"mode": "mirror_user_input_language",
|
||||||
|
"supported_languages": supported,
|
||||||
|
"rules": [
|
||||||
|
"Reply in the same language as the user's most recent message that contains enough text to classify.",
|
||||||
|
"Do not translate the user's message unless they explicitly ask for a translation.",
|
||||||
|
"Do not mix languages inside a single reply unless the user mixes languages first.",
|
||||||
|
"If the user's message is too short to classify, reuse the last confidently detected language in the same thread.",
|
||||||
|
"If there is still no signal, ask a 1-line clarification asking which language they prefer (keep it short).",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def generate_voice_dna(
|
||||||
|
*,
|
||||||
|
export_input: Path,
|
||||||
|
out_path: Path,
|
||||||
|
owner_name: str | None,
|
||||||
|
window_months: int,
|
||||||
|
response_window_hours: float,
|
||||||
|
scripted_min_count: int,
|
||||||
|
as_of_utc: datetime,
|
||||||
|
) -> Path:
|
||||||
|
export_root = _resolve_export_root(export_input)
|
||||||
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
if not owner_name or not owner_name.strip():
|
||||||
|
owner_name = _infer_owner_name(export_root)
|
||||||
|
if not owner_name:
|
||||||
|
raise RuntimeError("Could not infer owner name. Pass --owner-name.")
|
||||||
|
|
||||||
|
as_of_utc = as_of_utc.astimezone(timezone.utc).replace(microsecond=0)
|
||||||
|
start_utc = _subtract_months(as_of_utc, int(window_months))
|
||||||
|
start_ts_ms = int(start_utc.timestamp() * 1000)
|
||||||
|
end_ts_ms = int(as_of_utc.timestamp() * 1000)
|
||||||
|
response_window_ms = int(float(response_window_hours) * 3600 * 1000)
|
||||||
|
|
||||||
|
template_counts: Counter[str] = Counter()
|
||||||
|
inbound_lang_counts_window: Counter[VoiceLang] = Counter()
|
||||||
|
|
||||||
|
candidate_responses: list[OutgoingSample] = []
|
||||||
|
scanned_conversations = 0
|
||||||
|
scanned_message_files = 0
|
||||||
|
|
||||||
|
inbox_root = export_root / "messages" / "inbox"
|
||||||
|
for conv_dir in inbox_root.iterdir():
|
||||||
|
if not conv_dir.is_dir():
|
||||||
|
continue
|
||||||
|
scanned_conversations += 1
|
||||||
|
parts = sorted(conv_dir.glob("message*.json"), key=lambda p: p.name)
|
||||||
|
if not parts:
|
||||||
|
continue
|
||||||
|
scanned_message_files += len(parts)
|
||||||
|
events = _load_events(parts, owner_name=owner_name)
|
||||||
|
if not events:
|
||||||
|
continue
|
||||||
|
|
||||||
|
last_inbound_ts: int | None = None
|
||||||
|
for e in events:
|
||||||
|
# Track inbound language (for mirroring policy) for the same recent window.
|
||||||
|
if not e.is_owner and isinstance(e.text, str) and e.text.strip():
|
||||||
|
if start_ts_ms <= e.ts_ms <= end_ts_ms:
|
||||||
|
inbound_lang_counts_window[_guess_lang(e.text)] += 1
|
||||||
|
last_inbound_ts = e.ts_ms
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not e.is_owner:
|
||||||
|
last_inbound_ts = e.ts_ms
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not isinstance(e.text, str) or not e.text.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
canon = canonicalize_text(e.text)
|
||||||
|
if canon:
|
||||||
|
template_counts[canon] += 1
|
||||||
|
|
||||||
|
# Style samples: only use manual "responses" in the recent window.
|
||||||
|
if _is_system_new_follower_message(e.text):
|
||||||
|
continue
|
||||||
|
if not (start_ts_ms <= e.ts_ms <= end_ts_ms):
|
||||||
|
continue
|
||||||
|
if last_inbound_ts is None:
|
||||||
|
continue
|
||||||
|
if e.ts_ms < last_inbound_ts:
|
||||||
|
continue
|
||||||
|
if (e.ts_ms - last_inbound_ts) > response_window_ms:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not canon:
|
||||||
|
continue
|
||||||
|
candidate_responses.append(_sample_from_text(ts_ms=e.ts_ms, canon=canon, text=e.text))
|
||||||
|
|
||||||
|
scripted_templates = {canon for canon, cnt in template_counts.items() if cnt >= int(scripted_min_count)}
|
||||||
|
manual_responses = [s for s in candidate_responses if s.canon not in scripted_templates]
|
||||||
|
|
||||||
|
owner_lang_counts = Counter([s.lang for s in manual_responses])
|
||||||
|
policy = _choose_language_policy(inbound_lang_counts_window, owner_lang_counts)
|
||||||
|
|
||||||
|
by_lang = _lang_bucket(manual_responses)
|
||||||
|
per_lang_summary = {lang: _summarize_samples(by_lang[lang]) for lang in by_lang}
|
||||||
|
|
||||||
|
voice_dna: dict[str, Any] = {
|
||||||
|
"schema_version": "voice_dna/v1",
|
||||||
|
"created_at_utc": _now_utc_iso(),
|
||||||
|
"subject": {
|
||||||
|
"account": "@socialmediatorr",
|
||||||
|
"owner_name": owner_name,
|
||||||
|
"scope": "Instagram DMs",
|
||||||
|
},
|
||||||
|
"source": {
|
||||||
|
"type": "instagram_export",
|
||||||
|
"window": {
|
||||||
|
"months": int(window_months),
|
||||||
|
"start_utc": start_utc.isoformat(),
|
||||||
|
"end_utc": as_of_utc.isoformat(),
|
||||||
|
"response_window_hours": float(response_window_hours),
|
||||||
|
},
|
||||||
|
"classification": {
|
||||||
|
"manual_reply_definition": "owner text message within response_window_hours of the most recent inbound message, excluding repeated templates",
|
||||||
|
"scripted_template_definition": f"owner canonicalized text sent >= {int(scripted_min_count)} times across full export",
|
||||||
|
"system_messages_excluded": ["you messaged <user> because they followed your account"],
|
||||||
|
},
|
||||||
|
"scan": {
|
||||||
|
"export_root_hint": export_root.name,
|
||||||
|
"scanned_conversations": scanned_conversations,
|
||||||
|
"scanned_message_files": scanned_message_files,
|
||||||
|
"candidate_responses_in_window": len(candidate_responses),
|
||||||
|
"manual_responses_in_window": len(manual_responses),
|
||||||
|
"scripted_template_count": len(scripted_templates),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"policies": {
|
||||||
|
"language": policy,
|
||||||
|
},
|
||||||
|
"language_observed": {
|
||||||
|
"inbound_last_window_counts": dict(inbound_lang_counts_window),
|
||||||
|
"manual_reply_counts": dict(owner_lang_counts),
|
||||||
|
},
|
||||||
|
"style": {
|
||||||
|
"manual_replies": {
|
||||||
|
"overall": _summarize_samples(manual_responses),
|
||||||
|
"by_language": per_lang_summary,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
out_path.write_text(json.dumps(voice_dna, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
||||||
|
return out_path
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv: list[str] | None = None) -> int:
|
||||||
|
ap = argparse.ArgumentParser(
|
||||||
|
description="Generate a VoiceDNA JSON file for recent manual DM replies (style fingerprint + language mirroring policy)."
|
||||||
|
)
|
||||||
|
ap.add_argument("--export-input", required=True, help="Instagram export root (contains messages/inbox)")
|
||||||
|
ap.add_argument("--out", required=True, help="output path (JSON)")
|
||||||
|
ap.add_argument("--owner-name", default=None, help='owner sender_name (e.g., "Sergio de Vocht")')
|
||||||
|
ap.add_argument("--window-months", type=int, default=6, help="how many recent months to use")
|
||||||
|
ap.add_argument(
|
||||||
|
"--response-window-hours",
|
||||||
|
type=float,
|
||||||
|
default=72.0,
|
||||||
|
help="max hours after inbound message for an outgoing message to count as a response",
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"--scripted-min-count",
|
||||||
|
type=int,
|
||||||
|
default=50,
|
||||||
|
help="owner canonicalized text sent >= this count is treated as scripted/templated",
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"--as-of-utc",
|
||||||
|
default=None,
|
||||||
|
help="analysis end time (UTC ISO); default: now",
|
||||||
|
)
|
||||||
|
args = ap.parse_args(argv)
|
||||||
|
|
||||||
|
as_of = _parse_iso_utc(args.as_of_utc) if args.as_of_utc else datetime.now(timezone.utc).replace(microsecond=0)
|
||||||
|
|
||||||
|
try:
|
||||||
|
out = generate_voice_dna(
|
||||||
|
export_input=Path(args.export_input),
|
||||||
|
out_path=Path(args.out),
|
||||||
|
owner_name=(args.owner_name.strip() if args.owner_name else None),
|
||||||
|
window_months=int(args.window_months),
|
||||||
|
response_window_hours=float(args.response_window_hours),
|
||||||
|
scripted_min_count=int(args.scripted_min_count),
|
||||||
|
as_of_utc=as_of,
|
||||||
|
)
|
||||||
|
print(json.dumps({"ok": True, "out": str(out)}, ensure_ascii=False))
|
||||||
|
return 0
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
print(str(e))
|
||||||
|
return 2
|
||||||
|
except Exception as e:
|
||||||
|
print(f"VoiceDNA generation failed: {e}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
468
voice_dna/socialmediatorr.voice-dna.json
Normal file
468
voice_dna/socialmediatorr.voice-dna.json
Normal file
|
|
@ -0,0 +1,468 @@
|
||||||
|
{
|
||||||
|
"schema_version": "voice_dna/v1",
|
||||||
|
"created_at_utc": "2025-12-24T12:08:24+00:00",
|
||||||
|
"subject": {
|
||||||
|
"account": "@socialmediatorr",
|
||||||
|
"owner_name": "Sergio de Vocht",
|
||||||
|
"scope": "Instagram DMs"
|
||||||
|
},
|
||||||
|
"source": {
|
||||||
|
"type": "instagram_export",
|
||||||
|
"window": {
|
||||||
|
"months": 6,
|
||||||
|
"start_utc": "2025-06-24T12:08:20+00:00",
|
||||||
|
"end_utc": "2025-12-24T12:08:20+00:00",
|
||||||
|
"response_window_hours": 72.0
|
||||||
|
},
|
||||||
|
"classification": {
|
||||||
|
"manual_reply_definition": "owner text message within response_window_hours of the most recent inbound message, excluding repeated templates",
|
||||||
|
"scripted_template_definition": "owner canonicalized text sent >= 50 times across full export",
|
||||||
|
"system_messages_excluded": [
|
||||||
|
"you messaged <user> because they followed your account"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"scan": {
|
||||||
|
"export_root_hint": "socialmediatorr-ig-export-raw-20251224",
|
||||||
|
"scanned_conversations": 10100,
|
||||||
|
"scanned_message_files": 10061,
|
||||||
|
"candidate_responses_in_window": 18934,
|
||||||
|
"manual_responses_in_window": 825,
|
||||||
|
"scripted_template_count": 24
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"policies": {
|
||||||
|
"language": {
|
||||||
|
"mode": "mirror_user_input_language",
|
||||||
|
"supported_languages": [
|
||||||
|
"English",
|
||||||
|
"Spanish",
|
||||||
|
"French",
|
||||||
|
"Catalan"
|
||||||
|
],
|
||||||
|
"rules": [
|
||||||
|
"Reply in the same language as the user's most recent message that contains enough text to classify.",
|
||||||
|
"Do not translate the user's message unless they explicitly ask for a translation.",
|
||||||
|
"Do not mix languages inside a single reply unless the user mixes languages first.",
|
||||||
|
"If the user's message is too short to classify, reuse the last confidently detected language in the same thread.",
|
||||||
|
"If there is still no signal, ask a 1-line clarification asking which language they prefer (keep it short)."
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"language_observed": {
|
||||||
|
"inbound_last_window_counts": {
|
||||||
|
"English": 2828,
|
||||||
|
"Too short to tell": 5024,
|
||||||
|
"Spanish": 2176,
|
||||||
|
"French": 191,
|
||||||
|
"Catalan": 16
|
||||||
|
},
|
||||||
|
"manual_reply_counts": {
|
||||||
|
"Spanish": 541,
|
||||||
|
"Too short to tell": 255,
|
||||||
|
"English": 17,
|
||||||
|
"French": 1,
|
||||||
|
"Catalan": 11
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"style": {
|
||||||
|
"manual_replies": {
|
||||||
|
"overall": {
|
||||||
|
"count": 825,
|
||||||
|
"length": {
|
||||||
|
"chars": {
|
||||||
|
"min": 2,
|
||||||
|
"p10": 13,
|
||||||
|
"median": 54,
|
||||||
|
"p90": 199,
|
||||||
|
"max": 928,
|
||||||
|
"mean": 87.47878787878788
|
||||||
|
},
|
||||||
|
"words": {
|
||||||
|
"min": 1,
|
||||||
|
"p10": 2,
|
||||||
|
"median": 11,
|
||||||
|
"p90": 34,
|
||||||
|
"max": 169,
|
||||||
|
"mean": 15.436363636363636
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"rates": {
|
||||||
|
"emoji_messages_pct": 16.363636363636363,
|
||||||
|
"question_messages_pct": 32.484848484848484,
|
||||||
|
"ends_with_question_pct": 27.151515151515156,
|
||||||
|
"exclamation_messages_pct": 18.303030303030305,
|
||||||
|
"linebreak_messages_pct": 9.212121212121211,
|
||||||
|
"url_messages_pct": 4.121212121212121,
|
||||||
|
"handle_messages_pct": 0.0,
|
||||||
|
"number_messages_pct": 9.454545454545455,
|
||||||
|
"starts_with_greeting_pct": 4.96969696969697,
|
||||||
|
"contains_thanks_pct": 4.484848484848484,
|
||||||
|
"contains_cta_terms_pct": 4.848484848484849
|
||||||
|
},
|
||||||
|
"top_emojis": [
|
||||||
|
{
|
||||||
|
"emoji": "🙌",
|
||||||
|
"count": 41
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "🫶",
|
||||||
|
"count": 23
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "👋",
|
||||||
|
"count": 19
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "🎁",
|
||||||
|
"count": 17
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "💪",
|
||||||
|
"count": 14
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "👇",
|
||||||
|
"count": 12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "😜",
|
||||||
|
"count": 7
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "😉",
|
||||||
|
"count": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "🤣",
|
||||||
|
"count": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "🙏",
|
||||||
|
"count": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "⬆",
|
||||||
|
"count": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "👀",
|
||||||
|
"count": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "¦",
|
||||||
|
"count": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "😅",
|
||||||
|
"count": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "🤝",
|
||||||
|
"count": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "☺",
|
||||||
|
"count": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "🫡",
|
||||||
|
"count": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "😁",
|
||||||
|
"count": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "👌",
|
||||||
|
"count": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "🚀",
|
||||||
|
"count": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"by_language": {
|
||||||
|
"Spanish": {
|
||||||
|
"count": 541,
|
||||||
|
"length": {
|
||||||
|
"chars": {
|
||||||
|
"min": 5,
|
||||||
|
"p10": 35,
|
||||||
|
"median": 99,
|
||||||
|
"p90": 211,
|
||||||
|
"max": 928,
|
||||||
|
"mean": 116.06469500924214
|
||||||
|
},
|
||||||
|
"words": {
|
||||||
|
"min": 1,
|
||||||
|
"p10": 6,
|
||||||
|
"median": 18,
|
||||||
|
"p90": 38,
|
||||||
|
"max": 169,
|
||||||
|
"mean": 20.478743068391868
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"rates": {
|
||||||
|
"emoji_messages_pct": 20.70240295748614,
|
||||||
|
"question_messages_pct": 39.55637707948244,
|
||||||
|
"ends_with_question_pct": 32.34750462107209,
|
||||||
|
"exclamation_messages_pct": 16.266173752310536,
|
||||||
|
"linebreak_messages_pct": 13.67837338262477,
|
||||||
|
"url_messages_pct": 2.2181146025878005,
|
||||||
|
"handle_messages_pct": 0.0,
|
||||||
|
"number_messages_pct": 9.242144177449168,
|
||||||
|
"starts_with_greeting_pct": 7.578558225508318,
|
||||||
|
"contains_thanks_pct": 6.839186691312385,
|
||||||
|
"contains_cta_terms_pct": 7.024029574861368
|
||||||
|
},
|
||||||
|
"top_emojis": [
|
||||||
|
{
|
||||||
|
"emoji": "🙌",
|
||||||
|
"count": 24
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "🫶",
|
||||||
|
"count": 23
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "👋",
|
||||||
|
"count": 19
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "🎁",
|
||||||
|
"count": 17
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "👇",
|
||||||
|
"count": 12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "💪",
|
||||||
|
"count": 10
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "😜",
|
||||||
|
"count": 7
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "😉",
|
||||||
|
"count": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "🤣",
|
||||||
|
"count": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "🙏",
|
||||||
|
"count": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "⬆",
|
||||||
|
"count": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "👀",
|
||||||
|
"count": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "😅",
|
||||||
|
"count": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "🤝",
|
||||||
|
"count": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "☺",
|
||||||
|
"count": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "😁",
|
||||||
|
"count": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "👌",
|
||||||
|
"count": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "🚀",
|
||||||
|
"count": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "⏱",
|
||||||
|
"count": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "🫂",
|
||||||
|
"count": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"Too short to tell": {
|
||||||
|
"count": 255,
|
||||||
|
"length": {
|
||||||
|
"chars": {
|
||||||
|
"min": 2,
|
||||||
|
"p10": 10,
|
||||||
|
"median": 30,
|
||||||
|
"p90": 53,
|
||||||
|
"max": 151,
|
||||||
|
"mean": 32.01176470588236
|
||||||
|
},
|
||||||
|
"words": {
|
||||||
|
"min": 1,
|
||||||
|
"p10": 1,
|
||||||
|
"median": 4,
|
||||||
|
"p90": 11,
|
||||||
|
"max": 29,
|
||||||
|
"mean": 5.749019607843137
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"rates": {
|
||||||
|
"emoji_messages_pct": 7.8431372549019605,
|
||||||
|
"question_messages_pct": 19.607843137254903,
|
||||||
|
"ends_with_question_pct": 17.647058823529413,
|
||||||
|
"exclamation_messages_pct": 23.52941176470588,
|
||||||
|
"linebreak_messages_pct": 0.39215686274509803,
|
||||||
|
"url_messages_pct": 4.705882352941177,
|
||||||
|
"handle_messages_pct": 0.0,
|
||||||
|
"number_messages_pct": 7.0588235294117645,
|
||||||
|
"starts_with_greeting_pct": 0.0,
|
||||||
|
"contains_thanks_pct": 0.0,
|
||||||
|
"contains_cta_terms_pct": 0.0
|
||||||
|
},
|
||||||
|
"top_emojis": [
|
||||||
|
{
|
||||||
|
"emoji": "🙌",
|
||||||
|
"count": 15
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "💪",
|
||||||
|
"count": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emoji": "🫡",
|
||||||
|
"count": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"English": {
|
||||||
|
"count": 17,
|
||||||
|
"length": {
|
||||||
|
"chars": {
|
||||||
|
"min": 4,
|
||||||
|
"p10": 11,
|
||||||
|
"median": 23,
|
||||||
|
"p90": 38,
|
||||||
|
"max": 40,
|
||||||
|
"mean": 23.176470588235293
|
||||||
|
},
|
||||||
|
"words": {
|
||||||
|
"min": 2,
|
||||||
|
"p10": 3,
|
||||||
|
"median": 4,
|
||||||
|
"p90": 6,
|
||||||
|
"max": 8,
|
||||||
|
"mean": 4.588235294117647
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"rates": {
|
||||||
|
"emoji_messages_pct": 11.76470588235294,
|
||||||
|
"question_messages_pct": 23.52941176470588,
|
||||||
|
"ends_with_question_pct": 23.52941176470588,
|
||||||
|
"exclamation_messages_pct": 17.647058823529413,
|
||||||
|
"linebreak_messages_pct": 0.0,
|
||||||
|
"url_messages_pct": 0.0,
|
||||||
|
"handle_messages_pct": 0.0,
|
||||||
|
"number_messages_pct": 0.0,
|
||||||
|
"starts_with_greeting_pct": 0.0,
|
||||||
|
"contains_thanks_pct": 0.0,
|
||||||
|
"contains_cta_terms_pct": 11.76470588235294
|
||||||
|
},
|
||||||
|
"top_emojis": [
|
||||||
|
{
|
||||||
|
"emoji": "🙌",
|
||||||
|
"count": 2
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"French": {
|
||||||
|
"count": 1,
|
||||||
|
"length": {
|
||||||
|
"chars": {
|
||||||
|
"min": 29,
|
||||||
|
"p10": 29,
|
||||||
|
"median": 29,
|
||||||
|
"p90": 29,
|
||||||
|
"max": 29,
|
||||||
|
"mean": 29.0
|
||||||
|
},
|
||||||
|
"words": {
|
||||||
|
"min": 5,
|
||||||
|
"p10": 5,
|
||||||
|
"median": 5,
|
||||||
|
"p90": 5,
|
||||||
|
"max": 5,
|
||||||
|
"mean": 5.0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"rates": {
|
||||||
|
"emoji_messages_pct": 100.0,
|
||||||
|
"question_messages_pct": 0.0,
|
||||||
|
"ends_with_question_pct": 0.0,
|
||||||
|
"exclamation_messages_pct": 0.0,
|
||||||
|
"linebreak_messages_pct": 0.0,
|
||||||
|
"url_messages_pct": 0.0,
|
||||||
|
"handle_messages_pct": 0.0,
|
||||||
|
"number_messages_pct": 0.0,
|
||||||
|
"starts_with_greeting_pct": 0.0,
|
||||||
|
"contains_thanks_pct": 0.0,
|
||||||
|
"contains_cta_terms_pct": 0.0
|
||||||
|
},
|
||||||
|
"top_emojis": [
|
||||||
|
{
|
||||||
|
"emoji": "¦",
|
||||||
|
"count": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"Catalan": {
|
||||||
|
"count": 11,
|
||||||
|
"length": {
|
||||||
|
"chars": {
|
||||||
|
"min": 26,
|
||||||
|
"p10": 48,
|
||||||
|
"median": 52,
|
||||||
|
"p90": 99,
|
||||||
|
"max": 205,
|
||||||
|
"mean": 72.0909090909091
|
||||||
|
},
|
||||||
|
"words": {
|
||||||
|
"min": 5,
|
||||||
|
"p10": 5,
|
||||||
|
"median": 8,
|
||||||
|
"p90": 10,
|
||||||
|
"max": 34,
|
||||||
|
"mean": 9.727272727272727
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"rates": {
|
||||||
|
"emoji_messages_pct": 0.0,
|
||||||
|
"question_messages_pct": 0.0,
|
||||||
|
"ends_with_question_pct": 0.0,
|
||||||
|
"exclamation_messages_pct": 0.0,
|
||||||
|
"linebreak_messages_pct": 9.090909090909092,
|
||||||
|
"url_messages_pct": 90.9090909090909,
|
||||||
|
"handle_messages_pct": 0.0,
|
||||||
|
"number_messages_pct": 90.9090909090909,
|
||||||
|
"starts_with_greeting_pct": 0.0,
|
||||||
|
"contains_thanks_pct": 0.0,
|
||||||
|
"contains_cta_terms_pct": 0.0
|
||||||
|
},
|
||||||
|
"top_emojis": []
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Add table
Reference in a new issue