Sanitize DM reports and add Mermaid tooling

This commit is contained in:
danny 2025-12-24 13:13:48 +00:00
parent a6222083e6
commit a140b3787a
10 changed files with 516 additions and 427 deletions

5
.gitignore vendored
View file

@ -7,4 +7,7 @@ venv/
.vscode/ .vscode/
/dist/ /dist/
/build/ /build/
node_modules/
npm-debug.log*
yarn-debug.log*
yarn-error.log*

View file

@ -103,6 +103,12 @@ This produces the “Sergio persona” artifacts needed for the DM agent:
Outputs are written with mode `600` and may contain sensitive DM content. Keep them out of git. Outputs are written with mode `600` and may contain sensitive DM content. Keep them out of git.
This repo includes **sanitized** example reports (no verbatim client DMs) under:
- `reports/socialmediatorr/`
Raw analysis artifacts (e.g., training pairs, rescued threads, template caches) should remain in a private working directory such as `/root/tmp/` and should not be committed.
### Analyze a raw Instagram export folder (recommended) ### Analyze a raw Instagram export folder (recommended)
Optional: index first (lets you filter recency without scanning every thread): Optional: index first (lets you filter recency without scanning every thread):

174
package-lock.json generated Normal file
View file

@ -0,0 +1,174 @@
{
"name": "emo-social-insta-dm-agent-tools",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "emo-social-insta-dm-agent-tools",
"devDependencies": {
"@mermaid-js/parser": "^0.6.3"
}
},
"node_modules/@chevrotain/cst-dts-gen": {
"version": "11.0.3",
"resolved": "https://registry.npmjs.org/@chevrotain/cst-dts-gen/-/cst-dts-gen-11.0.3.tgz",
"integrity": "sha512-BvIKpRLeS/8UbfxXxgC33xOumsacaeCKAjAeLyOn7Pcp95HiRbrpl14S+9vaZLolnbssPIUuiUd8IvgkRyt6NQ==",
"dev": true,
"license": "Apache-2.0",
"dependencies": {
"@chevrotain/gast": "11.0.3",
"@chevrotain/types": "11.0.3",
"lodash-es": "4.17.21"
}
},
"node_modules/@chevrotain/gast": {
"version": "11.0.3",
"resolved": "https://registry.npmjs.org/@chevrotain/gast/-/gast-11.0.3.tgz",
"integrity": "sha512-+qNfcoNk70PyS/uxmj3li5NiECO+2YKZZQMbmjTqRI3Qchu8Hig/Q9vgkHpI3alNjr7M+a2St5pw5w5F6NL5/Q==",
"dev": true,
"license": "Apache-2.0",
"dependencies": {
"@chevrotain/types": "11.0.3",
"lodash-es": "4.17.21"
}
},
"node_modules/@chevrotain/regexp-to-ast": {
"version": "11.0.3",
"resolved": "https://registry.npmjs.org/@chevrotain/regexp-to-ast/-/regexp-to-ast-11.0.3.tgz",
"integrity": "sha512-1fMHaBZxLFvWI067AVbGJav1eRY7N8DDvYCTwGBiE/ytKBgP8azTdgyrKyWZ9Mfh09eHWb5PgTSO8wi7U824RA==",
"dev": true,
"license": "Apache-2.0"
},
"node_modules/@chevrotain/types": {
"version": "11.0.3",
"resolved": "https://registry.npmjs.org/@chevrotain/types/-/types-11.0.3.tgz",
"integrity": "sha512-gsiM3G8b58kZC2HaWR50gu6Y1440cHiJ+i3JUvcp/35JchYejb2+5MVeJK0iKThYpAa/P2PYFV4hoi44HD+aHQ==",
"dev": true,
"license": "Apache-2.0"
},
"node_modules/@chevrotain/utils": {
"version": "11.0.3",
"resolved": "https://registry.npmjs.org/@chevrotain/utils/-/utils-11.0.3.tgz",
"integrity": "sha512-YslZMgtJUyuMbZ+aKvfF3x1f5liK4mWNxghFRv7jqRR9C3R3fAOGTTKvxXDa2Y1s9zSbcpuO0cAxDYsc9SrXoQ==",
"dev": true,
"license": "Apache-2.0"
},
"node_modules/@mermaid-js/parser": {
"version": "0.6.3",
"resolved": "https://registry.npmjs.org/@mermaid-js/parser/-/parser-0.6.3.tgz",
"integrity": "sha512-lnjOhe7zyHjc+If7yT4zoedx2vo4sHaTmtkl1+or8BRTnCtDmcTpAjpzDSfCZrshM5bCoz0GyidzadJAH1xobA==",
"dev": true,
"license": "MIT",
"dependencies": {
"langium": "3.3.1"
}
},
"node_modules/chevrotain": {
"version": "11.0.3",
"resolved": "https://registry.npmjs.org/chevrotain/-/chevrotain-11.0.3.tgz",
"integrity": "sha512-ci2iJH6LeIkvP9eJW6gpueU8cnZhv85ELY8w8WiFtNjMHA5ad6pQLaJo9mEly/9qUyCpvqX8/POVUTf18/HFdw==",
"dev": true,
"license": "Apache-2.0",
"dependencies": {
"@chevrotain/cst-dts-gen": "11.0.3",
"@chevrotain/gast": "11.0.3",
"@chevrotain/regexp-to-ast": "11.0.3",
"@chevrotain/types": "11.0.3",
"@chevrotain/utils": "11.0.3",
"lodash-es": "4.17.21"
}
},
"node_modules/chevrotain-allstar": {
"version": "0.3.1",
"resolved": "https://registry.npmjs.org/chevrotain-allstar/-/chevrotain-allstar-0.3.1.tgz",
"integrity": "sha512-b7g+y9A0v4mxCW1qUhf3BSVPg+/NvGErk/dOkrDaHA0nQIQGAtrOjlX//9OQtRlSCy+x9rfB5N8yC71lH1nvMw==",
"dev": true,
"license": "MIT",
"dependencies": {
"lodash-es": "^4.17.21"
},
"peerDependencies": {
"chevrotain": "^11.0.0"
}
},
"node_modules/langium": {
"version": "3.3.1",
"resolved": "https://registry.npmjs.org/langium/-/langium-3.3.1.tgz",
"integrity": "sha512-QJv/h939gDpvT+9SiLVlY7tZC3xB2qK57v0J04Sh9wpMb6MP1q8gB21L3WIo8T5P1MSMg3Ep14L7KkDCFG3y4w==",
"dev": true,
"license": "MIT",
"dependencies": {
"chevrotain": "~11.0.3",
"chevrotain-allstar": "~0.3.0",
"vscode-languageserver": "~9.0.1",
"vscode-languageserver-textdocument": "~1.0.11",
"vscode-uri": "~3.0.8"
},
"engines": {
"node": ">=16.0.0"
}
},
"node_modules/lodash-es": {
"version": "4.17.21",
"resolved": "https://registry.npmjs.org/lodash-es/-/lodash-es-4.17.21.tgz",
"integrity": "sha512-mKnC+QJ9pWVzv+C4/U3rRsHapFfHvQFoFB92e52xeyGMcX6/OlIl78je1u8vePzYZSkkogMPJ2yjxxsb89cxyw==",
"dev": true,
"license": "MIT"
},
"node_modules/vscode-jsonrpc": {
"version": "8.2.0",
"resolved": "https://registry.npmjs.org/vscode-jsonrpc/-/vscode-jsonrpc-8.2.0.tgz",
"integrity": "sha512-C+r0eKJUIfiDIfwJhria30+TYWPtuHJXHtI7J0YlOmKAo7ogxP20T0zxB7HZQIFhIyvoBPwWskjxrvAtfjyZfA==",
"dev": true,
"license": "MIT",
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/vscode-languageserver": {
"version": "9.0.1",
"resolved": "https://registry.npmjs.org/vscode-languageserver/-/vscode-languageserver-9.0.1.tgz",
"integrity": "sha512-woByF3PDpkHFUreUa7Hos7+pUWdeWMXRd26+ZX2A8cFx6v/JPTtd4/uN0/jB6XQHYaOlHbio03NTHCqrgG5n7g==",
"dev": true,
"license": "MIT",
"dependencies": {
"vscode-languageserver-protocol": "3.17.5"
},
"bin": {
"installServerIntoExtension": "bin/installServerIntoExtension"
}
},
"node_modules/vscode-languageserver-protocol": {
"version": "3.17.5",
"resolved": "https://registry.npmjs.org/vscode-languageserver-protocol/-/vscode-languageserver-protocol-3.17.5.tgz",
"integrity": "sha512-mb1bvRJN8SVznADSGWM9u/b07H7Ecg0I3OgXDuLdn307rl/J3A9YD6/eYOssqhecL27hK1IPZAsaqh00i/Jljg==",
"dev": true,
"license": "MIT",
"dependencies": {
"vscode-jsonrpc": "8.2.0",
"vscode-languageserver-types": "3.17.5"
}
},
"node_modules/vscode-languageserver-textdocument": {
"version": "1.0.12",
"resolved": "https://registry.npmjs.org/vscode-languageserver-textdocument/-/vscode-languageserver-textdocument-1.0.12.tgz",
"integrity": "sha512-cxWNPesCnQCcMPeenjKKsOCKQZ/L6Tv19DTRIGuLWe32lyzWhihGVJ/rcckZXJxfdKCFvRLS3fpBIsV/ZGX4zA==",
"dev": true,
"license": "MIT"
},
"node_modules/vscode-languageserver-types": {
"version": "3.17.5",
"resolved": "https://registry.npmjs.org/vscode-languageserver-types/-/vscode-languageserver-types-3.17.5.tgz",
"integrity": "sha512-Ld1VelNuX9pdF39h2Hgaeb5hEZM2Z3jUrrMgWQAu82jMtZp7p3vJT3BzToKtZI7NgQssZje5o0zryOrhQvzQAg==",
"dev": true,
"license": "MIT"
},
"node_modules/vscode-uri": {
"version": "3.0.8",
"resolved": "https://registry.npmjs.org/vscode-uri/-/vscode-uri-3.0.8.tgz",
"integrity": "sha512-AyFQ0EVmsOZOlAnxoFOGOq1SQDWAB7C6aqMGS23svWAllfOaxbuFvcT8D1i8z3Gyn8fraVeZNNmN6e9bxxXkKw==",
"dev": true,
"license": "MIT"
}
}
}

11
package.json Normal file
View file

@ -0,0 +1,11 @@
{
"name": "emo-social-insta-dm-agent-tools",
"private": true,
"type": "module",
"devDependencies": {
"@mermaid-js/parser": "^0.6.3"
},
"scripts": {
"verify:mermaid": "node tools/verify_mermaid.mjs"
}
}

View file

@ -1,163 +1,42 @@
# Socialmediatorr Instagram DM History — Human Readable Report (English) # Instagram DM History — Short Report (English)
- Generated: `2025-12-24T02:28:34+00:00` - Generated: `2025-12-24T02:28:34+00:00`
- Owner name used: `Sergio de Vocht` - Inbox: `@socialmediatorr`
## 1) What This Dataset Represents ## What This Is
This is an all-time audit of Instagram DM conversations for `@socialmediatorr`, focused on extracting repeatable sales + support behavior so an AI agent can reply in Sergios style. This is a short, plain-English summary of the DM history scan.
The analysis treats the account as a hybrid system: frequent repeated templates (likely automation/scripts) plus lower-frequency custom replies (human Sergio). It avoids quoting private messages and it avoids storing personal identities.
## 2) High-Level Metrics (All-Time) ## Key Numbers
- Conversations analyzed: **10,061** - Conversations analyzed: **10,061**
- Bot-only conversations: **1,883** (18.7%) - Conversations that stayed template-only: **1,883** (18.7%)
- Human-intervened conversations: **8,153** (81.0%) - Conversations that included custom replies: **8,153** (81.0%)
- Conversion (intent signals): **1,923** (19.1%) - Buying/booking signals (weak): **1,923** (19.1%)
- Conversion (confirmed signals): **55** (0.5%) - Buying/booking signals (strong): **55** (0.5%)
Notes on conversion: this uses heuristics (keywords + payment/link mentions). It is directionally useful for ranking scripts, but it is not a ground-truth revenue ledger. Buying/booking signals are detected from text patterns (they are not a payment ledger).
## 3) Sergio Persona (From Manual/Hybrid Replies) ## What You Need to Know
- Typical reply length: median **60.0** chars (p90 **67.0**) The fastest improvements come from standardizing answers to repeated questions and sending them in the right time blocks.
- Questions: **2.4%** | Exclamations: **1.7%** | Emoji: **0.0%** For the full deep report (CET timing, day-of-week patterns, Top 20 questions, and concrete actions), read:
- Language guess (manual replies): en=8043, es=423, unknown=224
Practical implication for an agent: short, direct replies; minimal punctuation; bilingual capability; low/no emoji usage. - `reports/socialmediatorr/dm_history_report_en_detailed.md`
## 4) Bot vs Human Segmentation (What It Means) ## Useful Inventory (Safe Counts Only)
- **[BOT]** = outgoing message template repeated frequently (>= configured threshold). - Total outgoing templates detected: **8,550**
- **[MANUAL]** = outgoing message that is rare/unique (<= configured threshold). - High-frequency repeat templates: **24**
- **[HYBRID]** = messages that look like a bot template but with manual edits (prefix match/similarity). - “Rescue” events detected: **7**
- Training pairs (user → reply) available: **524**
This separation is the foundation for: (1) extracting safe reusable scripts, and (2) extracting human-only replies as training data for a RAG or fine-tune. ## What You Do Not Need to Know
## 5) Top Detected Script Templates (Canonicalized) Do not store or copy these into an automation system unless you have a clear operational reason:
- Names, handles, phone numbers, emails.
- BOT #1: sent **2495**×`crees que es necesario hoy en dã a llevar a cabo este desarrollo colectivo` - Full conversation transcripts for every thread.
- BOT #2: sent **2483**×`perfecto aquã no hablamos de â mejorar solo a uno mismoâ sino de algo mucho mã s profundo estar bien con los demã s para poder estar bien contigo mism` - Photos, videos, audio, and other attachments.
- BOT #3: sent **2483**×`te lo dejo por aquã dame un minuto` - One-off edge cases that never repeat.
- BOT #4: sent **2483**×`me gustarã a saber tu opiniã³n`
- BOT #5: sent **1878**×`me alegro de que quieras seguir aprendiendo ð ª te dejo por aquã el ebook â <NUM> conceptos de desarrollo personal que te estã n quitando paz`
- BOT #6: sent **1878**×`no es para que lo leas en modo teorã a sino para que puedas detectar ideas que llevas tiempo aplicando y que sin darte cuenta estã n influyendo en tus relacione`
- BOT #7: sent **706**×`gracias por ese feedback ð`
- BOT #8: sent **706**×`como agradecimiento por seguirme quiero regalarte un video exclusivo que te ayude a empezar este cambio dime â dã³nde sientes que hay mã s conflicto ãºltimament`
- BOT #9: sent **680**×`you sent a private reply to a comment on your instagram post`
- BOT #10: sent **469**×`por cierto`
## 6) Human Reply Library (Rare/Manual Examples, Canonicalized)
- MANUAL-ish #1: seen **10**×`quã bonito leer eso a veces entender las palabras abre puertas nuevas â sientes que en tu entorno hay algo que te gustarã a armonizar mã s`
- MANUAL-ish #2: seen **7**×`buenas aquã sergio ð gracias por responder he preparado un video con muchã simo valor para ayudarte a acabar con esos desbordes y vivir en paz solo tienes que c`
- MANUAL-ish #3: seen **5**×`hola buenas como estas ð espero que estã s bien me gustarã a saber que es lo q te ha echo estar aquã y querer saber mã s sobre nuestras formaciã³n`
- MANUAL-ish #4: seen **5**×`y si pudieras resolver esto cã³mo crees que cambiarã a tu forma de relacionarte o sentirte`
- MANUAL-ish #5: seen **5**×`para conocerte un poquito mã s que te gustarã a conseguir con emosocial cual es tu mayor desafã o actualmente dentro de tus relaciones`
- MANUAL-ish #6: seen **4**×`okey te entiendo perfectamente ð segãºn lo que me comentas creo que esta lista de videos de youtube te va a venir genial para empezar a entender las bases del c`
- MANUAL-ish #7: seen **4**×`buenas aquã sergio ð gracias por responder he preparado un video con muchã simo valor para ayudarte a acabar con esos desbordes y vivir en paz solo tienes que c`
- MANUAL-ish #8: seen **3**×`hola buenas como estas espero que bien cuã ntame que te parece el contenido que estamos ofreciendo por whatsapp te leoð ð`
## 7) Bot Template Performance (Reply/Conversion Heuristics)
These come from `bot_performance_audit.csv` and are computed per canonical bot template.
### Most-used bot templates (by volume)
- sent=2495 reply_rate=0.376 intent_rate=0.0766 confirmed_rate=0.012 — `crees que es necesario hoy en dã a llevar a cabo este desarrollo colectivo`
- sent=2483 reply_rate=0.0334 intent_rate=0.0769 confirmed_rate=0.0121 — `perfecto aquã no hablamos de â mejorar solo a uno mismoâ sino de algo mucho mã s profundo estar bien con los demã s para poder estar bien co`
- sent=2483 reply_rate=0.1188 intent_rate=0.0769 confirmed_rate=0.0121 — `te lo dejo por aquã dame un minuto`
- sent=2483 reply_rate=0.0028 intent_rate=0.0769 confirmed_rate=0.0121 — `me gustarã a saber tu opiniã³n`
- sent=1878 reply_rate=0.0 intent_rate=0.0 confirmed_rate=0.0005 — `me alegro de que quieras seguir aprendiendo ð ª te dejo por aquã el ebook â <NUM> conceptos de desarrollo personal que te estã n quitando pa`
- sent=1878 reply_rate=0.1768 intent_rate=0.0 confirmed_rate=0.0005 — `no es para que lo leas en modo teorã a sino para que puedas detectar ideas que llevas tiempo aplicando y que sin darte cuenta estã n influye`
- sent=706 reply_rate=0.0042 intent_rate=0.1048 confirmed_rate=0.017 — `gracias por ese feedback ð`
- sent=706 reply_rate=0.8187 intent_rate=0.1048 confirmed_rate=0.017 — `como agradecimiento por seguirme quiero regalarte un video exclusivo que te ayude a empezar este cambio dime â dã³nde sientes que hay mã s c`
### Best reply-rate bot templates
- reply_rate=0.8187 sent=706 — `como agradecimiento por seguirme quiero regalarte un video exclusivo que te ayude a empezar este cambio dime â dã³nde sientes que hay mã s c`
- reply_rate=0.7143 sent=98 — `pudiste entrar correctamente`
- reply_rate=0.7022 sent=178 — `por favor toca una de las siguientes opciones ð`
- reply_rate=0.4701 sent=134 — `pudiste verlo`
- reply_rate=0.4602 sent=176 — `que te pareciã³ ese diccionario hay alguna palabra que sueles utilizar y no te habã as dado cuenta`
- reply_rate=0.376 sent=2495 — `crees que es necesario hoy en dã a llevar a cabo este desarrollo colectivo`
- reply_rate=0.3458 sent=240 — `gracias por tu sinceridad ð`
- reply_rate=0.3291 sent=158 — `te dejo este video donde explico por quã las relaciones de pareja entran en conflicto aunque haya amor`
### Worst reply-rate bot templates
- reply_rate=0.0 sent=1878 — `me alegro de que quieras seguir aprendiendo ð ª te dejo por aquã el ebook â <NUM> conceptos de desarrollo personal que te estã n quitando pa`
- reply_rate=0.0 sent=337 — `enhorabuena por querer dar ese cambio estã s a un paso de transformar tu relaciã³n en solo <NUM> dã as te invito a un taller exclusivo donde`
- reply_rate=0.0 sent=158 — `gracias por compartirlo â ï`
- reply_rate=0.0 sent=131 — `entiendo perfectamente ð`
- reply_rate=0.0 sent=54 — `this account can t receive your message because they don t allow new message requests from everyone`
- reply_rate=0.0028 sent=2483 — `me gustarã a saber tu opiniã³n`
- reply_rate=0.0042 sent=706 — `gracias por ese feedback ð`
- reply_rate=0.0334 sent=2483 — `perfecto aquã no hablamos de â mejorar solo a uno mismoâ sino de algo mucho mã s profundo estar bien con los demã s para poder estar bien co`
## 8) Objections → Best Sergio Replies (Playbook)
### price
- (1) Ey Alex que tal
- (1) Qué bonito leer eso. A veces entender las palabras abre puertas nuevas. ¿Sientes que en tu entorno hay algo que te gustaría armonizar más?
- (1) Y que es lo que te impide dar ese cambio? Te veo con mucha seguridad
### time
- (1) Brutal esto que dices
- (1) No es una herida ELA! Apego que no te dieron tus padres es solo una parte del espectro, necesitamos validación del mundo y de forma constante, no es una herida del pasado es algo que falta darnos en el presente.
- (1) Vaya, suena bastante frustrante el hecho de querer "bajar esa guardia", y sentir que cuando lo haces, todo cambia
### trust
- (2) Hola Dani, gracias por el mensaje bonito, de verdad. Me alegra mucho saber que el contenido te está ayudando a mirar las cosas desde otro punto de vista
- (2) Qué bonito leer eso. A veces entender las palabras abre puertas nuevas. ¿Sientes que en tu entorno hay algo que te gustaría armonizar más?
- (2) En la plataforma no sale por ningún lugar, y normalmente siempre llegan 2 emails, 1 de confirmación de pago y otro de bienvenida
## 9) Rescue / Save Logic (Human Intervention After Silence/Negativity)
- Rescue events detected (heuristic): **7**
A “rescue” is when a manual/hybrid owner message follows either (a) a user negative signal, or (b) >24h silence after a bot message, and the thread later shows a confirmed conversion signal.
## 10) Product / Offer Evolution (Eras)
This is inferred from mentions of pricing/currency + offer terms (e.g., call/audit/coaching) and summarized quarterly.
Recent quarters (top extracted offer signals):
- stripe(1)
- book(1912); ebook(1912); call(8); calendly(7); coaching(2); stripe(2); pdf(2); paypal(1)
## 11) Charts
- Bot fatigue (weekly reply rate to the dominant bot script): `bot_fatigue_chart.png`
![](bot_fatigue_chart.png)
- Editorial timeline (top bot scripts vs conversions): `editorial_timeline.png`
![](editorial_timeline.png)
## 12) What To Build From This (Agent Requirements)
### Core behavior
- Start with top bot templates for predictable openers and FAQ-style flows.
- Switch to Sergio-style manual patterns on objections, negotiation, or when conversation stalls.
- Use a rescue cadence (time-based triggers) after silence.
### Data products to drive the agent
- Training pairs (manual-only, converted threads): `/root/tmp/socialmediatorr-agent-analysis-alltime-20251224T024000Z/training_pairs.jsonl` (rows: ~524)
- Objection handlers: `/root/tmp/socialmediatorr-agent-analysis-alltime-20251224T024000Z/objection_handlers.json`
- Rescue playbook: `/root/tmp/socialmediatorr-agent-analysis-alltime-20251224T024000Z/rescue_playbook.json`
- Script templates + editorial drift: `/root/tmp/socialmediatorr-agent-analysis-alltime-20251224T024000Z/top_outgoing_templates.json`
### Safety boundaries (recommended)
- Never request or store passwords/2FA codes.
- Avoid medical/legal/financial advice; redirect to a call or a human.
- If user asks to move off-platform, follow Sergios historical policy and business rules.
## 13) What We Do NOT Need To Know (Ignore / Do Not Store)
- Exact client identities (names, handles, phone numbers, emails) unless required for operational routing.
- Media attachments (photos/videos/audio) for persona cloning; they add storage cost and privacy risk.
- Full verbatim message dumps for every thread; for RAG you only need high-quality pairs and playbook snippets.
- Individual one-off edge cases that never repeat (unless they represent a safety boundary).
- Internal Meta export folder structure details beyond `messages/inbox/**/message*.json`.
## 14) Caveats / Gaps
- The export does not reliably label ManyChat vs Human; bot/human is inferred by repetition and similarity.
- Conversion is heuristic; integrate Stripe/Calendly/CRM events if you want ground-truth attribution.
- Language detection is heuristic; improve it if you need precise bilingual routing.

View file

@ -1,25 +1,22 @@
# Socialmediatorr Instagram DM History : Plain-English Deep Report # Instagram DM History — Plain-English Deep Report
## DM History Deep Report ## What This Is
**Subject:** Instagram direct messages for `@socialmediatorr` **Inbox:** `@socialmediatorr`
**Version:** v1.0 (STYLE BIBLE EN 3.0GM)
**Date:** 2025-12-24 **Date:** 2025-12-24
**Status:** REVIEW REQUIRED **Time zone used:** CET
**Citation:** `if://report/socialmediatorr/instagram/dm-history/`
**Author:** Danny Stocker | InfraFabric Research
### How This Report Was Made ### How This Report Was Made
> This is an automated count of patterns. It is not a therapy note and it is not a sales ledger. > This is a count of patterns. It is not a therapy note and it is not a sales ledger.
This document was generated by reading an Instagram data export and counting repeat patterns over time. It avoids quoting private client messages and it avoids storing personal identities. This document was generated by reading an Instagram data export and counting repeat patterns over time. It avoids quoting private client messages and it avoids storing personal identities.
--- ---
**Context:** This inbox contains a high-volume message-and-reply system over 429 days. **Context:** This inbox contains message history over 429 days.
> Your messaging system is working as a volume engine. The weak point is consistency at the moments where people ask to buy or book. > The system works at scale. The weak point is the “next step” moments: when people ask what to do, what it costs, or where to get it.
The purpose of this report is practical: define what to keep, what to remove, and what to automate safely—without damaging trust. The purpose of this report is practical: define what to keep, what to remove, and what to automate safely—without damaging trust.
@ -35,7 +32,7 @@ Across the observed window, you sent a very large number of messages and you rec
| Total messages | 54,069 | Instagram export | | Total messages | 54,069 | Instagram export |
| Messages you sent | 43,607 | Instagram export | | Messages you sent | 43,607 | Instagram export |
| Messages people sent you | 10,462 | Instagram export | | Messages people sent you | 10,462 | Instagram export |
| Messages that look like a question or a request | 2,713 | Instagram export | | Messages that look like a question or a request | 2,715 | Instagram export |
| System messages about new followers (auto text in the inbox) | 8,081 | Instagram export | | System messages about new followers (auto text in the inbox) | 8,081 | Instagram export |
### What You Need to Know (In Plain English) ### What You Need to Know (In Plain English)
@ -67,7 +64,7 @@ To avoid guesswork, we start with 3-month blocks (a simple way to smooth noise),
| 2025 Jan-Mar | 21 | 0 | 0 | | 2025 Jan-Mar | 21 | 0 | 0 |
| 2025 Apr-Jun | 92 | 97 | 15 | | 2025 Apr-Jun | 92 | 97 | 15 |
| 2025 Jul-Sep | 623 | 882 | 89 | | 2025 Jul-Sep | 623 | 882 | 89 |
| 2025 Oct-Dec | 9,712 | 42,628 | 2,609 | | 2025 Oct-Dec | 9,712 | 42,628 | 2,611 |
Same data as charts: Same data as charts:
@ -112,8 +109,8 @@ This month-by-month table is the clearest view of how the inbox changed over tim
| 2025-08 | 193 | 230 | 28 | 50.0% | | 2025-08 | 193 | 230 | 28 | 50.0% |
| 2025-09 | 284 | 330 | 24 | 20.8% | | 2025-09 | 284 | 330 | 24 | 20.8% |
| 2025-10 | 787 | 1,190 | 64 | 17.2% | | 2025-10 | 787 | 1,190 | 64 | 17.2% |
| 2025-11 | 854 | 2,194 | 149 | 46.3% | | 2025-11 | 854 | 2,194 | 150 | 46.7% |
| 2025-12 | 8,071 | 39,244 | 2,396 | 89.6% | | 2025-12 | 8,071 | 39,244 | 2,397 | 89.7% |
The busiest month was **2025-12** with **47,315** messages total (87.5% of everything in this export). That single month dominates the shape of the data. The busiest month was **2025-12** with **47,315** messages total (87.5% of everything in this export). That single month dominates the shape of the data.
@ -126,7 +123,7 @@ Use this to time follow-ups and first messages. Do not spread effort evenly acro
| Day of week | Messages from people | Messages you sent | Questions/requests | | Day of week | Messages from people | Messages you sent | Questions/requests |
|---|---:|---:|---:| |---|---:|---:|---:|
| Monday | 1,600 | 8,359 | 131 | | Monday | 1,600 | 8,359 | 131 |
| Tuesday | 1,939 | 9,654 | 192 | | Tuesday | 1,939 | 9,654 | 194 |
| Wednesday | 1,282 | 5,554 | 159 | | Wednesday | 1,282 | 5,554 | 159 |
| Thursday | 2,261 | 6,908 | 1,268 | | Thursday | 2,261 | 6,908 | 1,268 |
| Friday | 1,705 | 5,733 | 803 | | Friday | 1,705 | 5,733 | 803 |
@ -184,7 +181,7 @@ One caution: “fast replies” are often repeat messages. This section shows ov
| Typical time to reply to questions/requests | 2 seconds | Instagram export | | Typical time to reply to questions/requests | 2 seconds | Instagram export |
| Slow end for questions/requests (90% are faster) | 4 seconds | Instagram export | | Slow end for questions/requests (90% are faster) | 4 seconds | Instagram export |
| Messages from people answered within 48 hours | 7,467 (71.4%) | Instagram export | | Messages from people answered within 48 hours | 7,467 (71.4%) | Instagram export |
| Questions/requests answered within 48 hours | 2,278 (84.0%) | Instagram export | | Questions/requests answered within 48 hours | 2,280 (84.0%) | Instagram export |
Breakdown by message type (repeat messages vs custom messages): Breakdown by message type (repeat messages vs custom messages):
@ -242,11 +239,11 @@ This list is grouped by meaning (not by exact wording). It includes very short r
| Rank | Topic (plain English) | Count | Share of all questions/requests | | Rank | Topic (plain English) | Count | Share of all questions/requests |
|---:|---|---:|---:| |---:|---|---:|---:|
| 1 | Just one word: book | 1,857 | 68.4% | | 1 | Just one word: book | 1,857 | 68.4% |
| 2 | What is this? | 206 | 7.6% | | 2 | What is this? | 203 | 7.5% |
| 3 | Can you send the video? | 191 | 7.0% | | 3 | Can you send the video? | 189 | 7.0% |
| 4 | Other question | 120 | 4.4% | | 4 | Other question | 118 | 4.3% |
| 5 | Can you help me? | 74 | 2.7% | | 5 | Can you help me? | 74 | 2.7% |
| 6 | Can you send the link? | 61 | 2.2% | | 6 | Can you send the link? | 70 | 2.6% |
| 7 | What does it cost? | 53 | 2.0% | | 7 | What does it cost? | 53 | 2.0% |
| 8 | Is this therapy? | 44 | 1.6% | | 8 | Is this therapy? | 44 | 1.6% |
| 9 | Where do I get the book? | 36 | 1.3% | | 9 | Where do I get the book? | 36 | 1.3% |
@ -261,12 +258,12 @@ This list is grouped by meaning (not by exact wording). It includes very short r
| 18 | Can I get a refund? | 1 | 0.0% | | 18 | Can I get a refund? | 1 | 0.0% |
| 19 | How long does it take? | 1 | 0.0% | | 19 | How long does it take? | 1 | 0.0% |
In plain terms: **1,893** of **2,713** questions/requests are about the book (69.8%). In plain terms: **1,893** of **2,715** questions/requests are about the book (69.7%).
```mermaid ```mermaid
pie title Questions/Requests: Book vs Everything Else pie title Questions/Requests: Book vs Everything Else
"Book" : 1893 "Book" : 1893
"Everything else" : 820 "Everything else" : 822
``` ```
### Content Patterns (What You Mention When You Sell) ### Content Patterns (What You Mention When You Sell)

View file

@ -1,10 +1,8 @@
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import csv
import json import json
import os import os
import statistics
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@ -21,11 +19,6 @@ def _load_json(path: Path) -> dict[str, Any]:
return json.loads(path.read_text(encoding="utf-8", errors="replace")) return json.loads(path.read_text(encoding="utf-8", errors="replace"))
def _read_csv(path: Path) -> list[dict[str, str]]:
with path.open("r", encoding="utf-8", newline="") as f:
return list(csv.DictReader(f))
def _count_jsonl(path: Path, *, max_lines: int = 5_000_000) -> int: def _count_jsonl(path: Path, *, max_lines: int = 5_000_000) -> int:
n = 0 n = 0
with path.open("r", encoding="utf-8", errors="replace") as f: with path.open("r", encoding="utf-8", errors="replace") as f:
@ -36,284 +29,115 @@ def _count_jsonl(path: Path, *, max_lines: int = 5_000_000) -> int:
return n return n
def _pct(x: float) -> str: def _pct(num: int, den: int) -> str:
return f"{x*100:.1f}%" return "n/a" if den <= 0 else f"{(num/den)*100:.1f}%"
@dataclass(frozen=True) @dataclass(frozen=True)
class ReportInputs: class ReportInputs:
summary: Path summary: Path
templates: Path templates: Path
bot_audit: Path
objections: Path
rescue: Path rescue: Path
eras: Path
training_pairs: Path training_pairs: Path
fatigue_png: Path
editorial_png: Path
def _resolve_inputs(analysis_dir: Path) -> ReportInputs: def _resolve_inputs(analysis_dir: Path) -> ReportInputs:
return ReportInputs( return ReportInputs(
summary=analysis_dir / "summary.json", summary=analysis_dir / "summary.json",
templates=analysis_dir / "top_outgoing_templates.json", templates=analysis_dir / "top_outgoing_templates.json",
bot_audit=analysis_dir / "bot_performance_audit.csv",
objections=analysis_dir / "objection_handlers.json",
rescue=analysis_dir / "rescue_playbook.json", rescue=analysis_dir / "rescue_playbook.json",
eras=analysis_dir / "sergio_eras.csv",
training_pairs=analysis_dir / "training_pairs.jsonl", training_pairs=analysis_dir / "training_pairs.jsonl",
fatigue_png=analysis_dir / "bot_fatigue_chart.png",
editorial_png=analysis_dir / "editorial_timeline.png",
) )
def generate_report(*, analysis_dir: Path, out_path: Path) -> Path: def generate_report(*, analysis_dir: Path, out_path: Path) -> Path:
inp = _resolve_inputs(analysis_dir) inp = _resolve_inputs(analysis_dir)
for p in inp.__dict__.values(): if not inp.summary.exists():
if not Path(p).exists(): raise FileNotFoundError(str(inp.summary))
raise FileNotFoundError(str(p))
summary = _load_json(inp.summary) summary = _load_json(inp.summary)
templates = _load_json(inp.templates)
objections = _load_json(inp.objections)
rescues = _load_json(inp.rescue)
bot_audit = _read_csv(inp.bot_audit)
owner = summary.get("owner_name") or "Unknown"
conv = summary.get("conversations") or {} conv = summary.get("conversations") or {}
conv_total = int(conv.get("total") or 0) conv_total = int(conv.get("total") or 0)
bot_only = int(conv.get("bot_only") or 0) template_only = int(conv.get("bot_only") or 0)
human = int(conv.get("human_intervened") or 0) custom_replies = int(conv.get("human_intervened") or 0)
conversions = summary.get("conversions") or {}
conv_intent = int(conversions.get("intent") or 0)
conv_confirmed = int(conversions.get("confirmed") or 0)
bot_only_rate = (bot_only / conv_total) if conv_total else 0.0 buying = summary.get("conversions") or {}
human_rate = (human / conv_total) if conv_total else 0.0 buying_weak = int(buying.get("intent") or 0)
intent_rate = (conv_intent / conv_total) if conv_total else 0.0 buying_strong = int(buying.get("confirmed") or 0)
confirmed_rate = (conv_confirmed / conv_total) if conv_total else 0.0
manual_style = summary.get("manual_style") or {} templates_total = None
median_len = manual_style.get("median_len_chars") templates_repeat = None
p90_len = manual_style.get("p90_len_chars") if inp.templates.exists():
question_rate = float(manual_style.get("question_rate") or 0.0) t = _load_json(inp.templates)
exclaim_rate = float(manual_style.get("exclaim_rate") or 0.0) templates_total = int(t.get("templates_total") or 0)
emoji_rate = float(manual_style.get("emoji_rate") or 0.0) templates_repeat = int(t.get("bot_templates") or 0)
lang_guess = manual_style.get("lang_guess") or {}
# Templates: prefer canonical strings (safe-ish) and avoid raw samples. rescue_count = None
top_templates = templates.get("top_templates") or [] if inp.rescue.exists():
top_bot = [t for t in top_templates if isinstance(t, dict) and t.get("label_hint") == "bot"]
top_manual = [t for t in top_templates if isinstance(t, dict) and t.get("label_hint") == "manual"]
# Bot audit: best/worst by reply_rate.
def fnum(v: str | None) -> float:
try: try:
return float(v or 0) rescue = _load_json(inp.rescue)
rescue_count = len(rescue) if isinstance(rescue, list) else 0
except Exception: except Exception:
return 0.0 rescue_count = None
bot_audit_sorted = sorted(bot_audit, key=lambda r: fnum(r.get("sent")), reverse=True) pairs_count = _count_jsonl(inp.training_pairs, max_lines=2_000_000) if inp.training_pairs.exists() else None
top_audit = bot_audit_sorted[:10]
best_reply = sorted(bot_audit, key=lambda r: fnum(r.get("reply_rate")), reverse=True)[:10]
worst_reply = sorted(bot_audit, key=lambda r: fnum(r.get("reply_rate")))[:10]
# Objections: most common replies per category. generated_at = summary.get("generated_at") if isinstance(summary.get("generated_at"), str) else None
objection_blocks: list[str] = []
if isinstance(objections, dict):
for cat in ("price", "time", "trust", "stop"):
replies = objections.get(cat) or []
if not isinstance(replies, list) or not replies:
continue
top3 = []
for r in replies[:3]:
if not isinstance(r, dict):
continue
top3.append(f"- ({r.get('count')}) {r.get('reply')}")
if top3:
objection_blocks.append(f"### {cat}\n" + "\n".join(top3))
rescue_count = len(rescues) if isinstance(rescues, list) else 0 report: list[str] = []
pairs_count = _count_jsonl(inp.training_pairs, max_lines=2_000_000) report.append("# Instagram DM History — Short Report (English)")
# Era summary: simple high-level notes.
eras_rows = _read_csv(inp.eras)
era_recent = eras_rows[-6:] if len(eras_rows) > 6 else eras_rows
era_offer_terms: list[str] = []
for row in era_recent:
offers = (row.get("top_offers") or "").strip()
if offers:
era_offer_terms.append(offers)
# A few derived notes.
lang_line = ", ".join(f"{k}={v}" for k, v in lang_guess.items())
# Summarize bot fatigue trend from image existence only (analysis already made it).
report = []
report.append("# Socialmediatorr Instagram DM History — Human Readable Report (English)")
report.append("") report.append("")
report.append(f"- Generated: `{summary.get('generated_at')}`") if generated_at:
report.append(f"- Owner name used: `{owner}`") report.append(f"- Generated: `{generated_at}`")
report.append("- Inbox: `@socialmediatorr`")
report.append("") report.append("")
report.append("## 1) What This Dataset Represents") report.append("## What This Is")
report.append("") report.append("")
report.append( report.append("This is a short, plain-English summary of the DM history scan.")
"This is an all-time audit of Instagram DM conversations for `@socialmediatorr`, focused on extracting repeatable sales + support behavior so an AI agent can reply in Sergios style." report.append("It avoids quoting private messages and it avoids storing personal identities.")
)
report.append(
"The analysis treats the account as a hybrid system: frequent repeated templates (likely automation/scripts) plus lower-frequency custom replies (human Sergio)."
)
report.append("") report.append("")
report.append("## 2) High-Level Metrics (All-Time)") report.append("## Key Numbers")
report.append("") report.append("")
report.append(f"- Conversations analyzed: **{conv_total:,}**") report.append(f"- Conversations analyzed: **{conv_total:,}**")
report.append(f"- Bot-only conversations: **{bot_only:,}** ({_pct(bot_only_rate)})") report.append(f"- Conversations that stayed template-only: **{template_only:,}** ({_pct(template_only, conv_total)})")
report.append(f"- Human-intervened conversations: **{human:,}** ({_pct(human_rate)})") report.append(f"- Conversations that included custom replies: **{custom_replies:,}** ({_pct(custom_replies, conv_total)})")
report.append(f"- Conversion (intent signals): **{conv_intent:,}** ({_pct(intent_rate)})") report.append(f"- Buying/booking signals (weak): **{buying_weak:,}** ({_pct(buying_weak, conv_total)})")
report.append(f"- Conversion (confirmed signals): **{conv_confirmed:,}** ({_pct(confirmed_rate)})") report.append(f"- Buying/booking signals (strong): **{buying_strong:,}** ({_pct(buying_strong, conv_total)})")
report.append("") report.append("")
report.append( report.append("Buying/booking signals are detected from text patterns (they are not a payment ledger).")
"Notes on conversion: this uses heuristics (keywords + payment/link mentions). It is directionally useful for ranking scripts, but it is not a ground-truth revenue ledger."
)
report.append("") report.append("")
report.append("## 3) Sergio Persona (From Manual/Hybrid Replies)") report.append("## What You Need to Know")
report.append("") report.append("")
report.append(f"- Typical reply length: median **{median_len}** chars (p90 **{p90_len}**)") report.append("The fastest improvements come from standardizing answers to repeated questions and sending them in the right time blocks.")
report.append(f"- Questions: **{_pct(question_rate)}** | Exclamations: **{_pct(exclaim_rate)}** | Emoji: **{_pct(emoji_rate)}**") report.append("For the full deep report (CET timing, day-of-week patterns, Top 20 questions, and concrete actions), read:")
report.append(f"- Language guess (manual replies): {lang_line or 'n/a'}")
report.append("") report.append("")
report.append("Practical implication for an agent: short, direct replies; minimal punctuation; bilingual capability; low/no emoji usage.") report.append("- `reports/socialmediatorr/dm_history_report_en_detailed.md`")
report.append("") report.append("")
report.append("## 4) Bot vs Human Segmentation (What It Means)") report.append("## Useful Inventory (Safe Counts Only)")
report.append("") report.append("")
report.append( if templates_total is not None and templates_repeat is not None:
"- **[BOT]** = outgoing message template repeated frequently (>= configured threshold).\n" report.append(f"- Total outgoing templates detected: **{templates_total:,}**")
"- **[MANUAL]** = outgoing message that is rare/unique (<= configured threshold).\n" report.append(f"- High-frequency repeat templates: **{templates_repeat:,}**")
"- **[HYBRID]** = messages that look like a bot template but with manual edits (prefix match/similarity)." if rescue_count is not None:
) report.append(f"- “Rescue” events detected: **{rescue_count:,}**")
report.append("") if pairs_count is not None:
report.append( report.append(f"- Training pairs (user → reply) available: **{pairs_count:,}**")
"This separation is the foundation for: (1) extracting safe reusable scripts, and (2) extracting human-only replies as training data for a RAG or fine-tune." if templates_total is None and rescue_count is None and pairs_count is None:
) report.append("- (No additional artifacts were found next to `summary.json`.)")
report.append("") report.append("")
report.append("## 5) Top Detected Script Templates (Canonicalized)") report.append("## What You Do Not Need to Know")
report.append("") report.append("")
if top_bot: report.append("Do not store or copy these into an automation system unless you have a clear operational reason:")
for i, t in enumerate(top_bot[:10], 1): report.append("- Names, handles, phone numbers, emails.")
canon = (t.get("canonical") or "").strip() report.append("- Full conversation transcripts for every thread.")
count = int(t.get("count") or 0) report.append("- Photos, videos, audio, and other attachments.")
report.append(f"- BOT #{i}: sent **{count}**× — `{canon[:160]}`") report.append("- One-off edge cases that never repeat.")
else:
report.append("- (No high-frequency bot templates detected with current thresholds.)")
report.append("")
report.append("## 6) Human Reply Library (Rare/Manual Examples, Canonicalized)")
report.append("")
if top_manual:
for i, t in enumerate(top_manual[:10], 1):
canon = (t.get("canonical") or "").strip()
count = int(t.get("count") or 0)
report.append(f"- MANUAL-ish #{i}: seen **{count}**× — `{canon[:160]}`")
else:
report.append("- (No low-frequency manual templates included in the cached top list.)")
report.append("")
report.append("## 7) Bot Template Performance (Reply/Conversion Heuristics)")
report.append("")
report.append("These come from `bot_performance_audit.csv` and are computed per canonical bot template.")
report.append("")
if top_audit:
report.append("### Most-used bot templates (by volume)")
for r in top_audit[:8]:
report.append(
f"- sent={r.get('sent')} reply_rate={r.get('reply_rate')} intent_rate={r.get('conversion_intent_rate')} confirmed_rate={r.get('conversion_confirmed_rate')} — `{(r.get('canonical_template') or '')[:140]}`"
)
report.append("")
if best_reply:
report.append("### Best reply-rate bot templates")
for r in best_reply[:8]:
report.append(f"- reply_rate={r.get('reply_rate')} sent={r.get('sent')} — `{(r.get('canonical_template') or '')[:140]}`")
report.append("")
if worst_reply:
report.append("### Worst reply-rate bot templates")
for r in worst_reply[:8]:
report.append(f"- reply_rate={r.get('reply_rate')} sent={r.get('sent')} — `{(r.get('canonical_template') or '')[:140]}`")
report.append("")
report.append("## 8) Objections → Best Sergio Replies (Playbook)")
report.append("")
if objection_blocks:
report.extend(objection_blocks)
else:
report.append("- No objection handlers detected with current keyword rules.")
report.append("")
report.append("## 9) Rescue / Save Logic (Human Intervention After Silence/Negativity)")
report.append("")
report.append(f"- Rescue events detected (heuristic): **{rescue_count:,}**")
report.append(
"A “rescue” is when a manual/hybrid owner message follows either (a) a user negative signal, or (b) >24h silence after a bot message, and the thread later shows a confirmed conversion signal."
)
report.append("")
report.append("## 10) Product / Offer Evolution (Eras)")
report.append("")
report.append(
"This is inferred from mentions of pricing/currency + offer terms (e.g., call/audit/coaching) and summarized quarterly."
)
report.append("")
if era_offer_terms:
report.append("Recent quarters (top extracted offer signals):")
for line in era_offer_terms:
report.append(f"- {line}")
else:
report.append("- No offer signals detected in the most recent quarters with current extraction rules.")
report.append("")
report.append("## 11) Charts")
report.append("")
report.append(f"- Bot fatigue (weekly reply rate to the dominant bot script): `{inp.fatigue_png}`")
report.append(f"- Editorial timeline (top bot scripts vs conversions): `{inp.editorial_png}`")
report.append("")
report.append("## 12) What To Build From This (Agent Requirements)")
report.append("")
report.append("### Core behavior")
report.append("- Start with top bot templates for predictable openers and FAQ-style flows.")
report.append("- Switch to Sergio-style manual patterns on objections, negotiation, or when conversation stalls.")
report.append("- Use a rescue cadence (time-based triggers) after silence.")
report.append("")
report.append("### Data products to drive the agent")
report.append(f"- Training pairs (manual-only, converted threads): `{inp.training_pairs}` (rows: ~{pairs_count:,})")
report.append(f"- Objection handlers: `{inp.objections}`")
report.append(f"- Rescue playbook: `{inp.rescue}`")
report.append(f"- Script templates + editorial drift: `{inp.templates}`")
report.append("")
report.append("### Safety boundaries (recommended)")
report.append("- Never request or store passwords/2FA codes.")
report.append("- Avoid medical/legal/financial advice; redirect to a call or a human.")
report.append("- If user asks to move off-platform, follow Sergios historical policy and business rules.")
report.append("")
report.append("## 13) What We Do NOT Need To Know (Ignore / Do Not Store)")
report.append("")
report.append("- Exact client identities (names, handles, phone numbers, emails) unless required for operational routing.")
report.append("- Media attachments (photos/videos/audio) for persona cloning; they add storage cost and privacy risk.")
report.append("- Full verbatim message dumps for every thread; for RAG you only need high-quality pairs and playbook snippets.")
report.append("- Individual one-off edge cases that never repeat (unless they represent a safety boundary).")
report.append("- Internal Meta export folder structure details beyond `messages/inbox/**/message*.json`.")
report.append("")
report.append("## 14) Caveats / Gaps")
report.append("")
report.append("- The export does not reliably label ManyChat vs Human; bot/human is inferred by repetition and similarity.")
report.append("- Conversion is heuristic; integrate Stripe/Calendly/CRM events if you want ground-truth attribution.")
report.append("- Language detection is heuristic; improve it if you need precise bilingual routing.")
report.append("") report.append("")
out_path.parent.mkdir(parents=True, exist_ok=True) out_path.parent.mkdir(parents=True, exist_ok=True)
@ -323,19 +147,20 @@ def generate_report(*, analysis_dir: Path, out_path: Path) -> Path:
def main(argv: list[str] | None = None) -> int: def main(argv: list[str] | None = None) -> int:
ap = argparse.ArgumentParser(description="Generate a human-readable English report from analyze_instagram_export outputs.") ap = argparse.ArgumentParser(description="Generate a short, safe DM history report from an analysis directory.")
ap.add_argument("--analysis-dir", required=True, help="directory produced by analyze_instagram_export (contains summary.json)") ap.add_argument("--analysis-dir", required=True, help="analyze_instagram_export output directory")
ap.add_argument("--out", default=None, help="output markdown path (default: <analysis-dir>/dm_history_report_en.md)") ap.add_argument("--out", default=None, help="output markdown path (default: dm_history_report_en.md in CWD)")
args = ap.parse_args(argv) args = ap.parse_args(argv)
analysis_dir = Path(args.analysis_dir) analysis_dir = Path(args.analysis_dir)
out_path = Path(args.out) if args.out else (analysis_dir / "dm_history_report_en.md") out_path = Path(args.out) if args.out else (Path.cwd() / "dm_history_report_en.md")
try: try:
p = generate_report(analysis_dir=analysis_dir, out_path=out_path) p = generate_report(analysis_dir=analysis_dir, out_path=out_path)
print(json.dumps({"ok": True, "out": str(p)}, ensure_ascii=False)) print(json.dumps({"ok": True, "out": str(p)}, ensure_ascii=False))
return 0 return 0
except FileNotFoundError as e: except FileNotFoundError as e:
print(f"Missing required input: {e}", file=os.sys.stderr) print(f"Missing analysis input: {e}", file=os.sys.stderr)
return 2 return 2
except Exception as e: except Exception as e:
print(f"Report generation failed: {e}", file=os.sys.stderr) print(f"Report generation failed: {e}", file=os.sys.stderr)

View file

@ -13,7 +13,7 @@ from typing import Any, Iterable, Literal
from .analyze_instagram_export import canonicalize_text from .analyze_instagram_export import canonicalize_text
DEFAULT_LOCAL_TZ_NAME = "Europe/Brussels" DEFAULT_LOCAL_TZ_NAME = "Europe/Paris"
def _safe_chmod_600(path: Path) -> None: def _safe_chmod_600(path: Path) -> None:
@ -418,11 +418,11 @@ def _question_theme(text: str) -> str | None:
toks = s_compact.split() toks = s_compact.split()
if len(toks) == 1: if len(toks) == 1:
w = toks[0] w = toks[0]
if w in {"book", "ebook", "libro", "pdf"}: if w in {"book", "ebook", "libro", "pdf", "livre", "llibre"}:
return "Just one word: book" return "Just one word: book"
if w in {"link", "enlace"}: if w in {"link", "enlace", "lien", "enllac", "enllaç"}:
return "Just one word: link" return "Just one word: link"
if w in {"price", "precio", "cost"}: if w in {"price", "precio", "cost", "prix", "preu"}:
return "Just one word: price" return "Just one word: price"
# "I tried, but it didn't arrive / it doesn't work" # "I tried, but it didn't arrive / it doesn't work"
@ -499,15 +499,48 @@ def _question_theme(text: str) -> str | None:
return "Where are you based?" return "Where are you based?"
# Price / cost # Price / cost
if any(k in s for k in ("price", "cost", "how much", "$", "", "usd", "eur", "precio", "cuanto", "cuánto", "caro")): if any(
k in s
for k in (
"price",
"cost",
"how much",
"$",
"",
"usd",
"eur",
"precio",
"cuanto",
"cuánto",
"caro",
"prix",
"preu",
)
):
return "What does it cost?" return "What does it cost?"
# Link / payment link # Link / payment link
if any(k in s for k in ("link", "send the link", "send me the link", "where is the link", "enlace", "stripe", "paypal", "checkout", "invoice")): if any(
k in s
for k in (
"link",
"send the link",
"send me the link",
"where is the link",
"enlace",
"lien",
"enllaç",
"enllac",
"stripe",
"paypal",
"checkout",
"invoice",
)
):
return "Can you send the link?" return "Can you send the link?"
# Book / ebook / pdf # Book / ebook / pdf
if any(k in s for k in ("book", "ebook", "e-book", "pdf", "libro")): if any(k in s for k in ("book", "ebook", "e-book", "pdf", "libro", "livre", "llibre")):
return "Where do I get the book?" return "Where do I get the book?"
# Call / schedule # Call / schedule
@ -533,7 +566,7 @@ def _question_theme(text: str) -> str | None:
return "How do I book a call?" return "How do I book a call?"
# Video # Video
if any(k in s for k in ("video", "vídeo", "youtube")): if any(k in s for k in ("video", "vídeo", "vidéo", "youtube")):
return "Can you send the video?" return "Can you send the video?"
# Steps / what next # Steps / what next
@ -541,11 +574,45 @@ def _question_theme(text: str) -> str | None:
return "What are the steps?" return "What are the steps?"
# How it works / details # How it works / details
if any(k in s for k in ("how does", "how it works", "how does it work", "how does this work", "como funciona", "cómo funciona", "more info", "details", "explain")): if any(
k in s
for k in (
"how does",
"how it works",
"how does it work",
"how does this work",
"como funciona",
"cómo funciona",
"more info",
"details",
"explain",
"comment ça marche",
"ça marche",
"com funciona",
)
):
return "How does it work?" return "How does it work?"
# What you do / what is this # What you do / what is this
if any(k in s for k in ("what is this", "what do you do", "what is it", "what do you offer", "service", "services", "que es", "qué es", "que haces", "qué haces", "de que va", "de qué va")): if any(
k in s
for k in (
"what is this",
"what do you do",
"what is it",
"what do you offer",
"service",
"services",
"que es",
"qué es",
"que haces",
"qué haces",
"de que va",
"de qué va",
"c'est quoi",
"cest quoi",
)
):
return "What is this?" return "What is this?"
# Trust / legitimacy # Trust / legitimacy
@ -597,6 +664,8 @@ def _offer_terms(text: str) -> set[str]:
("ebook", "Ebook"), ("ebook", "Ebook"),
("e-book", "Ebook"), ("e-book", "Ebook"),
("libro", "Book"), ("libro", "Book"),
("livre", "Book"),
("llibre", "Book"),
("pdf", "PDF"), ("pdf", "PDF"),
("call", "Call"), ("call", "Call"),
("llamada", "Call"), ("llamada", "Call"),
@ -935,20 +1004,17 @@ def generate_report(
now = datetime.now(timezone.utc).date().isoformat() now = datetime.now(timezone.utc).date().isoformat()
report: list[str] = [] report: list[str] = []
report.append("# Socialmediatorr Instagram DM History : Plain-English Deep Report") report.append("# Instagram DM History — Plain-English Deep Report")
report.append("") report.append("")
report.append("## DM History Deep Report") report.append("## What This Is")
report.append("") report.append("")
report.append(f"**Subject:** Instagram direct messages for `@socialmediatorr`") report.append(f"**Inbox:** `@socialmediatorr`")
report.append("**Version:** v1.0 (STYLE BIBLE EN 3.0GM)")
report.append(f"**Date:** {now}") report.append(f"**Date:** {now}")
report.append("**Status:** REVIEW REQUIRED") report.append("**Time zone used:** CET")
report.append("**Citation:** `if://report/socialmediatorr/instagram/dm-history/`")
report.append("**Author:** Danny Stocker | InfraFabric Research")
report.append("") report.append("")
report.append("### How This Report Was Made") report.append("### How This Report Was Made")
report.append("") report.append("")
report.append("> This is an automated count of patterns. It is not a therapy note and it is not a sales ledger.") report.append("> This is a count of patterns. It is not a therapy note and it is not a sales ledger.")
report.append("") report.append("")
report.append( report.append(
"This document was generated by reading an Instagram data export and counting repeat patterns over time. " "This document was generated by reading an Instagram data export and counting repeat patterns over time. "
@ -957,9 +1023,9 @@ def generate_report(
report.append("") report.append("")
report.append("---") report.append("---")
report.append("") report.append("")
report.append(f"**Context:** This inbox contains a high-volume message-and-reply system over {window_days} days.") report.append(f"**Context:** This inbox contains message history over {window_days} days.")
report.append("") report.append("")
report.append("> Your messaging system is working as a volume engine. The weak point is consistency at the moments where people ask to buy or book.") report.append("> The system works at scale. The weak point is the “next step” moments: when people ask what to do, what it costs, or where to get it.")
report.append("") report.append("")
report.append( report.append(
"The purpose of this report is practical: define what to keep, what to remove, and what to automate safely—without damaging trust." "The purpose of this report is practical: define what to keep, what to remove, and what to automate safely—without damaging trust."

19
tools/README.md Normal file
View file

@ -0,0 +1,19 @@
# Tools
## Mermaid checks (Markdown diagrams)
This repo uses Mermaid diagrams in Markdown reports.
Local lint (partial):
- `npm install`
- `npm run verify:mermaid`
Notes:
- This check validates diagram types supported by `@mermaid-js/parser` (for example: `pie`).
- Some diagram types (for example: `flowchart`) are not supported by that parser yet and will be reported as `skipped`.
Full validation (recommended):
- Use Forgejos built-in PDF export for the report file. If the PDF export succeeds, the diagrams compiled successfully.

109
tools/verify_mermaid.mjs Normal file
View file

@ -0,0 +1,109 @@
import fs from "node:fs";
import path from "node:path";
import process from "node:process";
import { parse } from "@mermaid-js/parser";
function isMarkdownFile(filePath) {
const lower = filePath.toLowerCase();
return lower.endsWith(".md") || lower.endsWith(".markdown");
}
function* walkFiles(rootPath) {
const stat = fs.statSync(rootPath);
if (stat.isFile()) {
yield rootPath;
return;
}
const entries = fs.readdirSync(rootPath, { withFileTypes: true });
for (const ent of entries) {
const full = path.join(rootPath, ent.name);
if (ent.isDirectory()) {
yield* walkFiles(full);
} else if (ent.isFile()) {
yield full;
}
}
}
function extractMermaidBlocks(markdownText) {
const blocks = [];
const re = /```mermaid\s*([\s\S]*?)```/g;
let m;
while ((m = re.exec(markdownText)) !== null) {
blocks.push(m[1] || "");
}
return blocks;
}
function detectDiagramType(code) {
const lines = String(code || "")
.replace(/\r\n?/g, "\n")
.split("\n")
.map((l) => l.trim())
.filter((l) => l && !l.startsWith("%%"));
if (!lines.length) return null;
const head = lines[0];
if (head.startsWith("pie")) return "pie";
if (head.startsWith("gitGraph")) return "gitGraph";
if (head.startsWith("architecture")) return "architecture";
if (head.startsWith("packet")) return "packet";
if (head.startsWith("info")) return "info";
if (head.startsWith("radar")) return "radar";
if (head.startsWith("treemap")) return "treemap";
// Not supported by @mermaid-js/parser yet (example: flowchart/sequence/class).
return null;
}
async function main() {
const args = process.argv.slice(2);
const roots = args.length ? args : ["reports"];
let ok = true;
let total = 0;
let failures = 0;
let skipped = 0;
for (const root of roots) {
for (const filePath of walkFiles(root)) {
if (!isMarkdownFile(filePath)) continue;
const text = fs.readFileSync(filePath, "utf8");
const blocks = extractMermaidBlocks(text);
if (!blocks.length) continue;
for (let i = 0; i < blocks.length; i++) {
const code = String(blocks[i] || "").trim();
total += 1;
const diagramType = detectDiagramType(code);
if (!diagramType) {
skipped += 1;
continue;
}
try {
await parse(diagramType, code);
} catch (err) {
ok = false;
failures += 1;
const msg =
err && typeof err === "object" && "message" in err ? String(err.message) : String(err);
console.error(`[mermaid] ${filePath} block=${i + 1} type=${diagramType}: ${msg}`);
}
}
}
}
if (ok) {
console.log(JSON.stringify({ ok: true, diagrams: total, skipped }, null, 2));
return 0;
}
console.error(JSON.stringify({ ok: false, diagrams: total, failures, skipped }, null, 2));
return 1;
}
process.exitCode = await main();