/** * TOC Extractor Service * Detects and extracts Table of Contents from OCR'd document pages */ import { v4 as uuidv4 } from 'uuid'; import { getDb } from '../db/db.js'; import fs from 'fs/promises'; import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs'; /** * TOC entry patterns to match: * - "Chapter 4 – Plumbing System ........ 72" * - "4.1 Water System.....................45" * - "Section 3: Electrical . . . . . . . 89" * - "Introduction 12" */ const TOC_PATTERNS = [ // Pattern 1: Title [dots/spaces] PageNum /^(.{3,150?}?)\s*[.\s–-]{3,}\s*(\d{1,4})\s*$/, // Pattern 2: SectionKey Title [dots/spaces] PageNum /^([\d.]+)\s+(.{3,100}?)\s*[.\s–-]{3,}\s*(\d{1,4})\s*$/, // Pattern 3: Title [whitespace] PageNum (simpler) /^(.{5,120}?)\s{3,}(\d{1,4})\s*$/, ]; /** * Detect if a page looks like a TOC page * @param {string} pageText - OCR text from page * @returns {boolean} */ function isTocPage(pageText) { if (!pageText || pageText.length < 100) return false; const lines = pageText.split('\n').map(l => l.trim()).filter(l => l.length > 5); if (lines.length < 5) return false; // Count how many lines match TOC patterns let matchCount = 0; let pageNumbers = []; for (const line of lines) { for (const pattern of TOC_PATTERNS) { if (pattern.test(line)) { matchCount++; const match = line.match(pattern); const pageNum = parseInt(match[match.length - 1]); if (!isNaN(pageNum)) { pageNumbers.push(pageNum); } break; } } } // Heuristics for TOC detection: // 1. At least 5 matching lines // 2. At least 30% of lines match TOC patterns // 3. Page numbers are somewhat sequential or grouped const matchRatio = matchCount / lines.length; const hasSequentialPages = checkSequentiality(pageNumbers); return matchCount >= 5 && matchRatio >= 0.3 && hasSequentialPages; } /** * Check if page numbers show some sequentiality * @param {number[]} pageNumbers * @returns {boolean} */ function checkSequentiality(pageNumbers) { if (pageNumbers.length < 3) return false; // Sort and check for general increasing trend const sorted = [...pageNumbers].sort((a, b) => a - b); let increases = 0; for (let i = 1; i < sorted.length; i++) { if (sorted[i] >= sorted[i - 1]) increases++; } // At least 70% should be increasing return (increases / (sorted.length - 1)) >= 0.7; } /** * Parse section key and determine hierarchy level * @param {string} sectionKey - e.g., "4", "4.1", "4.1.2" * @returns {{ key: string, level: number }} */ function parseSectionKey(sectionKey) { if (!sectionKey) return { key: null, level: 1 }; const trimmed = sectionKey.trim(); const parts = trimmed.split('.'); return { key: trimmed, level: parts.length }; } /** * Extract TOC entries from a page * @param {string} pageText * @param {number} pageNumber * @returns {Array