/** * TOC Extractor Service * Detects and extracts Table of Contents from OCR'd document pages */ import { v4 as uuidv4 } from 'uuid'; import { getDb } from '../db/db.js'; import fs from 'fs/promises'; import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs'; /** * TOC entry patterns to match: * - "Chapter 4 – Plumbing System ........ 72" * - "4.1 Water System.....................45" * - "Section 3: Electrical . . . . . . . 89" * - "Introduction 12" */ const TOC_PATTERNS = [ // Pattern 1: Title [dots/spaces] PageNum /^(.{3,150?}?)\s*[.\s–-]{3,}\s*(\d{1,4})\s*$/, // Pattern 2: SectionKey Title [dots/spaces] PageNum /^([\d.]+)\s+(.{3,100}?)\s*[.\s–-]{3,}\s*(\d{1,4})\s*$/, // Pattern 3: Title [whitespace] PageNum (simpler) /^(.{5,120}?)\s{3,}(\d{1,4})\s*$/, ]; /** * Detect if a page looks like a TOC page * @param {string} pageText - OCR text from page * @returns {boolean} */ function isTocPage(pageText) { if (!pageText || pageText.length < 100) return false; const lines = pageText.split('\n').map(l => l.trim()).filter(l => l.length > 5); if (lines.length < 5) return false; // Count how many lines match TOC patterns let matchCount = 0; let pageNumbers = []; for (const line of lines) { for (const pattern of TOC_PATTERNS) { if (pattern.test(line)) { matchCount++; const match = line.match(pattern); const pageNum = parseInt(match[match.length - 1]); if (!isNaN(pageNum)) { pageNumbers.push(pageNum); } break; } } } // Heuristics for TOC detection: // 1. At least 5 matching lines // 2. At least 30% of lines match TOC patterns // 3. Page numbers are somewhat sequential or grouped const matchRatio = matchCount / lines.length; const hasSequentialPages = checkSequentiality(pageNumbers); return matchCount >= 5 && matchRatio >= 0.3 && hasSequentialPages; } /** * Check if page numbers show some sequentiality * @param {number[]} pageNumbers * @returns {boolean} */ function checkSequentiality(pageNumbers) { if (pageNumbers.length < 3) return false; // Sort and check for general increasing trend const sorted = [...pageNumbers].sort((a, b) => a - b); let increases = 0; for (let i = 1; i < sorted.length; i++) { if (sorted[i] >= sorted[i - 1]) increases++; } // At least 70% should be increasing return (increases / (sorted.length - 1)) >= 0.7; } /** * Parse section key and determine hierarchy level * @param {string} sectionKey - e.g., "4", "4.1", "4.1.2" * @returns {{ key: string, level: number }} */ function parseSectionKey(sectionKey) { if (!sectionKey) return { key: null, level: 1 }; const trimmed = sectionKey.trim(); const parts = trimmed.split('.'); return { key: trimmed, level: parts.length }; } /** * Extract TOC entries from a page * @param {string} pageText * @param {number} pageNumber * @returns {Array} */ function extractTocEntries(pageText, pageNumber) { const lines = pageText.split('\n').map(l => l.trim()).filter(l => l.length > 5); const entries = []; let orderIndex = 0; for (const line of lines) { let match = null; let patternType = 0; // Try each pattern for (let i = 0; i < TOC_PATTERNS.length; i++) { match = line.match(TOC_PATTERNS[i]); if (match) { patternType = i; break; } } if (!match) continue; let title, sectionKey, targetPage; // Parse based on pattern type if (patternType === 1) { // Pattern with section key: "4.1 Title .... 45" sectionKey = match[1]; title = match[2].trim(); targetPage = parseInt(match[3]); } else { // Patterns without section key: "Title .... 45" const groups = match.slice(1).filter(g => g !== undefined); title = groups[0].trim(); targetPage = parseInt(groups[groups.length - 1]); sectionKey = null; } // Clean up title (remove trailing dots/dashes) title = title.replace(/[.\-–\s]+$/, '').trim(); // Skip if title is too short or page number invalid if (title.length < 3 || isNaN(targetPage) || targetPage < 1) continue; const { key, level } = parseSectionKey(sectionKey); entries.push({ title, sectionKey: key, pageStart: targetPage, level, tocPageNumber: pageNumber, orderIndex: orderIndex++ }); } return entries; } /** * Match TOC entries to their source pages in OCR text * Used for PDF outline entries to find which pages they appear on * @param {Array} entries - TOC entries to match * @param {string} documentId - Document ID * @returns {Array} Entries with tocPageNumber populated */ function matchEntriesToSourcePages(entries, documentId) { const db = getDb(); // Get all pages with OCR text const pages = db.prepare(` SELECT page_number, ocr_text FROM document_pages WHERE document_id = ? AND ocr_text IS NOT NULL ORDER BY page_number ASC `).all(documentId); if (pages.length === 0) { console.log('[TOC] No OCR text available for source page matching'); return entries; } let matchCount = 0; // For each entry, search OCR text to find source page for (const entry of entries) { if (entry.tocPageNumber !== null) continue; // Skip if already set const titleText = entry.title?.trim(); if (!titleText || titleText.length < 5) continue; // Try to find this title in OCR text, prioritizing early pages (TOC is usually at start) for (const page of pages) { // Get significant words from title (skip common words, numbers with dots like "7.2.9" count as one word) const titleWords = titleText.split(/\s+/).slice(0, 8); // Use more words for better matching // Escape special regex characters but keep spaces for word matching const escapedWords = titleWords.map(word => word.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') ); // Create pattern allowing for flexible spacing and line breaks const searchPattern = escapedWords.join('[\\s\\S]{0,5}'); // Allow up to 5 chars between words const regex = new RegExp(searchPattern, 'i'); if (regex.test(page.ocr_text)) { entry.tocPageNumber = page.page_number; matchCount++; break; // Stop after first match } } } if (matchCount > 0) { console.log(`[TOC] Matched ${matchCount} PDF outline entries to source pages in OCR text`); } return entries; } /** * Build parent-child relationships for hierarchical TOC * @param {Array} entries * @returns {Array} Entries with parentId set */ function buildHierarchy(entries) { const enhanced = entries.map(e => ({ ...e, id: uuidv4(), parentId: null })); for (let i = 0; i < enhanced.length; i++) { const entry = enhanced[i]; if (!entry.sectionKey || entry.level === 1) continue; // Find parent: look backwards for entry with section key that is prefix // e.g., "4.1.2" parent is "4.1" const parentKeyParts = entry.sectionKey.split('.'); parentKeyParts.pop(); // Remove last part const parentKey = parentKeyParts.join('.'); for (let j = i - 1; j >= 0; j--) { if (enhanced[j].sectionKey === parentKey) { entry.parentId = enhanced[j].id; break; } } } return enhanced; } /** * Extract PDF outline/bookmarks as fallback TOC * Uses pdfjs-dist to read the PDF's built-in outline/bookmarks * * @param {string} pdfPath - Absolute path to PDF file * @returns {Promise>} Array of TOC entries with {title, page, level} */ async function extractPdfOutline(pdfPath) { try { const loadingTask = pdfjsLib.getDocument({ url: pdfPath }); const pdfDoc = await loadingTask.promise; const outline = await pdfDoc.getOutline(); if (!outline || outline.length === 0) { await pdfDoc.destroy?.(); return []; } const results = []; async function walk(items, level = 1, parentKey = null) { for (const item of items) { const title = (item.title || '').trim(); let pageNum = null; // Try to resolve destination to page number if (item.dest) { try { const destArray = await pdfDoc.getDestination(item.dest); if (Array.isArray(destArray) && destArray.length > 0) { const pageRef = destArray[0]; const pageIndex = await pdfDoc.getPageIndex(pageRef); pageNum = pageIndex + 1; // Convert 0-based to 1-based } } catch (err) { // Silently handle resolution errors } } // Fallback: try URL fragment like #page=5 if (!pageNum && item.url) { const m = String(item.url).match(/#page=(\d+)/i); if (m) pageNum = parseInt(m[1], 10); } results.push({ title: title || 'Untitled', page: Number.isFinite(pageNum) && pageNum >= 1 ? pageNum : null, level, _raw: { dest: !!item.dest, url: !!item.url, action: !!item.action } }); // Recurse into children if (item.items && item.items.length) { await walk(item.items, level + 1); } } } await walk(outline, 1); await pdfDoc.destroy?.(); return results; } catch (err) { console.warn('extractPdfOutline failed:', err && err.message); return []; } } /** * Extract TOC from entire document * @param {string} documentId * @returns {Promise<{ success: boolean, entriesCount: number, pages: number[] }>} */ export async function extractTocFromDocument(documentId) { const db = getDb(); try { // Validate document exists const document = db.prepare(` SELECT id FROM documents WHERE id = ? `).get(documentId); if (!document) { console.error(`[TOC] Document not found: ${documentId}`); return { success: false, error: 'Document not found', entriesCount: 0, pages: [] }; } // Get total page count for the document const pageCountResult = db.prepare(` SELECT COUNT(*) as count FROM document_pages WHERE document_id = ? `).get(documentId); if (pageCountResult.count === 0) { console.error(`[TOC] No pages available for TOC extraction in document: ${documentId}`); return { success: false, error: 'No pages available for TOC extraction', entriesCount: 0, pages: [] }; } // PRIORITY: Try PDF outline FIRST (Adobe approach) console.log(`[TOC] Attempting PDF outline extraction first for document ${documentId}`); const doc = db.prepare('SELECT file_path FROM documents WHERE id = ?').get(documentId); if (doc?.file_path) { const outlineResults = await extractPdfOutline(doc.file_path); if (outlineResults && outlineResults.length > 0) { console.log(`[TOC] PDF outline found with ${outlineResults.length} entries, using it as primary TOC source`); // Convert simplified outline format to database format const outlineEntries = []; const parentStack = []; for (let i = 0; i < outlineResults.length; i++) { const result = outlineResults[i]; const entryId = uuidv4(); let parentId = null; if (result.level > 1) { for (let j = parentStack.length - 1; j >= 0; j--) { if (parentStack[j].level === result.level - 1) { parentId = parentStack[j].id; break; } } } const entry = { id: entryId, title: result.title, sectionKey: null, pageStart: result.page || 1, level: result.level, parentId: parentId, orderIndex: i, tocPageNumber: null }; outlineEntries.push(entry); while (parentStack.length > 0 && parentStack[parentStack.length - 1].level >= result.level) { parentStack.pop(); } parentStack.push({ id: entryId, level: result.level }); } // Match PDF outline entries to their source pages in OCR text matchEntriesToSourcePages(outlineEntries, documentId); // Save to database db.prepare('DELETE FROM document_toc WHERE document_id = ?').run(documentId); const insertStmt = db.prepare(` INSERT INTO document_toc ( id, document_id, title, section_key, page_start, level, parent_id, order_index, toc_page_number, created_at ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) `); const timestamp = Date.now(); for (const entry of outlineEntries) { insertStmt.run( entry.id, documentId, entry.title, entry.sectionKey, entry.pageStart, entry.level, entry.parentId, entry.orderIndex, entry.tocPageNumber, timestamp ); } return { success: true, entriesCount: outlineEntries.length, pages: [], source: 'pdf-outline', message: `Extracted ${outlineEntries.length} entries from PDF outline` }; } } // FALLBACK: Try OCR-based TOC detection if PDF outline failed console.log(`[TOC] No PDF outline found, falling back to OCR-based TOC detection for document ${documentId}`); // Get all pages with OCR text const pages = db.prepare(` SELECT page_number, ocr_text FROM document_pages WHERE document_id = ? AND ocr_text IS NOT NULL ORDER BY page_number ASC `).all(documentId); if (pages.length === 0) { console.error(`[TOC] No OCR text found for document: ${documentId}`); return { success: false, error: 'No OCR text found', entriesCount: 0, pages: [] }; } // Find TOC pages const tocPages = []; for (const page of pages) { if (isTocPage(page.ocr_text)) { tocPages.push(page); } } // If no TOC pages found either, give up if (tocPages.length === 0) { console.log(`[TOC] No TOC pages detected via OCR either for document ${documentId}`); return { success: false, error: 'TOC detection failed: No PDF outline or OCR-detectable TOC found', entriesCount: 0, pages: [] }; } console.log(`[TOC] Found ${tocPages.length} TOC pages in document ${documentId}`); // Extract entries from all TOC pages let allEntries = []; for (const page of tocPages) { const entries = extractTocEntries(page.ocr_text, page.page_number); allEntries = allEntries.concat(entries); } if (allEntries.length === 0) { console.error(`[TOC] TOC parsing failed: No valid entries extracted from detected TOC pages in document ${documentId}`); return { success: false, error: 'TOC parsing failed: No valid entries extracted from detected TOC pages', entriesCount: 0, pages: tocPages.map(p => p.page_number) }; } // Build hierarchy let hierarchicalEntries; try { hierarchicalEntries = buildHierarchy(allEntries); } catch (hierarchyError) { console.error(`[TOC] TOC parsing failed: Hierarchy building error in document ${documentId}:`, hierarchyError); return { success: false, error: `TOC parsing failed: Hierarchy building error - ${hierarchyError.message}`, entriesCount: 0, pages: tocPages.map(p => p.page_number) }; } // Delete existing TOC entries for this document try { db.prepare('DELETE FROM document_toc WHERE document_id = ?').run(documentId); } catch (deleteError) { console.error(`[TOC] TOC parsing failed: Database cleanup error in document ${documentId}:`, deleteError); return { success: false, error: `TOC parsing failed: Database cleanup error - ${deleteError.message}`, entriesCount: 0, pages: tocPages.map(p => p.page_number) }; } // Insert new TOC entries const insertStmt = db.prepare(` INSERT INTO document_toc ( id, document_id, title, section_key, page_start, level, parent_id, order_index, toc_page_number, created_at ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) `); const timestamp = Date.now(); try { for (const entry of hierarchicalEntries) { insertStmt.run( entry.id, documentId, entry.title, entry.sectionKey, entry.pageStart, entry.level, entry.parentId, entry.orderIndex, entry.tocPageNumber, timestamp ); } } catch (insertError) { console.error(`[TOC] TOC parsing failed: Database insertion error in document ${documentId}:`, insertError); return { success: false, error: `TOC parsing failed: Database insertion error - ${insertError.message}`, entriesCount: 0, pages: tocPages.map(p => p.page_number) }; } console.log(`[TOC] Extracted ${hierarchicalEntries.length} TOC entries for document ${documentId}`); return { success: true, entriesCount: hierarchicalEntries.length, pages: tocPages.map(p => p.page_number), source: 'ocr-extraction' }; } catch (error) { console.error(`[TOC] Unexpected extraction error for document ${documentId}:`, error); return { success: false, error: `Unexpected error during TOC extraction: ${error.message}`, entriesCount: 0, pages: [] }; } } /** * Get TOC for a document * @param {string} documentId * @returns {Array} TOC entries with hierarchy */ export function getDocumentToc(documentId) { const db = getDb(); const entries = db.prepare(` SELECT id, document_id, title, section_key, page_start, level, parent_id, order_index, toc_page_number FROM document_toc WHERE document_id = ? ORDER BY order_index ASC `).all(documentId); return entries; } /** * Build tree structure from flat TOC entries * @param {Array} entries * @returns {Array} Tree with children arrays */ export function buildTocTree(entries) { const idMap = {}; const roots = []; // First pass: create map for (const entry of entries) { idMap[entry.id] = { ...entry, children: [] }; } // Second pass: build tree for (const entry of entries) { const node = idMap[entry.id]; if (entry.parent_id && idMap[entry.parent_id]) { idMap[entry.parent_id].children.push(node); } else { roots.push(node); } } return roots; } export default { extractTocFromDocument, getDocumentToc, buildTocTree };