navidocs/server/services/toc-extractor.js
Danny Stocker 58b344aa31 FINAL: P0 blockers fixed + Joe Trader + ignore binaries
Fixed:
- Price: €800K-€1.5M, Sunseeker added
- Agent 1: Joe Trader persona + actual sale ads research
- Ignored meilisearch binary + data/ (too large for GitHub)
- SESSION_DEBUG_BLOCKERS.md created

Ready for Session 1 launch.

🤖 Generated with Claude Code
2025-11-13 01:29:59 +01:00

646 lines
18 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* TOC Extractor Service
* Detects and extracts Table of Contents from OCR'd document pages
*/
import { v4 as uuidv4 } from 'uuid';
import { getDb } from '../db/db.js';
import fs from 'fs/promises';
import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs';
/**
* TOC entry patterns to match:
* - "Chapter 4 Plumbing System ........ 72"
* - "4.1 Water System.....................45"
* - "Section 3: Electrical . . . . . . . 89"
* - "Introduction 12"
*/
const TOC_PATTERNS = [
// Pattern 1: Title [dots/spaces] PageNum
/^(.{3,150?}?)\s*[.\s-]{3,}\s*(\d{1,4})\s*$/,
// Pattern 2: SectionKey Title [dots/spaces] PageNum
/^([\d.]+)\s+(.{3,100}?)\s*[.\s-]{3,}\s*(\d{1,4})\s*$/,
// Pattern 3: Title [whitespace] PageNum (simpler)
/^(.{5,120}?)\s{3,}(\d{1,4})\s*$/,
];
/**
* Detect if a page looks like a TOC page
* @param {string} pageText - OCR text from page
* @returns {boolean}
*/
function isTocPage(pageText) {
if (!pageText || pageText.length < 100) return false;
const lines = pageText.split('\n').map(l => l.trim()).filter(l => l.length > 5);
if (lines.length < 5) return false;
// Count how many lines match TOC patterns
let matchCount = 0;
let pageNumbers = [];
for (const line of lines) {
for (const pattern of TOC_PATTERNS) {
if (pattern.test(line)) {
matchCount++;
const match = line.match(pattern);
const pageNum = parseInt(match[match.length - 1]);
if (!isNaN(pageNum)) {
pageNumbers.push(pageNum);
}
break;
}
}
}
// Heuristics for TOC detection:
// 1. At least 5 matching lines
// 2. At least 30% of lines match TOC patterns
// 3. Page numbers are somewhat sequential or grouped
const matchRatio = matchCount / lines.length;
const hasSequentialPages = checkSequentiality(pageNumbers);
return matchCount >= 5 && matchRatio >= 0.3 && hasSequentialPages;
}
/**
* Check if page numbers show some sequentiality
* @param {number[]} pageNumbers
* @returns {boolean}
*/
function checkSequentiality(pageNumbers) {
if (pageNumbers.length < 3) return false;
// Sort and check for general increasing trend
const sorted = [...pageNumbers].sort((a, b) => a - b);
let increases = 0;
for (let i = 1; i < sorted.length; i++) {
if (sorted[i] >= sorted[i - 1]) increases++;
}
// At least 70% should be increasing
return (increases / (sorted.length - 1)) >= 0.7;
}
/**
* Parse section key and determine hierarchy level
* @param {string} sectionKey - e.g., "4", "4.1", "4.1.2"
* @returns {{ key: string, level: number }}
*/
function parseSectionKey(sectionKey) {
if (!sectionKey) return { key: null, level: 1 };
const trimmed = sectionKey.trim();
const parts = trimmed.split('.');
return {
key: trimmed,
level: parts.length
};
}
/**
* Extract TOC entries from a page
* @param {string} pageText
* @param {number} pageNumber
* @returns {Array<Object>}
*/
function extractTocEntries(pageText, pageNumber) {
const lines = pageText.split('\n').map(l => l.trim()).filter(l => l.length > 5);
const entries = [];
let orderIndex = 0;
for (const line of lines) {
let match = null;
let patternType = 0;
// Try each pattern
for (let i = 0; i < TOC_PATTERNS.length; i++) {
match = line.match(TOC_PATTERNS[i]);
if (match) {
patternType = i;
break;
}
}
if (!match) continue;
let title, sectionKey, targetPage;
// Parse based on pattern type
if (patternType === 1) {
// Pattern with section key: "4.1 Title .... 45"
sectionKey = match[1];
title = match[2].trim();
targetPage = parseInt(match[3]);
} else {
// Patterns without section key: "Title .... 45"
const groups = match.slice(1).filter(g => g !== undefined);
title = groups[0].trim();
targetPage = parseInt(groups[groups.length - 1]);
sectionKey = null;
}
// Clean up title (remove trailing dots/dashes)
title = title.replace(/[.\-\s]+$/, '').trim();
// Skip if title is too short or page number invalid
if (title.length < 3 || isNaN(targetPage) || targetPage < 1) continue;
const { key, level } = parseSectionKey(sectionKey);
entries.push({
title,
sectionKey: key,
pageStart: targetPage,
level,
tocPageNumber: pageNumber,
orderIndex: orderIndex++
});
}
return entries;
}
/**
* Match TOC entries to their source pages in OCR text
* Used for PDF outline entries to find which pages they appear on
* @param {Array<Object>} entries - TOC entries to match
* @param {string} documentId - Document ID
* @returns {Array<Object>} Entries with tocPageNumber populated
*/
function matchEntriesToSourcePages(entries, documentId) {
const db = getDb();
// Get all pages with OCR text
const pages = db.prepare(`
SELECT page_number, ocr_text
FROM document_pages
WHERE document_id = ? AND ocr_text IS NOT NULL
ORDER BY page_number ASC
`).all(documentId);
if (pages.length === 0) {
console.log('[TOC] No OCR text available for source page matching');
return entries;
}
let matchCount = 0;
// For each entry, search OCR text to find source page
for (const entry of entries) {
if (entry.tocPageNumber !== null) continue; // Skip if already set
const titleText = entry.title?.trim();
if (!titleText || titleText.length < 5) continue;
// Try to find this title in OCR text, prioritizing early pages (TOC is usually at start)
for (const page of pages) {
// Get significant words from title (skip common words, numbers with dots like "7.2.9" count as one word)
const titleWords = titleText.split(/\s+/).slice(0, 8); // Use more words for better matching
// Escape special regex characters but keep spaces for word matching
const escapedWords = titleWords.map(word =>
word.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
);
// Create pattern allowing for flexible spacing and line breaks
const searchPattern = escapedWords.join('[\\s\\S]{0,5}'); // Allow up to 5 chars between words
const regex = new RegExp(searchPattern, 'i');
if (regex.test(page.ocr_text)) {
entry.tocPageNumber = page.page_number;
matchCount++;
break; // Stop after first match
}
}
}
if (matchCount > 0) {
console.log(`[TOC] Matched ${matchCount} PDF outline entries to source pages in OCR text`);
}
return entries;
}
/**
* Build parent-child relationships for hierarchical TOC
* @param {Array<Object>} entries
* @returns {Array<Object>} Entries with parentId set
*/
function buildHierarchy(entries) {
const enhanced = entries.map(e => ({ ...e, id: uuidv4(), parentId: null }));
for (let i = 0; i < enhanced.length; i++) {
const entry = enhanced[i];
if (!entry.sectionKey || entry.level === 1) continue;
// Find parent: look backwards for entry with section key that is prefix
// e.g., "4.1.2" parent is "4.1"
const parentKeyParts = entry.sectionKey.split('.');
parentKeyParts.pop(); // Remove last part
const parentKey = parentKeyParts.join('.');
for (let j = i - 1; j >= 0; j--) {
if (enhanced[j].sectionKey === parentKey) {
entry.parentId = enhanced[j].id;
break;
}
}
}
return enhanced;
}
/**
* Extract PDF outline/bookmarks as fallback TOC
* Uses pdfjs-dist to read the PDF's built-in outline/bookmarks
*
* @param {string} pdfPath - Absolute path to PDF file
* @returns {Promise<Array<Object>>} Array of TOC entries with {title, page, level}
*/
async function extractPdfOutline(pdfPath) {
try {
const loadingTask = pdfjsLib.getDocument({ url: pdfPath });
const pdfDoc = await loadingTask.promise;
const outline = await pdfDoc.getOutline();
if (!outline || outline.length === 0) {
await pdfDoc.destroy?.();
return [];
}
const results = [];
async function walk(items, level = 1, parentKey = null) {
for (const item of items) {
const title = (item.title || '').trim();
let pageNum = null;
// Try to resolve destination to page number
if (item.dest) {
try {
const destArray = await pdfDoc.getDestination(item.dest);
if (Array.isArray(destArray) && destArray.length > 0) {
const pageRef = destArray[0];
const pageIndex = await pdfDoc.getPageIndex(pageRef);
pageNum = pageIndex + 1; // Convert 0-based to 1-based
}
} catch (err) {
// Silently handle resolution errors
}
}
// Fallback: try URL fragment like #page=5
if (!pageNum && item.url) {
const m = String(item.url).match(/#page=(\d+)/i);
if (m) pageNum = parseInt(m[1], 10);
}
results.push({
title: title || 'Untitled',
page: Number.isFinite(pageNum) && pageNum >= 1 ? pageNum : null,
level,
_raw: { dest: !!item.dest, url: !!item.url, action: !!item.action }
});
// Recurse into children
if (item.items && item.items.length) {
await walk(item.items, level + 1);
}
}
}
await walk(outline, 1);
await pdfDoc.destroy?.();
return results;
} catch (err) {
console.warn('extractPdfOutline failed:', err && err.message);
return [];
}
}
/**
* Extract TOC from entire document
* @param {string} documentId
* @returns {Promise<{ success: boolean, entriesCount: number, pages: number[] }>}
*/
export async function extractTocFromDocument(documentId) {
const db = getDb();
try {
// Validate document exists
const document = db.prepare(`
SELECT id FROM documents WHERE id = ?
`).get(documentId);
if (!document) {
console.error(`[TOC] Document not found: ${documentId}`);
return {
success: false,
error: 'Document not found',
entriesCount: 0,
pages: []
};
}
// Get total page count for the document
const pageCountResult = db.prepare(`
SELECT COUNT(*) as count
FROM document_pages
WHERE document_id = ?
`).get(documentId);
if (pageCountResult.count === 0) {
console.error(`[TOC] No pages available for TOC extraction in document: ${documentId}`);
return {
success: false,
error: 'No pages available for TOC extraction',
entriesCount: 0,
pages: []
};
}
// PRIORITY: Try PDF outline FIRST (Adobe approach)
console.log(`[TOC] Attempting PDF outline extraction first for document ${documentId}`);
const doc = db.prepare('SELECT file_path FROM documents WHERE id = ?').get(documentId);
if (doc?.file_path) {
const outlineResults = await extractPdfOutline(doc.file_path);
if (outlineResults && outlineResults.length > 0) {
console.log(`[TOC] PDF outline found with ${outlineResults.length} entries, using it as primary TOC source`);
// Convert simplified outline format to database format
const outlineEntries = [];
const parentStack = [];
for (let i = 0; i < outlineResults.length; i++) {
const result = outlineResults[i];
const entryId = uuidv4();
let parentId = null;
if (result.level > 1) {
for (let j = parentStack.length - 1; j >= 0; j--) {
if (parentStack[j].level === result.level - 1) {
parentId = parentStack[j].id;
break;
}
}
}
const entry = {
id: entryId,
title: result.title,
sectionKey: null,
pageStart: result.page || 1,
level: result.level,
parentId: parentId,
orderIndex: i,
tocPageNumber: null
};
outlineEntries.push(entry);
while (parentStack.length > 0 && parentStack[parentStack.length - 1].level >= result.level) {
parentStack.pop();
}
parentStack.push({ id: entryId, level: result.level });
}
// Match PDF outline entries to their source pages in OCR text
matchEntriesToSourcePages(outlineEntries, documentId);
// Save to database
db.prepare('DELETE FROM document_toc WHERE document_id = ?').run(documentId);
const insertStmt = db.prepare(`
INSERT INTO document_toc (
id, document_id, title, section_key, page_start,
level, parent_id, order_index, toc_page_number, created_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`);
const timestamp = Date.now();
for (const entry of outlineEntries) {
insertStmt.run(
entry.id,
documentId,
entry.title,
entry.sectionKey,
entry.pageStart,
entry.level,
entry.parentId,
entry.orderIndex,
entry.tocPageNumber,
timestamp
);
}
return {
success: true,
entriesCount: outlineEntries.length,
pages: [],
source: 'pdf-outline',
message: `Extracted ${outlineEntries.length} entries from PDF outline`
};
}
}
// FALLBACK: Try OCR-based TOC detection if PDF outline failed
console.log(`[TOC] No PDF outline found, falling back to OCR-based TOC detection for document ${documentId}`);
// Get all pages with OCR text
const pages = db.prepare(`
SELECT page_number, ocr_text
FROM document_pages
WHERE document_id = ? AND ocr_text IS NOT NULL
ORDER BY page_number ASC
`).all(documentId);
if (pages.length === 0) {
console.error(`[TOC] No OCR text found for document: ${documentId}`);
return {
success: false,
error: 'No OCR text found',
entriesCount: 0,
pages: []
};
}
// Find TOC pages
const tocPages = [];
for (const page of pages) {
if (isTocPage(page.ocr_text)) {
tocPages.push(page);
}
}
// If no TOC pages found either, give up
if (tocPages.length === 0) {
console.log(`[TOC] No TOC pages detected via OCR either for document ${documentId}`);
return {
success: false,
error: 'TOC detection failed: No PDF outline or OCR-detectable TOC found',
entriesCount: 0,
pages: []
};
}
console.log(`[TOC] Found ${tocPages.length} TOC pages in document ${documentId}`);
// Extract entries from all TOC pages
let allEntries = [];
for (const page of tocPages) {
const entries = extractTocEntries(page.ocr_text, page.page_number);
allEntries = allEntries.concat(entries);
}
if (allEntries.length === 0) {
console.error(`[TOC] TOC parsing failed: No valid entries extracted from detected TOC pages in document ${documentId}`);
return {
success: false,
error: 'TOC parsing failed: No valid entries extracted from detected TOC pages',
entriesCount: 0,
pages: tocPages.map(p => p.page_number)
};
}
// Build hierarchy
let hierarchicalEntries;
try {
hierarchicalEntries = buildHierarchy(allEntries);
} catch (hierarchyError) {
console.error(`[TOC] TOC parsing failed: Hierarchy building error in document ${documentId}:`, hierarchyError);
return {
success: false,
error: `TOC parsing failed: Hierarchy building error - ${hierarchyError.message}`,
entriesCount: 0,
pages: tocPages.map(p => p.page_number)
};
}
// Delete existing TOC entries for this document
try {
db.prepare('DELETE FROM document_toc WHERE document_id = ?').run(documentId);
} catch (deleteError) {
console.error(`[TOC] TOC parsing failed: Database cleanup error in document ${documentId}:`, deleteError);
return {
success: false,
error: `TOC parsing failed: Database cleanup error - ${deleteError.message}`,
entriesCount: 0,
pages: tocPages.map(p => p.page_number)
};
}
// Insert new TOC entries
const insertStmt = db.prepare(`
INSERT INTO document_toc (
id, document_id, title, section_key, page_start,
level, parent_id, order_index, toc_page_number, created_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`);
const timestamp = Date.now();
try {
for (const entry of hierarchicalEntries) {
insertStmt.run(
entry.id,
documentId,
entry.title,
entry.sectionKey,
entry.pageStart,
entry.level,
entry.parentId,
entry.orderIndex,
entry.tocPageNumber,
timestamp
);
}
} catch (insertError) {
console.error(`[TOC] TOC parsing failed: Database insertion error in document ${documentId}:`, insertError);
return {
success: false,
error: `TOC parsing failed: Database insertion error - ${insertError.message}`,
entriesCount: 0,
pages: tocPages.map(p => p.page_number)
};
}
console.log(`[TOC] Extracted ${hierarchicalEntries.length} TOC entries for document ${documentId}`);
return {
success: true,
entriesCount: hierarchicalEntries.length,
pages: tocPages.map(p => p.page_number),
source: 'ocr-extraction'
};
} catch (error) {
console.error(`[TOC] Unexpected extraction error for document ${documentId}:`, error);
return {
success: false,
error: `Unexpected error during TOC extraction: ${error.message}`,
entriesCount: 0,
pages: []
};
}
}
/**
* Get TOC for a document
* @param {string} documentId
* @returns {Array<Object>} TOC entries with hierarchy
*/
export function getDocumentToc(documentId) {
const db = getDb();
const entries = db.prepare(`
SELECT
id, document_id, title, section_key, page_start,
level, parent_id, order_index, toc_page_number
FROM document_toc
WHERE document_id = ?
ORDER BY order_index ASC
`).all(documentId);
return entries;
}
/**
* Build tree structure from flat TOC entries
* @param {Array<Object>} entries
* @returns {Array<Object>} Tree with children arrays
*/
export function buildTocTree(entries) {
const idMap = {};
const roots = [];
// First pass: create map
for (const entry of entries) {
idMap[entry.id] = { ...entry, children: [] };
}
// Second pass: build tree
for (const entry of entries) {
const node = idMap[entry.id];
if (entry.parent_id && idMap[entry.parent_id]) {
idMap[entry.parent_id].children.push(node);
} else {
roots.push(node);
}
}
return roots;
}
export default {
extractTocFromDocument,
getDocumentToc,
buildTocTree
};