navidocs/server/services/toc-extractor.js
ggq-admin fb88b291de feat: Add interactive Table of Contents navigation with i18n support
Implements complete TOC feature for document navigation with bilingual support.

## TOC Detection & Extraction
- Pattern-based TOC detection with 3 regex patterns
- Heuristic validation (30%+ match ratio, 5+ entries, sequential pages)
- Hierarchical section key parsing (e.g., "4.1.2" → level 3, parent "4.1")
- Database schema with parent-child relationships
- Automatic extraction during OCR post-processing
- Server-side LRU caching (200 entries, 30min TTL)

## UI Components
- TocSidebar: Collapsible sidebar (320px) with auto-open on TOC presence
- TocEntry: Recursive component for hierarchical rendering
- Flex layout: Sidebar + PDF viewer side-by-side
- Active page highlighting with real-time sync
- localStorage persistence for sidebar state

## Navigation Features
- Click TOC entry → PDF jumps to page
- Deep link support: URL hash format #p=12
- Page change events: navidocs:pagechange custom event
- URL hash updates on all navigation (next/prev/goTo/TOC)
- Hash change listener for external navigation
- Page clamping and validation

## Search Integration
- "Jump to section" button in search results
- Shows when result has section field
- Navigates to document with page number and hash

## Accessibility
- ARIA attributes: role, aria-label, aria-expanded, aria-current
- Keyboard navigation: Enter/Space on entries, Tab focus
- Screen reader support with aria-live regions
- Semantic HTML with proper list/listitem roles

## Internationalization (i18n)
- Vue I18n integration with vue-i18n package
- English and French translations
- 8 TOC-specific translation keys
- Language switcher component in document viewer
- Locale persistence in localStorage

## Error Handling
- Specific error messages for each failure case
- Validation before processing (doc exists, has pages, has OCR)
- Non-blocking TOC extraction (doesn't fail OCR jobs)
- Detailed error returns: {success, error, entriesCount, pages}

## API Endpoints
- GET /api/documents/:id/toc?format=flat|tree
- POST /api/documents/:id/toc/extract
- Cache invalidation on re-extraction

## Testing
- Smoke test script: 9 comprehensive tests
- E2E testing guide with 5 manual scenarios
- Tests cover: API, caching, validation, navigation, search

## Database
- Migration 002: document_toc table
- Fields: id, document_id, title, section_key, page_start, level, parent_id, order_index
- Foreign keys with CASCADE delete

## Files Changed
- New: TocSidebar.vue, TocEntry.vue, LanguageSwitcher.vue
- New: toc-extractor.js, toc.js routes, i18n setup
- Modified: DocumentView.vue (sidebar, deep links, events)
- Modified: SearchView.vue (Jump to section button)
- Modified: ocr-worker.js (TOC post-processing)
- New: toc-smoke-test.sh, TOC_E2E_TEST.md

Generated with Claude Code (https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-20 13:22:45 +02:00

591 lines
17 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* TOC Extractor Service
* Detects and extracts Table of Contents from OCR'd document pages
*/
import { v4 as uuidv4 } from 'uuid';
import { getDb } from '../db/db.js';
import fs from 'fs/promises';
import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs';
/**
* TOC entry patterns to match:
* - "Chapter 4 Plumbing System ........ 72"
* - "4.1 Water System.....................45"
* - "Section 3: Electrical . . . . . . . 89"
* - "Introduction 12"
*/
const TOC_PATTERNS = [
// Pattern 1: Title [dots/spaces] PageNum
/^(.{3,150?}?)\s*[.\s-]{3,}\s*(\d{1,4})\s*$/,
// Pattern 2: SectionKey Title [dots/spaces] PageNum
/^([\d.]+)\s+(.{3,100}?)\s*[.\s-]{3,}\s*(\d{1,4})\s*$/,
// Pattern 3: Title [whitespace] PageNum (simpler)
/^(.{5,120}?)\s{3,}(\d{1,4})\s*$/,
];
/**
* Detect if a page looks like a TOC page
* @param {string} pageText - OCR text from page
* @returns {boolean}
*/
function isTocPage(pageText) {
if (!pageText || pageText.length < 100) return false;
const lines = pageText.split('\n').map(l => l.trim()).filter(l => l.length > 5);
if (lines.length < 5) return false;
// Count how many lines match TOC patterns
let matchCount = 0;
let pageNumbers = [];
for (const line of lines) {
for (const pattern of TOC_PATTERNS) {
if (pattern.test(line)) {
matchCount++;
const match = line.match(pattern);
const pageNum = parseInt(match[match.length - 1]);
if (!isNaN(pageNum)) {
pageNumbers.push(pageNum);
}
break;
}
}
}
// Heuristics for TOC detection:
// 1. At least 5 matching lines
// 2. At least 30% of lines match TOC patterns
// 3. Page numbers are somewhat sequential or grouped
const matchRatio = matchCount / lines.length;
const hasSequentialPages = checkSequentiality(pageNumbers);
return matchCount >= 5 && matchRatio >= 0.3 && hasSequentialPages;
}
/**
* Check if page numbers show some sequentiality
* @param {number[]} pageNumbers
* @returns {boolean}
*/
function checkSequentiality(pageNumbers) {
if (pageNumbers.length < 3) return false;
// Sort and check for general increasing trend
const sorted = [...pageNumbers].sort((a, b) => a - b);
let increases = 0;
for (let i = 1; i < sorted.length; i++) {
if (sorted[i] >= sorted[i - 1]) increases++;
}
// At least 70% should be increasing
return (increases / (sorted.length - 1)) >= 0.7;
}
/**
* Parse section key and determine hierarchy level
* @param {string} sectionKey - e.g., "4", "4.1", "4.1.2"
* @returns {{ key: string, level: number }}
*/
function parseSectionKey(sectionKey) {
if (!sectionKey) return { key: null, level: 1 };
const trimmed = sectionKey.trim();
const parts = trimmed.split('.');
return {
key: trimmed,
level: parts.length
};
}
/**
* Extract TOC entries from a page
* @param {string} pageText
* @param {number} pageNumber
* @returns {Array<Object>}
*/
function extractTocEntries(pageText, pageNumber) {
const lines = pageText.split('\n').map(l => l.trim()).filter(l => l.length > 5);
const entries = [];
let orderIndex = 0;
for (const line of lines) {
let match = null;
let patternType = 0;
// Try each pattern
for (let i = 0; i < TOC_PATTERNS.length; i++) {
match = line.match(TOC_PATTERNS[i]);
if (match) {
patternType = i;
break;
}
}
if (!match) continue;
let title, sectionKey, targetPage;
// Parse based on pattern type
if (patternType === 1) {
// Pattern with section key: "4.1 Title .... 45"
sectionKey = match[1];
title = match[2].trim();
targetPage = parseInt(match[3]);
} else {
// Patterns without section key: "Title .... 45"
const groups = match.slice(1).filter(g => g !== undefined);
title = groups[0].trim();
targetPage = parseInt(groups[groups.length - 1]);
sectionKey = null;
}
// Clean up title (remove trailing dots/dashes)
title = title.replace(/[.\-\s]+$/, '').trim();
// Skip if title is too short or page number invalid
if (title.length < 3 || isNaN(targetPage) || targetPage < 1) continue;
const { key, level } = parseSectionKey(sectionKey);
entries.push({
title,
sectionKey: key,
pageStart: targetPage,
level,
tocPageNumber: pageNumber,
orderIndex: orderIndex++
});
}
return entries;
}
/**
* Build parent-child relationships for hierarchical TOC
* @param {Array<Object>} entries
* @returns {Array<Object>} Entries with parentId set
*/
function buildHierarchy(entries) {
const enhanced = entries.map(e => ({ ...e, id: uuidv4(), parentId: null }));
for (let i = 0; i < enhanced.length; i++) {
const entry = enhanced[i];
if (!entry.sectionKey || entry.level === 1) continue;
// Find parent: look backwards for entry with section key that is prefix
// e.g., "4.1.2" parent is "4.1"
const parentKeyParts = entry.sectionKey.split('.');
parentKeyParts.pop(); // Remove last part
const parentKey = parentKeyParts.join('.');
for (let j = i - 1; j >= 0; j--) {
if (enhanced[j].sectionKey === parentKey) {
entry.parentId = enhanced[j].id;
break;
}
}
}
return enhanced;
}
/**
* Extract PDF outline/bookmarks as fallback TOC
* Uses pdfjs-dist to read the PDF's built-in outline/bookmarks
*
* @param {string} filePath - Absolute path to PDF file
* @param {string} documentId - Document ID for reference
* @returns {Promise<Array<Object>|null>} Array of TOC entries or null if no outline exists
*/
async function extractPdfOutline(filePath, documentId) {
try {
console.log(`[TOC] Attempting to extract PDF outline from: ${filePath}`);
// Read PDF file
const dataBuffer = await fs.readFile(filePath);
// Load PDF document
const loadingTask = pdfjsLib.getDocument({
data: new Uint8Array(dataBuffer),
useSystemFonts: true,
standardFontDataUrl: null // Disable font loading for performance
});
const pdfDocument = await loadingTask.promise;
const outline = await pdfDocument.getOutline();
if (!outline || outline.length === 0) {
console.log(`[TOC] No PDF outline found in document ${documentId}`);
await pdfDocument.destroy();
return null;
}
console.log(`[TOC] Found PDF outline with ${outline.length} top-level items`);
// Convert outline to TOC entries
const entries = [];
let orderIndex = 0;
/**
* Recursively process outline items and convert to TOC entries
*/
async function processOutlineItem(item, level = 1, parentId = null) {
if (!item || !item.title) return;
// Resolve destination to page number
let pageStart = 1;
if (item.dest) {
try {
// Get the destination (can be a string reference or direct array)
const dest = typeof item.dest === 'string'
? await pdfDocument.getDestination(item.dest)
: item.dest;
// Extract page reference from destination array
// Format is typically: [pageRef, fitType, ...params]
if (dest && Array.isArray(dest) && dest[0]) {
const pageIndex = await pdfDocument.getPageIndex(dest[0]);
pageStart = pageIndex + 1; // Convert 0-based to 1-based
}
} catch (e) {
console.log(`[TOC] Could not resolve page for outline item "${item.title}": ${e.message}`);
// Keep default pageStart = 1
}
}
const entry = {
id: uuidv4(),
title: item.title.trim(),
sectionKey: null, // PDF outlines don't have section keys
pageStart: pageStart,
level: level,
parentId: parentId,
orderIndex: orderIndex++,
tocPageNumber: null // Not from a TOC page, from PDF outline
};
entries.push(entry);
// Process children recursively
if (item.items && Array.isArray(item.items) && item.items.length > 0) {
for (const child of item.items) {
await processOutlineItem(child, level + 1, entry.id);
}
}
}
// Process all top-level outline items
for (const item of outline) {
await processOutlineItem(item);
}
// Clean up
await pdfDocument.destroy();
if (entries.length === 0) {
console.log(`[TOC] PDF outline exists but contains no valid entries for document ${documentId}`);
return null;
}
console.log(`[TOC] Successfully extracted ${entries.length} entries from PDF outline for document ${documentId}`);
return entries;
} catch (error) {
console.error(`[TOC] Error extracting PDF outline for document ${documentId}:`, error);
return null;
}
}
/**
* Extract TOC from entire document
* @param {string} documentId
* @returns {Promise<{ success: boolean, entriesCount: number, pages: number[] }>}
*/
export async function extractTocFromDocument(documentId) {
const db = getDb();
try {
// Validate document exists
const document = db.prepare(`
SELECT id FROM documents WHERE id = ?
`).get(documentId);
if (!document) {
console.error(`[TOC] Document not found: ${documentId}`);
return {
success: false,
error: 'Document not found',
entriesCount: 0,
pages: []
};
}
// Get total page count for the document
const pageCountResult = db.prepare(`
SELECT COUNT(*) as count
FROM document_pages
WHERE document_id = ?
`).get(documentId);
if (pageCountResult.count === 0) {
console.error(`[TOC] No pages available for TOC extraction in document: ${documentId}`);
return {
success: false,
error: 'No pages available for TOC extraction',
entriesCount: 0,
pages: []
};
}
// Get all pages with OCR text
const pages = db.prepare(`
SELECT page_number, ocr_text
FROM document_pages
WHERE document_id = ? AND ocr_text IS NOT NULL
ORDER BY page_number ASC
`).all(documentId);
if (pages.length === 0) {
console.error(`[TOC] No OCR text found for document: ${documentId}`);
return {
success: false,
error: 'No OCR text found',
entriesCount: 0,
pages: []
};
}
// Find TOC pages
const tocPages = [];
for (const page of pages) {
if (isTocPage(page.ocr_text)) {
tocPages.push(page);
}
}
// If no TOC pages found, try PDF outline as fallback
if (tocPages.length === 0) {
console.log(`[TOC] No TOC pages detected in document ${documentId}, attempting PDF outline fallback`);
// Get document file path
const doc = db.prepare('SELECT file_path FROM documents WHERE id = ?').get(documentId);
if (!doc || !doc.file_path) {
console.log(`[TOC] Cannot attempt PDF outline fallback: file path not found for document ${documentId}`);
return {
success: false,
error: 'TOC detection failed: No patterns matched',
entriesCount: 0,
pages: []
};
}
// Try extracting PDF outline
const outlineEntries = await extractPdfOutline(doc.file_path, documentId);
if (!outlineEntries || outlineEntries.length === 0) {
console.log(`[TOC] PDF outline fallback failed for document ${documentId}`);
return {
success: false,
error: 'TOC detection failed: No patterns matched and no PDF outline found',
entriesCount: 0,
pages: []
};
}
// Save outline entries to database
console.log(`[TOC] Using PDF outline as TOC for document ${documentId} (${outlineEntries.length} entries)`);
// Delete existing TOC entries for this document
db.prepare('DELETE FROM document_toc WHERE document_id = ?').run(documentId);
// Insert outline entries
const insertStmt = db.prepare(`
INSERT INTO document_toc (
id, document_id, title, section_key, page_start,
level, parent_id, order_index, toc_page_number, created_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`);
const timestamp = Date.now();
for (const entry of outlineEntries) {
insertStmt.run(
entry.id,
documentId,
entry.title,
entry.sectionKey,
entry.pageStart,
entry.level,
entry.parentId,
entry.orderIndex,
entry.tocPageNumber,
timestamp
);
}
return {
success: true,
entriesCount: outlineEntries.length,
pages: [],
source: 'pdf-outline'
};
}
console.log(`[TOC] Found ${tocPages.length} TOC pages in document ${documentId}`);
// Extract entries from all TOC pages
let allEntries = [];
for (const page of tocPages) {
const entries = extractTocEntries(page.ocr_text, page.page_number);
allEntries = allEntries.concat(entries);
}
if (allEntries.length === 0) {
console.error(`[TOC] TOC parsing failed: No valid entries extracted from detected TOC pages in document ${documentId}`);
return {
success: false,
error: 'TOC parsing failed: No valid entries extracted from detected TOC pages',
entriesCount: 0,
pages: tocPages.map(p => p.page_number)
};
}
// Build hierarchy
let hierarchicalEntries;
try {
hierarchicalEntries = buildHierarchy(allEntries);
} catch (hierarchyError) {
console.error(`[TOC] TOC parsing failed: Hierarchy building error in document ${documentId}:`, hierarchyError);
return {
success: false,
error: `TOC parsing failed: Hierarchy building error - ${hierarchyError.message}`,
entriesCount: 0,
pages: tocPages.map(p => p.page_number)
};
}
// Delete existing TOC entries for this document
try {
db.prepare('DELETE FROM document_toc WHERE document_id = ?').run(documentId);
} catch (deleteError) {
console.error(`[TOC] TOC parsing failed: Database cleanup error in document ${documentId}:`, deleteError);
return {
success: false,
error: `TOC parsing failed: Database cleanup error - ${deleteError.message}`,
entriesCount: 0,
pages: tocPages.map(p => p.page_number)
};
}
// Insert new TOC entries
const insertStmt = db.prepare(`
INSERT INTO document_toc (
id, document_id, title, section_key, page_start,
level, parent_id, order_index, toc_page_number, created_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`);
const timestamp = Date.now();
try {
for (const entry of hierarchicalEntries) {
insertStmt.run(
entry.id,
documentId,
entry.title,
entry.sectionKey,
entry.pageStart,
entry.level,
entry.parentId,
entry.orderIndex,
entry.tocPageNumber,
timestamp
);
}
} catch (insertError) {
console.error(`[TOC] TOC parsing failed: Database insertion error in document ${documentId}:`, insertError);
return {
success: false,
error: `TOC parsing failed: Database insertion error - ${insertError.message}`,
entriesCount: 0,
pages: tocPages.map(p => p.page_number)
};
}
console.log(`[TOC] Extracted ${hierarchicalEntries.length} TOC entries for document ${documentId}`);
return {
success: true,
entriesCount: hierarchicalEntries.length,
pages: tocPages.map(p => p.page_number),
source: 'ocr-extraction'
};
} catch (error) {
console.error(`[TOC] Unexpected extraction error for document ${documentId}:`, error);
return {
success: false,
error: `Unexpected error during TOC extraction: ${error.message}`,
entriesCount: 0,
pages: []
};
}
}
/**
* Get TOC for a document
* @param {string} documentId
* @returns {Array<Object>} TOC entries with hierarchy
*/
export function getDocumentToc(documentId) {
const db = getDb();
const entries = db.prepare(`
SELECT
id, document_id, title, section_key, page_start,
level, parent_id, order_index, toc_page_number
FROM document_toc
WHERE document_id = ?
ORDER BY order_index ASC
`).all(documentId);
return entries;
}
/**
* Build tree structure from flat TOC entries
* @param {Array<Object>} entries
* @returns {Array<Object>} Tree with children arrays
*/
export function buildTocTree(entries) {
const idMap = {};
const roots = [];
// First pass: create map
for (const entry of entries) {
idMap[entry.id] = { ...entry, children: [] };
}
// Second pass: build tree
for (const entry of entries) {
const node = idMap[entry.id];
if (entry.parent_id && idMap[entry.parent_id]) {
idMap[entry.parent_id].children.push(node);
} else {
roots.push(node);
}
}
return roots;
}
export default {
extractTocFromDocument,
getDocumentToc,
buildTocTree
};