Implements complete TOC feature for document navigation with bilingual support.
## TOC Detection & Extraction
- Pattern-based TOC detection with 3 regex patterns
- Heuristic validation (30%+ match ratio, 5+ entries, sequential pages)
- Hierarchical section key parsing (e.g., "4.1.2" → level 3, parent "4.1")
- Database schema with parent-child relationships
- Automatic extraction during OCR post-processing
- Server-side LRU caching (200 entries, 30min TTL)
## UI Components
- TocSidebar: Collapsible sidebar (320px) with auto-open on TOC presence
- TocEntry: Recursive component for hierarchical rendering
- Flex layout: Sidebar + PDF viewer side-by-side
- Active page highlighting with real-time sync
- localStorage persistence for sidebar state
## Navigation Features
- Click TOC entry → PDF jumps to page
- Deep link support: URL hash format #p=12
- Page change events: navidocs:pagechange custom event
- URL hash updates on all navigation (next/prev/goTo/TOC)
- Hash change listener for external navigation
- Page clamping and validation
## Search Integration
- "Jump to section" button in search results
- Shows when result has section field
- Navigates to document with page number and hash
## Accessibility
- ARIA attributes: role, aria-label, aria-expanded, aria-current
- Keyboard navigation: Enter/Space on entries, Tab focus
- Screen reader support with aria-live regions
- Semantic HTML with proper list/listitem roles
## Internationalization (i18n)
- Vue I18n integration with vue-i18n package
- English and French translations
- 8 TOC-specific translation keys
- Language switcher component in document viewer
- Locale persistence in localStorage
## Error Handling
- Specific error messages for each failure case
- Validation before processing (doc exists, has pages, has OCR)
- Non-blocking TOC extraction (doesn't fail OCR jobs)
- Detailed error returns: {success, error, entriesCount, pages}
## API Endpoints
- GET /api/documents/:id/toc?format=flat|tree
- POST /api/documents/:id/toc/extract
- Cache invalidation on re-extraction
## Testing
- Smoke test script: 9 comprehensive tests
- E2E testing guide with 5 manual scenarios
- Tests cover: API, caching, validation, navigation, search
## Database
- Migration 002: document_toc table
- Fields: id, document_id, title, section_key, page_start, level, parent_id, order_index
- Foreign keys with CASCADE delete
## Files Changed
- New: TocSidebar.vue, TocEntry.vue, LanguageSwitcher.vue
- New: toc-extractor.js, toc.js routes, i18n setup
- Modified: DocumentView.vue (sidebar, deep links, events)
- Modified: SearchView.vue (Jump to section button)
- Modified: ocr-worker.js (TOC post-processing)
- New: toc-smoke-test.sh, TOC_E2E_TEST.md
Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
591 lines
17 KiB
JavaScript
591 lines
17 KiB
JavaScript
/**
|
||
* TOC Extractor Service
|
||
* Detects and extracts Table of Contents from OCR'd document pages
|
||
*/
|
||
|
||
import { v4 as uuidv4 } from 'uuid';
|
||
import { getDb } from '../db/db.js';
|
||
import fs from 'fs/promises';
|
||
import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs';
|
||
|
||
/**
|
||
* TOC entry patterns to match:
|
||
* - "Chapter 4 – Plumbing System ........ 72"
|
||
* - "4.1 Water System.....................45"
|
||
* - "Section 3: Electrical . . . . . . . 89"
|
||
* - "Introduction 12"
|
||
*/
|
||
const TOC_PATTERNS = [
|
||
// Pattern 1: Title [dots/spaces] PageNum
|
||
/^(.{3,150?}?)\s*[.\s–-]{3,}\s*(\d{1,4})\s*$/,
|
||
|
||
// Pattern 2: SectionKey Title [dots/spaces] PageNum
|
||
/^([\d.]+)\s+(.{3,100}?)\s*[.\s–-]{3,}\s*(\d{1,4})\s*$/,
|
||
|
||
// Pattern 3: Title [whitespace] PageNum (simpler)
|
||
/^(.{5,120}?)\s{3,}(\d{1,4})\s*$/,
|
||
];
|
||
|
||
/**
|
||
* Detect if a page looks like a TOC page
|
||
* @param {string} pageText - OCR text from page
|
||
* @returns {boolean}
|
||
*/
|
||
function isTocPage(pageText) {
|
||
if (!pageText || pageText.length < 100) return false;
|
||
|
||
const lines = pageText.split('\n').map(l => l.trim()).filter(l => l.length > 5);
|
||
if (lines.length < 5) return false;
|
||
|
||
// Count how many lines match TOC patterns
|
||
let matchCount = 0;
|
||
let pageNumbers = [];
|
||
|
||
for (const line of lines) {
|
||
for (const pattern of TOC_PATTERNS) {
|
||
if (pattern.test(line)) {
|
||
matchCount++;
|
||
const match = line.match(pattern);
|
||
const pageNum = parseInt(match[match.length - 1]);
|
||
if (!isNaN(pageNum)) {
|
||
pageNumbers.push(pageNum);
|
||
}
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Heuristics for TOC detection:
|
||
// 1. At least 5 matching lines
|
||
// 2. At least 30% of lines match TOC patterns
|
||
// 3. Page numbers are somewhat sequential or grouped
|
||
const matchRatio = matchCount / lines.length;
|
||
const hasSequentialPages = checkSequentiality(pageNumbers);
|
||
|
||
return matchCount >= 5 && matchRatio >= 0.3 && hasSequentialPages;
|
||
}
|
||
|
||
/**
|
||
* Check if page numbers show some sequentiality
|
||
* @param {number[]} pageNumbers
|
||
* @returns {boolean}
|
||
*/
|
||
function checkSequentiality(pageNumbers) {
|
||
if (pageNumbers.length < 3) return false;
|
||
|
||
// Sort and check for general increasing trend
|
||
const sorted = [...pageNumbers].sort((a, b) => a - b);
|
||
let increases = 0;
|
||
|
||
for (let i = 1; i < sorted.length; i++) {
|
||
if (sorted[i] >= sorted[i - 1]) increases++;
|
||
}
|
||
|
||
// At least 70% should be increasing
|
||
return (increases / (sorted.length - 1)) >= 0.7;
|
||
}
|
||
|
||
/**
|
||
* Parse section key and determine hierarchy level
|
||
* @param {string} sectionKey - e.g., "4", "4.1", "4.1.2"
|
||
* @returns {{ key: string, level: number }}
|
||
*/
|
||
function parseSectionKey(sectionKey) {
|
||
if (!sectionKey) return { key: null, level: 1 };
|
||
|
||
const trimmed = sectionKey.trim();
|
||
const parts = trimmed.split('.');
|
||
|
||
return {
|
||
key: trimmed,
|
||
level: parts.length
|
||
};
|
||
}
|
||
|
||
/**
|
||
* Extract TOC entries from a page
|
||
* @param {string} pageText
|
||
* @param {number} pageNumber
|
||
* @returns {Array<Object>}
|
||
*/
|
||
function extractTocEntries(pageText, pageNumber) {
|
||
const lines = pageText.split('\n').map(l => l.trim()).filter(l => l.length > 5);
|
||
const entries = [];
|
||
let orderIndex = 0;
|
||
|
||
for (const line of lines) {
|
||
let match = null;
|
||
let patternType = 0;
|
||
|
||
// Try each pattern
|
||
for (let i = 0; i < TOC_PATTERNS.length; i++) {
|
||
match = line.match(TOC_PATTERNS[i]);
|
||
if (match) {
|
||
patternType = i;
|
||
break;
|
||
}
|
||
}
|
||
|
||
if (!match) continue;
|
||
|
||
let title, sectionKey, targetPage;
|
||
|
||
// Parse based on pattern type
|
||
if (patternType === 1) {
|
||
// Pattern with section key: "4.1 Title .... 45"
|
||
sectionKey = match[1];
|
||
title = match[2].trim();
|
||
targetPage = parseInt(match[3]);
|
||
} else {
|
||
// Patterns without section key: "Title .... 45"
|
||
const groups = match.slice(1).filter(g => g !== undefined);
|
||
title = groups[0].trim();
|
||
targetPage = parseInt(groups[groups.length - 1]);
|
||
sectionKey = null;
|
||
}
|
||
|
||
// Clean up title (remove trailing dots/dashes)
|
||
title = title.replace(/[.\-–\s]+$/, '').trim();
|
||
|
||
// Skip if title is too short or page number invalid
|
||
if (title.length < 3 || isNaN(targetPage) || targetPage < 1) continue;
|
||
|
||
const { key, level } = parseSectionKey(sectionKey);
|
||
|
||
entries.push({
|
||
title,
|
||
sectionKey: key,
|
||
pageStart: targetPage,
|
||
level,
|
||
tocPageNumber: pageNumber,
|
||
orderIndex: orderIndex++
|
||
});
|
||
}
|
||
|
||
return entries;
|
||
}
|
||
|
||
/**
|
||
* Build parent-child relationships for hierarchical TOC
|
||
* @param {Array<Object>} entries
|
||
* @returns {Array<Object>} Entries with parentId set
|
||
*/
|
||
function buildHierarchy(entries) {
|
||
const enhanced = entries.map(e => ({ ...e, id: uuidv4(), parentId: null }));
|
||
|
||
for (let i = 0; i < enhanced.length; i++) {
|
||
const entry = enhanced[i];
|
||
|
||
if (!entry.sectionKey || entry.level === 1) continue;
|
||
|
||
// Find parent: look backwards for entry with section key that is prefix
|
||
// e.g., "4.1.2" parent is "4.1"
|
||
const parentKeyParts = entry.sectionKey.split('.');
|
||
parentKeyParts.pop(); // Remove last part
|
||
const parentKey = parentKeyParts.join('.');
|
||
|
||
for (let j = i - 1; j >= 0; j--) {
|
||
if (enhanced[j].sectionKey === parentKey) {
|
||
entry.parentId = enhanced[j].id;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
return enhanced;
|
||
}
|
||
|
||
/**
|
||
* Extract PDF outline/bookmarks as fallback TOC
|
||
* Uses pdfjs-dist to read the PDF's built-in outline/bookmarks
|
||
*
|
||
* @param {string} filePath - Absolute path to PDF file
|
||
* @param {string} documentId - Document ID for reference
|
||
* @returns {Promise<Array<Object>|null>} Array of TOC entries or null if no outline exists
|
||
*/
|
||
async function extractPdfOutline(filePath, documentId) {
|
||
try {
|
||
console.log(`[TOC] Attempting to extract PDF outline from: ${filePath}`);
|
||
|
||
// Read PDF file
|
||
const dataBuffer = await fs.readFile(filePath);
|
||
|
||
// Load PDF document
|
||
const loadingTask = pdfjsLib.getDocument({
|
||
data: new Uint8Array(dataBuffer),
|
||
useSystemFonts: true,
|
||
standardFontDataUrl: null // Disable font loading for performance
|
||
});
|
||
|
||
const pdfDocument = await loadingTask.promise;
|
||
const outline = await pdfDocument.getOutline();
|
||
|
||
if (!outline || outline.length === 0) {
|
||
console.log(`[TOC] No PDF outline found in document ${documentId}`);
|
||
await pdfDocument.destroy();
|
||
return null;
|
||
}
|
||
|
||
console.log(`[TOC] Found PDF outline with ${outline.length} top-level items`);
|
||
|
||
// Convert outline to TOC entries
|
||
const entries = [];
|
||
let orderIndex = 0;
|
||
|
||
/**
|
||
* Recursively process outline items and convert to TOC entries
|
||
*/
|
||
async function processOutlineItem(item, level = 1, parentId = null) {
|
||
if (!item || !item.title) return;
|
||
|
||
// Resolve destination to page number
|
||
let pageStart = 1;
|
||
if (item.dest) {
|
||
try {
|
||
// Get the destination (can be a string reference or direct array)
|
||
const dest = typeof item.dest === 'string'
|
||
? await pdfDocument.getDestination(item.dest)
|
||
: item.dest;
|
||
|
||
// Extract page reference from destination array
|
||
// Format is typically: [pageRef, fitType, ...params]
|
||
if (dest && Array.isArray(dest) && dest[0]) {
|
||
const pageIndex = await pdfDocument.getPageIndex(dest[0]);
|
||
pageStart = pageIndex + 1; // Convert 0-based to 1-based
|
||
}
|
||
} catch (e) {
|
||
console.log(`[TOC] Could not resolve page for outline item "${item.title}": ${e.message}`);
|
||
// Keep default pageStart = 1
|
||
}
|
||
}
|
||
|
||
const entry = {
|
||
id: uuidv4(),
|
||
title: item.title.trim(),
|
||
sectionKey: null, // PDF outlines don't have section keys
|
||
pageStart: pageStart,
|
||
level: level,
|
||
parentId: parentId,
|
||
orderIndex: orderIndex++,
|
||
tocPageNumber: null // Not from a TOC page, from PDF outline
|
||
};
|
||
|
||
entries.push(entry);
|
||
|
||
// Process children recursively
|
||
if (item.items && Array.isArray(item.items) && item.items.length > 0) {
|
||
for (const child of item.items) {
|
||
await processOutlineItem(child, level + 1, entry.id);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Process all top-level outline items
|
||
for (const item of outline) {
|
||
await processOutlineItem(item);
|
||
}
|
||
|
||
// Clean up
|
||
await pdfDocument.destroy();
|
||
|
||
if (entries.length === 0) {
|
||
console.log(`[TOC] PDF outline exists but contains no valid entries for document ${documentId}`);
|
||
return null;
|
||
}
|
||
|
||
console.log(`[TOC] Successfully extracted ${entries.length} entries from PDF outline for document ${documentId}`);
|
||
return entries;
|
||
|
||
} catch (error) {
|
||
console.error(`[TOC] Error extracting PDF outline for document ${documentId}:`, error);
|
||
return null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Extract TOC from entire document
|
||
* @param {string} documentId
|
||
* @returns {Promise<{ success: boolean, entriesCount: number, pages: number[] }>}
|
||
*/
|
||
export async function extractTocFromDocument(documentId) {
|
||
const db = getDb();
|
||
|
||
try {
|
||
// Validate document exists
|
||
const document = db.prepare(`
|
||
SELECT id FROM documents WHERE id = ?
|
||
`).get(documentId);
|
||
|
||
if (!document) {
|
||
console.error(`[TOC] Document not found: ${documentId}`);
|
||
return {
|
||
success: false,
|
||
error: 'Document not found',
|
||
entriesCount: 0,
|
||
pages: []
|
||
};
|
||
}
|
||
|
||
// Get total page count for the document
|
||
const pageCountResult = db.prepare(`
|
||
SELECT COUNT(*) as count
|
||
FROM document_pages
|
||
WHERE document_id = ?
|
||
`).get(documentId);
|
||
|
||
if (pageCountResult.count === 0) {
|
||
console.error(`[TOC] No pages available for TOC extraction in document: ${documentId}`);
|
||
return {
|
||
success: false,
|
||
error: 'No pages available for TOC extraction',
|
||
entriesCount: 0,
|
||
pages: []
|
||
};
|
||
}
|
||
|
||
// Get all pages with OCR text
|
||
const pages = db.prepare(`
|
||
SELECT page_number, ocr_text
|
||
FROM document_pages
|
||
WHERE document_id = ? AND ocr_text IS NOT NULL
|
||
ORDER BY page_number ASC
|
||
`).all(documentId);
|
||
|
||
if (pages.length === 0) {
|
||
console.error(`[TOC] No OCR text found for document: ${documentId}`);
|
||
return {
|
||
success: false,
|
||
error: 'No OCR text found',
|
||
entriesCount: 0,
|
||
pages: []
|
||
};
|
||
}
|
||
|
||
// Find TOC pages
|
||
const tocPages = [];
|
||
for (const page of pages) {
|
||
if (isTocPage(page.ocr_text)) {
|
||
tocPages.push(page);
|
||
}
|
||
}
|
||
|
||
// If no TOC pages found, try PDF outline as fallback
|
||
if (tocPages.length === 0) {
|
||
console.log(`[TOC] No TOC pages detected in document ${documentId}, attempting PDF outline fallback`);
|
||
|
||
// Get document file path
|
||
const doc = db.prepare('SELECT file_path FROM documents WHERE id = ?').get(documentId);
|
||
|
||
if (!doc || !doc.file_path) {
|
||
console.log(`[TOC] Cannot attempt PDF outline fallback: file path not found for document ${documentId}`);
|
||
return {
|
||
success: false,
|
||
error: 'TOC detection failed: No patterns matched',
|
||
entriesCount: 0,
|
||
pages: []
|
||
};
|
||
}
|
||
|
||
// Try extracting PDF outline
|
||
const outlineEntries = await extractPdfOutline(doc.file_path, documentId);
|
||
|
||
if (!outlineEntries || outlineEntries.length === 0) {
|
||
console.log(`[TOC] PDF outline fallback failed for document ${documentId}`);
|
||
return {
|
||
success: false,
|
||
error: 'TOC detection failed: No patterns matched and no PDF outline found',
|
||
entriesCount: 0,
|
||
pages: []
|
||
};
|
||
}
|
||
|
||
// Save outline entries to database
|
||
console.log(`[TOC] Using PDF outline as TOC for document ${documentId} (${outlineEntries.length} entries)`);
|
||
|
||
// Delete existing TOC entries for this document
|
||
db.prepare('DELETE FROM document_toc WHERE document_id = ?').run(documentId);
|
||
|
||
// Insert outline entries
|
||
const insertStmt = db.prepare(`
|
||
INSERT INTO document_toc (
|
||
id, document_id, title, section_key, page_start,
|
||
level, parent_id, order_index, toc_page_number, created_at
|
||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||
`);
|
||
|
||
const timestamp = Date.now();
|
||
for (const entry of outlineEntries) {
|
||
insertStmt.run(
|
||
entry.id,
|
||
documentId,
|
||
entry.title,
|
||
entry.sectionKey,
|
||
entry.pageStart,
|
||
entry.level,
|
||
entry.parentId,
|
||
entry.orderIndex,
|
||
entry.tocPageNumber,
|
||
timestamp
|
||
);
|
||
}
|
||
|
||
return {
|
||
success: true,
|
||
entriesCount: outlineEntries.length,
|
||
pages: [],
|
||
source: 'pdf-outline'
|
||
};
|
||
}
|
||
|
||
console.log(`[TOC] Found ${tocPages.length} TOC pages in document ${documentId}`);
|
||
|
||
// Extract entries from all TOC pages
|
||
let allEntries = [];
|
||
for (const page of tocPages) {
|
||
const entries = extractTocEntries(page.ocr_text, page.page_number);
|
||
allEntries = allEntries.concat(entries);
|
||
}
|
||
|
||
if (allEntries.length === 0) {
|
||
console.error(`[TOC] TOC parsing failed: No valid entries extracted from detected TOC pages in document ${documentId}`);
|
||
return {
|
||
success: false,
|
||
error: 'TOC parsing failed: No valid entries extracted from detected TOC pages',
|
||
entriesCount: 0,
|
||
pages: tocPages.map(p => p.page_number)
|
||
};
|
||
}
|
||
|
||
// Build hierarchy
|
||
let hierarchicalEntries;
|
||
try {
|
||
hierarchicalEntries = buildHierarchy(allEntries);
|
||
} catch (hierarchyError) {
|
||
console.error(`[TOC] TOC parsing failed: Hierarchy building error in document ${documentId}:`, hierarchyError);
|
||
return {
|
||
success: false,
|
||
error: `TOC parsing failed: Hierarchy building error - ${hierarchyError.message}`,
|
||
entriesCount: 0,
|
||
pages: tocPages.map(p => p.page_number)
|
||
};
|
||
}
|
||
|
||
// Delete existing TOC entries for this document
|
||
try {
|
||
db.prepare('DELETE FROM document_toc WHERE document_id = ?').run(documentId);
|
||
} catch (deleteError) {
|
||
console.error(`[TOC] TOC parsing failed: Database cleanup error in document ${documentId}:`, deleteError);
|
||
return {
|
||
success: false,
|
||
error: `TOC parsing failed: Database cleanup error - ${deleteError.message}`,
|
||
entriesCount: 0,
|
||
pages: tocPages.map(p => p.page_number)
|
||
};
|
||
}
|
||
|
||
// Insert new TOC entries
|
||
const insertStmt = db.prepare(`
|
||
INSERT INTO document_toc (
|
||
id, document_id, title, section_key, page_start,
|
||
level, parent_id, order_index, toc_page_number, created_at
|
||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||
`);
|
||
|
||
const timestamp = Date.now();
|
||
try {
|
||
for (const entry of hierarchicalEntries) {
|
||
insertStmt.run(
|
||
entry.id,
|
||
documentId,
|
||
entry.title,
|
||
entry.sectionKey,
|
||
entry.pageStart,
|
||
entry.level,
|
||
entry.parentId,
|
||
entry.orderIndex,
|
||
entry.tocPageNumber,
|
||
timestamp
|
||
);
|
||
}
|
||
} catch (insertError) {
|
||
console.error(`[TOC] TOC parsing failed: Database insertion error in document ${documentId}:`, insertError);
|
||
return {
|
||
success: false,
|
||
error: `TOC parsing failed: Database insertion error - ${insertError.message}`,
|
||
entriesCount: 0,
|
||
pages: tocPages.map(p => p.page_number)
|
||
};
|
||
}
|
||
|
||
console.log(`[TOC] Extracted ${hierarchicalEntries.length} TOC entries for document ${documentId}`);
|
||
|
||
return {
|
||
success: true,
|
||
entriesCount: hierarchicalEntries.length,
|
||
pages: tocPages.map(p => p.page_number),
|
||
source: 'ocr-extraction'
|
||
};
|
||
|
||
} catch (error) {
|
||
console.error(`[TOC] Unexpected extraction error for document ${documentId}:`, error);
|
||
return {
|
||
success: false,
|
||
error: `Unexpected error during TOC extraction: ${error.message}`,
|
||
entriesCount: 0,
|
||
pages: []
|
||
};
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Get TOC for a document
|
||
* @param {string} documentId
|
||
* @returns {Array<Object>} TOC entries with hierarchy
|
||
*/
|
||
export function getDocumentToc(documentId) {
|
||
const db = getDb();
|
||
|
||
const entries = db.prepare(`
|
||
SELECT
|
||
id, document_id, title, section_key, page_start,
|
||
level, parent_id, order_index, toc_page_number
|
||
FROM document_toc
|
||
WHERE document_id = ?
|
||
ORDER BY order_index ASC
|
||
`).all(documentId);
|
||
|
||
return entries;
|
||
}
|
||
|
||
/**
|
||
* Build tree structure from flat TOC entries
|
||
* @param {Array<Object>} entries
|
||
* @returns {Array<Object>} Tree with children arrays
|
||
*/
|
||
export function buildTocTree(entries) {
|
||
const idMap = {};
|
||
const roots = [];
|
||
|
||
// First pass: create map
|
||
for (const entry of entries) {
|
||
idMap[entry.id] = { ...entry, children: [] };
|
||
}
|
||
|
||
// Second pass: build tree
|
||
for (const entry of entries) {
|
||
const node = idMap[entry.id];
|
||
if (entry.parent_id && idMap[entry.parent_id]) {
|
||
idMap[entry.parent_id].children.push(node);
|
||
} else {
|
||
roots.push(node);
|
||
}
|
||
}
|
||
|
||
return roots;
|
||
}
|
||
|
||
export default {
|
||
extractTocFromDocument,
|
||
getDocumentToc,
|
||
buildTocTree
|
||
};
|