navidocs/server/services/section-extractor.js
ggq-admin fb88b291de feat: Add interactive Table of Contents navigation with i18n support
Implements complete TOC feature for document navigation with bilingual support.

## TOC Detection & Extraction
- Pattern-based TOC detection with 3 regex patterns
- Heuristic validation (30%+ match ratio, 5+ entries, sequential pages)
- Hierarchical section key parsing (e.g., "4.1.2" → level 3, parent "4.1")
- Database schema with parent-child relationships
- Automatic extraction during OCR post-processing
- Server-side LRU caching (200 entries, 30min TTL)

## UI Components
- TocSidebar: Collapsible sidebar (320px) with auto-open on TOC presence
- TocEntry: Recursive component for hierarchical rendering
- Flex layout: Sidebar + PDF viewer side-by-side
- Active page highlighting with real-time sync
- localStorage persistence for sidebar state

## Navigation Features
- Click TOC entry → PDF jumps to page
- Deep link support: URL hash format #p=12
- Page change events: navidocs:pagechange custom event
- URL hash updates on all navigation (next/prev/goTo/TOC)
- Hash change listener for external navigation
- Page clamping and validation

## Search Integration
- "Jump to section" button in search results
- Shows when result has section field
- Navigates to document with page number and hash

## Accessibility
- ARIA attributes: role, aria-label, aria-expanded, aria-current
- Keyboard navigation: Enter/Space on entries, Tab focus
- Screen reader support with aria-live regions
- Semantic HTML with proper list/listitem roles

## Internationalization (i18n)
- Vue I18n integration with vue-i18n package
- English and French translations
- 8 TOC-specific translation keys
- Language switcher component in document viewer
- Locale persistence in localStorage

## Error Handling
- Specific error messages for each failure case
- Validation before processing (doc exists, has pages, has OCR)
- Non-blocking TOC extraction (doesn't fail OCR jobs)
- Detailed error returns: {success, error, entriesCount, pages}

## API Endpoints
- GET /api/documents/:id/toc?format=flat|tree
- POST /api/documents/:id/toc/extract
- Cache invalidation on re-extraction

## Testing
- Smoke test script: 9 comprehensive tests
- E2E testing guide with 5 manual scenarios
- Tests cover: API, caching, validation, navigation, search

## Database
- Migration 002: document_toc table
- Fields: id, document_id, title, section_key, page_start, level, parent_id, order_index
- Foreign keys with CASCADE delete

## Files Changed
- New: TocSidebar.vue, TocEntry.vue, LanguageSwitcher.vue
- New: toc-extractor.js, toc.js routes, i18n setup
- Modified: DocumentView.vue (sidebar, deep links, events)
- Modified: SearchView.vue (Jump to section button)
- Modified: ocr-worker.js (TOC post-processing)
- New: toc-smoke-test.sh, TOC_E2E_TEST.md

Generated with Claude Code (https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-20 13:22:45 +02:00

265 lines
7.4 KiB
JavaScript

/**
* Section Extractor Service
*
* Extracts section/chapter metadata from PDFs using a three-tier approach:
* 1. PDF Outline/Bookmarks (most reliable)
* 2. Header Detection via Regex (fallback)
* 3. Table of Contents Parsing (last resort)
*/
import pdf from 'pdf-parse';
import fs from 'fs';
import { promisify } from 'util';
const readFile = promisify(fs.readFile);
/**
* Slugify section title for consistent keys
*/
function slugify(text) {
return text
.toLowerCase()
.replace(/[^\w\s.-]/g, '')
.replace(/\s+/g, '-')
.replace(/^-+|-+$/g, '');
}
/**
* Parse section number to determine order
* Examples: "8" -> 800, "8.6" -> 806, "8-6" -> 806, "8/6" -> 806
*/
function parseSectionOrder(sectionNum) {
if (!sectionNum) return 0;
// Normalize separators: treat -, /, . the same
const normalized = sectionNum.replace(/[-\/]/g, '.');
const parts = normalized.split('.').map(p => parseInt(p) || 0);
// Major * 100 + minor * 1
return (parts[0] || 0) * 100 + (parts[1] || 0);
}
/**
* Extract sections from PDF outline/bookmarks
* This is the most reliable method when available
*/
async function extractFromOutline(pdfPath) {
try {
const dataBuffer = await readFile(pdfPath);
const data = await pdf(dataBuffer, {
max: 0 // Don't extract text, just metadata
});
if (!data.metadata || !data.metadata.info) {
return null;
}
// pdf-parse doesn't expose outlines directly, we need pdf-lib or pdfjs-dist
// For now, return null to fall through to other methods
return null;
} catch (error) {
console.error('[SectionExtractor] Outline extraction failed:', error.message);
return null;
}
}
/**
* Detect section headers using regex patterns
* Looks for patterns like:
* - "8. Waste Systems"
* - "8.6 Blackwater Tank"
* - "CHAPTER 8: WASTE SYSTEMS"
*/
function detectSectionHeaders(pages) {
const sections = [];
let currentSection = null;
let currentSectionOrder = 0;
// Patterns to match section headers (marine manual focused)
const headerPatterns = [
// "8.6 Blackwater Tank" or "8-6 Bilge System" or "8/6 Through-Hull"
/^\s*(\d+(?:[.\-\/]\d+)*)\s+([A-Z][^\n]{3,60})/m,
// "CHAPTER 8: WASTE SYSTEMS" or "SECTION 8.6: Blackwater"
/^\s*(?:CHAPTER|SECTION|PART)\s+(\d+(?:[.\-\/]\d+)*)[:\s]+([A-Z][^\n]{3,60})/mi,
// Marine-specific: "ELECTRICAL SYSTEM", "PLUMBING", "NAVIGATION EQUIPMENT"
/^\s*([A-Z][A-Z\s\-]{4,59})$/m,
// TOC style: "8.6 Blackwater" at page start
/^(\d+(?:[.\-\/]\d+)*)\s+([A-Z][a-z][^\n]{3,50})/m,
];
for (const page of pages) {
const { pageNumber, text } = page;
if (!text || text.length < 10) continue;
// Try each pattern
let matched = false;
for (const pattern of headerPatterns) {
const match = text.match(pattern);
if (match) {
let sectionNum = match[1];
let sectionTitle = match[2] || match[1];
// Skip if it's just the page number
if (sectionTitle.length < 5) continue;
// Clean up title
sectionTitle = sectionTitle.trim();
if (sectionTitle.endsWith(':')) {
sectionTitle = sectionTitle.slice(0, -1);
}
// Calculate section order
const order = sectionNum && /\d/.test(sectionNum)
? parseSectionOrder(sectionNum)
: currentSectionOrder + 1;
// Create section key (hierarchical path)
const sectionKey = slugify(sectionTitle);
currentSection = {
section: sectionTitle,
sectionKey: sectionKey,
sectionOrder: order,
startPage: pageNumber
};
currentSectionOrder = order;
sections.push(currentSection);
matched = true;
break;
}
}
// If we found a section, continue to next page
if (matched) continue;
// Otherwise, assign current section to this page
if (!currentSection) {
// No section yet, create a default one
currentSection = {
section: 'Introduction',
sectionKey: 'introduction',
sectionOrder: 0,
startPage: pageNumber
};
sections.push(currentSection);
}
}
return sections;
}
/**
* Parse Table of Contents to extract section structure
* Looks for pages with dense "8.6 Title ........ 73" style entries
*/
function parseTableOfContents(pages) {
const sections = [];
// Pattern to match TOC entries: "8.6 Blackwater Tank ........ 73"
const tocPattern = /^\s*(\d+(?:\.\d+)*)\s+([^.\d][^\n]{3,50}?)[\s.]+(\d+)\s*$/gm;
for (const page of pages) {
const { text } = page;
if (!text) continue;
// Look for pages with multiple TOC-style entries
const matches = [...text.matchAll(tocPattern)];
if (matches.length >= 3) { // Likely a TOC page if 3+ entries
console.log(`[SectionExtractor] Found TOC page with ${matches.length} entries`);
for (const match of matches) {
const sectionNum = match[1];
const sectionTitle = match[2].trim();
const pageNum = parseInt(match[3]);
if (pageNum > 0 && sectionTitle.length >= 5) {
sections.push({
section: sectionTitle,
sectionKey: slugify(sectionTitle),
sectionOrder: parseSectionOrder(sectionNum),
startPage: pageNum
});
}
}
// If we found a TOC, we're done
if (sections.length > 0) {
return sections;
}
}
}
return sections.length > 0 ? sections : null;
}
/**
* Main extraction function - tries all methods in order
*/
export async function extractSections(pdfPath, pages) {
console.log('[SectionExtractor] Starting section extraction');
// Method 1: Try PDF outline/bookmarks
let sections = await extractFromOutline(pdfPath);
if (sections && sections.length > 0) {
console.log(`[SectionExtractor] Extracted ${sections.length} sections from PDF outline`);
return sections;
}
// Method 2: Try Table of Contents parsing
sections = parseTableOfContents(pages);
if (sections && sections.length > 0) {
console.log(`[SectionExtractor] Extracted ${sections.length} sections from TOC`);
return sections;
}
// Method 3: Try header detection
sections = detectSectionHeaders(pages);
if (sections && sections.length > 0) {
console.log(`[SectionExtractor] Detected ${sections.length} sections from headers`);
return sections;
}
console.log('[SectionExtractor] No sections found, using single section');
// Fallback: Single section for entire document
return [{
section: 'Complete Manual',
sectionKey: 'complete-manual',
sectionOrder: 0,
startPage: 1
}];
}
/**
* Map pages to their sections
* Given extracted sections and pages, assigns each page to a section
*/
export function mapPagesToSections(sections, totalPages) {
const pageMap = new Map();
// Sort sections by start page
const sortedSections = [...sections].sort((a, b) => a.startPage - b.startPage);
// For each section, determine its page range
for (let i = 0; i < sortedSections.length; i++) {
const section = sortedSections[i];
const nextSection = sortedSections[i + 1];
const startPage = section.startPage;
const endPage = nextSection ? nextSection.startPage - 1 : totalPages;
// Assign all pages in this range to this section
for (let pageNum = startPage; pageNum <= endPage; pageNum++) {
pageMap.set(pageNum, {
section: section.section,
sectionKey: section.sectionKey,
sectionOrder: section.sectionOrder
});
}
}
return pageMap;
}