Fixed:
- Price: €800K-€1.5M, Sunseeker added
- Agent 1: Joe Trader persona + actual sale ads research
- Ignored meilisearch binary + data/ (too large for GitHub)
- SESSION_DEBUG_BLOCKERS.md created
Ready for Session 1 launch.
🤖 Generated with Claude Code
646 lines
18 KiB
JavaScript
646 lines
18 KiB
JavaScript
/**
|
||
* TOC Extractor Service
|
||
* Detects and extracts Table of Contents from OCR'd document pages
|
||
*/
|
||
|
||
import { v4 as uuidv4 } from 'uuid';
|
||
import { getDb } from '../db/db.js';
|
||
import fs from 'fs/promises';
|
||
import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs';
|
||
|
||
/**
|
||
* TOC entry patterns to match:
|
||
* - "Chapter 4 – Plumbing System ........ 72"
|
||
* - "4.1 Water System.....................45"
|
||
* - "Section 3: Electrical . . . . . . . 89"
|
||
* - "Introduction 12"
|
||
*/
|
||
const TOC_PATTERNS = [
|
||
// Pattern 1: Title [dots/spaces] PageNum
|
||
/^(.{3,150?}?)\s*[.\s–-]{3,}\s*(\d{1,4})\s*$/,
|
||
|
||
// Pattern 2: SectionKey Title [dots/spaces] PageNum
|
||
/^([\d.]+)\s+(.{3,100}?)\s*[.\s–-]{3,}\s*(\d{1,4})\s*$/,
|
||
|
||
// Pattern 3: Title [whitespace] PageNum (simpler)
|
||
/^(.{5,120}?)\s{3,}(\d{1,4})\s*$/,
|
||
];
|
||
|
||
/**
|
||
* Detect if a page looks like a TOC page
|
||
* @param {string} pageText - OCR text from page
|
||
* @returns {boolean}
|
||
*/
|
||
function isTocPage(pageText) {
|
||
if (!pageText || pageText.length < 100) return false;
|
||
|
||
const lines = pageText.split('\n').map(l => l.trim()).filter(l => l.length > 5);
|
||
if (lines.length < 5) return false;
|
||
|
||
// Count how many lines match TOC patterns
|
||
let matchCount = 0;
|
||
let pageNumbers = [];
|
||
|
||
for (const line of lines) {
|
||
for (const pattern of TOC_PATTERNS) {
|
||
if (pattern.test(line)) {
|
||
matchCount++;
|
||
const match = line.match(pattern);
|
||
const pageNum = parseInt(match[match.length - 1]);
|
||
if (!isNaN(pageNum)) {
|
||
pageNumbers.push(pageNum);
|
||
}
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Heuristics for TOC detection:
|
||
// 1. At least 5 matching lines
|
||
// 2. At least 30% of lines match TOC patterns
|
||
// 3. Page numbers are somewhat sequential or grouped
|
||
const matchRatio = matchCount / lines.length;
|
||
const hasSequentialPages = checkSequentiality(pageNumbers);
|
||
|
||
return matchCount >= 5 && matchRatio >= 0.3 && hasSequentialPages;
|
||
}
|
||
|
||
/**
|
||
* Check if page numbers show some sequentiality
|
||
* @param {number[]} pageNumbers
|
||
* @returns {boolean}
|
||
*/
|
||
function checkSequentiality(pageNumbers) {
|
||
if (pageNumbers.length < 3) return false;
|
||
|
||
// Sort and check for general increasing trend
|
||
const sorted = [...pageNumbers].sort((a, b) => a - b);
|
||
let increases = 0;
|
||
|
||
for (let i = 1; i < sorted.length; i++) {
|
||
if (sorted[i] >= sorted[i - 1]) increases++;
|
||
}
|
||
|
||
// At least 70% should be increasing
|
||
return (increases / (sorted.length - 1)) >= 0.7;
|
||
}
|
||
|
||
/**
|
||
* Parse section key and determine hierarchy level
|
||
* @param {string} sectionKey - e.g., "4", "4.1", "4.1.2"
|
||
* @returns {{ key: string, level: number }}
|
||
*/
|
||
function parseSectionKey(sectionKey) {
|
||
if (!sectionKey) return { key: null, level: 1 };
|
||
|
||
const trimmed = sectionKey.trim();
|
||
const parts = trimmed.split('.');
|
||
|
||
return {
|
||
key: trimmed,
|
||
level: parts.length
|
||
};
|
||
}
|
||
|
||
/**
|
||
* Extract TOC entries from a page
|
||
* @param {string} pageText
|
||
* @param {number} pageNumber
|
||
* @returns {Array<Object>}
|
||
*/
|
||
function extractTocEntries(pageText, pageNumber) {
|
||
const lines = pageText.split('\n').map(l => l.trim()).filter(l => l.length > 5);
|
||
const entries = [];
|
||
let orderIndex = 0;
|
||
|
||
for (const line of lines) {
|
||
let match = null;
|
||
let patternType = 0;
|
||
|
||
// Try each pattern
|
||
for (let i = 0; i < TOC_PATTERNS.length; i++) {
|
||
match = line.match(TOC_PATTERNS[i]);
|
||
if (match) {
|
||
patternType = i;
|
||
break;
|
||
}
|
||
}
|
||
|
||
if (!match) continue;
|
||
|
||
let title, sectionKey, targetPage;
|
||
|
||
// Parse based on pattern type
|
||
if (patternType === 1) {
|
||
// Pattern with section key: "4.1 Title .... 45"
|
||
sectionKey = match[1];
|
||
title = match[2].trim();
|
||
targetPage = parseInt(match[3]);
|
||
} else {
|
||
// Patterns without section key: "Title .... 45"
|
||
const groups = match.slice(1).filter(g => g !== undefined);
|
||
title = groups[0].trim();
|
||
targetPage = parseInt(groups[groups.length - 1]);
|
||
sectionKey = null;
|
||
}
|
||
|
||
// Clean up title (remove trailing dots/dashes)
|
||
title = title.replace(/[.\-–\s]+$/, '').trim();
|
||
|
||
// Skip if title is too short or page number invalid
|
||
if (title.length < 3 || isNaN(targetPage) || targetPage < 1) continue;
|
||
|
||
const { key, level } = parseSectionKey(sectionKey);
|
||
|
||
entries.push({
|
||
title,
|
||
sectionKey: key,
|
||
pageStart: targetPage,
|
||
level,
|
||
tocPageNumber: pageNumber,
|
||
orderIndex: orderIndex++
|
||
});
|
||
}
|
||
|
||
return entries;
|
||
}
|
||
|
||
/**
|
||
* Match TOC entries to their source pages in OCR text
|
||
* Used for PDF outline entries to find which pages they appear on
|
||
* @param {Array<Object>} entries - TOC entries to match
|
||
* @param {string} documentId - Document ID
|
||
* @returns {Array<Object>} Entries with tocPageNumber populated
|
||
*/
|
||
function matchEntriesToSourcePages(entries, documentId) {
|
||
const db = getDb();
|
||
|
||
// Get all pages with OCR text
|
||
const pages = db.prepare(`
|
||
SELECT page_number, ocr_text
|
||
FROM document_pages
|
||
WHERE document_id = ? AND ocr_text IS NOT NULL
|
||
ORDER BY page_number ASC
|
||
`).all(documentId);
|
||
|
||
if (pages.length === 0) {
|
||
console.log('[TOC] No OCR text available for source page matching');
|
||
return entries;
|
||
}
|
||
|
||
let matchCount = 0;
|
||
|
||
// For each entry, search OCR text to find source page
|
||
for (const entry of entries) {
|
||
if (entry.tocPageNumber !== null) continue; // Skip if already set
|
||
|
||
const titleText = entry.title?.trim();
|
||
if (!titleText || titleText.length < 5) continue;
|
||
|
||
// Try to find this title in OCR text, prioritizing early pages (TOC is usually at start)
|
||
for (const page of pages) {
|
||
// Get significant words from title (skip common words, numbers with dots like "7.2.9" count as one word)
|
||
const titleWords = titleText.split(/\s+/).slice(0, 8); // Use more words for better matching
|
||
|
||
// Escape special regex characters but keep spaces for word matching
|
||
const escapedWords = titleWords.map(word =>
|
||
word.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
|
||
);
|
||
|
||
// Create pattern allowing for flexible spacing and line breaks
|
||
const searchPattern = escapedWords.join('[\\s\\S]{0,5}'); // Allow up to 5 chars between words
|
||
const regex = new RegExp(searchPattern, 'i');
|
||
|
||
if (regex.test(page.ocr_text)) {
|
||
entry.tocPageNumber = page.page_number;
|
||
matchCount++;
|
||
break; // Stop after first match
|
||
}
|
||
}
|
||
}
|
||
|
||
if (matchCount > 0) {
|
||
console.log(`[TOC] Matched ${matchCount} PDF outline entries to source pages in OCR text`);
|
||
}
|
||
|
||
return entries;
|
||
}
|
||
|
||
/**
|
||
* Build parent-child relationships for hierarchical TOC
|
||
* @param {Array<Object>} entries
|
||
* @returns {Array<Object>} Entries with parentId set
|
||
*/
|
||
function buildHierarchy(entries) {
|
||
const enhanced = entries.map(e => ({ ...e, id: uuidv4(), parentId: null }));
|
||
|
||
for (let i = 0; i < enhanced.length; i++) {
|
||
const entry = enhanced[i];
|
||
|
||
if (!entry.sectionKey || entry.level === 1) continue;
|
||
|
||
// Find parent: look backwards for entry with section key that is prefix
|
||
// e.g., "4.1.2" parent is "4.1"
|
||
const parentKeyParts = entry.sectionKey.split('.');
|
||
parentKeyParts.pop(); // Remove last part
|
||
const parentKey = parentKeyParts.join('.');
|
||
|
||
for (let j = i - 1; j >= 0; j--) {
|
||
if (enhanced[j].sectionKey === parentKey) {
|
||
entry.parentId = enhanced[j].id;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
return enhanced;
|
||
}
|
||
|
||
/**
|
||
* Extract PDF outline/bookmarks as fallback TOC
|
||
* Uses pdfjs-dist to read the PDF's built-in outline/bookmarks
|
||
*
|
||
* @param {string} pdfPath - Absolute path to PDF file
|
||
* @returns {Promise<Array<Object>>} Array of TOC entries with {title, page, level}
|
||
*/
|
||
async function extractPdfOutline(pdfPath) {
|
||
try {
|
||
const loadingTask = pdfjsLib.getDocument({ url: pdfPath });
|
||
const pdfDoc = await loadingTask.promise;
|
||
const outline = await pdfDoc.getOutline();
|
||
|
||
if (!outline || outline.length === 0) {
|
||
await pdfDoc.destroy?.();
|
||
return [];
|
||
}
|
||
|
||
const results = [];
|
||
|
||
async function walk(items, level = 1, parentKey = null) {
|
||
for (const item of items) {
|
||
const title = (item.title || '').trim();
|
||
let pageNum = null;
|
||
|
||
// Try to resolve destination to page number
|
||
if (item.dest) {
|
||
try {
|
||
const destArray = await pdfDoc.getDestination(item.dest);
|
||
if (Array.isArray(destArray) && destArray.length > 0) {
|
||
const pageRef = destArray[0];
|
||
const pageIndex = await pdfDoc.getPageIndex(pageRef);
|
||
pageNum = pageIndex + 1; // Convert 0-based to 1-based
|
||
}
|
||
} catch (err) {
|
||
// Silently handle resolution errors
|
||
}
|
||
}
|
||
|
||
// Fallback: try URL fragment like #page=5
|
||
if (!pageNum && item.url) {
|
||
const m = String(item.url).match(/#page=(\d+)/i);
|
||
if (m) pageNum = parseInt(m[1], 10);
|
||
}
|
||
|
||
results.push({
|
||
title: title || 'Untitled',
|
||
page: Number.isFinite(pageNum) && pageNum >= 1 ? pageNum : null,
|
||
level,
|
||
_raw: { dest: !!item.dest, url: !!item.url, action: !!item.action }
|
||
});
|
||
|
||
// Recurse into children
|
||
if (item.items && item.items.length) {
|
||
await walk(item.items, level + 1);
|
||
}
|
||
}
|
||
}
|
||
|
||
await walk(outline, 1);
|
||
await pdfDoc.destroy?.();
|
||
|
||
return results;
|
||
} catch (err) {
|
||
console.warn('extractPdfOutline failed:', err && err.message);
|
||
return [];
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Extract TOC from entire document
|
||
* @param {string} documentId
|
||
* @returns {Promise<{ success: boolean, entriesCount: number, pages: number[] }>}
|
||
*/
|
||
export async function extractTocFromDocument(documentId) {
|
||
const db = getDb();
|
||
|
||
try {
|
||
// Validate document exists
|
||
const document = db.prepare(`
|
||
SELECT id FROM documents WHERE id = ?
|
||
`).get(documentId);
|
||
|
||
if (!document) {
|
||
console.error(`[TOC] Document not found: ${documentId}`);
|
||
return {
|
||
success: false,
|
||
error: 'Document not found',
|
||
entriesCount: 0,
|
||
pages: []
|
||
};
|
||
}
|
||
|
||
// Get total page count for the document
|
||
const pageCountResult = db.prepare(`
|
||
SELECT COUNT(*) as count
|
||
FROM document_pages
|
||
WHERE document_id = ?
|
||
`).get(documentId);
|
||
|
||
if (pageCountResult.count === 0) {
|
||
console.error(`[TOC] No pages available for TOC extraction in document: ${documentId}`);
|
||
return {
|
||
success: false,
|
||
error: 'No pages available for TOC extraction',
|
||
entriesCount: 0,
|
||
pages: []
|
||
};
|
||
}
|
||
|
||
// PRIORITY: Try PDF outline FIRST (Adobe approach)
|
||
console.log(`[TOC] Attempting PDF outline extraction first for document ${documentId}`);
|
||
const doc = db.prepare('SELECT file_path FROM documents WHERE id = ?').get(documentId);
|
||
|
||
if (doc?.file_path) {
|
||
const outlineResults = await extractPdfOutline(doc.file_path);
|
||
|
||
if (outlineResults && outlineResults.length > 0) {
|
||
console.log(`[TOC] PDF outline found with ${outlineResults.length} entries, using it as primary TOC source`);
|
||
|
||
// Convert simplified outline format to database format
|
||
const outlineEntries = [];
|
||
const parentStack = [];
|
||
|
||
for (let i = 0; i < outlineResults.length; i++) {
|
||
const result = outlineResults[i];
|
||
const entryId = uuidv4();
|
||
|
||
let parentId = null;
|
||
if (result.level > 1) {
|
||
for (let j = parentStack.length - 1; j >= 0; j--) {
|
||
if (parentStack[j].level === result.level - 1) {
|
||
parentId = parentStack[j].id;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
const entry = {
|
||
id: entryId,
|
||
title: result.title,
|
||
sectionKey: null,
|
||
pageStart: result.page || 1,
|
||
level: result.level,
|
||
parentId: parentId,
|
||
orderIndex: i,
|
||
tocPageNumber: null
|
||
};
|
||
|
||
outlineEntries.push(entry);
|
||
|
||
while (parentStack.length > 0 && parentStack[parentStack.length - 1].level >= result.level) {
|
||
parentStack.pop();
|
||
}
|
||
parentStack.push({ id: entryId, level: result.level });
|
||
}
|
||
|
||
// Match PDF outline entries to their source pages in OCR text
|
||
matchEntriesToSourcePages(outlineEntries, documentId);
|
||
|
||
// Save to database
|
||
db.prepare('DELETE FROM document_toc WHERE document_id = ?').run(documentId);
|
||
|
||
const insertStmt = db.prepare(`
|
||
INSERT INTO document_toc (
|
||
id, document_id, title, section_key, page_start,
|
||
level, parent_id, order_index, toc_page_number, created_at
|
||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||
`);
|
||
|
||
const timestamp = Date.now();
|
||
for (const entry of outlineEntries) {
|
||
insertStmt.run(
|
||
entry.id,
|
||
documentId,
|
||
entry.title,
|
||
entry.sectionKey,
|
||
entry.pageStart,
|
||
entry.level,
|
||
entry.parentId,
|
||
entry.orderIndex,
|
||
entry.tocPageNumber,
|
||
timestamp
|
||
);
|
||
}
|
||
|
||
return {
|
||
success: true,
|
||
entriesCount: outlineEntries.length,
|
||
pages: [],
|
||
source: 'pdf-outline',
|
||
message: `Extracted ${outlineEntries.length} entries from PDF outline`
|
||
};
|
||
}
|
||
}
|
||
|
||
// FALLBACK: Try OCR-based TOC detection if PDF outline failed
|
||
console.log(`[TOC] No PDF outline found, falling back to OCR-based TOC detection for document ${documentId}`);
|
||
|
||
// Get all pages with OCR text
|
||
const pages = db.prepare(`
|
||
SELECT page_number, ocr_text
|
||
FROM document_pages
|
||
WHERE document_id = ? AND ocr_text IS NOT NULL
|
||
ORDER BY page_number ASC
|
||
`).all(documentId);
|
||
|
||
if (pages.length === 0) {
|
||
console.error(`[TOC] No OCR text found for document: ${documentId}`);
|
||
return {
|
||
success: false,
|
||
error: 'No OCR text found',
|
||
entriesCount: 0,
|
||
pages: []
|
||
};
|
||
}
|
||
|
||
// Find TOC pages
|
||
const tocPages = [];
|
||
for (const page of pages) {
|
||
if (isTocPage(page.ocr_text)) {
|
||
tocPages.push(page);
|
||
}
|
||
}
|
||
|
||
// If no TOC pages found either, give up
|
||
if (tocPages.length === 0) {
|
||
console.log(`[TOC] No TOC pages detected via OCR either for document ${documentId}`);
|
||
return {
|
||
success: false,
|
||
error: 'TOC detection failed: No PDF outline or OCR-detectable TOC found',
|
||
entriesCount: 0,
|
||
pages: []
|
||
};
|
||
}
|
||
|
||
console.log(`[TOC] Found ${tocPages.length} TOC pages in document ${documentId}`);
|
||
|
||
// Extract entries from all TOC pages
|
||
let allEntries = [];
|
||
for (const page of tocPages) {
|
||
const entries = extractTocEntries(page.ocr_text, page.page_number);
|
||
allEntries = allEntries.concat(entries);
|
||
}
|
||
|
||
if (allEntries.length === 0) {
|
||
console.error(`[TOC] TOC parsing failed: No valid entries extracted from detected TOC pages in document ${documentId}`);
|
||
return {
|
||
success: false,
|
||
error: 'TOC parsing failed: No valid entries extracted from detected TOC pages',
|
||
entriesCount: 0,
|
||
pages: tocPages.map(p => p.page_number)
|
||
};
|
||
}
|
||
|
||
// Build hierarchy
|
||
let hierarchicalEntries;
|
||
try {
|
||
hierarchicalEntries = buildHierarchy(allEntries);
|
||
} catch (hierarchyError) {
|
||
console.error(`[TOC] TOC parsing failed: Hierarchy building error in document ${documentId}:`, hierarchyError);
|
||
return {
|
||
success: false,
|
||
error: `TOC parsing failed: Hierarchy building error - ${hierarchyError.message}`,
|
||
entriesCount: 0,
|
||
pages: tocPages.map(p => p.page_number)
|
||
};
|
||
}
|
||
|
||
// Delete existing TOC entries for this document
|
||
try {
|
||
db.prepare('DELETE FROM document_toc WHERE document_id = ?').run(documentId);
|
||
} catch (deleteError) {
|
||
console.error(`[TOC] TOC parsing failed: Database cleanup error in document ${documentId}:`, deleteError);
|
||
return {
|
||
success: false,
|
||
error: `TOC parsing failed: Database cleanup error - ${deleteError.message}`,
|
||
entriesCount: 0,
|
||
pages: tocPages.map(p => p.page_number)
|
||
};
|
||
}
|
||
|
||
// Insert new TOC entries
|
||
const insertStmt = db.prepare(`
|
||
INSERT INTO document_toc (
|
||
id, document_id, title, section_key, page_start,
|
||
level, parent_id, order_index, toc_page_number, created_at
|
||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||
`);
|
||
|
||
const timestamp = Date.now();
|
||
try {
|
||
for (const entry of hierarchicalEntries) {
|
||
insertStmt.run(
|
||
entry.id,
|
||
documentId,
|
||
entry.title,
|
||
entry.sectionKey,
|
||
entry.pageStart,
|
||
entry.level,
|
||
entry.parentId,
|
||
entry.orderIndex,
|
||
entry.tocPageNumber,
|
||
timestamp
|
||
);
|
||
}
|
||
} catch (insertError) {
|
||
console.error(`[TOC] TOC parsing failed: Database insertion error in document ${documentId}:`, insertError);
|
||
return {
|
||
success: false,
|
||
error: `TOC parsing failed: Database insertion error - ${insertError.message}`,
|
||
entriesCount: 0,
|
||
pages: tocPages.map(p => p.page_number)
|
||
};
|
||
}
|
||
|
||
console.log(`[TOC] Extracted ${hierarchicalEntries.length} TOC entries for document ${documentId}`);
|
||
|
||
return {
|
||
success: true,
|
||
entriesCount: hierarchicalEntries.length,
|
||
pages: tocPages.map(p => p.page_number),
|
||
source: 'ocr-extraction'
|
||
};
|
||
|
||
} catch (error) {
|
||
console.error(`[TOC] Unexpected extraction error for document ${documentId}:`, error);
|
||
return {
|
||
success: false,
|
||
error: `Unexpected error during TOC extraction: ${error.message}`,
|
||
entriesCount: 0,
|
||
pages: []
|
||
};
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Get TOC for a document
|
||
* @param {string} documentId
|
||
* @returns {Array<Object>} TOC entries with hierarchy
|
||
*/
|
||
export function getDocumentToc(documentId) {
|
||
const db = getDb();
|
||
|
||
const entries = db.prepare(`
|
||
SELECT
|
||
id, document_id, title, section_key, page_start,
|
||
level, parent_id, order_index, toc_page_number
|
||
FROM document_toc
|
||
WHERE document_id = ?
|
||
ORDER BY order_index ASC
|
||
`).all(documentId);
|
||
|
||
return entries;
|
||
}
|
||
|
||
/**
|
||
* Build tree structure from flat TOC entries
|
||
* @param {Array<Object>} entries
|
||
* @returns {Array<Object>} Tree with children arrays
|
||
*/
|
||
export function buildTocTree(entries) {
|
||
const idMap = {};
|
||
const roots = [];
|
||
|
||
// First pass: create map
|
||
for (const entry of entries) {
|
||
idMap[entry.id] = { ...entry, children: [] };
|
||
}
|
||
|
||
// Second pass: build tree
|
||
for (const entry of entries) {
|
||
const node = idMap[entry.id];
|
||
if (entry.parent_id && idMap[entry.parent_id]) {
|
||
idMap[entry.parent_id].children.push(node);
|
||
} else {
|
||
roots.push(node);
|
||
}
|
||
}
|
||
|
||
return roots;
|
||
}
|
||
|
||
export default {
|
||
extractTocFromDocument,
|
||
getDocumentToc,
|
||
buildTocTree
|
||
};
|