## Backend (server/) - Express 5 API with security middleware (helmet, rate limiting) - SQLite database with WAL mode (schema from docs/architecture/) - Meilisearch integration with tenant tokens - BullMQ + Redis background job queue - OCR pipeline with Tesseract.js - File safety validation (extension, MIME, size) - 4 API route modules: upload, jobs, search, documents ## Frontend (client/) - Vue 3 with Composition API (<script setup>) - Vite 5 build system with HMR - Tailwind CSS (Meilisearch-inspired design) - UploadModal with drag-and-drop - FigureZoom component (ported from lilian1) - Meilisearch search integration with tenant tokens - Job polling composable - Clean SVG icons (no emojis) ## Code Extraction - ✅ manuals.js → UploadModal.vue, useJobPolling.js - ✅ figure-zoom.js → FigureZoom.vue - ✅ service-worker.js → client/public/service-worker.js (TODO) - ✅ glossary.json → Merged into Meilisearch synonyms - ❌ Discarded: quiz.js, persona.js, gamification.js (Frank-AI junk) ## Documentation - Complete extraction plan in docs/analysis/ - README with quick start guide - Architecture summary in docs/architecture/ ## Build Status - Server dependencies: ✅ Installed (234 packages) - Client dependencies: ✅ Installed (160 packages) - Client build: ✅ Successful (2.63s) 🤖 Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com>
291 lines
7.5 KiB
JavaScript
291 lines
7.5 KiB
JavaScript
/**
|
|
* OCR Integration Example
|
|
*
|
|
* This example demonstrates the complete OCR pipeline workflow:
|
|
* 1. Upload a PDF document
|
|
* 2. Create OCR job in database
|
|
* 3. Queue job for background processing
|
|
* 4. Monitor job progress
|
|
* 5. Search indexed content
|
|
*
|
|
* Usage: node examples/ocr-integration.js
|
|
*/
|
|
|
|
import { v4 as uuidv4 } from 'uuid';
|
|
import { getDb } from '../config/db.js';
|
|
import { addOcrJob, getJobStatus } from '../services/queue.js';
|
|
import { searchPages } from '../services/search.js';
|
|
import { createReadStream, statSync } from 'fs';
|
|
import { createHash } from 'crypto';
|
|
|
|
/**
|
|
* Example 1: Complete document upload and OCR workflow
|
|
*/
|
|
async function uploadAndProcessDocument() {
|
|
console.log('=== Example 1: Upload and Process Document ===\n');
|
|
|
|
const db = getDb();
|
|
|
|
// Simulate uploaded file
|
|
const filePath = './uploads/boat-manual.pdf';
|
|
const fileStats = statSync(filePath);
|
|
const fileHash = createHash('sha256')
|
|
.update(createReadStream(filePath))
|
|
.digest('hex');
|
|
|
|
// Create document record
|
|
const documentId = uuidv4();
|
|
const now = Math.floor(Date.now() / 1000);
|
|
|
|
db.prepare(`
|
|
INSERT INTO documents (
|
|
id, organization_id, entity_id, uploaded_by,
|
|
title, document_type, file_path, file_name,
|
|
file_size, file_hash, page_count,
|
|
status, created_at, updated_at
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'processing', ?, ?)
|
|
`).run(
|
|
documentId,
|
|
'org_demo_123', // Organization ID
|
|
'boat_demo_456', // Boat/Entity ID
|
|
'user_demo_789', // User ID
|
|
'Prestige F4.9 Owner Manual',
|
|
'owner-manual',
|
|
filePath,
|
|
'boat-manual.pdf',
|
|
fileStats.size,
|
|
fileHash,
|
|
50, // Page count (would be detected from PDF)
|
|
now,
|
|
now
|
|
);
|
|
|
|
console.log(`✓ Document created: ${documentId}`);
|
|
|
|
// Create OCR job in database
|
|
const jobId = uuidv4();
|
|
|
|
db.prepare(`
|
|
INSERT INTO ocr_jobs (id, document_id, status, progress, created_at)
|
|
VALUES (?, ?, 'pending', 0, ?)
|
|
`).run(jobId, documentId, now);
|
|
|
|
console.log(`✓ OCR job created: ${jobId}`);
|
|
|
|
// Add job to BullMQ queue
|
|
await addOcrJob(documentId, jobId, {
|
|
filePath: filePath
|
|
});
|
|
|
|
console.log(`✓ Job queued for background processing`);
|
|
|
|
return { documentId, jobId };
|
|
}
|
|
|
|
/**
|
|
* Example 2: Monitor job progress
|
|
*/
|
|
async function monitorJobProgress(jobId) {
|
|
console.log('\n=== Example 2: Monitor Job Progress ===\n');
|
|
|
|
const db = getDb();
|
|
|
|
// Poll for progress every 2 seconds
|
|
const checkProgress = setInterval(async () => {
|
|
const job = db.prepare(`
|
|
SELECT status, progress, error FROM ocr_jobs WHERE id = ?
|
|
`).get(jobId);
|
|
|
|
console.log(`Status: ${job.status} | Progress: ${job.progress}%`);
|
|
|
|
if (job.status === 'completed') {
|
|
console.log('✓ OCR processing completed!');
|
|
clearInterval(checkProgress);
|
|
} else if (job.status === 'failed') {
|
|
console.error(`✗ Job failed: ${job.error}`);
|
|
clearInterval(checkProgress);
|
|
}
|
|
}, 2000);
|
|
|
|
// Also check BullMQ status
|
|
const bullStatus = await getJobStatus(jobId);
|
|
if (bullStatus) {
|
|
console.log(`BullMQ State: ${bullStatus.state}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Example 3: Search indexed content
|
|
*/
|
|
async function searchDocumentContent(documentId) {
|
|
console.log('\n=== Example 3: Search Document Content ===\n');
|
|
|
|
// Wait for indexing to complete
|
|
await new Promise(resolve => setTimeout(resolve, 5000));
|
|
|
|
// Search for specific content
|
|
const queries = [
|
|
'bilge pump',
|
|
'electrical system',
|
|
'maintenance schedule',
|
|
'safety equipment'
|
|
];
|
|
|
|
for (const query of queries) {
|
|
console.log(`\nSearching for: "${query}"`);
|
|
|
|
const results = await searchPages(query, {
|
|
filter: `docId = "${documentId}"`,
|
|
limit: 3
|
|
});
|
|
|
|
if (results.hits.length > 0) {
|
|
console.log(`Found ${results.hits.length} matches:`);
|
|
results.hits.forEach((hit, index) => {
|
|
console.log(` ${index + 1}. Page ${hit.pageNumber} (confidence: ${(hit.ocrConfidence * 100).toFixed(0)}%)`);
|
|
console.log(` "${hit.text.substring(0, 100)}..."`);
|
|
});
|
|
} else {
|
|
console.log(' No matches found');
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Example 4: Get document pages with OCR data
|
|
*/
|
|
async function getDocumentPages(documentId) {
|
|
console.log('\n=== Example 4: Get Document Pages ===\n');
|
|
|
|
const db = getDb();
|
|
|
|
const pages = db.prepare(`
|
|
SELECT
|
|
page_number,
|
|
ocr_confidence,
|
|
LENGTH(ocr_text) as text_length,
|
|
ocr_completed_at,
|
|
search_indexed_at
|
|
FROM document_pages
|
|
WHERE document_id = ?
|
|
ORDER BY page_number
|
|
LIMIT 10
|
|
`).all(documentId);
|
|
|
|
console.log(`Document has ${pages.length} pages indexed:\n`);
|
|
|
|
pages.forEach(page => {
|
|
console.log(`Page ${page.page_number}:`);
|
|
console.log(` OCR Confidence: ${(page.ocr_confidence * 100).toFixed(0)}%`);
|
|
console.log(` Text Length: ${page.text_length} characters`);
|
|
console.log(` Indexed: ${page.search_indexed_at ? '✓' : '✗'}`);
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Example 5: Multi-vertical search
|
|
*/
|
|
async function multiVerticalSearch() {
|
|
console.log('\n=== Example 5: Multi-Vertical Search ===\n');
|
|
|
|
// Search across all boat documents
|
|
const boatResults = await searchPages('engine maintenance', {
|
|
filter: 'vertical = "boating"',
|
|
limit: 5
|
|
});
|
|
|
|
console.log(`Boat documents: ${boatResults.hits.length} results`);
|
|
|
|
// Search property/condo documents
|
|
const propertyResults = await searchPages('HVAC system', {
|
|
filter: 'vertical = "property"',
|
|
limit: 5
|
|
});
|
|
|
|
console.log(`Property documents: ${propertyResults.hits.length} results`);
|
|
|
|
// Search by organization
|
|
const orgResults = await searchPages('safety', {
|
|
filter: 'organizationId = "org_demo_123"',
|
|
limit: 10
|
|
});
|
|
|
|
console.log(`Organization documents: ${orgResults.hits.length} results`);
|
|
}
|
|
|
|
/**
|
|
* Example 6: Advanced filtering and sorting
|
|
*/
|
|
async function advancedSearch() {
|
|
console.log('\n=== Example 6: Advanced Search ===\n');
|
|
|
|
// Search with multiple filters
|
|
const results = await searchPages('pump', {
|
|
filter: [
|
|
'vertical = "boating"',
|
|
'systems IN ["plumbing", "waste-management"]',
|
|
'ocrConfidence > 0.8'
|
|
].join(' AND '),
|
|
sort: ['pageNumber:asc'],
|
|
limit: 10
|
|
});
|
|
|
|
console.log(`Found ${results.hits.length} high-confidence plumbing pages`);
|
|
|
|
// Search by boat make/model
|
|
const prestigeResults = await searchPages('', {
|
|
filter: 'boatMake = "Prestige" AND boatModel = "F4.9"',
|
|
limit: 20
|
|
});
|
|
|
|
console.log(`Found ${prestigeResults.hits.length} Prestige F4.9 pages`);
|
|
}
|
|
|
|
/**
|
|
* Run all examples
|
|
*/
|
|
async function runExamples() {
|
|
try {
|
|
console.log('NaviDocs OCR Integration Examples\n');
|
|
console.log('===================================\n');
|
|
|
|
// Example 1: Upload and process
|
|
const { documentId, jobId } = await uploadAndProcessDocument();
|
|
|
|
// Example 2: Monitor progress
|
|
await monitorJobProgress(jobId);
|
|
|
|
// Example 3: Search content
|
|
await searchDocumentContent(documentId);
|
|
|
|
// Example 4: Get pages
|
|
await getDocumentPages(documentId);
|
|
|
|
// Example 5: Multi-vertical search
|
|
await multiVerticalSearch();
|
|
|
|
// Example 6: Advanced search
|
|
await advancedSearch();
|
|
|
|
console.log('\n✅ All examples completed!\n');
|
|
process.exit(0);
|
|
} catch (error) {
|
|
console.error('Error running examples:', error);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
// Run if executed directly
|
|
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
runExamples();
|
|
}
|
|
|
|
// Export for use in other modules
|
|
export {
|
|
uploadAndProcessDocument,
|
|
monitorJobProgress,
|
|
searchDocumentContent,
|
|
getDocumentPages,
|
|
multiVerticalSearch,
|
|
advancedSearch
|
|
};
|