navidocs/server/examples/ocr-integration.js

/**
 * OCR Integration Example
 *
 * This example demonstrates the complete OCR pipeline workflow:
 * 1. Upload a PDF document
 * 2. Create OCR job in database
 * 3. Queue job for background processing
 * 4. Monitor job progress
 * 5. Search indexed content
 *
 * Usage: node examples/ocr-integration.js
 */

import { v4 as uuidv4 } from 'uuid';
import { getDb } from '../config/db.js';
import { addOcrJob, getJobStatus } from '../services/queue.js';
import { searchPages } from '../services/search.js';
import { createReadStream, statSync } from 'fs';
import { createHash } from 'crypto';

/**
 * Example 1: Complete document upload and OCR workflow
 */
async function uploadAndProcessDocument() {
  console.log('=== Example 1: Upload and Process Document ===\n');

  const db = getDb();

  // Simulate uploaded file
  const filePath = './uploads/boat-manual.pdf';
  const fileStats = statSync(filePath);
  const fileHash = createHash('sha256')
    .update(createReadStream(filePath))
    .digest('hex');

  // Create document record
  const documentId = uuidv4();
  const now = Math.floor(Date.now() / 1000);

  db.prepare(`
    INSERT INTO documents (
      id, organization_id, entity_id, uploaded_by,
      title, document_type, file_path, file_name,
      file_size, file_hash, page_count,
      status, created_at, updated_at
    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'processing', ?, ?)
  `).run(
    documentId,
    'org_demo_123', // Organization ID
    'boat_demo_456', // Boat/Entity ID
    'user_demo_789', // User ID
    'Prestige F4.9 Owner Manual',
    'owner-manual',
    filePath,
    'boat-manual.pdf',
    fileStats.size,
    fileHash,
    50, // Page count (would be detected from PDF)
    now,
    now
  );

  console.log(`✓ Document created: ${documentId}`);

  // Create OCR job in database
  const jobId = uuidv4();

  db.prepare(`
    INSERT INTO ocr_jobs (id, document_id, status, progress, created_at)
    VALUES (?, ?, 'pending', 0, ?)
  `).run(jobId, documentId, now);

  console.log(`✓ OCR job created: ${jobId}`);

  // Add job to BullMQ queue
  await addOcrJob(documentId, jobId, {
    filePath: filePath
  });

  console.log(`✓ Job queued for background processing`);

  return { documentId, jobId };
}

/**
 * Example 2: Monitor job progress
 */
async function monitorJobProgress(jobId) {
  console.log('\n=== Example 2: Monitor Job Progress ===\n');

  const db = getDb();

  // Poll for progress every 2 seconds
  const checkProgress = setInterval(async () => {
    const job = db.prepare(`
      SELECT status, progress, error FROM ocr_jobs WHERE id = ?
    `).get(jobId);

    console.log(`Status: ${job.status} | Progress: ${job.progress}%`);

    if (job.status === 'completed') {
      console.log('✓ OCR processing completed!');
      clearInterval(checkProgress);
    } else if (job.status === 'failed') {
      console.error(`✗ Job failed: ${job.error}`);
      clearInterval(checkProgress);
    }
  }, 2000);

  // Also check BullMQ status
  const bullStatus = await getJobStatus(jobId);
  if (bullStatus) {
    console.log(`BullMQ State: ${bullStatus.state}`);
  }
}

/**
 * Example 3: Search indexed content
 */
async function searchDocumentContent(documentId) {
  console.log('\n=== Example 3: Search Document Content ===\n');

  // Wait for indexing to complete
  await new Promise(resolve => setTimeout(resolve, 5000));

  // Search for specific content
  const queries = [
    'bilge pump',
    'electrical system',
    'maintenance schedule',
    'safety equipment'
  ];

  for (const query of queries) {
    console.log(`\nSearching for: "${query}"`);

    const results = await searchPages(query, {
      filter: `docId = "${documentId}"`,
      limit: 3
    });

    if (results.hits.length > 0) {
      console.log(`Found ${results.hits.length} matches:`);
      results.hits.forEach((hit, index) => {
        console.log(`  ${index + 1}. Page ${hit.pageNumber} (confidence: ${(hit.ocrConfidence * 100).toFixed(0)}%)`);
        console.log(`     "${hit.text.substring(0, 100)}..."`);
      });
    } else {
      console.log('  No matches found');
    }
  }
}

/**
 * Example 4: Get document pages with OCR data
 */
async function getDocumentPages(documentId) {
  console.log('\n=== Example 4: Get Document Pages ===\n');

  const db = getDb();

  const pages = db.prepare(`
    SELECT
      page_number,
      ocr_confidence,
      LENGTH(ocr_text) as text_length,
      ocr_completed_at,
      search_indexed_at
    FROM document_pages
    WHERE document_id = ?
    ORDER BY page_number
    LIMIT 10
  `).all(documentId);

  console.log(`Document has ${pages.length} pages indexed:\n`);

  pages.forEach(page => {
    console.log(`Page ${page.page_number}:`);
    console.log(`  OCR Confidence: ${(page.ocr_confidence * 100).toFixed(0)}%`);
    console.log(`  Text Length: ${page.text_length} characters`);
    console.log(`  Indexed: ${page.search_indexed_at ? '✓' : '✗'}`);
  });
}

/**
 * Example 5: Multi-vertical search
 */
async function multiVerticalSearch() {
  console.log('\n=== Example 5: Multi-Vertical Search ===\n');

  // Search across all boat documents
  const boatResults = await searchPages('engine maintenance', {
    filter: 'vertical = "boating"',
    limit: 5
  });

  console.log(`Boat documents: ${boatResults.hits.length} results`);

  // Search property/condo documents
  const propertyResults = await searchPages('HVAC system', {
    filter: 'vertical = "property"',
    limit: 5
  });

  console.log(`Property documents: ${propertyResults.hits.length} results`);

  // Search by organization
  const orgResults = await searchPages('safety', {
    filter: 'organizationId = "org_demo_123"',
    limit: 10
  });

  console.log(`Organization documents: ${orgResults.hits.length} results`);
}

/**
 * Example 6: Advanced filtering and sorting
 */
async function advancedSearch() {
  console.log('\n=== Example 6: Advanced Search ===\n');

  // Search with multiple filters
  const results = await searchPages('pump', {
    filter: [
      'vertical = "boating"',
      'systems IN ["plumbing", "waste-management"]',
      'ocrConfidence > 0.8'
    ].join(' AND '),
    sort: ['pageNumber:asc'],
    limit: 10
  });

  console.log(`Found ${results.hits.length} high-confidence plumbing pages`);

  // Search by boat make/model
  const prestigeResults = await searchPages('', {
    filter: 'boatMake = "Prestige" AND boatModel = "F4.9"',
    limit: 20
  });

  console.log(`Found ${prestigeResults.hits.length} Prestige F4.9 pages`);
}

/**
 * Run all examples
 */
async function runExamples() {
  try {
    console.log('NaviDocs OCR Integration Examples\n');
    console.log('===================================\n');

    // Example 1: Upload and process
    const { documentId, jobId } = await uploadAndProcessDocument();

    // Example 2: Monitor progress
    await monitorJobProgress(jobId);

    // Example 3: Search content
    await searchDocumentContent(documentId);

    // Example 4: Get pages
    await getDocumentPages(documentId);

    // Example 5: Multi-vertical search
    await multiVerticalSearch();

    // Example 6: Advanced search
    await advancedSearch();

    console.log('\n✅ All examples completed!\n');
    process.exit(0);
  } catch (error) {
    console.error('Error running examples:', error);
    process.exit(1);
  }
}

// Run if executed directly
if (import.meta.url === `file://${process.argv[1]}`) {
  runExamples();
}

// Export for use in other modules
export {
  uploadAndProcessDocument,
  monitorJobProgress,
  searchDocumentContent,
  getDocumentPages,
  multiVerticalSearch,
  advancedSearch
};