navidocs/server/test-full-pipeline.js

#!/usr/bin/env node
/**
 * Test full image extraction and OCR pipeline
 */

import { extractImagesFromPage } from './workers/image-extractor.js';
import { extractTextFromImage } from './services/ocr.js';
import path from 'path';
import { fileURLToPath } from 'url';

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

async function testFullPipeline() {
  console.log('=== Testing Full Image Extraction + OCR Pipeline ===\n');

  const testPdfPath = '/home/setup/navidocs/test/data/05-versions-space.pdf';
  const documentId = 'test_doc_' + Date.now();

  console.log(`Test PDF: ${testPdfPath}`);
  console.log(`Document ID: ${documentId}\n`);

  try {
    // Step 1: Extract images from page 1
    console.log('Step 1: Extracting images from page 1...');
    const images = await extractImagesFromPage(testPdfPath, 1, documentId);

    console.log(`✅ Extracted ${images.length} image(s)\n`);

    if (images.length === 0) {
      console.log('No images to process. Test complete.');
      return;
    }

    // Step 2: Run OCR on each extracted image
    console.log('Step 2: Running OCR on extracted images...\n');

    for (const image of images) {
      console.log(`Processing image: ${image.relativePath}`);
      console.log(`  Dimensions: ${image.width}x${image.height}`);

      try {
        const ocrResult = await extractTextFromImage(image.path, 'eng');

        console.log(`  OCR Confidence: ${ocrResult.confidence.toFixed(2)}`);
        console.log(`  Text Length: ${ocrResult.text.length} characters`);
        console.log(`  Text Preview (first 200 chars):`);
        console.log(`    ${ocrResult.text.substring(0, 200).replace(/\n/g, ' ')}...`);
        console.log();
      } catch (ocrError) {
        console.error(`  ❌ OCR Error: ${ocrError.message}\n`);
      }
    }

    console.log('=== Full Pipeline Test Complete ===');
  } catch (error) {
    console.error('❌ Pipeline test failed:', error);
    console.error(error.stack);
    process.exit(1);
  }
}

testFullPipeline();