Implement PDF image extraction with OCR in OCR worker

This commit adds comprehensive image extraction and OCR functionality to the OCR worker: Features: - Created image-extractor.js worker module with extractImagesFromPage() function - Uses pdftoppm (with ImageMagick fallback) to convert PDF pages to high-res images - Images saved to /uploads/{documentId}/images/page-{N}-img-{M}.png - Returns image metadata: id, path, position, width, height OCR Worker Integration: - Imports image-extractor module and extractTextFromImage from OCR service - After processing page text, extracts images from each page - Runs Tesseract OCR on extracted images - Stores image data in document_images table with extracted text and confidence - Indexes images in Meilisearch with type='image' for searchability - Updates document.imageCount and sets imagesExtracted flag Database: - Uses existing document_images table from migration 004 - Stores image metadata, OCR text, and confidence scores Dependencies: - Added pdf-img-convert and sharp packages - Uses system tools (pdftoppm/ImageMagick) for reliable PDF conversion Testing: - Created test-image-extraction.js to verify image extraction - Created test-full-pipeline.js to test end-to-end extraction + OCR - Successfully tested with 05-versions-space.pdf test document Error Handling: - Graceful degradation if image extraction fails - Continues OCR processing even if images cannot be extracted - Comprehensive logging for debugging Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-19 19:54:25 +02:00 · 2025-10-19 19:54:25 +02:00 · 09d9f1b601
commit 09d9f1b601
parent 4b91896838
5 changed files with 426 additions and 17 deletions
--- a/server/package.json
+++ b/server/package.json
@ -9,26 +9,33 @@
    "dev": "node --watch index.js",
    "init-db": "node db/init.js"
  },
-  "keywords": ["boat", "manuals", "ocr", "meilisearch"],
+  "keywords": [
+    "boat",
+    "manuals",
+    "ocr",
+    "meilisearch"
+  ],
  "author": "",
  "license": "MIT",
  "dependencies": {
-    "express": "^5.0.0",
-    "better-sqlite3": "^11.0.0",
-    "meilisearch": "^0.41.0",
-    "bullmq": "^5.0.0",
-    "ioredis": "^5.0.0",
-    "helmet": "^7.0.0",
-    "express-rate-limit": "^7.0.0",
-    "cors": "^2.8.5",
-    "tesseract.js": "^5.0.0",
-    "pdf-parse": "^1.1.1",
-    "uuid": "^10.0.0",
    "bcrypt": "^5.1.0",
-    "jsonwebtoken": "^9.0.0",
-    "multer": "^1.4.5-lts.1",
+    "better-sqlite3": "^11.0.0",
+    "bullmq": "^5.0.0",
+    "cors": "^2.8.5",
+    "dotenv": "^16.0.0",
+    "express": "^5.0.0",
+    "express-rate-limit": "^7.0.0",
    "file-type": "^19.0.0",
-    "dotenv": "^16.0.0"
+    "helmet": "^7.0.0",
+    "ioredis": "^5.0.0",
+    "jsonwebtoken": "^9.0.0",
+    "meilisearch": "^0.41.0",
+    "multer": "^1.4.5-lts.1",
+    "pdf-img-convert": "^2.0.0",
+    "pdf-parse": "^1.1.1",
+    "sharp": "^0.34.4",
+    "tesseract.js": "^5.0.0",
+    "uuid": "^10.0.0"
  },
  "devDependencies": {
    "@types/node": "^20.0.0"
--- a/server/test-full-pipeline.js
+++ b/server/test-full-pipeline.js
@ -0,0 +1,63 @@
+#!/usr/bin/env node
+/**
+ * Test full image extraction and OCR pipeline
+ */
+
+import { extractImagesFromPage } from './workers/image-extractor.js';
+import { extractTextFromImage } from './services/ocr.js';
+import path from 'path';
+import { fileURLToPath } from 'url';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+async function testFullPipeline() {
+  console.log('=== Testing Full Image Extraction + OCR Pipeline ===\n');
+
+  const testPdfPath = '/home/setup/navidocs/test/data/05-versions-space.pdf';
+  const documentId = 'test_doc_' + Date.now();
+
+  console.log(`Test PDF: ${testPdfPath}`);
+  console.log(`Document ID: ${documentId}\n`);
+
+  try {
+    // Step 1: Extract images from page 1
+    console.log('Step 1: Extracting images from page 1...');
+    const images = await extractImagesFromPage(testPdfPath, 1, documentId);
+
+    console.log(`✅ Extracted ${images.length} image(s)\n`);
+
+    if (images.length === 0) {
+      console.log('No images to process. Test complete.');
+      return;
+    }
+
+    // Step 2: Run OCR on each extracted image
+    console.log('Step 2: Running OCR on extracted images...\n');
+
+    for (const image of images) {
+      console.log(`Processing image: ${image.relativePath}`);
+      console.log(`  Dimensions: ${image.width}x${image.height}`);
+
+      try {
+        const ocrResult = await extractTextFromImage(image.path, 'eng');
+
+        console.log(`  OCR Confidence: ${ocrResult.confidence.toFixed(2)}`);
+        console.log(`  Text Length: ${ocrResult.text.length} characters`);
+        console.log(`  Text Preview (first 200 chars):`);
+        console.log(`    ${ocrResult.text.substring(0, 200).replace(/\n/g, ' ')}...`);
+        console.log();
+      } catch (ocrError) {
+        console.error(`  ❌ OCR Error: ${ocrError.message}\n`);
+      }
+    }
+
+    console.log('=== Full Pipeline Test Complete ===');
+  } catch (error) {
+    console.error('❌ Pipeline test failed:', error);
+    console.error(error.stack);
+    process.exit(1);
+  }
+}
+
+testFullPipeline();
--- a/server/test-image-extraction.js
+++ b/server/test-image-extraction.js
@ -0,0 +1,51 @@
+#!/usr/bin/env node
+/**
+ * Test image extraction functionality
+ */
+
+import { extractImagesFromPage } from './workers/image-extractor.js';
+import path from 'path';
+import { fileURLToPath } from 'url';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+async function testImageExtraction() {
+  console.log('=== Testing Image Extraction ===\n');
+
+  const testPdfPath = '/home/setup/navidocs/test/data/05-versions-space.pdf';
+  const documentId = 'test_doc_' + Date.now();
+
+  console.log(`Test PDF: ${testPdfPath}`);
+  console.log(`Document ID: ${documentId}\n`);
+
+  try {
+    // Test extracting from page 1
+    console.log('Extracting images from page 1...');
+    const images = await extractImagesFromPage(testPdfPath, 1, documentId);
+
+    console.log(`\n✅ Extraction complete!`);
+    console.log(`Found ${images.length} image(s)\n`);
+
+    if (images.length > 0) {
+      console.log('Image details:');
+      images.forEach((img, idx) => {
+        console.log(`\n  Image ${idx + 1}:`);
+        console.log(`    ID: ${img.id}`);
+        console.log(`    Path: ${img.path}`);
+        console.log(`    Relative Path: ${img.relativePath}`);
+        console.log(`    Dimensions: ${img.width}x${img.height}`);
+        console.log(`    Format: ${img.format}`);
+        console.log(`    Position:`, JSON.stringify(img.position));
+      });
+    }
+
+    console.log('\n=== Test Complete ===');
+  } catch (error) {
+    console.error('❌ Test failed:', error);
+    console.error(error.stack);
+    process.exit(1);
+  }
+}
+
+testImageExtraction();
--- a/server/workers/image-extractor.js
+++ b/server/workers/image-extractor.js
@ -0,0 +1,179 @@
+/**
+ * Image Extractor - Extract images from PDF pages and save them
+ *
+ * Features:
+ * - Extract images from specific PDF pages using pdftoppm
+ * - Convert images to PNG format using sharp
+ * - Save images to organized directory structure
+ * - Return image metadata (path, position, dimensions)
+ * - Handle errors gracefully
+ */
+
+import sharp from 'sharp';
+import { promises as fs } from 'fs';
+import { join, dirname } from 'path';
+import { fileURLToPath } from 'url';
+import { execSync } from 'child_process';
+import { existsSync, unlinkSync } from 'fs';
+import { tmpdir } from 'os';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+
+/**
+ * Extract images from a specific PDF page
+ *
+ * @param {string} pdfPath - Path to the PDF file
+ * @param {number} pageNumber - Page number (1-based)
+ * @param {string} documentId - Document ID for organizing output
+ * @returns {Promise<Array<Object>>} - Array of extracted image objects
+ */
+export async function extractImagesFromPage(pdfPath, pageNumber, documentId) {
+  try {
+    console.log(`[Image Extractor] Extracting images from page ${pageNumber} of ${pdfPath}`);
+
+    // Create output directory for images
+    const uploadsDir = join(__dirname, '../../uploads');
+    const documentImagesDir = join(uploadsDir, documentId, 'images');
+
+    // Ensure directory exists
+    await fs.mkdir(documentImagesDir, { recursive: true });
+
+    // Create temporary directory for conversion
+    const tempDir = join(tmpdir(), 'navidocs-image-extract');
+    await fs.mkdir(tempDir, { recursive: true });
+
+    // Use pdftoppm to convert the PDF page to an image
+    const tempOutputPrefix = join(tempDir, `page-${Date.now()}-${pageNumber}`);
+    const tempImagePath = `${tempOutputPrefix}.png`;
+
+    try {
+      // Convert PDF page to PNG using pdftoppm
+      execSync(
+        `pdftoppm -f ${pageNumber} -l ${pageNumber} -png -singlefile -r 300 "${pdfPath}" "${tempOutputPrefix}"`,
+        { stdio: 'pipe' }
+      );
+
+      console.log(`[Image Extractor] Converted page ${pageNumber} to image using pdftoppm`);
+    } catch (convertError) {
+      console.warn(`[Image Extractor] pdftoppm failed, trying ImageMagick:`, convertError.message);
+
+      // Fallback to ImageMagick
+      try {
+        execSync(
+          `convert -density 300 "${pdfPath}[${pageNumber - 1}]" -quality 90 "${tempImagePath}"`,
+          { stdio: 'pipe' }
+        );
+
+        console.log(`[Image Extractor] Converted page ${pageNumber} to image using ImageMagick`);
+      } catch (imageMagickError) {
+        console.error(`[Image Extractor] Both pdftoppm and ImageMagick failed`);
+        return [];
+      }
+    }
+
+    // Check if the image was created
+    if (!existsSync(tempImagePath)) {
+      console.log(`[Image Extractor] No image generated for page ${pageNumber}`);
+      return [];
+    }
+
+    const extractedImages = [];
+
+    try {
+      // Process with sharp to get metadata and optimize
+      const image = sharp(tempImagePath);
+      const metadata = await image.metadata();
+
+      // Generate unique image ID
+      const imageId = `img_${documentId}_p${pageNumber}_0`;
+
+      // Save as PNG in the document's images directory
+      const imagePath = join(documentImagesDir, `page-${pageNumber}-img-0.png`);
+
+      // Optimize and save the image
+      await image
+        .png({ compressionLevel: 6 })
+        .toFile(imagePath);
+
+      console.log(`[Image Extractor] Saved image: ${imagePath} (${metadata.width}x${metadata.height})`);
+
+      // Build image object
+      const imageObj = {
+        id: imageId,
+        path: imagePath,
+        relativePath: `/uploads/${documentId}/images/page-${pageNumber}-img-0.png`,
+        position: {
+          x: 0,
+          y: 0,
+          width: metadata.width,
+          height: metadata.height
+        },
+        width: metadata.width,
+        height: metadata.height,
+        format: 'png',
+        pageNumber: pageNumber,
+        imageIndex: 0
+      };
+
+      extractedImages.push(imageObj);
+
+      // Clean up temporary file
+      try {
+        unlinkSync(tempImagePath);
+      } catch (e) {
+        // Ignore cleanup errors
+      }
+    } catch (imgError) {
+      console.error(`[Image Extractor] Error processing image on page ${pageNumber}:`, imgError.message);
+    }
+
+    console.log(`[Image Extractor] Extracted ${extractedImages.length} image(s) from page ${pageNumber}`);
+
+    return extractedImages;
+  } catch (error) {
+    console.error(`[Image Extractor] Error extracting images from page ${pageNumber}:`, error);
+
+    // Return empty array instead of throwing to allow OCR to continue
+    return [];
+  }
+}
+
+/**
+ * Extract images from all pages of a PDF
+ *
+ * @param {string} pdfPath - Path to the PDF file
+ * @param {string} documentId - Document ID for organizing output
+ * @param {number} totalPages - Total number of pages in the PDF
+ * @returns {Promise<Array<Object>>} - Array of all extracted image objects
+ */
+export async function extractAllImages(pdfPath, documentId, totalPages) {
+  const allImages = [];
+
+  for (let pageNum = 1; pageNum <= totalPages; pageNum++) {
+    const pageImages = await extractImagesFromPage(pdfPath, pageNum, documentId);
+    allImages.push(...pageImages);
+  }
+
+  console.log(`[Image Extractor] Extracted total of ${allImages.length} images from ${totalPages} pages`);
+
+  return allImages;
+}
+
+/**
+ * Clean up extracted images for a document
+ *
+ * @param {string} documentId - Document ID
+ * @returns {Promise<void>}
+ */
+export async function cleanupImages(documentId) {
+  try {
+    const uploadsDir = join(__dirname, '../../uploads');
+    const documentImagesDir = join(uploadsDir, documentId, 'images');
+
+    await fs.rm(documentImagesDir, { recursive: true, force: true });
+
+    console.log(`[Image Extractor] Cleaned up images for document ${documentId}`);
+  } catch (error) {
+    console.error(`[Image Extractor] Error cleaning up images:`, error.message);
+  }
+}
--- a/server/workers/ocr-worker.js
+++ b/server/workers/ocr-worker.js
@ -18,8 +18,9 @@ import { v4 as uuidv4 } from 'uuid';
 import { dirname, join } from 'path';
 import { fileURLToPath } from 'url';
 import { getDb } from '../config/db.js';
-import { extractTextFromPDF, cleanOCRText } from '../services/ocr.js';
+import { extractTextFromPDF, cleanOCRText, extractTextFromImage } from '../services/ocr.js';
 import { indexDocumentPage } from '../services/search.js';
+import { extractImagesFromPage } from './image-extractor.js';

 const __dirname = dirname(fileURLToPath(import.meta.url));

@ -179,16 +180,124 @@ async function processOCRJob(job) {
            // Continue processing other pages even if indexing fails
          }
        }
+
+        // Extract and process images from this page
+        try {
+          console.log(`[OCR Worker] Extracting images from page ${pageNumber}`);
+
+          const extractedImages = await extractImagesFromPage(filePath, pageNumber, documentId);
+
+          console.log(`[OCR Worker] Found ${extractedImages.length} image(s) on page ${pageNumber}`);
+
+          // Process each extracted image
+          for (const image of extractedImages) {
+            try {
+              console.log(`[OCR Worker] Running OCR on image: ${image.relativePath}`);
+
+              // Run Tesseract OCR on the extracted image
+              const imageOCR = await extractTextFromImage(image.path, document.language || 'eng');
+
+              const imageText = imageOCR.text ? cleanOCRText(imageOCR.text) : '';
+              const imageConfidence = imageOCR.confidence || 0;
+
+              console.log(`[OCR Worker] Image OCR complete (confidence: ${imageConfidence.toFixed(2)}, text length: ${imageText.length})`);
+
+              // Generate unique image ID for database
+              const imageDbId = `${image.id}_${Date.now()}`;
+
+              // Store image in document_images table
+              db.prepare(`
+                INSERT INTO document_images (
+                  id, documentId, pageNumber, imageIndex,
+                  imagePath, imageFormat, width, height,
+                  position, extractedText, textConfidence,
+                  createdAt
+                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+              `).run(
+                imageDbId,
+                documentId,
+                pageNumber,
+                image.imageIndex,
+                image.relativePath,
+                image.format,
+                image.width,
+                image.height,
+                JSON.stringify(image.position),
+                imageText,
+                imageConfidence,
+                now
+              );
+
+              console.log(`[OCR Worker] Stored image metadata in database: ${imageDbId}`);
+
+              // Index image in Meilisearch with type='image'
+              if (imageText && imageText.length > 0) {
+                try {
+                  // Build a search document for the image
+                  const imageSearchDoc = {
+                    id: `image_${documentId}_p${pageNumber}_i${image.imageIndex}`,
+                    vertical: 'boating', // Default, will be enriched by indexDocumentPage
+                    organizationId: document.organization_id,
+                    organizationName: 'Unknown Organization',
+                    entityId: document.entity_id || 'unknown',
+                    entityName: 'Unknown Entity',
+                    entityType: document.entity_type || 'unknown',
+                    docId: documentId,
+                    userId: document.uploaded_by,
+                    documentType: 'image', // Mark as image type
+                    title: `Image from page ${pageNumber}`,
+                    pageNumber: pageNumber,
+                    text: imageText,
+                    language: document.language || 'en',
+                    ocrConfidence: imageConfidence,
+                    createdAt: document.created_at,
+                    updatedAt: now,
+                    // Image-specific metadata
+                    imagePath: image.relativePath,
+                    imageWidth: image.width,
+                    imageHeight: image.height
+                  };
+
+                  // Get Meilisearch index and add document
+                  const { getMeilisearchIndex } = await import('../config/meilisearch.js');
+                  const index = await getMeilisearchIndex();
+                  await index.addDocuments([imageSearchDoc]);
+
+                  console.log(`[OCR Worker] Indexed image in Meilisearch: ${imageSearchDoc.id}`);
+                } catch (imageIndexError) {
+                  console.error(`[OCR Worker] Failed to index image in Meilisearch:`, imageIndexError.message);
+                  // Continue processing
+                }
+              }
+            } catch (imageOCRError) {
+              console.error(`[OCR Worker] Error processing image ${image.imageIndex} on page ${pageNumber}:`, imageOCRError.message);
+              // Continue with next image
+            }
+          }
+
+          // Update document image count
+          if (extractedImages.length > 0) {
+            db.prepare(`
+              UPDATE documents
+              SET imageCount = COALESCE(imageCount, 0) + ?
+              WHERE id = ?
+            `).run(extractedImages.length, documentId);
+          }
+        } catch (imageExtractionError) {
+          console.error(`[OCR Worker] Error extracting images from page ${pageNumber}:`, imageExtractionError.message);
+          // Continue processing other pages
+        }
      } catch (pageError) {
        console.error(`[OCR Worker] Error processing page ${pageNumber}:`, pageError.message);
        // Continue processing other pages
      }
    }

-    // Update document status to indexed
+    // Update document status to indexed and mark images as extracted
    db.prepare(`
      UPDATE documents
      SET status = 'indexed',
+          imagesExtracted = 1,
          updated_at = ?
      WHERE id = ?
    `).run(now, documentId);