navidocs/server/test-image-extraction.js
ggq-admin 09d9f1b601 Implement PDF image extraction with OCR in OCR worker
This commit adds comprehensive image extraction and OCR functionality to the OCR worker:

Features:
- Created image-extractor.js worker module with extractImagesFromPage() function
- Uses pdftoppm (with ImageMagick fallback) to convert PDF pages to high-res images
- Images saved to /uploads/{documentId}/images/page-{N}-img-{M}.png
- Returns image metadata: id, path, position, width, height

OCR Worker Integration:
- Imports image-extractor module and extractTextFromImage from OCR service
- After processing page text, extracts images from each page
- Runs Tesseract OCR on extracted images
- Stores image data in document_images table with extracted text and confidence
- Indexes images in Meilisearch with type='image' for searchability
- Updates document.imageCount and sets imagesExtracted flag

Database:
- Uses existing document_images table from migration 004
- Stores image metadata, OCR text, and confidence scores

Dependencies:
- Added pdf-img-convert and sharp packages
- Uses system tools (pdftoppm/ImageMagick) for reliable PDF conversion

Testing:
- Created test-image-extraction.js to verify image extraction
- Created test-full-pipeline.js to test end-to-end extraction + OCR
- Successfully tested with 05-versions-space.pdf test document

Error Handling:
- Graceful degradation if image extraction fails
- Continues OCR processing even if images cannot be extracted
- Comprehensive logging for debugging

Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-19 19:54:25 +02:00

51 lines
1.5 KiB
JavaScript

#!/usr/bin/env node
/**
* Test image extraction functionality
*/
import { extractImagesFromPage } from './workers/image-extractor.js';
import path from 'path';
import { fileURLToPath } from 'url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
async function testImageExtraction() {
console.log('=== Testing Image Extraction ===\n');
const testPdfPath = '/home/setup/navidocs/test/data/05-versions-space.pdf';
const documentId = 'test_doc_' + Date.now();
console.log(`Test PDF: ${testPdfPath}`);
console.log(`Document ID: ${documentId}\n`);
try {
// Test extracting from page 1
console.log('Extracting images from page 1...');
const images = await extractImagesFromPage(testPdfPath, 1, documentId);
console.log(`\n✅ Extraction complete!`);
console.log(`Found ${images.length} image(s)\n`);
if (images.length > 0) {
console.log('Image details:');
images.forEach((img, idx) => {
console.log(`\n Image ${idx + 1}:`);
console.log(` ID: ${img.id}`);
console.log(` Path: ${img.path}`);
console.log(` Relative Path: ${img.relativePath}`);
console.log(` Dimensions: ${img.width}x${img.height}`);
console.log(` Format: ${img.format}`);
console.log(` Position:`, JSON.stringify(img.position));
});
}
console.log('\n=== Test Complete ===');
} catch (error) {
console.error('❌ Test failed:', error);
console.error(error.stack);
process.exit(1);
}
}
testImageExtraction();