This commit adds comprehensive image extraction and OCR functionality to the OCR worker:
Features:
- Created image-extractor.js worker module with extractImagesFromPage() function
- Uses pdftoppm (with ImageMagick fallback) to convert PDF pages to high-res images
- Images saved to /uploads/{documentId}/images/page-{N}-img-{M}.png
- Returns image metadata: id, path, position, width, height
OCR Worker Integration:
- Imports image-extractor module and extractTextFromImage from OCR service
- After processing page text, extracts images from each page
- Runs Tesseract OCR on extracted images
- Stores image data in document_images table with extracted text and confidence
- Indexes images in Meilisearch with type='image' for searchability
- Updates document.imageCount and sets imagesExtracted flag
Database:
- Uses existing document_images table from migration 004
- Stores image metadata, OCR text, and confidence scores
Dependencies:
- Added pdf-img-convert and sharp packages
- Uses system tools (pdftoppm/ImageMagick) for reliable PDF conversion
Testing:
- Created test-image-extraction.js to verify image extraction
- Created test-full-pipeline.js to test end-to-end extraction + OCR
- Successfully tested with 05-versions-space.pdf test document
Error Handling:
- Graceful degradation if image extraction fails
- Continues OCR processing even if images cannot be extracted
- Comprehensive logging for debugging
Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
63 lines
2 KiB
JavaScript
63 lines
2 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* Test full image extraction and OCR pipeline
|
|
*/
|
|
|
|
import { extractImagesFromPage } from './workers/image-extractor.js';
|
|
import { extractTextFromImage } from './services/ocr.js';
|
|
import path from 'path';
|
|
import { fileURLToPath } from 'url';
|
|
|
|
const __filename = fileURLToPath(import.meta.url);
|
|
const __dirname = path.dirname(__filename);
|
|
|
|
async function testFullPipeline() {
|
|
console.log('=== Testing Full Image Extraction + OCR Pipeline ===\n');
|
|
|
|
const testPdfPath = '/home/setup/navidocs/test/data/05-versions-space.pdf';
|
|
const documentId = 'test_doc_' + Date.now();
|
|
|
|
console.log(`Test PDF: ${testPdfPath}`);
|
|
console.log(`Document ID: ${documentId}\n`);
|
|
|
|
try {
|
|
// Step 1: Extract images from page 1
|
|
console.log('Step 1: Extracting images from page 1...');
|
|
const images = await extractImagesFromPage(testPdfPath, 1, documentId);
|
|
|
|
console.log(`✅ Extracted ${images.length} image(s)\n`);
|
|
|
|
if (images.length === 0) {
|
|
console.log('No images to process. Test complete.');
|
|
return;
|
|
}
|
|
|
|
// Step 2: Run OCR on each extracted image
|
|
console.log('Step 2: Running OCR on extracted images...\n');
|
|
|
|
for (const image of images) {
|
|
console.log(`Processing image: ${image.relativePath}`);
|
|
console.log(` Dimensions: ${image.width}x${image.height}`);
|
|
|
|
try {
|
|
const ocrResult = await extractTextFromImage(image.path, 'eng');
|
|
|
|
console.log(` OCR Confidence: ${ocrResult.confidence.toFixed(2)}`);
|
|
console.log(` Text Length: ${ocrResult.text.length} characters`);
|
|
console.log(` Text Preview (first 200 chars):`);
|
|
console.log(` ${ocrResult.text.substring(0, 200).replace(/\n/g, ' ')}...`);
|
|
console.log();
|
|
} catch (ocrError) {
|
|
console.error(` ❌ OCR Error: ${ocrError.message}\n`);
|
|
}
|
|
}
|
|
|
|
console.log('=== Full Pipeline Test Complete ===');
|
|
} catch (error) {
|
|
console.error('❌ Pipeline test failed:', error);
|
|
console.error(error.stack);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
testFullPipeline();
|