navidocs/server/workers/image-extractor.js
ggq-admin 09d9f1b601 Implement PDF image extraction with OCR in OCR worker
This commit adds comprehensive image extraction and OCR functionality to the OCR worker:

Features:
- Created image-extractor.js worker module with extractImagesFromPage() function
- Uses pdftoppm (with ImageMagick fallback) to convert PDF pages to high-res images
- Images saved to /uploads/{documentId}/images/page-{N}-img-{M}.png
- Returns image metadata: id, path, position, width, height

OCR Worker Integration:
- Imports image-extractor module and extractTextFromImage from OCR service
- After processing page text, extracts images from each page
- Runs Tesseract OCR on extracted images
- Stores image data in document_images table with extracted text and confidence
- Indexes images in Meilisearch with type='image' for searchability
- Updates document.imageCount and sets imagesExtracted flag

Database:
- Uses existing document_images table from migration 004
- Stores image metadata, OCR text, and confidence scores

Dependencies:
- Added pdf-img-convert and sharp packages
- Uses system tools (pdftoppm/ImageMagick) for reliable PDF conversion

Testing:
- Created test-image-extraction.js to verify image extraction
- Created test-full-pipeline.js to test end-to-end extraction + OCR
- Successfully tested with 05-versions-space.pdf test document

Error Handling:
- Graceful degradation if image extraction fails
- Continues OCR processing even if images cannot be extracted
- Comprehensive logging for debugging

Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-19 19:54:25 +02:00

179 lines
5.8 KiB
JavaScript

/**
* Image Extractor - Extract images from PDF pages and save them
*
* Features:
* - Extract images from specific PDF pages using pdftoppm
* - Convert images to PNG format using sharp
* - Save images to organized directory structure
* - Return image metadata (path, position, dimensions)
* - Handle errors gracefully
*/
import sharp from 'sharp';
import { promises as fs } from 'fs';
import { join, dirname } from 'path';
import { fileURLToPath } from 'url';
import { execSync } from 'child_process';
import { existsSync, unlinkSync } from 'fs';
import { tmpdir } from 'os';
const __dirname = dirname(fileURLToPath(import.meta.url));
/**
* Extract images from a specific PDF page
*
* @param {string} pdfPath - Path to the PDF file
* @param {number} pageNumber - Page number (1-based)
* @param {string} documentId - Document ID for organizing output
* @returns {Promise<Array<Object>>} - Array of extracted image objects
*/
export async function extractImagesFromPage(pdfPath, pageNumber, documentId) {
try {
console.log(`[Image Extractor] Extracting images from page ${pageNumber} of ${pdfPath}`);
// Create output directory for images
const uploadsDir = join(__dirname, '../../uploads');
const documentImagesDir = join(uploadsDir, documentId, 'images');
// Ensure directory exists
await fs.mkdir(documentImagesDir, { recursive: true });
// Create temporary directory for conversion
const tempDir = join(tmpdir(), 'navidocs-image-extract');
await fs.mkdir(tempDir, { recursive: true });
// Use pdftoppm to convert the PDF page to an image
const tempOutputPrefix = join(tempDir, `page-${Date.now()}-${pageNumber}`);
const tempImagePath = `${tempOutputPrefix}.png`;
try {
// Convert PDF page to PNG using pdftoppm
execSync(
`pdftoppm -f ${pageNumber} -l ${pageNumber} -png -singlefile -r 300 "${pdfPath}" "${tempOutputPrefix}"`,
{ stdio: 'pipe' }
);
console.log(`[Image Extractor] Converted page ${pageNumber} to image using pdftoppm`);
} catch (convertError) {
console.warn(`[Image Extractor] pdftoppm failed, trying ImageMagick:`, convertError.message);
// Fallback to ImageMagick
try {
execSync(
`convert -density 300 "${pdfPath}[${pageNumber - 1}]" -quality 90 "${tempImagePath}"`,
{ stdio: 'pipe' }
);
console.log(`[Image Extractor] Converted page ${pageNumber} to image using ImageMagick`);
} catch (imageMagickError) {
console.error(`[Image Extractor] Both pdftoppm and ImageMagick failed`);
return [];
}
}
// Check if the image was created
if (!existsSync(tempImagePath)) {
console.log(`[Image Extractor] No image generated for page ${pageNumber}`);
return [];
}
const extractedImages = [];
try {
// Process with sharp to get metadata and optimize
const image = sharp(tempImagePath);
const metadata = await image.metadata();
// Generate unique image ID
const imageId = `img_${documentId}_p${pageNumber}_0`;
// Save as PNG in the document's images directory
const imagePath = join(documentImagesDir, `page-${pageNumber}-img-0.png`);
// Optimize and save the image
await image
.png({ compressionLevel: 6 })
.toFile(imagePath);
console.log(`[Image Extractor] Saved image: ${imagePath} (${metadata.width}x${metadata.height})`);
// Build image object
const imageObj = {
id: imageId,
path: imagePath,
relativePath: `/uploads/${documentId}/images/page-${pageNumber}-img-0.png`,
position: {
x: 0,
y: 0,
width: metadata.width,
height: metadata.height
},
width: metadata.width,
height: metadata.height,
format: 'png',
pageNumber: pageNumber,
imageIndex: 0
};
extractedImages.push(imageObj);
// Clean up temporary file
try {
unlinkSync(tempImagePath);
} catch (e) {
// Ignore cleanup errors
}
} catch (imgError) {
console.error(`[Image Extractor] Error processing image on page ${pageNumber}:`, imgError.message);
}
console.log(`[Image Extractor] Extracted ${extractedImages.length} image(s) from page ${pageNumber}`);
return extractedImages;
} catch (error) {
console.error(`[Image Extractor] Error extracting images from page ${pageNumber}:`, error);
// Return empty array instead of throwing to allow OCR to continue
return [];
}
}
/**
* Extract images from all pages of a PDF
*
* @param {string} pdfPath - Path to the PDF file
* @param {string} documentId - Document ID for organizing output
* @param {number} totalPages - Total number of pages in the PDF
* @returns {Promise<Array<Object>>} - Array of all extracted image objects
*/
export async function extractAllImages(pdfPath, documentId, totalPages) {
const allImages = [];
for (let pageNum = 1; pageNum <= totalPages; pageNum++) {
const pageImages = await extractImagesFromPage(pdfPath, pageNum, documentId);
allImages.push(...pageImages);
}
console.log(`[Image Extractor] Extracted total of ${allImages.length} images from ${totalPages} pages`);
return allImages;
}
/**
* Clean up extracted images for a document
*
* @param {string} documentId - Document ID
* @returns {Promise<void>}
*/
export async function cleanupImages(documentId) {
try {
const uploadsDir = join(__dirname, '../../uploads');
const documentImagesDir = join(uploadsDir, documentId, 'images');
await fs.rm(documentImagesDir, { recursive: true, force: true });
console.log(`[Image Extractor] Cleaned up images for document ${documentId}`);
} catch (error) {
console.error(`[Image Extractor] Error cleaning up images:`, error.message);
}
}