This commit adds comprehensive image extraction and OCR functionality to the OCR worker:
Features:
- Created image-extractor.js worker module with extractImagesFromPage() function
- Uses pdftoppm (with ImageMagick fallback) to convert PDF pages to high-res images
- Images saved to /uploads/{documentId}/images/page-{N}-img-{M}.png
- Returns image metadata: id, path, position, width, height
OCR Worker Integration:
- Imports image-extractor module and extractTextFromImage from OCR service
- After processing page text, extracts images from each page
- Runs Tesseract OCR on extracted images
- Stores image data in document_images table with extracted text and confidence
- Indexes images in Meilisearch with type='image' for searchability
- Updates document.imageCount and sets imagesExtracted flag
Database:
- Uses existing document_images table from migration 004
- Stores image metadata, OCR text, and confidence scores
Dependencies:
- Added pdf-img-convert and sharp packages
- Uses system tools (pdftoppm/ImageMagick) for reliable PDF conversion
Testing:
- Created test-image-extraction.js to verify image extraction
- Created test-full-pipeline.js to test end-to-end extraction + OCR
- Successfully tested with 05-versions-space.pdf test document
Error Handling:
- Graceful degradation if image extraction fails
- Continues OCR processing even if images cannot be extracted
- Comprehensive logging for debugging
Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
179 lines
5.8 KiB
JavaScript
179 lines
5.8 KiB
JavaScript
/**
|
|
* Image Extractor - Extract images from PDF pages and save them
|
|
*
|
|
* Features:
|
|
* - Extract images from specific PDF pages using pdftoppm
|
|
* - Convert images to PNG format using sharp
|
|
* - Save images to organized directory structure
|
|
* - Return image metadata (path, position, dimensions)
|
|
* - Handle errors gracefully
|
|
*/
|
|
|
|
import sharp from 'sharp';
|
|
import { promises as fs } from 'fs';
|
|
import { join, dirname } from 'path';
|
|
import { fileURLToPath } from 'url';
|
|
import { execSync } from 'child_process';
|
|
import { existsSync, unlinkSync } from 'fs';
|
|
import { tmpdir } from 'os';
|
|
|
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
|
|
/**
|
|
* Extract images from a specific PDF page
|
|
*
|
|
* @param {string} pdfPath - Path to the PDF file
|
|
* @param {number} pageNumber - Page number (1-based)
|
|
* @param {string} documentId - Document ID for organizing output
|
|
* @returns {Promise<Array<Object>>} - Array of extracted image objects
|
|
*/
|
|
export async function extractImagesFromPage(pdfPath, pageNumber, documentId) {
|
|
try {
|
|
console.log(`[Image Extractor] Extracting images from page ${pageNumber} of ${pdfPath}`);
|
|
|
|
// Create output directory for images
|
|
const uploadsDir = join(__dirname, '../../uploads');
|
|
const documentImagesDir = join(uploadsDir, documentId, 'images');
|
|
|
|
// Ensure directory exists
|
|
await fs.mkdir(documentImagesDir, { recursive: true });
|
|
|
|
// Create temporary directory for conversion
|
|
const tempDir = join(tmpdir(), 'navidocs-image-extract');
|
|
await fs.mkdir(tempDir, { recursive: true });
|
|
|
|
// Use pdftoppm to convert the PDF page to an image
|
|
const tempOutputPrefix = join(tempDir, `page-${Date.now()}-${pageNumber}`);
|
|
const tempImagePath = `${tempOutputPrefix}.png`;
|
|
|
|
try {
|
|
// Convert PDF page to PNG using pdftoppm
|
|
execSync(
|
|
`pdftoppm -f ${pageNumber} -l ${pageNumber} -png -singlefile -r 300 "${pdfPath}" "${tempOutputPrefix}"`,
|
|
{ stdio: 'pipe' }
|
|
);
|
|
|
|
console.log(`[Image Extractor] Converted page ${pageNumber} to image using pdftoppm`);
|
|
} catch (convertError) {
|
|
console.warn(`[Image Extractor] pdftoppm failed, trying ImageMagick:`, convertError.message);
|
|
|
|
// Fallback to ImageMagick
|
|
try {
|
|
execSync(
|
|
`convert -density 300 "${pdfPath}[${pageNumber - 1}]" -quality 90 "${tempImagePath}"`,
|
|
{ stdio: 'pipe' }
|
|
);
|
|
|
|
console.log(`[Image Extractor] Converted page ${pageNumber} to image using ImageMagick`);
|
|
} catch (imageMagickError) {
|
|
console.error(`[Image Extractor] Both pdftoppm and ImageMagick failed`);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
// Check if the image was created
|
|
if (!existsSync(tempImagePath)) {
|
|
console.log(`[Image Extractor] No image generated for page ${pageNumber}`);
|
|
return [];
|
|
}
|
|
|
|
const extractedImages = [];
|
|
|
|
try {
|
|
// Process with sharp to get metadata and optimize
|
|
const image = sharp(tempImagePath);
|
|
const metadata = await image.metadata();
|
|
|
|
// Generate unique image ID
|
|
const imageId = `img_${documentId}_p${pageNumber}_0`;
|
|
|
|
// Save as PNG in the document's images directory
|
|
const imagePath = join(documentImagesDir, `page-${pageNumber}-img-0.png`);
|
|
|
|
// Optimize and save the image
|
|
await image
|
|
.png({ compressionLevel: 6 })
|
|
.toFile(imagePath);
|
|
|
|
console.log(`[Image Extractor] Saved image: ${imagePath} (${metadata.width}x${metadata.height})`);
|
|
|
|
// Build image object
|
|
const imageObj = {
|
|
id: imageId,
|
|
path: imagePath,
|
|
relativePath: `/uploads/${documentId}/images/page-${pageNumber}-img-0.png`,
|
|
position: {
|
|
x: 0,
|
|
y: 0,
|
|
width: metadata.width,
|
|
height: metadata.height
|
|
},
|
|
width: metadata.width,
|
|
height: metadata.height,
|
|
format: 'png',
|
|
pageNumber: pageNumber,
|
|
imageIndex: 0
|
|
};
|
|
|
|
extractedImages.push(imageObj);
|
|
|
|
// Clean up temporary file
|
|
try {
|
|
unlinkSync(tempImagePath);
|
|
} catch (e) {
|
|
// Ignore cleanup errors
|
|
}
|
|
} catch (imgError) {
|
|
console.error(`[Image Extractor] Error processing image on page ${pageNumber}:`, imgError.message);
|
|
}
|
|
|
|
console.log(`[Image Extractor] Extracted ${extractedImages.length} image(s) from page ${pageNumber}`);
|
|
|
|
return extractedImages;
|
|
} catch (error) {
|
|
console.error(`[Image Extractor] Error extracting images from page ${pageNumber}:`, error);
|
|
|
|
// Return empty array instead of throwing to allow OCR to continue
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract images from all pages of a PDF
|
|
*
|
|
* @param {string} pdfPath - Path to the PDF file
|
|
* @param {string} documentId - Document ID for organizing output
|
|
* @param {number} totalPages - Total number of pages in the PDF
|
|
* @returns {Promise<Array<Object>>} - Array of all extracted image objects
|
|
*/
|
|
export async function extractAllImages(pdfPath, documentId, totalPages) {
|
|
const allImages = [];
|
|
|
|
for (let pageNum = 1; pageNum <= totalPages; pageNum++) {
|
|
const pageImages = await extractImagesFromPage(pdfPath, pageNum, documentId);
|
|
allImages.push(...pageImages);
|
|
}
|
|
|
|
console.log(`[Image Extractor] Extracted total of ${allImages.length} images from ${totalPages} pages`);
|
|
|
|
return allImages;
|
|
}
|
|
|
|
/**
|
|
* Clean up extracted images for a document
|
|
*
|
|
* @param {string} documentId - Document ID
|
|
* @returns {Promise<void>}
|
|
*/
|
|
export async function cleanupImages(documentId) {
|
|
try {
|
|
const uploadsDir = join(__dirname, '../../uploads');
|
|
const documentImagesDir = join(uploadsDir, documentId, 'images');
|
|
|
|
await fs.rm(documentImagesDir, { recursive: true, force: true });
|
|
|
|
console.log(`[Image Extractor] Cleaned up images for document ${documentId}`);
|
|
} catch (error) {
|
|
console.error(`[Image Extractor] Error cleaning up images:`, error.message);
|
|
}
|
|
}
|