/** * Hybrid OCR Service * * Intelligently chooses between multiple OCR engines: * 1. Google Drive OCR (if configured) - Highest quality * 2. Google Cloud Vision API (if configured) - High quality, more control * 3. Tesseract (fallback) - Local, free, always available * * Configuration via .env: * - PREFERRED_OCR_ENGINE=google-drive|google-vision|tesseract * - GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials.json */ import { extractTextFromPDF as extractWithTesseract } from './ocr.js'; import { extractTextFromPDFGoogleDrive, isGoogleDriveConfigured } from './ocr-google-drive.js'; const PREFERRED_ENGINE = process.env.PREFERRED_OCR_ENGINE || 'auto'; /** * Extract text from PDF using the best available OCR engine * * @param {string} pdfPath - Path to PDF file * @param {Object} options - Configuration options * @param {string} options.language - Language code (eng, spa, fra, etc.) * @param {Function} options.onProgress - Progress callback * @param {string} options.forceEngine - Force specific engine (google-drive, tesseract) * @returns {Promise>} */ export async function extractTextFromPDF(pdfPath, options = {}) { const { forceEngine } = options; const engine = forceEngine || PREFERRED_ENGINE; // Determine which engine to use let selectedEngine = 'tesseract'; // Default fallback if (engine === 'auto') { // Auto-select best available engine if (isGoogleDriveConfigured()) { selectedEngine = 'google-drive'; } } else if (engine === 'google-drive' && !isGoogleDriveConfigured()) { console.warn('[OCR Hybrid] Google Drive requested but not configured, falling back to Tesseract'); } else { selectedEngine = engine; } console.log(`[OCR Hybrid] Using ${selectedEngine} engine for ${pdfPath}`); // Execute OCR with selected engine try { switch (selectedEngine) { case 'google-drive': return await extractWithGoogleDrive(pdfPath, options); case 'tesseract': default: return await extractWithTesseract(pdfPath, options); } } catch (error) { // If preferred engine fails, fallback to Tesseract if (selectedEngine !== 'tesseract') { console.warn(`[OCR Hybrid] ${selectedEngine} failed, falling back to Tesseract:`, error.message); return await extractWithTesseract(pdfPath, options); } throw error; } } /** * Wrapper for Google Drive OCR with error handling */ async function extractWithGoogleDrive(pdfPath, options) { try { const results = await extractTextFromPDFGoogleDrive(pdfPath, options); // Log quality metrics const avgConfidence = results.reduce((sum, r) => sum + r.confidence, 0) / results.length; console.log(`[Google Drive OCR] Completed with avg confidence: ${avgConfidence.toFixed(2)}`); return results; } catch (error) { console.error('[Google Drive OCR] Error:', error.message); throw error; } } /** * Get information about available OCR engines * * @returns {Object} - Status of each engine */ export function getAvailableEngines() { return { tesseract: { available: true, quality: 'good', speed: 'fast', cost: 'free', notes: 'Always available, runs locally' }, 'google-drive': { available: isGoogleDriveConfigured(), quality: 'excellent', speed: 'medium', cost: 'free (within quotas)', notes: 'Requires Google Cloud credentials' } }; } /** * Recommend best OCR engine for a given document * * @param {Object} documentInfo - Document metadata * @param {number} documentInfo.pageCount - Number of pages * @param {number} documentInfo.fileSize - File size in bytes * @returns {string} - Recommended engine name */ export function recommendEngine(documentInfo) { const { pageCount = 1, fileSize = 0 } = documentInfo; // For large documents, prefer local Tesseract to avoid API quotas if (pageCount > 50 || fileSize > 10 * 1024 * 1024) { return 'tesseract'; } // For smaller documents, prefer Google Drive for quality if (isGoogleDriveConfigured()) { return 'google-drive'; } return 'tesseract'; }