Fixed:
- Price: €800K-€1.5M, Sunseeker added
- Agent 1: Joe Trader persona + actual sale ads research
- Ignored meilisearch binary + data/ (too large for GitHub)
- SESSION_DEBUG_BLOCKERS.md created
Ready for Session 1 launch.
🤖 Generated with Claude Code
276 lines
8.4 KiB
JavaScript
276 lines
8.4 KiB
JavaScript
/**
|
|
* Hybrid OCR Service
|
|
*
|
|
* Intelligently chooses between multiple OCR engines:
|
|
* 1. Remote OCR Worker - Offloads OCR to dedicated Proxmox server
|
|
* 2. Google Cloud Vision API - Best quality, fastest, real OCR API
|
|
* 3. Google Drive OCR - Good quality, uses Docs conversion
|
|
* 4. Tesseract - Local, free, always available
|
|
*
|
|
* Configuration via .env:
|
|
* - PREFERRED_OCR_ENGINE=remote-ocr|google-vision|google-drive|tesseract|auto
|
|
* - USE_REMOTE_OCR=true (to enable remote OCR worker)
|
|
* - OCR_WORKER_URL=http://fr-antibes.duckdns.org/naviocr
|
|
* - GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials.json
|
|
*
|
|
* RECOMMENDATION: Use remote-ocr for offloading or google-vision for production!
|
|
*/
|
|
|
|
import { extractTextFromPDF as extractWithTesseract } from './ocr.js';
|
|
import {
|
|
extractTextFromPDF as extractWithRemoteOCR,
|
|
checkRemoteOCRHealth,
|
|
getOCRWorkerInfo
|
|
} from './ocr-client.js';
|
|
|
|
const PREFERRED_ENGINE = process.env.PREFERRED_OCR_ENGINE || 'auto';
|
|
const USE_REMOTE_OCR = process.env.USE_REMOTE_OCR === 'true';
|
|
|
|
// Lazy-load Google services to avoid dependency errors if not installed
|
|
let googleDriveModule = null;
|
|
let googleVisionModule = null;
|
|
|
|
async function loadGoogleDrive() {
|
|
if (googleDriveModule === null) {
|
|
try {
|
|
googleDriveModule = await import('./ocr-google-drive.js');
|
|
} catch (e) {
|
|
googleDriveModule = false;
|
|
}
|
|
}
|
|
return googleDriveModule;
|
|
}
|
|
|
|
async function loadGoogleVision() {
|
|
if (googleVisionModule === null) {
|
|
try {
|
|
googleVisionModule = await import('./ocr-google-vision.js');
|
|
} catch (e) {
|
|
googleVisionModule = false;
|
|
}
|
|
}
|
|
return googleVisionModule;
|
|
}
|
|
|
|
function isGoogleDriveConfigured() {
|
|
// Can't check without loading the module, so return false
|
|
return false;
|
|
}
|
|
|
|
function isVisionConfigured() {
|
|
// Can't check without loading the module, so return false
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Extract text from PDF using the best available OCR engine
|
|
*
|
|
* @param {string} pdfPath - Path to PDF file
|
|
* @param {Object} options - Configuration options
|
|
* @param {string} options.language - Language code (eng, spa, fra, etc.)
|
|
* @param {Function} options.onProgress - Progress callback
|
|
* @param {string} options.forceEngine - Force specific engine (google-drive, tesseract)
|
|
* @returns {Promise<Array<{pageNumber: number, text: string, confidence: number}>>}
|
|
*/
|
|
export async function extractTextFromPDF(pdfPath, options = {}) {
|
|
const { forceEngine } = options;
|
|
const engine = forceEngine || PREFERRED_ENGINE;
|
|
|
|
// Determine which engine to use
|
|
let selectedEngine = 'tesseract'; // Default fallback
|
|
|
|
if (engine === 'auto') {
|
|
// Auto-select best available engine
|
|
// Priority: Remote OCR > Vision API > Drive API > Tesseract
|
|
if (USE_REMOTE_OCR) {
|
|
selectedEngine = 'remote-ocr';
|
|
} else if (isVisionConfigured()) {
|
|
selectedEngine = 'google-vision';
|
|
} else if (isGoogleDriveConfigured()) {
|
|
selectedEngine = 'google-drive';
|
|
}
|
|
} else if (engine === 'remote-ocr' && !USE_REMOTE_OCR) {
|
|
console.warn('[OCR Hybrid] Remote OCR requested but not enabled, falling back');
|
|
selectedEngine = isVisionConfigured() ? 'google-vision' : (isGoogleDriveConfigured() ? 'google-drive' : 'tesseract');
|
|
} else if (engine === 'google-vision' && !isVisionConfigured()) {
|
|
console.warn('[OCR Hybrid] Google Vision requested but not configured, falling back');
|
|
selectedEngine = isGoogleDriveConfigured() ? 'google-drive' : 'tesseract';
|
|
} else if (engine === 'google-drive' && !isGoogleDriveConfigured()) {
|
|
console.warn('[OCR Hybrid] Google Drive requested but not configured, falling back to Tesseract');
|
|
} else {
|
|
selectedEngine = engine;
|
|
}
|
|
|
|
console.log(`[OCR Hybrid] Using ${selectedEngine} engine for ${pdfPath}`);
|
|
|
|
// Execute OCR with selected engine
|
|
try {
|
|
switch (selectedEngine) {
|
|
case 'remote-ocr':
|
|
return await extractWithRemote(pdfPath, options);
|
|
|
|
case 'google-vision':
|
|
return await extractWithVision(pdfPath, options);
|
|
|
|
case 'google-drive':
|
|
return await extractWithGoogleDrive(pdfPath, options);
|
|
|
|
case 'tesseract':
|
|
default:
|
|
return await extractWithTesseract(pdfPath, options);
|
|
}
|
|
} catch (error) {
|
|
// If preferred engine fails, fallback to Tesseract
|
|
if (selectedEngine !== 'tesseract') {
|
|
console.warn(`[OCR Hybrid] ${selectedEngine} failed, falling back to Tesseract:`, error.message);
|
|
return await extractWithTesseract(pdfPath, options);
|
|
}
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Wrapper for Remote OCR Worker with error handling
|
|
*/
|
|
async function extractWithRemote(pdfPath, options) {
|
|
try {
|
|
const results = await extractWithRemoteOCR(pdfPath, options);
|
|
|
|
// Log quality metrics
|
|
const avgConfidence = results.reduce((sum, r) => sum + r.confidence, 0) / results.length;
|
|
console.log(`[Remote OCR] Completed with avg confidence: ${avgConfidence.toFixed(2)}`);
|
|
|
|
return results;
|
|
} catch (error) {
|
|
console.error('[Remote OCR] Error:', error.message);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Wrapper for Google Cloud Vision OCR with error handling
|
|
*/
|
|
async function extractWithVision(pdfPath, options) {
|
|
const visionModule = await loadGoogleVision();
|
|
if (!visionModule) {
|
|
throw new Error('Google Vision module not available');
|
|
}
|
|
|
|
try {
|
|
const results = await visionModule.extractTextFromPDFVision(pdfPath, options);
|
|
|
|
// Log quality metrics
|
|
const avgConfidence = results.reduce((sum, r) => sum + r.confidence, 0) / results.length;
|
|
console.log(`[Google Vision OCR] Completed with avg confidence: ${avgConfidence.toFixed(2)}`);
|
|
|
|
return results;
|
|
} catch (error) {
|
|
console.error('[Google Vision OCR] Error:', error.message);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Wrapper for Google Drive OCR with error handling
|
|
*/
|
|
async function extractWithGoogleDrive(pdfPath, options) {
|
|
const driveModule = await loadGoogleDrive();
|
|
if (!driveModule) {
|
|
throw new Error('Google Drive module not available');
|
|
}
|
|
|
|
try {
|
|
const results = await driveModule.extractTextFromPDFGoogleDrive(pdfPath, options);
|
|
|
|
// Log quality metrics
|
|
const avgConfidence = results.reduce((sum, r) => sum + r.confidence, 0) / results.length;
|
|
console.log(`[Google Drive OCR] Completed with avg confidence: ${avgConfidence.toFixed(2)}`);
|
|
|
|
return results;
|
|
} catch (error) {
|
|
console.error('[Google Drive OCR] Error:', error.message);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get information about available OCR engines
|
|
*
|
|
* @returns {Object} - Status of each engine
|
|
*/
|
|
export function getAvailableEngines() {
|
|
const workerInfo = getOCRWorkerInfo();
|
|
|
|
return {
|
|
'remote-ocr': {
|
|
available: workerInfo.enabled,
|
|
quality: 'good',
|
|
speed: 'fast',
|
|
cost: 'free',
|
|
notes: 'Offloads OCR to dedicated Proxmox server, saves local CPU',
|
|
handwriting: false,
|
|
pageByPage: true,
|
|
boundingBoxes: false,
|
|
url: workerInfo.url
|
|
},
|
|
'google-vision': {
|
|
available: isVisionConfigured(),
|
|
quality: 'excellent',
|
|
speed: 'fast',
|
|
cost: '$1.50/1000 pages (1000/month free)',
|
|
notes: 'RECOMMENDED: Real OCR API, fastest, most accurate',
|
|
handwriting: true,
|
|
pageByPage: true,
|
|
boundingBoxes: true
|
|
},
|
|
'google-drive': {
|
|
available: isGoogleDriveConfigured(),
|
|
quality: 'excellent',
|
|
speed: 'slow',
|
|
cost: 'free (unlimited)',
|
|
notes: 'Workaround using Docs conversion, slower',
|
|
handwriting: true,
|
|
pageByPage: false,
|
|
boundingBoxes: false
|
|
},
|
|
tesseract: {
|
|
available: true,
|
|
quality: 'good',
|
|
speed: 'fast',
|
|
cost: 'free',
|
|
notes: 'Local, private, no handwriting support',
|
|
handwriting: false,
|
|
pageByPage: true,
|
|
boundingBoxes: false
|
|
}
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Recommend best OCR engine for a given document
|
|
*
|
|
* @param {Object} documentInfo - Document metadata
|
|
* @param {number} documentInfo.pageCount - Number of pages
|
|
* @param {number} documentInfo.fileSize - File size in bytes
|
|
* @returns {string} - Recommended engine name
|
|
*/
|
|
export function recommendEngine(documentInfo) {
|
|
const { pageCount = 1, fileSize = 0 } = documentInfo;
|
|
|
|
// For large documents, use Tesseract to save on Vision API costs
|
|
if (pageCount > 100 || fileSize > 20 * 1024 * 1024) {
|
|
return 'tesseract';
|
|
}
|
|
|
|
// For medium documents (where cost is acceptable), prefer Vision API
|
|
if (isVisionConfigured()) {
|
|
return 'google-vision';
|
|
}
|
|
|
|
// For small documents, Drive API is free and good enough
|
|
if (isGoogleDriveConfigured()) {
|
|
return 'google-drive';
|
|
}
|
|
|
|
return 'tesseract';
|
|
}
|