navidocs/server/services/ocr-client.js
Danny Stocker 58b344aa31 FINAL: P0 blockers fixed + Joe Trader + ignore binaries
Fixed:
- Price: €800K-€1.5M, Sunseeker added
- Agent 1: Joe Trader persona + actual sale ads research
- Ignored meilisearch binary + data/ (too large for GitHub)
- SESSION_DEBUG_BLOCKERS.md created

Ready for Session 1 launch.

🤖 Generated with Claude Code
2025-11-13 01:29:59 +01:00

120 lines
3.2 KiB
JavaScript

/**
* OCR Client - Forward OCR requests to remote worker
*
* This service calls the remote OCR worker (naviocr) instead of
* running OCR locally. This offloads CPU-intensive processing.
*/
import { readFileSync } from 'fs';
import FormData from 'form-data';
import logger from '../utils/logger.js';
const OCR_WORKER_URL = process.env.OCR_WORKER_URL || 'http://fr-antibes.duckdns.org/naviocr';
const OCR_WORKER_TIMEOUT = parseInt(process.env.OCR_WORKER_TIMEOUT || '300000'); // 5 minutes
const USE_REMOTE_OCR = process.env.USE_REMOTE_OCR === 'true';
/**
* Extract text from PDF using remote OCR worker
*
* @param {string} pdfPath - Absolute path to PDF file
* @param {Object} options - OCR options
* @param {string} options.language - Language code (default: 'eng')
* @param {Function} options.onProgress - Progress callback
* @returns {Promise<Array<{pageNumber: number, text: string, confidence: number}>>}
*/
export async function extractTextFromPDF(pdfPath, options = {}) {
const { language = 'eng', onProgress } = options;
if (!USE_REMOTE_OCR) {
throw new Error('Remote OCR is not enabled. Set USE_REMOTE_OCR=true');
}
try {
logger.info(`Remote OCR: Sending ${pdfPath} to ${OCR_WORKER_URL}`);
// Read PDF file into buffer
const pdfBuffer = readFileSync(pdfPath);
// Create form data with file and language
const formData = new FormData();
formData.append('file', pdfBuffer, {
filename: pdfPath.split('/').pop(),
contentType: 'application/pdf'
});
formData.append('language', language);
// Send to remote OCR worker
const response = await fetch(`${OCR_WORKER_URL}/ocr`, {
method: 'POST',
body: formData,
headers: formData.getHeaders(),
signal: AbortSignal.timeout(OCR_WORKER_TIMEOUT)
});
if (!response.ok) {
const errorText = await response.text();
throw new Error(`OCR worker returned ${response.status}: ${errorText}`);
}
const result = await response.json();
if (!result.success) {
throw new Error(result.error || 'OCR processing failed');
}
logger.info(`Remote OCR: Completed ${result.totalPages} pages`);
// Call progress callback with final count
if (onProgress && result.totalPages) {
onProgress(result.totalPages, result.totalPages);
}
return result.pages;
} catch (error) {
logger.error('Remote OCR error:', error);
if (error.name === 'AbortError') {
throw new Error(`OCR worker timeout after ${OCR_WORKER_TIMEOUT}ms`);
}
throw new Error(`Remote OCR failed: ${error.message}`);
}
}
/**
* Check if remote OCR worker is available
*
* @returns {Promise<boolean>}
*/
export async function checkRemoteOCRHealth() {
try {
const response = await fetch(`${OCR_WORKER_URL}/health`, {
signal: AbortSignal.timeout(5000)
});
if (!response.ok) {
return false;
}
const data = await response.json();
return data.status === 'ok';
} catch (error) {
logger.warn('Remote OCR health check failed:', error.message);
return false;
}
}
/**
* Get OCR worker info
*
* @returns {Object}
*/
export function getOCRWorkerInfo() {
return {
enabled: USE_REMOTE_OCR,
url: OCR_WORKER_URL,
timeout: OCR_WORKER_TIMEOUT
};
}