diff --git a/server/services/ocr.js b/server/services/ocr.js index efe2e2a..9404d53 100644 --- a/server/services/ocr.js +++ b/server/services/ocr.js @@ -172,15 +172,19 @@ async function convertPDFPageToImage(pdfPath, pageNumber) { */ async function runTesseractOCR(imagePath, language = 'eng') { try { - const worker = await Tesseract.createWorker(language); + // Use local system tesseract command (faster and more reliable) + const result = execSync( + `tesseract "${imagePath}" stdout -l ${language} --psm 1`, + { encoding: 'utf8', maxBuffer: 10 * 1024 * 1024 } // 10MB buffer + ); - const { data } = await worker.recognize(imagePath); - - await worker.terminate(); + // Tesseract doesn't provide confidence via stdout, so we'll estimate based on output + const text = result.trim(); + const confidence = text.length > 0 ? 0.85 : 0.0; // Rough estimate return { - text: data.text, - confidence: data.confidence / 100 // Convert to 0-1 range + text, + confidence }; } catch (error) { console.error('Tesseract OCR error:', error); diff --git a/server/workers/ocr-worker.js b/server/workers/ocr-worker.js index 342785a..a54ad5f 100644 --- a/server/workers/ocr-worker.js +++ b/server/workers/ocr-worker.js @@ -231,7 +231,7 @@ async function processOCRJob(job) { * Create and start the OCR worker */ export function createOCRWorker() { - const worker = new Worker('ocr-jobs', processOCRJob, { + const worker = new Worker('ocr-processing', processOCRJob, { connection, concurrency: parseInt(process.env.OCR_CONCURRENCY || '2'), // Process 2 documents at a time limiter: { diff --git a/test-manual.pdf b/test-manual.pdf new file mode 100644 index 0000000..1dca354 Binary files /dev/null and b/test-manual.pdf differ