fix: Switch to local system tesseract command for OCR

- Replace Tesseract.js with local tesseract CLI due to CDN 404 issues - Fix queue name mismatch (ocr-processing vs ocr-jobs) - Local tesseract uses pre-installed training data - Faster and more reliable than downloading from CDN \ud83e\udd16 Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-19 04:48:18 +02:00 · 2025-10-19 04:48:18 +02:00 · af02363299
commit af02363299
parent 09892de4a3
3 changed files with 11 additions and 7 deletions
--- a/server/services/ocr.js
+++ b/server/services/ocr.js
@ -172,15 +172,19 @@ async function convertPDFPageToImage(pdfPath, pageNumber) {
 */
 async function runTesseractOCR(imagePath, language = 'eng') {
  try {
-    const worker = await Tesseract.createWorker(language);
+    // Use local system tesseract command (faster and more reliable)
+    const result = execSync(
+      `tesseract "${imagePath}" stdout -l ${language} --psm 1`,
+      { encoding: 'utf8', maxBuffer: 10 * 1024 * 1024 } // 10MB buffer
+    );

-    const { data } = await worker.recognize(imagePath);
-
-    await worker.terminate();
+    // Tesseract doesn't provide confidence via stdout, so we'll estimate based on output
+    const text = result.trim();
+    const confidence = text.length > 0 ? 0.85 : 0.0; // Rough estimate

    return {
-      text: data.text,
-      confidence: data.confidence / 100 // Convert to 0-1 range
+      text,
+      confidence
    };
  } catch (error) {
    console.error('Tesseract OCR error:', error);
--- a/server/workers/ocr-worker.js
+++ b/server/workers/ocr-worker.js
@ -231,7 +231,7 @@ async function processOCRJob(job) {
 * Create and start the OCR worker
 */
 export function createOCRWorker() {
-  const worker = new Worker('ocr-jobs', processOCRJob, {
+  const worker = new Worker('ocr-processing', processOCRJob, {
    connection,
    concurrency: parseInt(process.env.OCR_CONCURRENCY || '2'), // Process 2 documents at a time
    limiter: {
--- a/test-manual.pdf
+++ b/test-manual.pdf