diff --git a/server/services/ocr.js b/server/services/ocr.js index 9404d53..64e0906 100644 --- a/server/services/ocr.js +++ b/server/services/ocr.js @@ -172,9 +172,12 @@ async function convertPDFPageToImage(pdfPath, pageNumber) { */ async function runTesseractOCR(imagePath, language = 'eng') { try { + // Ensure language code is 'eng' not 'en' for tesseract + const tessLang = language === 'en' ? 'eng' : language; + // Use local system tesseract command (faster and more reliable) const result = execSync( - `tesseract "${imagePath}" stdout -l ${language} --psm 1`, + `TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata tesseract "${imagePath}" stdout -l ${tessLang} --psm 1`, { encoding: 'utf8', maxBuffer: 10 * 1024 * 1024 } // 10MB buffer ); diff --git a/test/data/05-versions-space.pdf b/test/data/05-versions-space.pdf new file mode 100644 index 0000000..1dca354 Binary files /dev/null and b/test/data/05-versions-space.pdf differ diff --git a/test/data/05-versions-space.pdf.txt b/test/data/05-versions-space.pdf.txt new file mode 100644 index 0000000..8196059 --- /dev/null +++ b/test/data/05-versions-space.pdf.txt @@ -0,0 +1,13 @@ + + +NaviDocs Test Manual + +Page 1 + +Bilge Pump Maintenance + +The bilge pump is located in the aft compartment. +Regular maintenance is required every 6 months. + +Electrical System +Check the battery connections regularly. \ No newline at end of file