fix: Switch to local system tesseract command for OCR
- Replace Tesseract.js with local tesseract CLI due to CDN 404 issues - Fix queue name mismatch (ocr-processing vs ocr-jobs) - Local tesseract uses pre-installed training data - Faster and more reliable than downloading from CDN \ud83e\udd16 Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
09892de4a3
commit
af02363299
3 changed files with 11 additions and 7 deletions
|
|
@ -172,15 +172,19 @@ async function convertPDFPageToImage(pdfPath, pageNumber) {
|
||||||
*/
|
*/
|
||||||
async function runTesseractOCR(imagePath, language = 'eng') {
|
async function runTesseractOCR(imagePath, language = 'eng') {
|
||||||
try {
|
try {
|
||||||
const worker = await Tesseract.createWorker(language);
|
// Use local system tesseract command (faster and more reliable)
|
||||||
|
const result = execSync(
|
||||||
|
`tesseract "${imagePath}" stdout -l ${language} --psm 1`,
|
||||||
|
{ encoding: 'utf8', maxBuffer: 10 * 1024 * 1024 } // 10MB buffer
|
||||||
|
);
|
||||||
|
|
||||||
const { data } = await worker.recognize(imagePath);
|
// Tesseract doesn't provide confidence via stdout, so we'll estimate based on output
|
||||||
|
const text = result.trim();
|
||||||
await worker.terminate();
|
const confidence = text.length > 0 ? 0.85 : 0.0; // Rough estimate
|
||||||
|
|
||||||
return {
|
return {
|
||||||
text: data.text,
|
text,
|
||||||
confidence: data.confidence / 100 // Convert to 0-1 range
|
confidence
|
||||||
};
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Tesseract OCR error:', error);
|
console.error('Tesseract OCR error:', error);
|
||||||
|
|
|
||||||
|
|
@ -231,7 +231,7 @@ async function processOCRJob(job) {
|
||||||
* Create and start the OCR worker
|
* Create and start the OCR worker
|
||||||
*/
|
*/
|
||||||
export function createOCRWorker() {
|
export function createOCRWorker() {
|
||||||
const worker = new Worker('ocr-jobs', processOCRJob, {
|
const worker = new Worker('ocr-processing', processOCRJob, {
|
||||||
connection,
|
connection,
|
||||||
concurrency: parseInt(process.env.OCR_CONCURRENCY || '2'), // Process 2 documents at a time
|
concurrency: parseInt(process.env.OCR_CONCURRENCY || '2'), // Process 2 documents at a time
|
||||||
limiter: {
|
limiter: {
|
||||||
|
|
|
||||||
BIN
test-manual.pdf
Normal file
BIN
test-manual.pdf
Normal file
Binary file not shown.
Loading…
Add table
Reference in a new issue