/** * Native PDF Text Extraction using pdfjs-dist * Extracts text directly from PDF without OCR * * Performance: 36x faster than Tesseract for text-based PDFs * Use case: Extract native text from PDFs before attempting OCR */ import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs'; import { readFileSync } from 'fs'; /** * Extract native text from each page of a PDF * @param {string} pdfPath - Absolute path to PDF file * @returns {Promise} Array of page texts (index 0 = page 1) */ export async function extractNativeTextPerPage(pdfPath) { const data = new Uint8Array(readFileSync(pdfPath)); const pdf = await pdfjsLib.getDocument({ data }).promise; const pageTexts = []; const pageCount = pdf.numPages; for (let pageNum = 1; pageNum <= pageCount; pageNum++) { const page = await pdf.getPage(pageNum); const textContent = await page.getTextContent(); const pageText = textContent.items.map(item => item.str).join(' '); pageTexts.push(pageText.trim()); } return pageTexts; } /** * Check if PDF has substantial native text * @param {string} pdfPath - Absolute path to PDF file * @param {number} minChars - Minimum character threshold (default: 100) * @returns {Promise} True if PDF has native text */ export async function hasNativeText(pdfPath, minChars = 100) { try { const pageTexts = await extractNativeTextPerPage(pdfPath); const totalText = pageTexts.join(''); return totalText.length >= minChars; } catch (error) { console.error('[PDF Text Extractor] Error checking native text:', error.message); return false; } } /** * Extract native text from a single page * @param {string} pdfPath - Absolute path to PDF file * @param {number} pageNumber - Page number (1-indexed) * @returns {Promise} Page text content */ export async function extractPageText(pdfPath, pageNumber) { const data = new Uint8Array(readFileSync(pdfPath)); const pdf = await pdfjsLib.getDocument({ data }).promise; const page = await pdf.getPage(pageNumber); const textContent = await page.getTextContent(); const pageText = textContent.items.map(item => item.str).join(' '); return pageText.trim(); }