navidocs/server/services/pdf-text-extractor.js

/**
 * Native PDF Text Extraction using pdfjs-dist
 * Extracts text directly from PDF without OCR
 *
 * Performance: 36x faster than Tesseract for text-based PDFs
 * Use case: Extract native text from PDFs before attempting OCR
 */

import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs';
import { readFileSync } from 'fs';

/**
 * Extract native text from each page of a PDF
 * @param {string} pdfPath - Absolute path to PDF file
 * @returns {Promise<string[]>} Array of page texts (index 0 = page 1)
 */
export async function extractNativeTextPerPage(pdfPath) {
  const data = new Uint8Array(readFileSync(pdfPath));
  const pdf = await pdfjsLib.getDocument({ data }).promise;

  const pageTexts = [];
  const pageCount = pdf.numPages;

  for (let pageNum = 1; pageNum <= pageCount; pageNum++) {
    const page = await pdf.getPage(pageNum);
    const textContent = await page.getTextContent();
    const pageText = textContent.items.map(item => item.str).join(' ');
    pageTexts.push(pageText.trim());
  }

  return pageTexts;
}

/**
 * Check if PDF has substantial native text
 * @param {string} pdfPath - Absolute path to PDF file
 * @param {number} minChars - Minimum character threshold (default: 100)
 * @returns {Promise<boolean>} True if PDF has native text
 */
export async function hasNativeText(pdfPath, minChars = 100) {
  try {
    const pageTexts = await extractNativeTextPerPage(pdfPath);
    const totalText = pageTexts.join('');
    return totalText.length >= minChars;
  } catch (error) {
    console.error('[PDF Text Extractor] Error checking native text:', error.message);
    return false;
  }
}

/**
 * Extract native text from a single page
 * @param {string} pdfPath - Absolute path to PDF file
 * @param {number} pageNumber - Page number (1-indexed)
 * @returns {Promise<string>} Page text content
 */
export async function extractPageText(pdfPath, pageNumber) {
  const data = new Uint8Array(readFileSync(pdfPath));
  const pdf = await pdfjsLib.getDocument({ data }).promise;

  const page = await pdf.getPage(pageNumber);
  const textContent = await page.getTextContent();
  const pageText = textContent.items.map(item => item.str).join(' ');

  return pageText.trim();
}