navidocs/server/routes/quick-ocr.js

/**
 * Quick OCR Route - POST /api/upload/quick-ocr
 * OCR first page of PDF and extract metadata for form auto-fill
 */

import express from 'express';
import multer from 'multer';
import pdfParse from 'pdf-parse';
import { tmpdir } from 'os';
import { join } from 'path';
import { writeFileSync, unlinkSync, readFileSync } from 'fs';
import { v4 as uuidv4 } from 'uuid';

const router = express.Router();

// Configure multer for memory storage
const upload = multer({
  storage: multer.memoryStorage(),
  limits: {
    fileSize: parseInt(process.env.MAX_FILE_SIZE || '52428800') // 50MB
  }
});

/**
 * Extract metadata from OCR text
 * Looks for patterns like:
 * - Boat makes: Prestige, Ferretti, Sunseeker, etc.
 * - Model numbers: F4.9, 630, etc.
 * - Years: 2020-2025
 * - Titles from headers
 */
function extractMetadata(ocrText, filename = '') {
  const metadata = {
    title: '',
    boatName: '',
    boatMake: '',
    boatModel: '',
    boatYear: null
  };

  // Remove .pdf extension from filename
  const cleanFilename = filename.replace(/\.pdf$/i, '');

  // Common boat manufacturers
  const boatMakes = [
    'Prestige', 'Ferretti', 'Sunseeker', 'Princess', 'Azimut', 'Beneteau',
    'Jeanneau', 'Bavaria', 'Catalina', 'Hunter', 'Lagoon', 'Fountaine Pajot',
    'Sea Ray', 'Boston Whaler', 'Grady-White', 'Chris-Craft', 'Tiara',
    'Viking', 'Hatteras', 'Ocean Alexander', 'Grand Banks'
  ];

  // Extract year (look for 4-digit years 1990-2030)
  const yearMatch = ocrText.match(/\b(19[9][0-9]|20[0-2][0-9]|2030)\b/);
  if (yearMatch) {
    metadata.boatYear = parseInt(yearMatch[1]);
  }

  // Extract boat make (case-insensitive)
  for (const make of boatMakes) {
    const makeRegex = new RegExp(`\\b${make}\\b`, 'i');
    if (makeRegex.test(ocrText)) {
      metadata.boatMake = make;
      break;
    }
  }

  // Extract model (usually alphanumeric, near the make)
  if (metadata.boatMake) {
    // Look for model pattern near the make
    const makeIndex = ocrText.toLowerCase().indexOf(metadata.boatMake.toLowerCase());
    const nearMake = ocrText.substring(Math.max(0, makeIndex - 50), makeIndex + 100);

    // Common model patterns: F4.9, 630, S45, etc.
    const modelMatch = nearMake.match(/\b([A-Z]?[0-9]{2,4}(?:\.[0-9])?)\b/);
    if (modelMatch) {
      metadata.boatModel = modelMatch[1];
    }
  }

  // Extract title from first few lines
  const lines = ocrText.split('\n').map(l => l.trim()).filter(l => l.length > 3);
  if (lines.length > 0) {
    // Use the first substantial line as title
    let titleLine = lines[0];

    // If first line is very short, try combining with second line
    if (titleLine.length < 15 && lines.length > 1) {
      titleLine = `${titleLine} ${lines[1]}`;
    }

    // Clean up title (remove excessive whitespace, special chars)
    metadata.title = titleLine
      .replace(/\s+/g, ' ')
      .replace(/[^\w\s\-(),.]/g, '')
      .substring(0, 100)
      .trim();
  }

  // If no title found in OCR, use filename
  if (!metadata.title && cleanFilename) {
    metadata.title = cleanFilename
      .replace(/[_-]/g, ' ')
      .replace(/\s+/g, ' ')
      .trim();
  }

  // Extract boat name from filename if not found in OCR
  // Look for pattern: BoatName_Something or BoatName-Something
  if (!metadata.boatName && cleanFilename) {
    const filenameMatch = cleanFilename.match(/^([A-Z][a-zA-Z0-9\s]+?)(?:[_-]|$)/);
    if (filenameMatch) {
      const potentialName = filenameMatch[1].trim();
      // Only use if it's not a common word like "Manual", "Owner", etc.
      const commonWords = ['Manual', 'Owner', 'Service', 'Document', 'Guide', 'Book'];
      if (!commonWords.some(word => potentialName.toLowerCase().includes(word.toLowerCase()))) {
        metadata.boatName = potentialName;
      }
    }
  }

  // Look for boat name in OCR text (usually appears early)
  if (!metadata.boatName && metadata.boatMake) {
    // Look for proper noun before or after make
    const makeIndex = ocrText.toLowerCase().indexOf(metadata.boatMake.toLowerCase());
    const beforeMake = ocrText.substring(Math.max(0, makeIndex - 100), makeIndex);
    const nameMatch = beforeMake.match(/\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)\s*$/);
    if (nameMatch) {
      metadata.boatName = nameMatch[1].trim();
    }
  }

  return metadata;
}

/**
 * POST /api/upload/quick-ocr
 * OCR first page and return extracted metadata
 *
 * @body {File} file - PDF file
 * @returns {Object} { success: true, metadata: {...}, ocrText: '...' }
 */
router.post('/', upload.single('file'), async (req, res) => {
  let tempFilePath = null;

  try {
    const file = req.file;

    if (!file) {
      return res.status(400).json({ error: 'No file uploaded' });
    }

    if (file.mimetype !== 'application/pdf') {
      return res.status(400).json({ error: 'Only PDF files are supported' });
    }

    // Save to temp file (OCR service needs file path)
    const tempId = uuidv4();
    tempFilePath = join(tmpdir(), `quick-ocr-${tempId}.pdf`);
    writeFileSync(tempFilePath, file.buffer);

    console.log(`[Quick OCR] Extracting embedded text from ${file.originalname}`);

    // Fast text extraction (no OCR) - works if PDF has embedded text
    const dataBuffer = readFileSync(tempFilePath);
    const pdfData = await pdfParse(dataBuffer, {
      max: 1 // Only parse first page
    });

    const firstPageText = pdfData.text || '';

    console.log(`[Quick OCR] Text extraction completed (fast)`);
    console.log(`[Quick OCR] Text length: ${firstPageText.length} characters`);

    // Extract metadata
    const metadata = extractMetadata(firstPageText, file.originalname);

    console.log(`[Quick OCR] Extracted metadata:`, metadata);

    // Clean up temp file
    try {
      unlinkSync(tempFilePath);
    } catch (e) {
      console.warn('[Quick OCR] Failed to clean up temp file:', e.message);
    }

    res.json({
      success: true,
      metadata,
      ocrText: firstPageText.substring(0, 500) // Return first 500 chars for debugging
    });

  } catch (error) {
    console.error('[Quick OCR] Error:', error);

    // Clean up temp file on error
    if (tempFilePath) {
      try {
        unlinkSync(tempFilePath);
      } catch (e) {
        // Ignore cleanup errors
      }
    }

    res.status(500).json({
      error: 'Quick OCR failed',
      message: error.message
    });
  }
});

export default router;