navidocs/server/routes/quick-ocr.js
Danny Stocker 58b344aa31 FINAL: P0 blockers fixed + Joe Trader + ignore binaries
Fixed:
- Price: €800K-€1.5M, Sunseeker added
- Agent 1: Joe Trader persona + actual sale ads research
- Ignored meilisearch binary + data/ (too large for GitHub)
- SESSION_DEBUG_BLOCKERS.md created

Ready for Session 1 launch.

🤖 Generated with Claude Code
2025-11-13 01:29:59 +01:00

211 lines
6.2 KiB
JavaScript

/**
* Quick OCR Route - POST /api/upload/quick-ocr
* OCR first page of PDF and extract metadata for form auto-fill
*/
import express from 'express';
import multer from 'multer';
import pdfParse from 'pdf-parse';
import { tmpdir } from 'os';
import { join } from 'path';
import { writeFileSync, unlinkSync, readFileSync } from 'fs';
import { v4 as uuidv4 } from 'uuid';
const router = express.Router();
// Configure multer for memory storage
const upload = multer({
storage: multer.memoryStorage(),
limits: {
fileSize: parseInt(process.env.MAX_FILE_SIZE || '52428800') // 50MB
}
});
/**
* Extract metadata from OCR text
* Looks for patterns like:
* - Boat makes: Prestige, Ferretti, Sunseeker, etc.
* - Model numbers: F4.9, 630, etc.
* - Years: 2020-2025
* - Titles from headers
*/
function extractMetadata(ocrText, filename = '') {
const metadata = {
title: '',
boatName: '',
boatMake: '',
boatModel: '',
boatYear: null
};
// Remove .pdf extension from filename
const cleanFilename = filename.replace(/\.pdf$/i, '');
// Common boat manufacturers
const boatMakes = [
'Prestige', 'Ferretti', 'Sunseeker', 'Princess', 'Azimut', 'Beneteau',
'Jeanneau', 'Bavaria', 'Catalina', 'Hunter', 'Lagoon', 'Fountaine Pajot',
'Sea Ray', 'Boston Whaler', 'Grady-White', 'Chris-Craft', 'Tiara',
'Viking', 'Hatteras', 'Ocean Alexander', 'Grand Banks'
];
// Extract year (look for 4-digit years 1990-2030)
const yearMatch = ocrText.match(/\b(19[9][0-9]|20[0-2][0-9]|2030)\b/);
if (yearMatch) {
metadata.boatYear = parseInt(yearMatch[1]);
}
// Extract boat make (case-insensitive)
for (const make of boatMakes) {
const makeRegex = new RegExp(`\\b${make}\\b`, 'i');
if (makeRegex.test(ocrText)) {
metadata.boatMake = make;
break;
}
}
// Extract model (usually alphanumeric, near the make)
if (metadata.boatMake) {
// Look for model pattern near the make
const makeIndex = ocrText.toLowerCase().indexOf(metadata.boatMake.toLowerCase());
const nearMake = ocrText.substring(Math.max(0, makeIndex - 50), makeIndex + 100);
// Common model patterns: F4.9, 630, S45, etc.
const modelMatch = nearMake.match(/\b([A-Z]?[0-9]{2,4}(?:\.[0-9])?)\b/);
if (modelMatch) {
metadata.boatModel = modelMatch[1];
}
}
// Extract title from first few lines
const lines = ocrText.split('\n').map(l => l.trim()).filter(l => l.length > 3);
if (lines.length > 0) {
// Use the first substantial line as title
let titleLine = lines[0];
// If first line is very short, try combining with second line
if (titleLine.length < 15 && lines.length > 1) {
titleLine = `${titleLine} ${lines[1]}`;
}
// Clean up title (remove excessive whitespace, special chars)
metadata.title = titleLine
.replace(/\s+/g, ' ')
.replace(/[^\w\s\-(),.]/g, '')
.substring(0, 100)
.trim();
}
// If no title found in OCR, use filename
if (!metadata.title && cleanFilename) {
metadata.title = cleanFilename
.replace(/[_-]/g, ' ')
.replace(/\s+/g, ' ')
.trim();
}
// Extract boat name from filename if not found in OCR
// Look for pattern: BoatName_Something or BoatName-Something
if (!metadata.boatName && cleanFilename) {
const filenameMatch = cleanFilename.match(/^([A-Z][a-zA-Z0-9\s]+?)(?:[_-]|$)/);
if (filenameMatch) {
const potentialName = filenameMatch[1].trim();
// Only use if it's not a common word like "Manual", "Owner", etc.
const commonWords = ['Manual', 'Owner', 'Service', 'Document', 'Guide', 'Book'];
if (!commonWords.some(word => potentialName.toLowerCase().includes(word.toLowerCase()))) {
metadata.boatName = potentialName;
}
}
}
// Look for boat name in OCR text (usually appears early)
if (!metadata.boatName && metadata.boatMake) {
// Look for proper noun before or after make
const makeIndex = ocrText.toLowerCase().indexOf(metadata.boatMake.toLowerCase());
const beforeMake = ocrText.substring(Math.max(0, makeIndex - 100), makeIndex);
const nameMatch = beforeMake.match(/\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)\s*$/);
if (nameMatch) {
metadata.boatName = nameMatch[1].trim();
}
}
return metadata;
}
/**
* POST /api/upload/quick-ocr
* OCR first page and return extracted metadata
*
* @body {File} file - PDF file
* @returns {Object} { success: true, metadata: {...}, ocrText: '...' }
*/
router.post('/', upload.single('file'), async (req, res) => {
let tempFilePath = null;
try {
const file = req.file;
if (!file) {
return res.status(400).json({ error: 'No file uploaded' });
}
if (file.mimetype !== 'application/pdf') {
return res.status(400).json({ error: 'Only PDF files are supported' });
}
// Save to temp file (OCR service needs file path)
const tempId = uuidv4();
tempFilePath = join(tmpdir(), `quick-ocr-${tempId}.pdf`);
writeFileSync(tempFilePath, file.buffer);
console.log(`[Quick OCR] Extracting embedded text from ${file.originalname}`);
// Fast text extraction (no OCR) - works if PDF has embedded text
const dataBuffer = readFileSync(tempFilePath);
const pdfData = await pdfParse(dataBuffer, {
max: 1 // Only parse first page
});
const firstPageText = pdfData.text || '';
console.log(`[Quick OCR] Text extraction completed (fast)`);
console.log(`[Quick OCR] Text length: ${firstPageText.length} characters`);
// Extract metadata
const metadata = extractMetadata(firstPageText, file.originalname);
console.log(`[Quick OCR] Extracted metadata:`, metadata);
// Clean up temp file
try {
unlinkSync(tempFilePath);
} catch (e) {
console.warn('[Quick OCR] Failed to clean up temp file:', e.message);
}
res.json({
success: true,
metadata,
ocrText: firstPageText.substring(0, 500) // Return first 500 chars for debugging
});
} catch (error) {
console.error('[Quick OCR] Error:', error);
// Clean up temp file on error
if (tempFilePath) {
try {
unlinkSync(tempFilePath);
} catch (e) {
// Ignore cleanup errors
}
}
res.status(500).json({
error: 'Quick OCR failed',
message: error.message
});
}
});
export default router;