This commit addresses multiple critical fixes and adds new functionality for the NaviDocs local testing environment (port 8083): Search Fixes: - Fixed search to use backend /api/search instead of direct Meilisearch - Resolves network accessibility issue when accessing from external IPs - Search now works from http://172.29.75.55:8083/search PDF Text Selection: - Added PDF.js text layer for selectable text - Imported pdf_viewer.css for proper text layer styling - Changed text layer opacity to 1 for better interaction - Added user-select: text for improved text selection - Pink selection highlight (rgba(255, 92, 178, 0.3)) Database Cleanup: - Created cleanup scripts to remove 20 duplicate documents - Removed 753 orphaned entries from Meilisearch index - Cleaned 17 document folders from filesystem - Kept only newest version of each document - Scripts: clean-duplicates.js, clean-meilisearch-orphans.js Auto-Fill Feature: - New /api/upload/quick-ocr endpoint for first-page OCR - Automatically extracts metadata from PDFs on file selection - Detects: boat make, model, year, name, and document title - Checks both OCR text and filename for boat name - Auto-fills upload form with extracted data - Shows loading indicator during metadata extraction - Graceful fallback to filename if OCR fails Tenant Management: - Updated organization ID to use boat name as tenant - Falls back to "Liliane 1" for single-tenant setup - Each boat becomes a unique tenant in the system Files Changed: - client/src/views/DocumentView.vue - Text layer implementation - client/src/composables/useSearch.js - Backend API integration - client/src/components/UploadModal.vue - Auto-fill feature - server/routes/quick-ocr.js - OCR endpoint (new) - server/index.js - Route registration - server/scripts/* - Cleanup utilities (new) Testing: All features tested on local deployment at http://172.29.75.55:8083 - Backend: http://localhost:8001 - Frontend: http://localhost:8083 - Meilisearch: http://localhost:7700 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
217 lines
6.4 KiB
JavaScript
217 lines
6.4 KiB
JavaScript
/**
|
|
* Quick OCR Route - POST /api/upload/quick-ocr
|
|
* OCR first page of PDF and extract metadata for form auto-fill
|
|
*/
|
|
|
|
import express from 'express';
|
|
import multer from 'multer';
|
|
import { extractTextFromPDF } from '../services/ocr.js';
|
|
import { tmpdir } from 'os';
|
|
import { join } from 'path';
|
|
import { writeFileSync, unlinkSync } from 'fs';
|
|
import { v4 as uuidv4 } from 'uuid';
|
|
|
|
const router = express.Router();
|
|
|
|
// Configure multer for memory storage
|
|
const upload = multer({
|
|
storage: multer.memoryStorage(),
|
|
limits: {
|
|
fileSize: parseInt(process.env.MAX_FILE_SIZE || '52428800') // 50MB
|
|
}
|
|
});
|
|
|
|
/**
|
|
* Extract metadata from OCR text
|
|
* Looks for patterns like:
|
|
* - Boat makes: Prestige, Ferretti, Sunseeker, etc.
|
|
* - Model numbers: F4.9, 630, etc.
|
|
* - Years: 2020-2025
|
|
* - Titles from headers
|
|
*/
|
|
function extractMetadata(ocrText, filename = '') {
|
|
const metadata = {
|
|
title: '',
|
|
boatName: '',
|
|
boatMake: '',
|
|
boatModel: '',
|
|
boatYear: null
|
|
};
|
|
|
|
// Remove .pdf extension from filename
|
|
const cleanFilename = filename.replace(/\.pdf$/i, '');
|
|
|
|
// Common boat manufacturers
|
|
const boatMakes = [
|
|
'Prestige', 'Ferretti', 'Sunseeker', 'Princess', 'Azimut', 'Beneteau',
|
|
'Jeanneau', 'Bavaria', 'Catalina', 'Hunter', 'Lagoon', 'Fountaine Pajot',
|
|
'Sea Ray', 'Boston Whaler', 'Grady-White', 'Chris-Craft', 'Tiara',
|
|
'Viking', 'Hatteras', 'Ocean Alexander', 'Grand Banks'
|
|
];
|
|
|
|
// Extract year (look for 4-digit years 1990-2030)
|
|
const yearMatch = ocrText.match(/\b(19[9][0-9]|20[0-2][0-9]|2030)\b/);
|
|
if (yearMatch) {
|
|
metadata.boatYear = parseInt(yearMatch[1]);
|
|
}
|
|
|
|
// Extract boat make (case-insensitive)
|
|
for (const make of boatMakes) {
|
|
const makeRegex = new RegExp(`\\b${make}\\b`, 'i');
|
|
if (makeRegex.test(ocrText)) {
|
|
metadata.boatMake = make;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Extract model (usually alphanumeric, near the make)
|
|
if (metadata.boatMake) {
|
|
// Look for model pattern near the make
|
|
const makeIndex = ocrText.toLowerCase().indexOf(metadata.boatMake.toLowerCase());
|
|
const nearMake = ocrText.substring(Math.max(0, makeIndex - 50), makeIndex + 100);
|
|
|
|
// Common model patterns: F4.9, 630, S45, etc.
|
|
const modelMatch = nearMake.match(/\b([A-Z]?[0-9]{2,4}(?:\.[0-9])?)\b/);
|
|
if (modelMatch) {
|
|
metadata.boatModel = modelMatch[1];
|
|
}
|
|
}
|
|
|
|
// Extract title from first few lines
|
|
const lines = ocrText.split('\n').map(l => l.trim()).filter(l => l.length > 3);
|
|
if (lines.length > 0) {
|
|
// Use the first substantial line as title
|
|
let titleLine = lines[0];
|
|
|
|
// If first line is very short, try combining with second line
|
|
if (titleLine.length < 15 && lines.length > 1) {
|
|
titleLine = `${titleLine} ${lines[1]}`;
|
|
}
|
|
|
|
// Clean up title (remove excessive whitespace, special chars)
|
|
metadata.title = titleLine
|
|
.replace(/\s+/g, ' ')
|
|
.replace(/[^\w\s\-(),.]/g, '')
|
|
.substring(0, 100)
|
|
.trim();
|
|
}
|
|
|
|
// If no title found in OCR, use filename
|
|
if (!metadata.title && cleanFilename) {
|
|
metadata.title = cleanFilename
|
|
.replace(/[_-]/g, ' ')
|
|
.replace(/\s+/g, ' ')
|
|
.trim();
|
|
}
|
|
|
|
// Extract boat name from filename if not found in OCR
|
|
// Look for pattern: BoatName_Something or BoatName-Something
|
|
if (!metadata.boatName && cleanFilename) {
|
|
const filenameMatch = cleanFilename.match(/^([A-Z][a-zA-Z0-9\s]+?)(?:[_-]|$)/);
|
|
if (filenameMatch) {
|
|
const potentialName = filenameMatch[1].trim();
|
|
// Only use if it's not a common word like "Manual", "Owner", etc.
|
|
const commonWords = ['Manual', 'Owner', 'Service', 'Document', 'Guide', 'Book'];
|
|
if (!commonWords.some(word => potentialName.toLowerCase().includes(word.toLowerCase()))) {
|
|
metadata.boatName = potentialName;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Look for boat name in OCR text (usually appears early)
|
|
if (!metadata.boatName && metadata.boatMake) {
|
|
// Look for proper noun before or after make
|
|
const makeIndex = ocrText.toLowerCase().indexOf(metadata.boatMake.toLowerCase());
|
|
const beforeMake = ocrText.substring(Math.max(0, makeIndex - 100), makeIndex);
|
|
const nameMatch = beforeMake.match(/\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)\s*$/);
|
|
if (nameMatch) {
|
|
metadata.boatName = nameMatch[1].trim();
|
|
}
|
|
}
|
|
|
|
return metadata;
|
|
}
|
|
|
|
/**
|
|
* POST /api/upload/quick-ocr
|
|
* OCR first page and return extracted metadata
|
|
*
|
|
* @body {File} file - PDF file
|
|
* @returns {Object} { success: true, metadata: {...}, ocrText: '...' }
|
|
*/
|
|
router.post('/', upload.single('file'), async (req, res) => {
|
|
let tempFilePath = null;
|
|
|
|
try {
|
|
const file = req.file;
|
|
|
|
if (!file) {
|
|
return res.status(400).json({ error: 'No file uploaded' });
|
|
}
|
|
|
|
if (file.mimetype !== 'application/pdf') {
|
|
return res.status(400).json({ error: 'Only PDF files are supported' });
|
|
}
|
|
|
|
// Save to temp file (OCR service needs file path)
|
|
const tempId = uuidv4();
|
|
tempFilePath = join(tmpdir(), `quick-ocr-${tempId}.pdf`);
|
|
writeFileSync(tempFilePath, file.buffer);
|
|
|
|
console.log(`[Quick OCR] Processing first page of ${file.originalname}`);
|
|
|
|
// Extract text from first page only
|
|
const ocrResults = await extractTextFromPDF(tempFilePath, {
|
|
language: 'eng',
|
|
onProgress: (page, total) => {
|
|
// Only process first page
|
|
if (page > 1) return;
|
|
}
|
|
});
|
|
|
|
// Get first page text
|
|
const firstPageText = ocrResults[0]?.text || '';
|
|
const confidence = ocrResults[0]?.confidence || 0;
|
|
|
|
console.log(`[Quick OCR] First page OCR completed (confidence: ${confidence.toFixed(2)})`);
|
|
console.log(`[Quick OCR] Text length: ${firstPageText.length} characters`);
|
|
|
|
// Extract metadata
|
|
const metadata = extractMetadata(firstPageText, file.originalname);
|
|
|
|
console.log(`[Quick OCR] Extracted metadata:`, metadata);
|
|
|
|
// Clean up temp file
|
|
try {
|
|
unlinkSync(tempFilePath);
|
|
} catch (e) {
|
|
console.warn('[Quick OCR] Failed to clean up temp file:', e.message);
|
|
}
|
|
|
|
res.json({
|
|
success: true,
|
|
metadata,
|
|
ocrText: firstPageText.substring(0, 500), // Return first 500 chars for debugging
|
|
confidence
|
|
});
|
|
|
|
} catch (error) {
|
|
console.error('[Quick OCR] Error:', error);
|
|
|
|
// Clean up temp file on error
|
|
if (tempFilePath) {
|
|
try {
|
|
unlinkSync(tempFilePath);
|
|
} catch (e) {
|
|
// Ignore cleanup errors
|
|
}
|
|
}
|
|
|
|
res.status(500).json({
|
|
error: 'Quick OCR failed',
|
|
message: error.message
|
|
});
|
|
}
|
|
});
|
|
|
|
export default router;
|