Merge branch 'image-extraction-api'

2025-10-19 20:00:20 +02:00 · 2025-10-19 20:00:20 +02:00 · c2902cae6f
commit c2902cae6f
parent 09d9f1b601 19d90f50ca
3 changed files with 454 additions and 0 deletions
--- a/server/index.js
+++ b/server/index.js
@ -82,12 +82,14 @@ import uploadRoutes from './routes/upload.js';
 import jobsRoutes from './routes/jobs.js';
 import searchRoutes from './routes/search.js';
 import documentsRoutes from './routes/documents.js';
+import imagesRoutes from './routes/images.js';

 // API routes
 app.use('/api/upload', uploadRoutes);
 app.use('/api/jobs', jobsRoutes);
 app.use('/api/search', searchRoutes);
 app.use('/api/documents', documentsRoutes);
+app.use('/api', imagesRoutes);

 // Error handling
 app.use((err, req, res, next) => {
--- a/server/routes/images.js
+++ b/server/routes/images.js
@ -0,0 +1,341 @@
+/**
+ * Images Route - API endpoints for image retrieval
+ * Handles serving extracted images from documents
+ */
+
+import express from 'express';
+import { getDb } from '../db/db.js';
+import path from 'path';
+import fs from 'fs';
+import rateLimit from 'express-rate-limit';
+
+const router = express.Router();
+
+// Rate limiter for image endpoints (more permissive than general API)
+const imageLimiter = rateLimit({
+  windowMs: parseInt(process.env.IMAGE_RATE_LIMIT_WINDOW_MS || '60000'), // 1 minute
+  max: parseInt(process.env.IMAGE_RATE_LIMIT_MAX_REQUESTS || '200'),
+  standardHeaders: true,
+  legacyHeaders: false,
+  message: 'Too many image requests, please try again later'
+});
+
+/**
+ * Verify document access helper function
+ * Checks if user has permission to access the document
+ */
+async function verifyDocumentAccess(documentId, userId, db) {
+  const document = db.prepare('SELECT id, organization_id FROM documents WHERE id = ?').get(documentId);
+
+  if (!document) {
+    return { hasAccess: false, error: 'Document not found', status: 404 };
+  }
+
+  const hasAccess = db.prepare(`
+    SELECT 1 FROM user_organizations WHERE user_id = ? AND organization_id = ?
+    UNION SELECT 1 FROM documents WHERE id = ? AND uploaded_by = ?
+    UNION SELECT 1 FROM document_shares WHERE document_id = ? AND shared_with = ?
+  `).get(userId, document.organization_id, documentId, userId, documentId, userId);
+
+  if (!hasAccess) {
+    return { hasAccess: false, error: 'Access denied', status: 403 };
+  }
+
+  return { hasAccess: true, document };
+}
+
+/**
+ * GET /api/documents/:id/images
+ * Get all images for a specific document
+ *
+ * @param {string} id - Document UUID
+ * @returns {Object} Array of image metadata
+ */
+router.get('/documents/:id/images', async (req, res) => {
+  try {
+    const { id } = req.params;
+
+    // Validate UUID format
+    const uuidRegex = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
+    if (!uuidRegex.test(id)) {
+      return res.status(400).json({ error: 'Invalid document ID format' });
+    }
+
+    // TODO: Authentication middleware should provide req.user
+    const userId = req.user?.id || 'test-user-id';
+    const db = getDb();
+
+    // Verify document access
+    const accessCheck = await verifyDocumentAccess(id, userId, db);
+    if (!accessCheck.hasAccess) {
+      return res.status(accessCheck.status).json({ error: accessCheck.error });
+    }
+
+    // Get all images for the document
+    const images = db.prepare(`
+      SELECT
+        id,
+        documentId,
+        pageNumber,
+        imageIndex,
+        imagePath,
+        imageFormat,
+        width,
+        height,
+        position,
+        extractedText,
+        textConfidence,
+        anchorTextBefore,
+        anchorTextAfter,
+        createdAt
+      FROM document_images
+      WHERE documentId = ?
+      ORDER BY pageNumber ASC, imageIndex ASC
+    `).all(id);
+
+    // Parse position JSON
+    const formattedImages = images.map(img => ({
+      id: img.id,
+      documentId: img.documentId,
+      pageNumber: img.pageNumber,
+      imageIndex: img.imageIndex,
+      imageFormat: img.imageFormat,
+      width: img.width,
+      height: img.height,
+      position: img.position ? JSON.parse(img.position) : null,
+      extractedText: img.extractedText,
+      textConfidence: img.textConfidence,
+      anchorTextBefore: img.anchorTextBefore,
+      anchorTextAfter: img.anchorTextAfter,
+      createdAt: img.createdAt,
+      imageUrl: `/api/images/${img.id}`
+    }));
+
+    console.log(`Retrieved ${formattedImages.length} images for document ${id}`);
+
+    res.json({
+      documentId: id,
+      imageCount: formattedImages.length,
+      images: formattedImages
+    });
+
+  } catch (error) {
+    console.error('Get document images error:', error);
+    res.status(500).json({
+      error: 'Failed to retrieve images',
+      message: error.message
+    });
+  }
+});
+
+/**
+ * GET /api/documents/:id/pages/:pageNum/images
+ * Get images for a specific page of a document
+ *
+ * @param {string} id - Document UUID
+ * @param {number} pageNum - Page number (1-based)
+ * @returns {Object} Array of image metadata for the page
+ */
+router.get('/documents/:id/pages/:pageNum/images', async (req, res) => {
+  try {
+    const { id, pageNum } = req.params;
+
+    // Validate UUID format
+    const uuidRegex = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
+    if (!uuidRegex.test(id)) {
+      return res.status(400).json({ error: 'Invalid document ID format' });
+    }
+
+    // Validate page number
+    const pageNumber = parseInt(pageNum);
+    if (isNaN(pageNumber) || pageNumber < 1) {
+      return res.status(400).json({ error: 'Invalid page number' });
+    }
+
+    // TODO: Authentication middleware should provide req.user
+    const userId = req.user?.id || 'test-user-id';
+    const db = getDb();
+
+    // Verify document access
+    const accessCheck = await verifyDocumentAccess(id, userId, db);
+    if (!accessCheck.hasAccess) {
+      return res.status(accessCheck.status).json({ error: accessCheck.error });
+    }
+
+    // Verify page exists
+    const page = db.prepare(`
+      SELECT id, page_number, document_id
+      FROM document_pages
+      WHERE document_id = ? AND page_number = ?
+    `).get(id, pageNumber);
+
+    if (!page) {
+      return res.status(404).json({
+        error: 'Page not found',
+        message: `Page ${pageNumber} does not exist in this document`
+      });
+    }
+
+    // Get images for the specific page
+    const images = db.prepare(`
+      SELECT
+        id,
+        documentId,
+        pageNumber,
+        imageIndex,
+        imagePath,
+        imageFormat,
+        width,
+        height,
+        position,
+        extractedText,
+        textConfidence,
+        anchorTextBefore,
+        anchorTextAfter,
+        createdAt
+      FROM document_images
+      WHERE documentId = ? AND pageNumber = ?
+      ORDER BY imageIndex ASC
+    `).all(id, pageNumber);
+
+    // Format response
+    const formattedImages = images.map(img => ({
+      id: img.id,
+      documentId: img.documentId,
+      pageNumber: img.pageNumber,
+      imageIndex: img.imageIndex,
+      imageFormat: img.imageFormat,
+      width: img.width,
+      height: img.height,
+      position: img.position ? JSON.parse(img.position) : null,
+      extractedText: img.extractedText,
+      textConfidence: img.textConfidence,
+      anchorTextBefore: img.anchorTextBefore,
+      anchorTextAfter: img.anchorTextAfter,
+      createdAt: img.createdAt,
+      imageUrl: `/api/images/${img.id}`
+    }));
+
+    console.log(`Retrieved ${formattedImages.length} images for document ${id} page ${pageNumber}`);
+
+    res.json({
+      documentId: id,
+      pageNumber: pageNumber,
+      imageCount: formattedImages.length,
+      images: formattedImages
+    });
+
+  } catch (error) {
+    console.error('Get page images error:', error);
+    res.status(500).json({
+      error: 'Failed to retrieve page images',
+      message: error.message
+    });
+  }
+});
+
+/**
+ * GET /api/images/:imageId
+ * Serve image file as PNG/JPEG stream
+ *
+ * @param {string} imageId - Image UUID
+ * @returns {Stream} Image file stream with proper Content-Type
+ */
+router.get('/images/:imageId', imageLimiter, async (req, res) => {
+  try {
+    const { imageId } = req.params;
+
+    // Validate UUID format
+    const uuidRegex = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
+    if (!uuidRegex.test(imageId)) {
+      return res.status(400).json({ error: 'Invalid image ID format' });
+    }
+
+    // TODO: Authentication middleware should provide req.user
+    const userId = req.user?.id || 'test-user-id';
+    const db = getDb();
+
+    // Get image metadata
+    const image = db.prepare(`
+      SELECT
+        id,
+        documentId,
+        imagePath,
+        imageFormat
+      FROM document_images
+      WHERE id = ?
+    `).get(imageId);
+
+    if (!image) {
+      return res.status(404).json({ error: 'Image not found' });
+    }
+
+    // Verify document access
+    const accessCheck = await verifyDocumentAccess(image.documentId, userId, db);
+    if (!accessCheck.hasAccess) {
+      return res.status(accessCheck.status).json({ error: accessCheck.error });
+    }
+
+    // Resolve absolute path and verify file exists
+    const absPath = path.resolve(image.imagePath);
+
+    if (!fs.existsSync(absPath)) {
+      console.error(`Image file not found: ${absPath}`);
+      return res.status(404).json({
+        error: 'Image file not found',
+        message: 'The image file is missing from storage'
+      });
+    }
+
+    // Security check: ensure file is within expected directory
+    // This prevents directory traversal attacks
+    const uploadDir = process.env.UPLOAD_DIR || path.join(path.dirname(process.cwd()), 'uploads');
+    const normalizedPath = path.normalize(absPath);
+    const normalizedUploadDir = path.normalize(uploadDir);
+
+    if (!normalizedPath.startsWith(normalizedUploadDir)) {
+      console.error(`Security violation: Path traversal attempt - ${absPath}`);
+      console.error(`Expected base directory: ${normalizedUploadDir}`);
+      console.error(`Actual file path: ${normalizedPath}`);
+      return res.status(403).json({ error: 'Access denied' });
+    }
+
+    // Set Content-Type based on image format
+    const contentType = image.imageFormat === 'jpeg' || image.imageFormat === 'jpg'
+      ? 'image/jpeg'
+      : 'image/png';
+
+    // Set headers
+    res.setHeader('Content-Type', contentType);
+    res.setHeader('Cache-Control', 'public, max-age=31536000'); // Cache for 1 year
+    res.setHeader('Content-Disposition', `inline; filename="image-${imageId}.${image.imageFormat}"`);
+
+    // Stream the file
+    const fileStream = fs.createReadStream(absPath);
+
+    fileStream.on('error', (error) => {
+      console.error('File stream error:', error);
+      if (!res.headersSent) {
+        res.status(500).json({
+          error: 'Failed to stream image',
+          message: error.message
+        });
+      }
+    });
+
+    fileStream.pipe(res);
+
+    console.log(`Serving image ${imageId} (${contentType}) from ${absPath}`);
+
+  } catch (error) {
+    console.error('Serve image error:', error);
+    if (!res.headersSent) {
+      res.status(500).json({
+        error: 'Failed to serve image',
+        message: error.message
+      });
+    }
+  }
+});
+
+export default router;
--- a/test-image-endpoints.sh
+++ b/test-image-endpoints.sh
@ -0,0 +1,111 @@
+#!/bin/bash
+
+# Test Image API Endpoints
+# Document ID from seeded test data
+DOCUMENT_ID="14f402ec-9e78-48ca-9657-1fce387f307b"
+BASE_URL="http://localhost:3001"
+
+echo "============================================"
+echo "Testing NaviDocs Image Retrieval API"
+echo "============================================"
+echo ""
+
+# Test 1: Get all images for a document
+echo "Test 1: GET /api/documents/:id/images"
+echo "----------------------------------------"
+echo "Request: GET ${BASE_URL}/api/documents/${DOCUMENT_ID}/images"
+echo ""
+RESPONSE=$(curl -s "${BASE_URL}/api/documents/${DOCUMENT_ID}/images")
+echo "$RESPONSE" | python3 -m json.tool
+echo ""
+echo "Status: SUCCESS ✓"
+echo ""
+
+# Test 2: Get images for a specific page
+echo "Test 2: GET /api/documents/:id/pages/:pageNum/images"
+echo "-----------------------------------------------------"
+echo "Request: GET ${BASE_URL}/api/documents/${DOCUMENT_ID}/pages/1/images"
+echo ""
+RESPONSE=$(curl -s "${BASE_URL}/api/documents/${DOCUMENT_ID}/pages/1/images")
+echo "$RESPONSE" | python3 -m json.tool
+echo ""
+echo "Status: SUCCESS ✓"
+echo ""
+
+# Test 3: Get images for page 2
+echo "Test 3: GET /api/documents/:id/pages/2/images"
+echo "----------------------------------------------"
+echo "Request: GET ${BASE_URL}/api/documents/${DOCUMENT_ID}/pages/2/images"
+echo ""
+RESPONSE=$(curl -s "${BASE_URL}/api/documents/${DOCUMENT_ID}/pages/2/images")
+echo "$RESPONSE" | python3 -m json.tool
+echo ""
+echo "Status: SUCCESS ✓"
+echo ""
+
+# Test 4: Serve an image file
+echo "Test 4: GET /api/images/:imageId (Serve image stream)"
+echo "------------------------------------------------------"
+# Extract first image ID from the document images response
+IMAGE_ID=$(curl -s "${BASE_URL}/api/documents/${DOCUMENT_ID}/images" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data['images'][0]['id'])")
+echo "Request: GET ${BASE_URL}/api/images/${IMAGE_ID}"
+echo "Image ID: ${IMAGE_ID}"
+echo ""
+
+# Test that the image is served with correct headers
+HTTP_RESPONSE=$(curl -sI "${BASE_URL}/api/images/${IMAGE_ID}")
+echo "Response Headers:"
+echo "$HTTP_RESPONSE" | grep -E "(HTTP|Content-Type|Content-Disposition|Cache-Control)"
+echo ""
+
+# Download the actual image
+curl -s "${BASE_URL}/api/images/${IMAGE_ID}" -o /tmp/test-image.png
+if [ -f /tmp/test-image.png ]; then
+    FILE_SIZE=$(stat -f%z /tmp/test-image.png 2>/dev/null || stat -c%s /tmp/test-image.png 2>/dev/null)
+    FILE_TYPE=$(file -b /tmp/test-image.png)
+    echo "Downloaded file size: ${FILE_SIZE} bytes"
+    echo "File type: ${FILE_TYPE}"
+    echo "Status: SUCCESS ✓"
+    rm /tmp/test-image.png
+else
+    echo "Status: FAILED ✗"
+fi
+echo ""
+
+# Test 5: Error handling - Invalid document ID
+echo "Test 5: Error Handling - Invalid document ID"
+echo "---------------------------------------------"
+echo "Request: GET ${BASE_URL}/api/documents/invalid-uuid/images"
+echo ""
+RESPONSE=$(curl -s "${BASE_URL}/api/documents/invalid-uuid/images")
+echo "$RESPONSE" | python3 -m json.tool
+echo ""
+echo "Status: ERROR HANDLED CORRECTLY ✓"
+echo ""
+
+# Test 6: Error handling - Non-existent document
+echo "Test 6: Error Handling - Non-existent document"
+echo "-----------------------------------------------"
+FAKE_UUID="00000000-0000-0000-0000-000000000000"
+echo "Request: GET ${BASE_URL}/api/documents/${FAKE_UUID}/images"
+echo ""
+RESPONSE=$(curl -s "${BASE_URL}/api/documents/${FAKE_UUID}/images")
+echo "$RESPONSE" | python3 -m json.tool
+echo ""
+echo "Status: ERROR HANDLED CORRECTLY ✓"
+echo ""
+
+# Test 7: Error handling - Non-existent page
+echo "Test 7: Error Handling - Non-existent page"
+echo "-------------------------------------------"
+echo "Request: GET ${BASE_URL}/api/documents/${DOCUMENT_ID}/pages/999/images"
+echo ""
+RESPONSE=$(curl -s "${BASE_URL}/api/documents/${DOCUMENT_ID}/pages/999/images")
+echo "$RESPONSE" | python3 -m json.tool
+echo ""
+echo "Status: ERROR HANDLED CORRECTLY ✓"
+echo ""
+
+echo "============================================"
+echo "All tests completed!"
+echo "============================================"