Merge Session 2: Multi-format upload (JPG, DOCX, XLSX, TXT, MD)

2025-11-13 12:55:41 +00:00 · 2025-11-13 12:55:41 +00:00 · bf76d0c2bf
commit bf76d0c2bf
parent 7866a2ceaf f0096a6bd6
5 changed files with 255 additions and 19 deletions
--- a/client/src/components/UploadModal.vue
+++ b/client/src/components/UploadModal.vue
@ -32,19 +32,19 @@
              <svg class="w-16 h-16 mx-auto text-white/50 mb-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
                <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M7 16a4 4 0 01-.88-7.903A5 5 0 1115.9 6L16 6a5 5 0 011 9.9M15 13l-3-3m0 0l-3 3m3-3v12" />
              </svg>
-              <p class="text-lg text-white mb-2">Drag and drop your PDF here</p>
+              <p class="text-lg text-white mb-2">Drag and drop your document here</p>
              <p class="text-sm text-white/70 mb-4">or</p>
              <label class="btn btn-outline cursor-pointer">
                Browse Files
                <input
                  ref="fileInput"
                  type="file"
-                  accept="application/pdf"
+                  accept=".pdf,.jpg,.jpeg,.png,.webp,.docx,.xlsx,.txt,.md"
                  class="hidden"
                  @change="handleFileSelect"
                />
              </label>
-              <p class="text-xs text-white/70 mt-4">Maximum file size: 50MB</p>
+              <p class="text-xs text-white/70 mt-4">Supported: PDF, Images (JPG/PNG), Word, Excel, Text/Markdown • Max: 50MB</p>
            </div>
            <!-- Selected File Preview -->
--- a/server/package.json
+++ b/server/package.json
@ -32,6 +32,7 @@
    "ioredis": "^5.0.0",
    "jsonwebtoken": "^9.0.2",
    "lru-cache": "^11.2.2",
    "mammoth": "^1.8.0",
    "meilisearch": "^0.41.0",
    "multer": "^1.4.5-lts.1",
    "pdf-img-convert": "^2.0.0",
@ -39,7 +40,8 @@
    "pdfjs-dist": "^5.4.394",
    "sharp": "^0.34.4",
    "tesseract.js": "^5.0.0",
-    "uuid": "^10.0.0"
+    "uuid": "^10.0.0",
    "xlsx": "^0.18.5"
  },
  "devDependencies": {
    "@types/node": "^20.0.0"
--- a/server/services/document-processor.js
+++ b/server/services/document-processor.js
@ -0,0 +1,186 @@
 /**
 * Document Processor Service
 * Routes file processing to appropriate handler based on file type
 */
 import { extractTextFromPDF } from './ocr.js';
 import { getFileCategory } from './file-safety.js';
 import { readFileSync } from 'fs';
 import mammoth from 'mammoth';
 import XLSX from 'xlsx';
 import Tesseract from 'tesseract.js';
 /**
 * Process document with appropriate handler based on file type
 * @param {string} filePath - Path to uploaded file
 * @param {Object} options - Processing options
 * @param {string} options.language - OCR language (default: 'eng')
 * @param {Function} options.onProgress - Progress callback
 * @returns {Promise<Array>} Array of page results with text and metadata
 */
 export async function processDocument(filePath, options = {}) {
  const category = getFileCategory(filePath);
  console.log(`[Document Processor] Processing ${category}: ${filePath}`);
  switch (category) {
    case 'pdf':
      return await extractTextFromPDF(filePath, options);
    case 'image':
      return await processImageFile(filePath, options);
    case 'word':
      return await processWordDocument(filePath, options);
    case 'excel':
      return await processExcelDocument(filePath, options);
    case 'text':
      return await processTextFile(filePath, options);
    default:
      throw new Error(`Unsupported file type: ${category}`);
  }
 }
 /**
 * Process image file with Tesseract OCR
 * @param {string} imagePath - Path to image file
 * @param {Object} options - Processing options
 * @returns {Promise<Array>} OCR results
 */
 async function processImageFile(imagePath, options = {}) {
  const { language = 'eng', onProgress } = options;
  console.log('[Image Processor] Running OCR on image...');
  try {
    const worker = await Tesseract.createWorker(language, 1, {
      logger: onProgress ? (m) => {
        if (m.status === 'recognizing text') {
          onProgress({ progress: m.progress * 100 });
        }
      } : undefined
    });
    const { data } = await worker.recognize(imagePath);
    await worker.terminate();
    console.log(`[Image Processor] OCR complete. Confidence: ${data.confidence}%`);
    return [{
      pageNumber: 1,
      text: data.text,
      confidence: data.confidence / 100, // Convert to 0-1 range
      method: 'tesseract-ocr'
    }];
  } catch (error) {
    console.error('[Image Processor] OCR failed:', error);
    throw new Error(`Image OCR failed: ${error.message}`);
  }
 }
 /**
 * Process Word document with Mammoth
 * @param {string} docPath - Path to DOCX file
 * @param {Object} options - Processing options
 * @returns {Promise<Array>} Extracted text
 */
 async function processWordDocument(docPath, options = {}) {
  console.log('[Word Processor] Extracting text from DOCX...');
  try {
    const result = await mammoth.extractRawText({ path: docPath });
    const text = result.value;
    if (result.messages.length > 0) {
      console.log('[Word Processor] Extraction warnings:', result.messages);
    }
    console.log(`[Word Processor] Extracted ${text.length} characters`);
    return [{
      pageNumber: 1,
      text: text,
      confidence: 0.99,
      method: 'native-extraction'
    }];
  } catch (error) {
    console.error('[Word Processor] Extraction failed:', error);
    throw new Error(`Word document processing failed: ${error.message}`);
  }
 }
 /**
 * Process Excel document with XLSX
 * @param {string} xlsPath - Path to XLSX file
 * @param {Object} options - Processing options
 * @returns {Promise<Array>} Extracted data from all sheets
 */
 async function processExcelDocument(xlsPath, options = {}) {
  console.log('[Excel Processor] Reading workbook...');
  try {
    const workbook = XLSX.readFile(xlsPath);
    const sheets = [];
    workbook.SheetNames.forEach((sheetName, idx) => {
      const worksheet = workbook.Sheets[sheetName];
      // Convert to CSV for text-based indexing
      const csvText = XLSX.utils.sheet_to_csv(worksheet);
      // Also get JSON for structured data (optional)
      const jsonData = XLSX.utils.sheet_to_json(worksheet, { header: 1 });
      sheets.push({
        pageNumber: idx + 1,
        text: csvText,
        confidence: 0.99,
        method: 'native-extraction',
        sheetName: sheetName,
        metadata: {
          rowCount: jsonData.length,
          columnCount: jsonData[0]?.length || 0
        }
      });
    });
    console.log(`[Excel Processor] Extracted ${sheets.length} sheets`);
    return sheets;
  } catch (error) {
    console.error('[Excel Processor] Reading failed:', error);
    throw new Error(`Excel document processing failed: ${error.message}`);
  }
 }
 /**
 * Process plain text file
 * @param {string} txtPath - Path to text file
 * @param {Object} options - Processing options
 * @returns {Promise<Array>} Text content
 */
 async function processTextFile(txtPath, options = {}) {
  console.log('[Text Processor] Reading text file...');
  try {
    const text = readFileSync(txtPath, 'utf-8');
    console.log(`[Text Processor] Read ${text.length} characters`);
    return [{
      pageNumber: 1,
      text: text,
      confidence: 1.0,
      method: 'native-extraction'
    }];
  } catch (error) {
    console.error('[Text Processor] Reading failed:', error);
    throw new Error(`Text file processing failed: ${error.message}`);
  }
 }
 export default {
  processDocument
 };
--- a/server/services/file-safety.js
+++ b/server/services/file-safety.js
@ -7,8 +7,29 @@ import { fileTypeFromBuffer } from 'file-type';
 import path from 'path';
 const MAX_FILE_SIZE = parseInt(process.env.MAX_FILE_SIZE || '52428800'); // 50MB default
-const ALLOWED_EXTENSIONS = ['.pdf'];
+
-const ALLOWED_MIME_TYPES = ['application/pdf'];
+// Documents
 const ALLOWED_EXTENSIONS = [
  '.pdf',
  '.doc', '.docx',
  '.xls', '.xlsx',
  '.txt', '.md',
  // Images
  '.jpg', '.jpeg', '.png', '.webp'
 ];
 const ALLOWED_MIME_TYPES = [
  'application/pdf',
  'application/msword',
  'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
  'application/vnd.ms-excel',
  'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
  'text/plain',
  'text/markdown',
  'image/jpeg',
  'image/png',
  'image/webp'
 ];
 /**
 * Validate file safety and format
@ -37,26 +58,35 @@ export async function validateFile(file) {
  if (!ALLOWED_EXTENSIONS.includes(ext)) {
    return {
      valid: false,
-      error: `File extension ${ext} not allowed. Only PDF files are accepted.`
+      error: `File extension ${ext} not allowed. Accepted types: PDF, JPG, PNG, DOCX, XLSX, TXT, MD`
    };
  }
  // Check MIME type via file-type (magic number detection)
  // Note: Text files (.txt, .md) may not be detected by file-type
  try {
    const detectedType = await fileTypeFromBuffer(file.buffer);
-    // PDF files should be detected
+    // Skip MIME check for text files (they don't have magic numbers)
-    if (!detectedType || !ALLOWED_MIME_TYPES.includes(detectedType.mime)) {
+    const textExtensions = ['.txt', '.md'];
    const isTextFile = textExtensions.includes(ext);
    // For binary files (PDF, images, Office), verify MIME type
    if (!isTextFile && detectedType && !ALLOWED_MIME_TYPES.includes(detectedType.mime)) {
      return {
        valid: false,
-        error: 'File is not a valid PDF document (MIME type mismatch)'
+        error: `File type mismatch: detected ${detectedType.mime}, expected ${ext} file`
      };
    }
  } catch (error) {
-    return {
+    // Ignore MIME detection errors for text files
-      valid: false,
+    const textExtensions = ['.txt', '.md'];
-      error: 'Unable to verify file type'
+    if (!textExtensions.includes(ext)) {
-    };
+      return {
        valid: false,
        error: 'Unable to verify file type'
      };
    }
  }
  // Check for null bytes (potential attack vector)
@ -97,7 +127,25 @@ export function sanitizeFilename(filename) {
  return sanitized;
 }
 /**
 * Get file category based on extension
 * @param {string} filename - Filename to categorize
 * @returns {string} Category: 'pdf', 'word', 'excel', 'text', 'image', or 'unknown'
 */
 export function getFileCategory(filename) {
  const ext = path.extname(filename).toLowerCase();
  if (['.pdf'].includes(ext)) return 'pdf';
  if (['.doc', '.docx'].includes(ext)) return 'word';
  if (['.xls', '.xlsx'].includes(ext)) return 'excel';
  if (['.txt', '.md'].includes(ext)) return 'text';
  if (['.jpg', '.jpeg', '.png', '.webp'].includes(ext)) return 'image';
  return 'unknown';
 }
 export default {
  validateFile,
-  sanitizeFilename
+  sanitizeFilename,
  getFileCategory
 };
--- a/server/workers/ocr-worker.js
+++ b/server/workers/ocr-worker.js
@ -18,7 +18,7 @@ import { v4 as uuidv4 } from 'uuid';
 import { dirname, join } from 'path';
 import { fileURLToPath } from 'url';
 import { getDb } from '../config/db.js';
-import { extractTextFromPDF } from '../services/ocr-hybrid.js';
+import { processDocument } from '../services/document-processor.js';
 import { cleanOCRText, extractTextFromImage } from '../services/ocr.js';
 import { indexDocumentPage } from '../services/search.js';
 import { extractImagesFromPage } from './image-extractor.js';
@ -92,10 +92,10 @@ async function processOCRJob(job) {
      console.log(`[OCR Worker] Progress: ${currentProgress}% (page ${pageNum}/${total})`);
    };
-    // Extract text from PDF using OCR service
+    // Process document using multi-format processor
-    console.log(`[OCR Worker] Extracting text from ${filePath}`);
+    console.log(`[OCR Worker] Processing document from ${filePath}`);
-    const ocrResults = await extractTextFromPDF(filePath, {
+    const ocrResults = await processDocument(filePath, {
      language: document.language || 'eng',
      onProgress: updateProgress
    });