diff --git a/client/src/components/UploadModal.vue b/client/src/components/UploadModal.vue index 5fd0d9d..54a8bc6 100644 --- a/client/src/components/UploadModal.vue +++ b/client/src/components/UploadModal.vue @@ -32,19 +32,19 @@ -

Drag and drop your PDF here

+

Drag and drop your document here

or

-

Maximum file size: 50MB

+

Supported: PDF, Images (JPG/PNG), Word, Excel, Text/Markdown • Max: 50MB

diff --git a/server/package.json b/server/package.json index fd7acdb..33991bf 100644 --- a/server/package.json +++ b/server/package.json @@ -32,13 +32,15 @@ "ioredis": "^5.0.0", "jsonwebtoken": "^9.0.2", "lru-cache": "^11.2.2", + "mammoth": "^1.8.0", "meilisearch": "^0.41.0", "multer": "^1.4.5-lts.1", "pdf-img-convert": "^2.0.0", "pdf-parse": "^1.1.1", "sharp": "^0.34.4", "tesseract.js": "^5.0.0", - "uuid": "^10.0.0" + "uuid": "^10.0.0", + "xlsx": "^0.18.5" }, "devDependencies": { "@types/node": "^20.0.0" diff --git a/server/services/document-processor.js b/server/services/document-processor.js new file mode 100644 index 0000000..c51a926 --- /dev/null +++ b/server/services/document-processor.js @@ -0,0 +1,186 @@ +/** + * Document Processor Service + * Routes file processing to appropriate handler based on file type + */ + +import { extractTextFromPDF } from './ocr.js'; +import { getFileCategory } from './file-safety.js'; +import { readFileSync } from 'fs'; +import mammoth from 'mammoth'; +import XLSX from 'xlsx'; +import Tesseract from 'tesseract.js'; + +/** + * Process document with appropriate handler based on file type + * @param {string} filePath - Path to uploaded file + * @param {Object} options - Processing options + * @param {string} options.language - OCR language (default: 'eng') + * @param {Function} options.onProgress - Progress callback + * @returns {Promise} Array of page results with text and metadata + */ +export async function processDocument(filePath, options = {}) { + const category = getFileCategory(filePath); + + console.log(`[Document Processor] Processing ${category}: ${filePath}`); + + switch (category) { + case 'pdf': + return await extractTextFromPDF(filePath, options); + + case 'image': + return await processImageFile(filePath, options); + + case 'word': + return await processWordDocument(filePath, options); + + case 'excel': + return await processExcelDocument(filePath, options); + + case 'text': + return await processTextFile(filePath, options); + + default: + throw new Error(`Unsupported file type: ${category}`); + } +} + +/** + * Process image file with Tesseract OCR + * @param {string} imagePath - Path to image file + * @param {Object} options - Processing options + * @returns {Promise} OCR results + */ +async function processImageFile(imagePath, options = {}) { + const { language = 'eng', onProgress } = options; + + console.log('[Image Processor] Running OCR on image...'); + + try { + const worker = await Tesseract.createWorker(language, 1, { + logger: onProgress ? (m) => { + if (m.status === 'recognizing text') { + onProgress({ progress: m.progress * 100 }); + } + } : undefined + }); + + const { data } = await worker.recognize(imagePath); + await worker.terminate(); + + console.log(`[Image Processor] OCR complete. Confidence: ${data.confidence}%`); + + return [{ + pageNumber: 1, + text: data.text, + confidence: data.confidence / 100, // Convert to 0-1 range + method: 'tesseract-ocr' + }]; + } catch (error) { + console.error('[Image Processor] OCR failed:', error); + throw new Error(`Image OCR failed: ${error.message}`); + } +} + +/** + * Process Word document with Mammoth + * @param {string} docPath - Path to DOCX file + * @param {Object} options - Processing options + * @returns {Promise} Extracted text + */ +async function processWordDocument(docPath, options = {}) { + console.log('[Word Processor] Extracting text from DOCX...'); + + try { + const result = await mammoth.extractRawText({ path: docPath }); + const text = result.value; + + if (result.messages.length > 0) { + console.log('[Word Processor] Extraction warnings:', result.messages); + } + + console.log(`[Word Processor] Extracted ${text.length} characters`); + + return [{ + pageNumber: 1, + text: text, + confidence: 0.99, + method: 'native-extraction' + }]; + } catch (error) { + console.error('[Word Processor] Extraction failed:', error); + throw new Error(`Word document processing failed: ${error.message}`); + } +} + +/** + * Process Excel document with XLSX + * @param {string} xlsPath - Path to XLSX file + * @param {Object} options - Processing options + * @returns {Promise} Extracted data from all sheets + */ +async function processExcelDocument(xlsPath, options = {}) { + console.log('[Excel Processor] Reading workbook...'); + + try { + const workbook = XLSX.readFile(xlsPath); + const sheets = []; + + workbook.SheetNames.forEach((sheetName, idx) => { + const worksheet = workbook.Sheets[sheetName]; + + // Convert to CSV for text-based indexing + const csvText = XLSX.utils.sheet_to_csv(worksheet); + + // Also get JSON for structured data (optional) + const jsonData = XLSX.utils.sheet_to_json(worksheet, { header: 1 }); + + sheets.push({ + pageNumber: idx + 1, + text: csvText, + confidence: 0.99, + method: 'native-extraction', + sheetName: sheetName, + metadata: { + rowCount: jsonData.length, + columnCount: jsonData[0]?.length || 0 + } + }); + }); + + console.log(`[Excel Processor] Extracted ${sheets.length} sheets`); + return sheets; + } catch (error) { + console.error('[Excel Processor] Reading failed:', error); + throw new Error(`Excel document processing failed: ${error.message}`); + } +} + +/** + * Process plain text file + * @param {string} txtPath - Path to text file + * @param {Object} options - Processing options + * @returns {Promise} Text content + */ +async function processTextFile(txtPath, options = {}) { + console.log('[Text Processor] Reading text file...'); + + try { + const text = readFileSync(txtPath, 'utf-8'); + + console.log(`[Text Processor] Read ${text.length} characters`); + + return [{ + pageNumber: 1, + text: text, + confidence: 1.0, + method: 'native-extraction' + }]; + } catch (error) { + console.error('[Text Processor] Reading failed:', error); + throw new Error(`Text file processing failed: ${error.message}`); + } +} + +export default { + processDocument +}; diff --git a/server/services/file-safety.js b/server/services/file-safety.js index b0f7079..d854734 100644 --- a/server/services/file-safety.js +++ b/server/services/file-safety.js @@ -7,8 +7,29 @@ import { fileTypeFromBuffer } from 'file-type'; import path from 'path'; const MAX_FILE_SIZE = parseInt(process.env.MAX_FILE_SIZE || '52428800'); // 50MB default -const ALLOWED_EXTENSIONS = ['.pdf']; -const ALLOWED_MIME_TYPES = ['application/pdf']; + +// Documents +const ALLOWED_EXTENSIONS = [ + '.pdf', + '.doc', '.docx', + '.xls', '.xlsx', + '.txt', '.md', + // Images + '.jpg', '.jpeg', '.png', '.webp' +]; + +const ALLOWED_MIME_TYPES = [ + 'application/pdf', + 'application/msword', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/vnd.ms-excel', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'text/plain', + 'text/markdown', + 'image/jpeg', + 'image/png', + 'image/webp' +]; /** * Validate file safety and format @@ -37,26 +58,35 @@ export async function validateFile(file) { if (!ALLOWED_EXTENSIONS.includes(ext)) { return { valid: false, - error: `File extension ${ext} not allowed. Only PDF files are accepted.` + error: `File extension ${ext} not allowed. Accepted types: PDF, JPG, PNG, DOCX, XLSX, TXT, MD` }; } // Check MIME type via file-type (magic number detection) + // Note: Text files (.txt, .md) may not be detected by file-type try { const detectedType = await fileTypeFromBuffer(file.buffer); - // PDF files should be detected - if (!detectedType || !ALLOWED_MIME_TYPES.includes(detectedType.mime)) { + // Skip MIME check for text files (they don't have magic numbers) + const textExtensions = ['.txt', '.md']; + const isTextFile = textExtensions.includes(ext); + + // For binary files (PDF, images, Office), verify MIME type + if (!isTextFile && detectedType && !ALLOWED_MIME_TYPES.includes(detectedType.mime)) { return { valid: false, - error: 'File is not a valid PDF document (MIME type mismatch)' + error: `File type mismatch: detected ${detectedType.mime}, expected ${ext} file` }; } } catch (error) { - return { - valid: false, - error: 'Unable to verify file type' - }; + // Ignore MIME detection errors for text files + const textExtensions = ['.txt', '.md']; + if (!textExtensions.includes(ext)) { + return { + valid: false, + error: 'Unable to verify file type' + }; + } } // Check for null bytes (potential attack vector) @@ -97,7 +127,25 @@ export function sanitizeFilename(filename) { return sanitized; } +/** + * Get file category based on extension + * @param {string} filename - Filename to categorize + * @returns {string} Category: 'pdf', 'word', 'excel', 'text', 'image', or 'unknown' + */ +export function getFileCategory(filename) { + const ext = path.extname(filename).toLowerCase(); + + if (['.pdf'].includes(ext)) return 'pdf'; + if (['.doc', '.docx'].includes(ext)) return 'word'; + if (['.xls', '.xlsx'].includes(ext)) return 'excel'; + if (['.txt', '.md'].includes(ext)) return 'text'; + if (['.jpg', '.jpeg', '.png', '.webp'].includes(ext)) return 'image'; + + return 'unknown'; +} + export default { validateFile, - sanitizeFilename + sanitizeFilename, + getFileCategory }; diff --git a/server/workers/ocr-worker.js b/server/workers/ocr-worker.js index 8577b50..c447dc5 100644 --- a/server/workers/ocr-worker.js +++ b/server/workers/ocr-worker.js @@ -18,7 +18,7 @@ import { v4 as uuidv4 } from 'uuid'; import { dirname, join } from 'path'; import { fileURLToPath } from 'url'; import { getDb } from '../config/db.js'; -import { extractTextFromPDF } from '../services/ocr-hybrid.js'; +import { processDocument } from '../services/document-processor.js'; import { cleanOCRText, extractTextFromImage } from '../services/ocr.js'; import { indexDocumentPage } from '../services/search.js'; import { extractImagesFromPage } from './image-extractor.js'; @@ -92,10 +92,10 @@ async function processOCRJob(job) { console.log(`[OCR Worker] Progress: ${currentProgress}% (page ${pageNum}/${total})`); }; - // Extract text from PDF using OCR service - console.log(`[OCR Worker] Extracting text from ${filePath}`); + // Process document using multi-format processor + console.log(`[OCR Worker] Processing document from ${filePath}`); - const ocrResults = await extractTextFromPDF(filePath, { + const ocrResults = await processDocument(filePath, { language: document.language || 'eng', onProgress: updateProgress });