diff --git a/client/src/components/UploadModal.vue b/client/src/components/UploadModal.vue
index 5fd0d9d..54a8bc6 100644
--- a/client/src/components/UploadModal.vue
+++ b/client/src/components/UploadModal.vue
@@ -32,19 +32,19 @@
-
Drag and drop your PDF here
+ Drag and drop your document here
or
- Maximum file size: 50MB
+ Supported: PDF, Images (JPG/PNG), Word, Excel, Text/Markdown • Max: 50MB
diff --git a/server/package.json b/server/package.json
index fd7acdb..33991bf 100644
--- a/server/package.json
+++ b/server/package.json
@@ -32,13 +32,15 @@
"ioredis": "^5.0.0",
"jsonwebtoken": "^9.0.2",
"lru-cache": "^11.2.2",
+ "mammoth": "^1.8.0",
"meilisearch": "^0.41.0",
"multer": "^1.4.5-lts.1",
"pdf-img-convert": "^2.0.0",
"pdf-parse": "^1.1.1",
"sharp": "^0.34.4",
"tesseract.js": "^5.0.0",
- "uuid": "^10.0.0"
+ "uuid": "^10.0.0",
+ "xlsx": "^0.18.5"
},
"devDependencies": {
"@types/node": "^20.0.0"
diff --git a/server/services/document-processor.js b/server/services/document-processor.js
new file mode 100644
index 0000000..c51a926
--- /dev/null
+++ b/server/services/document-processor.js
@@ -0,0 +1,186 @@
+/**
+ * Document Processor Service
+ * Routes file processing to appropriate handler based on file type
+ */
+
+import { extractTextFromPDF } from './ocr.js';
+import { getFileCategory } from './file-safety.js';
+import { readFileSync } from 'fs';
+import mammoth from 'mammoth';
+import XLSX from 'xlsx';
+import Tesseract from 'tesseract.js';
+
+/**
+ * Process document with appropriate handler based on file type
+ * @param {string} filePath - Path to uploaded file
+ * @param {Object} options - Processing options
+ * @param {string} options.language - OCR language (default: 'eng')
+ * @param {Function} options.onProgress - Progress callback
+ * @returns {Promise} Array of page results with text and metadata
+ */
+export async function processDocument(filePath, options = {}) {
+ const category = getFileCategory(filePath);
+
+ console.log(`[Document Processor] Processing ${category}: ${filePath}`);
+
+ switch (category) {
+ case 'pdf':
+ return await extractTextFromPDF(filePath, options);
+
+ case 'image':
+ return await processImageFile(filePath, options);
+
+ case 'word':
+ return await processWordDocument(filePath, options);
+
+ case 'excel':
+ return await processExcelDocument(filePath, options);
+
+ case 'text':
+ return await processTextFile(filePath, options);
+
+ default:
+ throw new Error(`Unsupported file type: ${category}`);
+ }
+}
+
+/**
+ * Process image file with Tesseract OCR
+ * @param {string} imagePath - Path to image file
+ * @param {Object} options - Processing options
+ * @returns {Promise} OCR results
+ */
+async function processImageFile(imagePath, options = {}) {
+ const { language = 'eng', onProgress } = options;
+
+ console.log('[Image Processor] Running OCR on image...');
+
+ try {
+ const worker = await Tesseract.createWorker(language, 1, {
+ logger: onProgress ? (m) => {
+ if (m.status === 'recognizing text') {
+ onProgress({ progress: m.progress * 100 });
+ }
+ } : undefined
+ });
+
+ const { data } = await worker.recognize(imagePath);
+ await worker.terminate();
+
+ console.log(`[Image Processor] OCR complete. Confidence: ${data.confidence}%`);
+
+ return [{
+ pageNumber: 1,
+ text: data.text,
+ confidence: data.confidence / 100, // Convert to 0-1 range
+ method: 'tesseract-ocr'
+ }];
+ } catch (error) {
+ console.error('[Image Processor] OCR failed:', error);
+ throw new Error(`Image OCR failed: ${error.message}`);
+ }
+}
+
+/**
+ * Process Word document with Mammoth
+ * @param {string} docPath - Path to DOCX file
+ * @param {Object} options - Processing options
+ * @returns {Promise} Extracted text
+ */
+async function processWordDocument(docPath, options = {}) {
+ console.log('[Word Processor] Extracting text from DOCX...');
+
+ try {
+ const result = await mammoth.extractRawText({ path: docPath });
+ const text = result.value;
+
+ if (result.messages.length > 0) {
+ console.log('[Word Processor] Extraction warnings:', result.messages);
+ }
+
+ console.log(`[Word Processor] Extracted ${text.length} characters`);
+
+ return [{
+ pageNumber: 1,
+ text: text,
+ confidence: 0.99,
+ method: 'native-extraction'
+ }];
+ } catch (error) {
+ console.error('[Word Processor] Extraction failed:', error);
+ throw new Error(`Word document processing failed: ${error.message}`);
+ }
+}
+
+/**
+ * Process Excel document with XLSX
+ * @param {string} xlsPath - Path to XLSX file
+ * @param {Object} options - Processing options
+ * @returns {Promise} Extracted data from all sheets
+ */
+async function processExcelDocument(xlsPath, options = {}) {
+ console.log('[Excel Processor] Reading workbook...');
+
+ try {
+ const workbook = XLSX.readFile(xlsPath);
+ const sheets = [];
+
+ workbook.SheetNames.forEach((sheetName, idx) => {
+ const worksheet = workbook.Sheets[sheetName];
+
+ // Convert to CSV for text-based indexing
+ const csvText = XLSX.utils.sheet_to_csv(worksheet);
+
+ // Also get JSON for structured data (optional)
+ const jsonData = XLSX.utils.sheet_to_json(worksheet, { header: 1 });
+
+ sheets.push({
+ pageNumber: idx + 1,
+ text: csvText,
+ confidence: 0.99,
+ method: 'native-extraction',
+ sheetName: sheetName,
+ metadata: {
+ rowCount: jsonData.length,
+ columnCount: jsonData[0]?.length || 0
+ }
+ });
+ });
+
+ console.log(`[Excel Processor] Extracted ${sheets.length} sheets`);
+ return sheets;
+ } catch (error) {
+ console.error('[Excel Processor] Reading failed:', error);
+ throw new Error(`Excel document processing failed: ${error.message}`);
+ }
+}
+
+/**
+ * Process plain text file
+ * @param {string} txtPath - Path to text file
+ * @param {Object} options - Processing options
+ * @returns {Promise} Text content
+ */
+async function processTextFile(txtPath, options = {}) {
+ console.log('[Text Processor] Reading text file...');
+
+ try {
+ const text = readFileSync(txtPath, 'utf-8');
+
+ console.log(`[Text Processor] Read ${text.length} characters`);
+
+ return [{
+ pageNumber: 1,
+ text: text,
+ confidence: 1.0,
+ method: 'native-extraction'
+ }];
+ } catch (error) {
+ console.error('[Text Processor] Reading failed:', error);
+ throw new Error(`Text file processing failed: ${error.message}`);
+ }
+}
+
+export default {
+ processDocument
+};
diff --git a/server/services/file-safety.js b/server/services/file-safety.js
index b0f7079..d854734 100644
--- a/server/services/file-safety.js
+++ b/server/services/file-safety.js
@@ -7,8 +7,29 @@ import { fileTypeFromBuffer } from 'file-type';
import path from 'path';
const MAX_FILE_SIZE = parseInt(process.env.MAX_FILE_SIZE || '52428800'); // 50MB default
-const ALLOWED_EXTENSIONS = ['.pdf'];
-const ALLOWED_MIME_TYPES = ['application/pdf'];
+
+// Documents
+const ALLOWED_EXTENSIONS = [
+ '.pdf',
+ '.doc', '.docx',
+ '.xls', '.xlsx',
+ '.txt', '.md',
+ // Images
+ '.jpg', '.jpeg', '.png', '.webp'
+];
+
+const ALLOWED_MIME_TYPES = [
+ 'application/pdf',
+ 'application/msword',
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+ 'application/vnd.ms-excel',
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+ 'text/plain',
+ 'text/markdown',
+ 'image/jpeg',
+ 'image/png',
+ 'image/webp'
+];
/**
* Validate file safety and format
@@ -37,26 +58,35 @@ export async function validateFile(file) {
if (!ALLOWED_EXTENSIONS.includes(ext)) {
return {
valid: false,
- error: `File extension ${ext} not allowed. Only PDF files are accepted.`
+ error: `File extension ${ext} not allowed. Accepted types: PDF, JPG, PNG, DOCX, XLSX, TXT, MD`
};
}
// Check MIME type via file-type (magic number detection)
+ // Note: Text files (.txt, .md) may not be detected by file-type
try {
const detectedType = await fileTypeFromBuffer(file.buffer);
- // PDF files should be detected
- if (!detectedType || !ALLOWED_MIME_TYPES.includes(detectedType.mime)) {
+ // Skip MIME check for text files (they don't have magic numbers)
+ const textExtensions = ['.txt', '.md'];
+ const isTextFile = textExtensions.includes(ext);
+
+ // For binary files (PDF, images, Office), verify MIME type
+ if (!isTextFile && detectedType && !ALLOWED_MIME_TYPES.includes(detectedType.mime)) {
return {
valid: false,
- error: 'File is not a valid PDF document (MIME type mismatch)'
+ error: `File type mismatch: detected ${detectedType.mime}, expected ${ext} file`
};
}
} catch (error) {
- return {
- valid: false,
- error: 'Unable to verify file type'
- };
+ // Ignore MIME detection errors for text files
+ const textExtensions = ['.txt', '.md'];
+ if (!textExtensions.includes(ext)) {
+ return {
+ valid: false,
+ error: 'Unable to verify file type'
+ };
+ }
}
// Check for null bytes (potential attack vector)
@@ -97,7 +127,25 @@ export function sanitizeFilename(filename) {
return sanitized;
}
+/**
+ * Get file category based on extension
+ * @param {string} filename - Filename to categorize
+ * @returns {string} Category: 'pdf', 'word', 'excel', 'text', 'image', or 'unknown'
+ */
+export function getFileCategory(filename) {
+ const ext = path.extname(filename).toLowerCase();
+
+ if (['.pdf'].includes(ext)) return 'pdf';
+ if (['.doc', '.docx'].includes(ext)) return 'word';
+ if (['.xls', '.xlsx'].includes(ext)) return 'excel';
+ if (['.txt', '.md'].includes(ext)) return 'text';
+ if (['.jpg', '.jpeg', '.png', '.webp'].includes(ext)) return 'image';
+
+ return 'unknown';
+}
+
export default {
validateFile,
- sanitizeFilename
+ sanitizeFilename,
+ getFileCategory
};
diff --git a/server/workers/ocr-worker.js b/server/workers/ocr-worker.js
index 8577b50..c447dc5 100644
--- a/server/workers/ocr-worker.js
+++ b/server/workers/ocr-worker.js
@@ -18,7 +18,7 @@ import { v4 as uuidv4 } from 'uuid';
import { dirname, join } from 'path';
import { fileURLToPath } from 'url';
import { getDb } from '../config/db.js';
-import { extractTextFromPDF } from '../services/ocr-hybrid.js';
+import { processDocument } from '../services/document-processor.js';
import { cleanOCRText, extractTextFromImage } from '../services/ocr.js';
import { indexDocumentPage } from '../services/search.js';
import { extractImagesFromPage } from './image-extractor.js';
@@ -92,10 +92,10 @@ async function processOCRJob(job) {
console.log(`[OCR Worker] Progress: ${currentProgress}% (page ${pageNum}/${total})`);
};
- // Extract text from PDF using OCR service
- console.log(`[OCR Worker] Extracting text from ${filePath}`);
+ // Process document using multi-format processor
+ console.log(`[OCR Worker] Processing document from ${filePath}`);
- const ocrResults = await extractTextFromPDF(filePath, {
+ const ocrResults = await processDocument(filePath, {
language: document.language || 'eng',
onProgress: updateProgress
});