navidocs/server/services/document-processor.js
Claude f0096a6bd6
Feature: Multi-format upload support (JPG, PNG, DOCX, XLSX, TXT, MD)
Implements multi-format document upload capability expanding beyond PDFs.

Changes:
- server/package.json: Add mammoth (DOCX) and xlsx (Excel) dependencies
- server/services/file-safety.js: Expand allowed file types and MIME types
  - Added getFileCategory() function to classify file types
  - Support for images, Office docs, and text files
  - Flexible MIME validation for text files
- server/services/document-processor.js: NEW routing service
  - processImageFile(): Tesseract OCR for JPG/PNG/WebP
  - processWordDocument(): Mammoth for DOCX text extraction
  - processExcelDocument(): XLSX for spreadsheet data extraction
  - processTextFile(): Native reading for TXT/MD files
  - Unified interface with processDocument() router
- server/workers/ocr-worker.js: Switch from extractTextFromPDF to processDocument
  - Now handles all file types through unified processor
- client/src/components/UploadModal.vue: Update UI for multi-format
  - File input accepts all new file types
  - Updated help text to show supported formats

Supported formats: PDF, JPG, PNG, WebP, DOCX, XLSX, TXT, MD
Text extraction methods: Native (Office/text), Tesseract OCR (images), PDF.js (PDFs)
Search indexing: All file types processed and indexed in Meilisearch

Session: Cloud Session 2 - Multi-Format Upload Support
Branch: feature/multiformat
Status: Complete - Ready for testing
2025-11-13 12:54:44 +00:00

186 lines
5.3 KiB
JavaScript

/**
* Document Processor Service
* Routes file processing to appropriate handler based on file type
*/
import { extractTextFromPDF } from './ocr.js';
import { getFileCategory } from './file-safety.js';
import { readFileSync } from 'fs';
import mammoth from 'mammoth';
import XLSX from 'xlsx';
import Tesseract from 'tesseract.js';
/**
* Process document with appropriate handler based on file type
* @param {string} filePath - Path to uploaded file
* @param {Object} options - Processing options
* @param {string} options.language - OCR language (default: 'eng')
* @param {Function} options.onProgress - Progress callback
* @returns {Promise<Array>} Array of page results with text and metadata
*/
export async function processDocument(filePath, options = {}) {
const category = getFileCategory(filePath);
console.log(`[Document Processor] Processing ${category}: ${filePath}`);
switch (category) {
case 'pdf':
return await extractTextFromPDF(filePath, options);
case 'image':
return await processImageFile(filePath, options);
case 'word':
return await processWordDocument(filePath, options);
case 'excel':
return await processExcelDocument(filePath, options);
case 'text':
return await processTextFile(filePath, options);
default:
throw new Error(`Unsupported file type: ${category}`);
}
}
/**
* Process image file with Tesseract OCR
* @param {string} imagePath - Path to image file
* @param {Object} options - Processing options
* @returns {Promise<Array>} OCR results
*/
async function processImageFile(imagePath, options = {}) {
const { language = 'eng', onProgress } = options;
console.log('[Image Processor] Running OCR on image...');
try {
const worker = await Tesseract.createWorker(language, 1, {
logger: onProgress ? (m) => {
if (m.status === 'recognizing text') {
onProgress({ progress: m.progress * 100 });
}
} : undefined
});
const { data } = await worker.recognize(imagePath);
await worker.terminate();
console.log(`[Image Processor] OCR complete. Confidence: ${data.confidence}%`);
return [{
pageNumber: 1,
text: data.text,
confidence: data.confidence / 100, // Convert to 0-1 range
method: 'tesseract-ocr'
}];
} catch (error) {
console.error('[Image Processor] OCR failed:', error);
throw new Error(`Image OCR failed: ${error.message}`);
}
}
/**
* Process Word document with Mammoth
* @param {string} docPath - Path to DOCX file
* @param {Object} options - Processing options
* @returns {Promise<Array>} Extracted text
*/
async function processWordDocument(docPath, options = {}) {
console.log('[Word Processor] Extracting text from DOCX...');
try {
const result = await mammoth.extractRawText({ path: docPath });
const text = result.value;
if (result.messages.length > 0) {
console.log('[Word Processor] Extraction warnings:', result.messages);
}
console.log(`[Word Processor] Extracted ${text.length} characters`);
return [{
pageNumber: 1,
text: text,
confidence: 0.99,
method: 'native-extraction'
}];
} catch (error) {
console.error('[Word Processor] Extraction failed:', error);
throw new Error(`Word document processing failed: ${error.message}`);
}
}
/**
* Process Excel document with XLSX
* @param {string} xlsPath - Path to XLSX file
* @param {Object} options - Processing options
* @returns {Promise<Array>} Extracted data from all sheets
*/
async function processExcelDocument(xlsPath, options = {}) {
console.log('[Excel Processor] Reading workbook...');
try {
const workbook = XLSX.readFile(xlsPath);
const sheets = [];
workbook.SheetNames.forEach((sheetName, idx) => {
const worksheet = workbook.Sheets[sheetName];
// Convert to CSV for text-based indexing
const csvText = XLSX.utils.sheet_to_csv(worksheet);
// Also get JSON for structured data (optional)
const jsonData = XLSX.utils.sheet_to_json(worksheet, { header: 1 });
sheets.push({
pageNumber: idx + 1,
text: csvText,
confidence: 0.99,
method: 'native-extraction',
sheetName: sheetName,
metadata: {
rowCount: jsonData.length,
columnCount: jsonData[0]?.length || 0
}
});
});
console.log(`[Excel Processor] Extracted ${sheets.length} sheets`);
return sheets;
} catch (error) {
console.error('[Excel Processor] Reading failed:', error);
throw new Error(`Excel document processing failed: ${error.message}`);
}
}
/**
* Process plain text file
* @param {string} txtPath - Path to text file
* @param {Object} options - Processing options
* @returns {Promise<Array>} Text content
*/
async function processTextFile(txtPath, options = {}) {
console.log('[Text Processor] Reading text file...');
try {
const text = readFileSync(txtPath, 'utf-8');
console.log(`[Text Processor] Read ${text.length} characters`);
return [{
pageNumber: 1,
text: text,
confidence: 1.0,
method: 'native-extraction'
}];
} catch (error) {
console.error('[Text Processor] Reading failed:', error);
throw new Error(`Text file processing failed: ${error.message}`);
}
}
export default {
processDocument
};