navidocs/server/services/file-safety.js
Claude f0096a6bd6
Feature: Multi-format upload support (JPG, PNG, DOCX, XLSX, TXT, MD)
Implements multi-format document upload capability expanding beyond PDFs.

Changes:
- server/package.json: Add mammoth (DOCX) and xlsx (Excel) dependencies
- server/services/file-safety.js: Expand allowed file types and MIME types
  - Added getFileCategory() function to classify file types
  - Support for images, Office docs, and text files
  - Flexible MIME validation for text files
- server/services/document-processor.js: NEW routing service
  - processImageFile(): Tesseract OCR for JPG/PNG/WebP
  - processWordDocument(): Mammoth for DOCX text extraction
  - processExcelDocument(): XLSX for spreadsheet data extraction
  - processTextFile(): Native reading for TXT/MD files
  - Unified interface with processDocument() router
- server/workers/ocr-worker.js: Switch from extractTextFromPDF to processDocument
  - Now handles all file types through unified processor
- client/src/components/UploadModal.vue: Update UI for multi-format
  - File input accepts all new file types
  - Updated help text to show supported formats

Supported formats: PDF, JPG, PNG, WebP, DOCX, XLSX, TXT, MD
Text extraction methods: Native (Office/text), Tesseract OCR (images), PDF.js (PDFs)
Search indexing: All file types processed and indexed in Meilisearch

Session: Cloud Session 2 - Multi-Format Upload Support
Branch: feature/multiformat
Status: Complete - Ready for testing
2025-11-13 12:54:44 +00:00

151 lines
4.1 KiB
JavaScript

/**
* File Safety Validation Service
* Validates uploaded files for security and format compliance
*/
import { fileTypeFromBuffer } from 'file-type';
import path from 'path';
const MAX_FILE_SIZE = parseInt(process.env.MAX_FILE_SIZE || '52428800'); // 50MB default
// Documents
const ALLOWED_EXTENSIONS = [
'.pdf',
'.doc', '.docx',
'.xls', '.xlsx',
'.txt', '.md',
// Images
'.jpg', '.jpeg', '.png', '.webp'
];
const ALLOWED_MIME_TYPES = [
'application/pdf',
'application/msword',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.ms-excel',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'text/plain',
'text/markdown',
'image/jpeg',
'image/png',
'image/webp'
];
/**
* Validate file safety and format
* @param {Object} file - Multer file object
* @param {Buffer} file.buffer - File buffer for MIME type detection
* @param {string} file.originalname - Original filename
* @param {number} file.size - File size in bytes
* @returns {Promise<{valid: boolean, error?: string}>}
*/
export async function validateFile(file) {
// Check file exists
if (!file) {
return { valid: false, error: 'No file provided' };
}
// Check file size
if (file.size > MAX_FILE_SIZE) {
return {
valid: false,
error: `File size exceeds maximum allowed size of ${MAX_FILE_SIZE / 1024 / 1024}MB`
};
}
// Check file extension
const ext = path.extname(file.originalname).toLowerCase();
if (!ALLOWED_EXTENSIONS.includes(ext)) {
return {
valid: false,
error: `File extension ${ext} not allowed. Accepted types: PDF, JPG, PNG, DOCX, XLSX, TXT, MD`
};
}
// Check MIME type via file-type (magic number detection)
// Note: Text files (.txt, .md) may not be detected by file-type
try {
const detectedType = await fileTypeFromBuffer(file.buffer);
// Skip MIME check for text files (they don't have magic numbers)
const textExtensions = ['.txt', '.md'];
const isTextFile = textExtensions.includes(ext);
// For binary files (PDF, images, Office), verify MIME type
if (!isTextFile && detectedType && !ALLOWED_MIME_TYPES.includes(detectedType.mime)) {
return {
valid: false,
error: `File type mismatch: detected ${detectedType.mime}, expected ${ext} file`
};
}
} catch (error) {
// Ignore MIME detection errors for text files
const textExtensions = ['.txt', '.md'];
if (!textExtensions.includes(ext)) {
return {
valid: false,
error: 'Unable to verify file type'
};
}
}
// Check for null bytes (potential attack vector)
if (file.originalname.includes('\0')) {
return {
valid: false,
error: 'Invalid filename'
};
}
// All checks passed
return { valid: true };
}
/**
* Sanitize filename for safe storage
* @param {string} filename - Original filename
* @returns {string} Sanitized filename
*/
export function sanitizeFilename(filename) {
// Remove path separators and null bytes
let sanitized = filename
.replace(/[\/\\]/g, '_')
.replace(/\0/g, '');
// Remove potentially dangerous characters
sanitized = sanitized.replace(/[^a-zA-Z0-9._-]/g, '_');
// Limit length
const ext = path.extname(sanitized);
const name = path.basename(sanitized, ext);
const maxNameLength = 200;
if (name.length > maxNameLength) {
sanitized = name.substring(0, maxNameLength) + ext;
}
return sanitized;
}
/**
* Get file category based on extension
* @param {string} filename - Filename to categorize
* @returns {string} Category: 'pdf', 'word', 'excel', 'text', 'image', or 'unknown'
*/
export function getFileCategory(filename) {
const ext = path.extname(filename).toLowerCase();
if (['.pdf'].includes(ext)) return 'pdf';
if (['.doc', '.docx'].includes(ext)) return 'word';
if (['.xls', '.xlsx'].includes(ext)) return 'excel';
if (['.txt', '.md'].includes(ext)) return 'text';
if (['.jpg', '.jpeg', '.png', '.webp'].includes(ext)) return 'image';
return 'unknown';
}
export default {
validateFile,
sanitizeFilename,
getFileCategory
};