Merge Session 2: Multi-format upload (JPG, DOCX, XLSX, TXT, MD)
This commit is contained in:
commit
bf76d0c2bf
5 changed files with 255 additions and 19 deletions
|
|
@ -32,19 +32,19 @@
|
||||||
<svg class="w-16 h-16 mx-auto text-white/50 mb-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
<svg class="w-16 h-16 mx-auto text-white/50 mb-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||||
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M7 16a4 4 0 01-.88-7.903A5 5 0 1115.9 6L16 6a5 5 0 011 9.9M15 13l-3-3m0 0l-3 3m3-3v12" />
|
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M7 16a4 4 0 01-.88-7.903A5 5 0 1115.9 6L16 6a5 5 0 011 9.9M15 13l-3-3m0 0l-3 3m3-3v12" />
|
||||||
</svg>
|
</svg>
|
||||||
<p class="text-lg text-white mb-2">Drag and drop your PDF here</p>
|
<p class="text-lg text-white mb-2">Drag and drop your document here</p>
|
||||||
<p class="text-sm text-white/70 mb-4">or</p>
|
<p class="text-sm text-white/70 mb-4">or</p>
|
||||||
<label class="btn btn-outline cursor-pointer">
|
<label class="btn btn-outline cursor-pointer">
|
||||||
Browse Files
|
Browse Files
|
||||||
<input
|
<input
|
||||||
ref="fileInput"
|
ref="fileInput"
|
||||||
type="file"
|
type="file"
|
||||||
accept="application/pdf"
|
accept=".pdf,.jpg,.jpeg,.png,.webp,.docx,.xlsx,.txt,.md"
|
||||||
class="hidden"
|
class="hidden"
|
||||||
@change="handleFileSelect"
|
@change="handleFileSelect"
|
||||||
/>
|
/>
|
||||||
</label>
|
</label>
|
||||||
<p class="text-xs text-white/70 mt-4">Maximum file size: 50MB</p>
|
<p class="text-xs text-white/70 mt-4">Supported: PDF, Images (JPG/PNG), Word, Excel, Text/Markdown • Max: 50MB</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- Selected File Preview -->
|
<!-- Selected File Preview -->
|
||||||
|
|
|
||||||
|
|
@ -32,6 +32,7 @@
|
||||||
"ioredis": "^5.0.0",
|
"ioredis": "^5.0.0",
|
||||||
"jsonwebtoken": "^9.0.2",
|
"jsonwebtoken": "^9.0.2",
|
||||||
"lru-cache": "^11.2.2",
|
"lru-cache": "^11.2.2",
|
||||||
|
"mammoth": "^1.8.0",
|
||||||
"meilisearch": "^0.41.0",
|
"meilisearch": "^0.41.0",
|
||||||
"multer": "^1.4.5-lts.1",
|
"multer": "^1.4.5-lts.1",
|
||||||
"pdf-img-convert": "^2.0.0",
|
"pdf-img-convert": "^2.0.0",
|
||||||
|
|
@ -39,7 +40,8 @@
|
||||||
"pdfjs-dist": "^5.4.394",
|
"pdfjs-dist": "^5.4.394",
|
||||||
"sharp": "^0.34.4",
|
"sharp": "^0.34.4",
|
||||||
"tesseract.js": "^5.0.0",
|
"tesseract.js": "^5.0.0",
|
||||||
"uuid": "^10.0.0"
|
"uuid": "^10.0.0",
|
||||||
|
"xlsx": "^0.18.5"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/node": "^20.0.0"
|
"@types/node": "^20.0.0"
|
||||||
|
|
|
||||||
186
server/services/document-processor.js
Normal file
186
server/services/document-processor.js
Normal file
|
|
@ -0,0 +1,186 @@
|
||||||
|
/**
|
||||||
|
* Document Processor Service
|
||||||
|
* Routes file processing to appropriate handler based on file type
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { extractTextFromPDF } from './ocr.js';
|
||||||
|
import { getFileCategory } from './file-safety.js';
|
||||||
|
import { readFileSync } from 'fs';
|
||||||
|
import mammoth from 'mammoth';
|
||||||
|
import XLSX from 'xlsx';
|
||||||
|
import Tesseract from 'tesseract.js';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process document with appropriate handler based on file type
|
||||||
|
* @param {string} filePath - Path to uploaded file
|
||||||
|
* @param {Object} options - Processing options
|
||||||
|
* @param {string} options.language - OCR language (default: 'eng')
|
||||||
|
* @param {Function} options.onProgress - Progress callback
|
||||||
|
* @returns {Promise<Array>} Array of page results with text and metadata
|
||||||
|
*/
|
||||||
|
export async function processDocument(filePath, options = {}) {
|
||||||
|
const category = getFileCategory(filePath);
|
||||||
|
|
||||||
|
console.log(`[Document Processor] Processing ${category}: ${filePath}`);
|
||||||
|
|
||||||
|
switch (category) {
|
||||||
|
case 'pdf':
|
||||||
|
return await extractTextFromPDF(filePath, options);
|
||||||
|
|
||||||
|
case 'image':
|
||||||
|
return await processImageFile(filePath, options);
|
||||||
|
|
||||||
|
case 'word':
|
||||||
|
return await processWordDocument(filePath, options);
|
||||||
|
|
||||||
|
case 'excel':
|
||||||
|
return await processExcelDocument(filePath, options);
|
||||||
|
|
||||||
|
case 'text':
|
||||||
|
return await processTextFile(filePath, options);
|
||||||
|
|
||||||
|
default:
|
||||||
|
throw new Error(`Unsupported file type: ${category}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process image file with Tesseract OCR
|
||||||
|
* @param {string} imagePath - Path to image file
|
||||||
|
* @param {Object} options - Processing options
|
||||||
|
* @returns {Promise<Array>} OCR results
|
||||||
|
*/
|
||||||
|
async function processImageFile(imagePath, options = {}) {
|
||||||
|
const { language = 'eng', onProgress } = options;
|
||||||
|
|
||||||
|
console.log('[Image Processor] Running OCR on image...');
|
||||||
|
|
||||||
|
try {
|
||||||
|
const worker = await Tesseract.createWorker(language, 1, {
|
||||||
|
logger: onProgress ? (m) => {
|
||||||
|
if (m.status === 'recognizing text') {
|
||||||
|
onProgress({ progress: m.progress * 100 });
|
||||||
|
}
|
||||||
|
} : undefined
|
||||||
|
});
|
||||||
|
|
||||||
|
const { data } = await worker.recognize(imagePath);
|
||||||
|
await worker.terminate();
|
||||||
|
|
||||||
|
console.log(`[Image Processor] OCR complete. Confidence: ${data.confidence}%`);
|
||||||
|
|
||||||
|
return [{
|
||||||
|
pageNumber: 1,
|
||||||
|
text: data.text,
|
||||||
|
confidence: data.confidence / 100, // Convert to 0-1 range
|
||||||
|
method: 'tesseract-ocr'
|
||||||
|
}];
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Image Processor] OCR failed:', error);
|
||||||
|
throw new Error(`Image OCR failed: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process Word document with Mammoth
|
||||||
|
* @param {string} docPath - Path to DOCX file
|
||||||
|
* @param {Object} options - Processing options
|
||||||
|
* @returns {Promise<Array>} Extracted text
|
||||||
|
*/
|
||||||
|
async function processWordDocument(docPath, options = {}) {
|
||||||
|
console.log('[Word Processor] Extracting text from DOCX...');
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await mammoth.extractRawText({ path: docPath });
|
||||||
|
const text = result.value;
|
||||||
|
|
||||||
|
if (result.messages.length > 0) {
|
||||||
|
console.log('[Word Processor] Extraction warnings:', result.messages);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`[Word Processor] Extracted ${text.length} characters`);
|
||||||
|
|
||||||
|
return [{
|
||||||
|
pageNumber: 1,
|
||||||
|
text: text,
|
||||||
|
confidence: 0.99,
|
||||||
|
method: 'native-extraction'
|
||||||
|
}];
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Word Processor] Extraction failed:', error);
|
||||||
|
throw new Error(`Word document processing failed: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process Excel document with XLSX
|
||||||
|
* @param {string} xlsPath - Path to XLSX file
|
||||||
|
* @param {Object} options - Processing options
|
||||||
|
* @returns {Promise<Array>} Extracted data from all sheets
|
||||||
|
*/
|
||||||
|
async function processExcelDocument(xlsPath, options = {}) {
|
||||||
|
console.log('[Excel Processor] Reading workbook...');
|
||||||
|
|
||||||
|
try {
|
||||||
|
const workbook = XLSX.readFile(xlsPath);
|
||||||
|
const sheets = [];
|
||||||
|
|
||||||
|
workbook.SheetNames.forEach((sheetName, idx) => {
|
||||||
|
const worksheet = workbook.Sheets[sheetName];
|
||||||
|
|
||||||
|
// Convert to CSV for text-based indexing
|
||||||
|
const csvText = XLSX.utils.sheet_to_csv(worksheet);
|
||||||
|
|
||||||
|
// Also get JSON for structured data (optional)
|
||||||
|
const jsonData = XLSX.utils.sheet_to_json(worksheet, { header: 1 });
|
||||||
|
|
||||||
|
sheets.push({
|
||||||
|
pageNumber: idx + 1,
|
||||||
|
text: csvText,
|
||||||
|
confidence: 0.99,
|
||||||
|
method: 'native-extraction',
|
||||||
|
sheetName: sheetName,
|
||||||
|
metadata: {
|
||||||
|
rowCount: jsonData.length,
|
||||||
|
columnCount: jsonData[0]?.length || 0
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`[Excel Processor] Extracted ${sheets.length} sheets`);
|
||||||
|
return sheets;
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Excel Processor] Reading failed:', error);
|
||||||
|
throw new Error(`Excel document processing failed: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process plain text file
|
||||||
|
* @param {string} txtPath - Path to text file
|
||||||
|
* @param {Object} options - Processing options
|
||||||
|
* @returns {Promise<Array>} Text content
|
||||||
|
*/
|
||||||
|
async function processTextFile(txtPath, options = {}) {
|
||||||
|
console.log('[Text Processor] Reading text file...');
|
||||||
|
|
||||||
|
try {
|
||||||
|
const text = readFileSync(txtPath, 'utf-8');
|
||||||
|
|
||||||
|
console.log(`[Text Processor] Read ${text.length} characters`);
|
||||||
|
|
||||||
|
return [{
|
||||||
|
pageNumber: 1,
|
||||||
|
text: text,
|
||||||
|
confidence: 1.0,
|
||||||
|
method: 'native-extraction'
|
||||||
|
}];
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Text Processor] Reading failed:', error);
|
||||||
|
throw new Error(`Text file processing failed: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export default {
|
||||||
|
processDocument
|
||||||
|
};
|
||||||
|
|
@ -7,8 +7,29 @@ import { fileTypeFromBuffer } from 'file-type';
|
||||||
import path from 'path';
|
import path from 'path';
|
||||||
|
|
||||||
const MAX_FILE_SIZE = parseInt(process.env.MAX_FILE_SIZE || '52428800'); // 50MB default
|
const MAX_FILE_SIZE = parseInt(process.env.MAX_FILE_SIZE || '52428800'); // 50MB default
|
||||||
const ALLOWED_EXTENSIONS = ['.pdf'];
|
|
||||||
const ALLOWED_MIME_TYPES = ['application/pdf'];
|
// Documents
|
||||||
|
const ALLOWED_EXTENSIONS = [
|
||||||
|
'.pdf',
|
||||||
|
'.doc', '.docx',
|
||||||
|
'.xls', '.xlsx',
|
||||||
|
'.txt', '.md',
|
||||||
|
// Images
|
||||||
|
'.jpg', '.jpeg', '.png', '.webp'
|
||||||
|
];
|
||||||
|
|
||||||
|
const ALLOWED_MIME_TYPES = [
|
||||||
|
'application/pdf',
|
||||||
|
'application/msword',
|
||||||
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||||
|
'application/vnd.ms-excel',
|
||||||
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||||
|
'text/plain',
|
||||||
|
'text/markdown',
|
||||||
|
'image/jpeg',
|
||||||
|
'image/png',
|
||||||
|
'image/webp'
|
||||||
|
];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Validate file safety and format
|
* Validate file safety and format
|
||||||
|
|
@ -37,26 +58,35 @@ export async function validateFile(file) {
|
||||||
if (!ALLOWED_EXTENSIONS.includes(ext)) {
|
if (!ALLOWED_EXTENSIONS.includes(ext)) {
|
||||||
return {
|
return {
|
||||||
valid: false,
|
valid: false,
|
||||||
error: `File extension ${ext} not allowed. Only PDF files are accepted.`
|
error: `File extension ${ext} not allowed. Accepted types: PDF, JPG, PNG, DOCX, XLSX, TXT, MD`
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check MIME type via file-type (magic number detection)
|
// Check MIME type via file-type (magic number detection)
|
||||||
|
// Note: Text files (.txt, .md) may not be detected by file-type
|
||||||
try {
|
try {
|
||||||
const detectedType = await fileTypeFromBuffer(file.buffer);
|
const detectedType = await fileTypeFromBuffer(file.buffer);
|
||||||
|
|
||||||
// PDF files should be detected
|
// Skip MIME check for text files (they don't have magic numbers)
|
||||||
if (!detectedType || !ALLOWED_MIME_TYPES.includes(detectedType.mime)) {
|
const textExtensions = ['.txt', '.md'];
|
||||||
|
const isTextFile = textExtensions.includes(ext);
|
||||||
|
|
||||||
|
// For binary files (PDF, images, Office), verify MIME type
|
||||||
|
if (!isTextFile && detectedType && !ALLOWED_MIME_TYPES.includes(detectedType.mime)) {
|
||||||
return {
|
return {
|
||||||
valid: false,
|
valid: false,
|
||||||
error: 'File is not a valid PDF document (MIME type mismatch)'
|
error: `File type mismatch: detected ${detectedType.mime}, expected ${ext} file`
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
return {
|
// Ignore MIME detection errors for text files
|
||||||
valid: false,
|
const textExtensions = ['.txt', '.md'];
|
||||||
error: 'Unable to verify file type'
|
if (!textExtensions.includes(ext)) {
|
||||||
};
|
return {
|
||||||
|
valid: false,
|
||||||
|
error: 'Unable to verify file type'
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check for null bytes (potential attack vector)
|
// Check for null bytes (potential attack vector)
|
||||||
|
|
@ -97,7 +127,25 @@ export function sanitizeFilename(filename) {
|
||||||
return sanitized;
|
return sanitized;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get file category based on extension
|
||||||
|
* @param {string} filename - Filename to categorize
|
||||||
|
* @returns {string} Category: 'pdf', 'word', 'excel', 'text', 'image', or 'unknown'
|
||||||
|
*/
|
||||||
|
export function getFileCategory(filename) {
|
||||||
|
const ext = path.extname(filename).toLowerCase();
|
||||||
|
|
||||||
|
if (['.pdf'].includes(ext)) return 'pdf';
|
||||||
|
if (['.doc', '.docx'].includes(ext)) return 'word';
|
||||||
|
if (['.xls', '.xlsx'].includes(ext)) return 'excel';
|
||||||
|
if (['.txt', '.md'].includes(ext)) return 'text';
|
||||||
|
if (['.jpg', '.jpeg', '.png', '.webp'].includes(ext)) return 'image';
|
||||||
|
|
||||||
|
return 'unknown';
|
||||||
|
}
|
||||||
|
|
||||||
export default {
|
export default {
|
||||||
validateFile,
|
validateFile,
|
||||||
sanitizeFilename
|
sanitizeFilename,
|
||||||
|
getFileCategory
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ import { v4 as uuidv4 } from 'uuid';
|
||||||
import { dirname, join } from 'path';
|
import { dirname, join } from 'path';
|
||||||
import { fileURLToPath } from 'url';
|
import { fileURLToPath } from 'url';
|
||||||
import { getDb } from '../config/db.js';
|
import { getDb } from '../config/db.js';
|
||||||
import { extractTextFromPDF } from '../services/ocr-hybrid.js';
|
import { processDocument } from '../services/document-processor.js';
|
||||||
import { cleanOCRText, extractTextFromImage } from '../services/ocr.js';
|
import { cleanOCRText, extractTextFromImage } from '../services/ocr.js';
|
||||||
import { indexDocumentPage } from '../services/search.js';
|
import { indexDocumentPage } from '../services/search.js';
|
||||||
import { extractImagesFromPage } from './image-extractor.js';
|
import { extractImagesFromPage } from './image-extractor.js';
|
||||||
|
|
@ -92,10 +92,10 @@ async function processOCRJob(job) {
|
||||||
console.log(`[OCR Worker] Progress: ${currentProgress}% (page ${pageNum}/${total})`);
|
console.log(`[OCR Worker] Progress: ${currentProgress}% (page ${pageNum}/${total})`);
|
||||||
};
|
};
|
||||||
|
|
||||||
// Extract text from PDF using OCR service
|
// Process document using multi-format processor
|
||||||
console.log(`[OCR Worker] Extracting text from ${filePath}`);
|
console.log(`[OCR Worker] Processing document from ${filePath}`);
|
||||||
|
|
||||||
const ocrResults = await extractTextFromPDF(filePath, {
|
const ocrResults = await processDocument(filePath, {
|
||||||
language: document.language || 'eng',
|
language: document.language || 'eng',
|
||||||
onProgress: updateProgress
|
onProgress: updateProgress
|
||||||
});
|
});
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue