diff --git a/client/package.json b/client/package.json index e201aa7..009c534 100644 --- a/client/package.json +++ b/client/package.json @@ -6,21 +6,24 @@ "scripts": { "dev": "vite", "build": "vite build", - "preview": "vite preview" + "preview": "vite preview", + "i18n:lint": "node scripts/i18n-keys-lint.js", + "test": "npm run i18n:lint" }, "dependencies": { - "vue": "^3.5.0", - "vue-router": "^4.4.0", - "pinia": "^2.2.0", + "meilisearch": "^0.41.0", "pdfjs-dist": "^4.0.0", - "meilisearch": "^0.41.0" + "pinia": "^2.2.0", + "vue": "^3.5.0", + "vue-i18n": "^9.14.5", + "vue-router": "^4.4.0" }, "devDependencies": { "@vitejs/plugin-vue": "^5.0.0", - "vite": "^5.0.0", - "tailwindcss": "^3.4.0", "autoprefixer": "^10.4.0", + "playwright": "^1.40.0", "postcss": "^8.4.0", - "playwright": "^1.40.0" + "tailwindcss": "^3.4.0", + "vite": "^5.0.0" } } diff --git a/client/src/components/LanguageSwitcher.vue b/client/src/components/LanguageSwitcher.vue new file mode 100644 index 0000000..7651f42 --- /dev/null +++ b/client/src/components/LanguageSwitcher.vue @@ -0,0 +1,170 @@ + + + + + diff --git a/client/src/components/TocEntry.vue b/client/src/components/TocEntry.vue new file mode 100644 index 0000000..45a82b3 --- /dev/null +++ b/client/src/components/TocEntry.vue @@ -0,0 +1,218 @@ + + + + + diff --git a/client/src/components/TocSidebar.vue b/client/src/components/TocSidebar.vue new file mode 100644 index 0000000..8573398 --- /dev/null +++ b/client/src/components/TocSidebar.vue @@ -0,0 +1,306 @@ + + + + + diff --git a/client/src/i18n/index.js b/client/src/i18n/index.js new file mode 100644 index 0000000..18232ae --- /dev/null +++ b/client/src/i18n/index.js @@ -0,0 +1,121 @@ +/** + * Vue I18n Configuration + * Internationalization setup for NaviDocs + * Supports EN/FR with browser language detection + */ + +import { createI18n } from 'vue-i18n' +import en from './locales/en.json' +import fr from './locales/fr.json' + +// Detect browser language +function getBrowserLocale() { + const navigatorLocale = + navigator.languages !== undefined + ? navigator.languages[0] + : navigator.language + + if (!navigatorLocale) { + return 'en' + } + + // Extract language code (en-US -> en, fr-FR -> fr) + const languageCode = navigatorLocale.trim().split(/[-_]/)[0] + + // Check if we support this language + const supportedLocales = ['en', 'fr'] + return supportedLocales.includes(languageCode) ? languageCode : 'en' +} + +// Get stored locale or browser locale +function getStartingLocale() { + const storedLocale = localStorage.getItem('navidocs-locale') + if (storedLocale) { + return storedLocale + } + + return getBrowserLocale() +} + +const i18n = createI18n({ + legacy: false, // Use Composition API mode + locale: getStartingLocale(), + fallbackLocale: 'en', + messages: { + en, + fr + }, + // Development guards: warn about missing translations + missing: (locale, key) => { + console.error(`[i18n] Missing translation: ${locale}:${key}`) + }, + missingWarn: import.meta.env.DEV, // Only warn in development + fallbackWarn: import.meta.env.DEV, + // Enable number and date formatting + datetimeFormats: { + en: { + short: { + year: 'numeric', + month: 'short', + day: 'numeric' + }, + long: { + year: 'numeric', + month: 'long', + day: 'numeric', + hour: 'numeric', + minute: 'numeric' + } + }, + fr: { + short: { + year: 'numeric', + month: 'short', + day: 'numeric' + }, + long: { + year: 'numeric', + month: 'long', + day: 'numeric', + hour: 'numeric', + minute: 'numeric' + } + } + }, + numberFormats: { + en: { + currency: { + style: 'currency', + currency: 'USD' + }, + decimal: { + style: 'decimal', + minimumFractionDigits: 2, + maximumFractionDigits: 2 + } + }, + fr: { + currency: { + style: 'currency', + currency: 'EUR' + }, + decimal: { + style: 'decimal', + minimumFractionDigits: 2, + maximumFractionDigits: 2 + } + } + } +}) + +// Helper function to switch locale +export function setLocale(locale) { + i18n.global.locale.value = locale + localStorage.setItem('navidocs-locale', locale) + document.querySelector('html').setAttribute('lang', locale) +} + +// Set initial HTML lang attribute +document.querySelector('html').setAttribute('lang', getStartingLocale()) + +export default i18n diff --git a/client/src/i18n/locales/en.json b/client/src/i18n/locales/en.json new file mode 100644 index 0000000..f255648 --- /dev/null +++ b/client/src/i18n/locales/en.json @@ -0,0 +1,160 @@ +{ + "app": { + "title": "NaviDocs", + "tagline": "Marine Manual Intelligence" + }, + "nav": { + "home": "Home", + "search": "Search", + "documents": "Documents", + "upload": "Upload", + "stats": "Statistics" + }, + "home": { + "welcome": "Welcome to NaviDocs", + "searchPlaceholder": "Search your boat manuals and documentation...", + "recentDocuments": "Recent Documents", + "quickActions": "Quick Actions", + "uploadDocument": "Upload Document", + "viewAll": "View All Documents" + }, + "search": { + "title": "Search Results", + "placeholder": "Search manuals, specs, and documentation...", + "searching": "Searching...", + "noResults": "No matches found", + "noResultsHint": "Try different search terms or check the spelling", + "resultsCount": "{count} result | {count} results", + "page": "Page", + "section": "Section", + "expand": "Show context", + "collapse": "Hide context", + "viewDocument": "View document", + "prevPage": "Previous page", + "currentPage": "Current page", + "nextPage": "Next page", + "noDiagram": "No diagram" + }, + "document": { + "title": "Document Viewer", + "back": "Back", + "page": "Page", + "of": "of", + "images": "image | images", + "previous": "Previous", + "next": "Next", + "goToPage": "Go", + "loading": "Loading document...", + "rendering": "Rendering page...", + "error": "Unable to render document", + "retry": "Retry", + "findBar": { + "noMatches": "No matches", + "matchCount": "{current} / {total}", + "previousMatch": "Previous match", + "nextMatch": "Next match", + "jumpTo": "Jump to", + "match": "Match", + "moreMatches": "+ {count} more matches" + } + }, + "upload": { + "title": "Upload Documents", + "dropZone": "Drop PDF files here or click to browse", + "browseFiles": "Browse Files", + "uploading": "Uploading...", + "processing": "Processing...", + "success": "Upload successful", + "error": "Upload failed", + "maxSize": "Maximum file size: {size}MB", + "supportedFormats": "Supported formats: PDF", + "documentInfo": "Document Information", + "documentTitle": "Document Title", + "documentType": "Document Type", + "boatInfo": "Boat Information", + "boatName": "Boat Name", + "boatMake": "Manufacturer", + "boatModel": "Model", + "boatYear": "Year", + "submit": "Upload", + "cancel": "Cancel", + "types": { + "manual": "Owner's Manual", + "service": "Service Manual", + "component": "Component Manual", + "wiring": "Wiring Diagram", + "parts": "Parts List", + "other": "Other Documentation" + } + }, + "stats": { + "title": "Statistics", + "overview": "Overview", + "totalDocuments": "Total Documents", + "totalPages": "Total Pages", + "storageUsed": "Storage Used", + "recentActivity": "Recent Activity", + "documentsByType": "Documents by Type", + "pagesByBoat": "Pages by Boat", + "searchActivity": "Search Activity", + "topSearchTerms": "Top Search Terms" + }, + "common": { + "loading": "Loading...", + "error": "Error", + "success": "Success", + "save": "Save", + "cancel": "Cancel", + "delete": "Delete", + "edit": "Edit", + "close": "Close", + "confirm": "Confirm", + "yes": "Yes", + "no": "No", + "search": "Search", + "filter": "Filter", + "sort": "Sort", + "actions": "Actions", + "viewDetails": "View Details", + "download": "Download", + "share": "Share", + "print": "Print" + }, + "marine": { + "systems": { + "electrical": "Electrical System", + "plumbing": "Plumbing & Water", + "navigation": "Navigation", + "propulsion": "Propulsion", + "hvac": "Climate Control", + "safety": "Safety Equipment", + "galley": "Galley", + "head": "Head", + "deck": "Deck Equipment", + "rigging": "Rigging" + }, + "categories": { + "maintenance": "Maintenance", + "troubleshooting": "Troubleshooting", + "installation": "Installation", + "operation": "Operation", + "safety": "Safety Procedures", + "specifications": "Specifications" + } + }, + "toc": { + "tableOfContents": "Table of Contents", + "loading": "Loading index...", + "noTocFound": "No table of contents found in this document", + "extract": "Extract TOC", + "entries": "entries", + "expand": "Expand index", + "collapse": "Collapse index", + "jumpToSection": "Jump to section" + }, + "language": { + "select": "Select Language", + "en": "English", + "fr": "Français" + } +} diff --git a/client/src/i18n/locales/fr.json b/client/src/i18n/locales/fr.json new file mode 100644 index 0000000..6b571b0 --- /dev/null +++ b/client/src/i18n/locales/fr.json @@ -0,0 +1,160 @@ +{ + "app": { + "title": "NaviDocs", + "tagline": "Intelligence Nautique" + }, + "nav": { + "home": "Accueil", + "search": "Recherche", + "documents": "Documents", + "upload": "Téléverser", + "stats": "Statistiques" + }, + "home": { + "welcome": "Bienvenue sur NaviDocs", + "searchPlaceholder": "Rechercher dans vos manuels et documentation nautique...", + "recentDocuments": "Documents récents", + "quickActions": "Actions rapides", + "uploadDocument": "Téléverser un document", + "viewAll": "Voir tous les documents" + }, + "search": { + "title": "Résultats de recherche", + "placeholder": "Rechercher dans les manuels, spécifications et documentation...", + "searching": "Recherche en cours...", + "noResults": "Aucun résultat", + "noResultsHint": "Essayez d'autres termes ou vérifiez l'orthographe", + "resultsCount": "{count} résultat | {count} résultats", + "page": "Page", + "section": "Section", + "expand": "Afficher le contexte", + "collapse": "Masquer le contexte", + "viewDocument": "Voir le document", + "prevPage": "Page précédente", + "currentPage": "Page actuelle", + "nextPage": "Page suivante", + "noDiagram": "Pas de schéma" + }, + "document": { + "title": "Visionneuse de documents", + "back": "Retour", + "page": "Page", + "of": "sur", + "images": "image | images", + "previous": "Précédent", + "next": "Suivant", + "goToPage": "Aller", + "loading": "Chargement du document...", + "rendering": "Affichage de la page...", + "error": "Impossible d'afficher le document", + "retry": "Réessayer", + "findBar": { + "noMatches": "Aucune correspondance", + "matchCount": "{current} / {total}", + "previousMatch": "Correspondance précédente", + "nextMatch": "Correspondance suivante", + "jumpTo": "Aller à", + "match": "Correspondance", + "moreMatches": "+ {count} correspondances supplémentaires" + } + }, + "upload": { + "title": "Téléverser des documents", + "dropZone": "Déposez les fichiers PDF ici ou cliquez pour parcourir", + "browseFiles": "Parcourir les fichiers", + "uploading": "Téléversement en cours...", + "processing": "Traitement en cours...", + "success": "Téléversement réussi", + "error": "Échec du téléversement", + "maxSize": "Taille maximale du fichier : {size}Mo", + "supportedFormats": "Formats pris en charge : PDF", + "documentInfo": "Informations du document", + "documentTitle": "Titre du document", + "documentType": "Type de document", + "boatInfo": "Informations du bateau", + "boatName": "Nom du bateau", + "boatMake": "Constructeur", + "boatModel": "Modèle", + "boatYear": "Année", + "submit": "Téléverser", + "cancel": "Annuler", + "types": { + "manual": "Manuel du propriétaire", + "service": "Manuel d'entretien", + "component": "Manuel de composant", + "wiring": "Schéma électrique", + "parts": "Liste de pièces", + "other": "Autre documentation" + } + }, + "stats": { + "title": "Statistiques", + "overview": "Aperçu", + "totalDocuments": "Documents totaux", + "totalPages": "Pages totales", + "storageUsed": "Espace utilisé", + "recentActivity": "Activité récente", + "documentsByType": "Documents par type", + "pagesByBoat": "Pages par bateau", + "searchActivity": "Activité de recherche", + "topSearchTerms": "Termes les plus recherchés" + }, + "common": { + "loading": "Chargement...", + "error": "Erreur", + "success": "Succès", + "save": "Enregistrer", + "cancel": "Annuler", + "delete": "Supprimer", + "edit": "Modifier", + "close": "Fermer", + "confirm": "Confirmer", + "yes": "Oui", + "no": "Non", + "search": "Rechercher", + "filter": "Filtrer", + "sort": "Trier", + "actions": "Actions", + "viewDetails": "Voir les détails", + "download": "Télécharger", + "share": "Partager", + "print": "Imprimer" + }, + "marine": { + "systems": { + "electrical": "Système électrique", + "plumbing": "Plomberie & eau", + "navigation": "Navigation", + "propulsion": "Propulsion", + "hvac": "Climatisation", + "safety": "Équipement de sécurité", + "galley": "Cuisine", + "head": "Toilettes", + "deck": "Équipement de pont", + "rigging": "Gréement" + }, + "categories": { + "maintenance": "Entretien", + "troubleshooting": "Dépannage", + "installation": "Installation", + "operation": "Fonctionnement", + "safety": "Procédures de sécurité", + "specifications": "Spécifications" + } + }, + "toc": { + "tableOfContents": "Table des matières", + "loading": "Chargement de l'index...", + "noTocFound": "Aucune table des matières trouvée dans ce document", + "extract": "Extraire la table", + "entries": "entrées", + "expand": "Développer l'index", + "collapse": "Réduire l'index", + "jumpToSection": "Aller à la section" + }, + "language": { + "select": "Choisir la langue", + "en": "English", + "fr": "Français" + } +} diff --git a/client/src/main.js b/client/src/main.js index bad15fe..e217226 100644 --- a/client/src/main.js +++ b/client/src/main.js @@ -5,6 +5,7 @@ import { createApp } from 'vue' import { createPinia } from 'pinia' import router from './router' +import i18n from './i18n' import App from './App.vue' import './assets/main.css' @@ -12,6 +13,7 @@ const app = createApp(App) app.use(createPinia()) app.use(router) +app.use(i18n) app.mount('#app') diff --git a/client/src/views/DocumentView.vue b/client/src/views/DocumentView.vue index a7c190a..9a29e0a 100644 --- a/client/src/views/DocumentView.vue +++ b/client/src/views/DocumentView.vue @@ -8,7 +8,7 @@ - Back + {{ $t('document.back') }}
@@ -17,10 +17,83 @@
- Page {{ currentPage }} / {{ totalPages }} + {{ $t('document.page') }} {{ currentPage }} {{ $t('document.of') }} {{ totalPages }} - ({{ pageImages.length }} {{ pageImages.length === 1 ? 'image' : 'images' }}) + ({{ pageImages.length }} {{ $t('document.images', pageImages.length) }}) + +
+ + + +
+
+
+
+ + + + {{ searchQuery }} +
+ +
+ + {{ totalHits === 0 ? $t('document.findBar.noMatches') : $t('document.findBar.matchCount', { current: currentHitIndex + 1, total: totalHits }) }} + + +
+ + +
+
+
+ + +
+ + +
+
+ +
+ + {{ hitList.length - 5 }} more matches +
+
@@ -34,7 +107,7 @@ - Previous + {{ $t('document.previous') }}
@@ -48,7 +121,7 @@ class="w-16 px-3 py-2 bg-white/10 text-white border border-white/20 rounded-lg text-center focus:outline-none focus:ring-2 focus:ring-pink-400 focus:border-pink-400" />
@@ -57,7 +130,7 @@ :disabled="currentPage >= totalPages || isRendering" class="px-4 py-2 bg-white/10 hover:bg-white/15 disabled:bg-white/5 disabled:text-white/30 text-white rounded-lg transition-colors flex items-center gap-2 border border-white/10" > - Next + {{ $t('document.next') }} @@ -66,9 +139,19 @@ - -
-
+ +
+ + + + +
+
@@ -123,6 +206,7 @@
+
@@ -143,6 +227,8 @@ import * as pdfjsLib from 'pdfjs-dist' import 'pdfjs-dist/web/pdf_viewer.css' import ImageOverlay from '../components/ImageOverlay.vue' import FigureZoom from '../components/FigureZoom.vue' +import LanguageSwitcher from '../components/LanguageSwitcher.vue' +import TocSidebar from '../components/TocSidebar.vue' import { useDocumentImages } from '../composables/useDocumentImages' // Configure PDF.js worker - use local worker file instead of CDN @@ -168,6 +254,12 @@ const canvasContainer = ref(null) const textLayer = ref(null) const isRendering = ref(false) +// Find bar state +const currentHitIndex = ref(0) +const totalHits = ref(0) +const hitList = ref([]) +const jumpListOpen = ref(false) + // PDF rendering scale const pdfScale = ref(1.5) @@ -218,11 +310,17 @@ async function loadDocument() { } function highlightSearchTerms() { - if (!textLayer.value || !searchQuery.value) return + if (!textLayer.value || !searchQuery.value) { + totalHits.value = 0 + hitList.value = [] + currentHitIndex.value = 0 + return + } const spans = textLayer.value.querySelectorAll('span') const query = searchQuery.value.toLowerCase().trim() - let firstMatch = null + const hits = [] + let hitIndex = 0 spans.forEach(span => { const text = span.textContent @@ -230,28 +328,86 @@ function highlightSearchTerms() { const lowerText = text.toLowerCase() if (lowerText.includes(query)) { - // Create a highlighted version + // Create a highlighted version with data attributes const regex = new RegExp(`(${query.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')})`, 'gi') - const highlightedText = text.replace(regex, '$1') + const highlightedText = text.replace(regex, (match) => { + const idx = hitIndex + hitIndex++ + return `${match}` + }) - // Wrap in a container to preserve PDF.js positioning span.innerHTML = highlightedText - // Track first match for scrolling - if (!firstMatch) { - firstMatch = span - } + // Collect hit information for jump list + const snippet = text.length > 100 ? text.substring(0, 100) + '...' : text + const marks = span.querySelectorAll('mark') + marks.forEach((mark) => { + hits.push({ + element: mark, + snippet: snippet, + page: currentPage.value, + index: parseInt(mark.getAttribute('data-hit-index')) + }) + }) } }) + totalHits.value = hits.length + hitList.value = hits + currentHitIndex.value = 0 + // Scroll to first match - if (firstMatch) { - setTimeout(() => { - firstMatch.scrollIntoView({ behavior: 'smooth', block: 'center' }) - }, 100) + if (hits.length > 0) { + scrollToHit(0) } } +function scrollToHit(index) { + if (index < 0 || index >= hitList.value.length) return + + const hit = hitList.value[index] + if (!hit || !hit.element) return + + // Remove active class from all marks + hitList.value.forEach(h => { + if (h.element) { + h.element.classList.remove('search-highlight-active') + } + }) + + // Add active class to current hit + hit.element.classList.add('search-highlight-active') + + // Scroll to current hit + setTimeout(() => { + hit.element.scrollIntoView({ behavior: 'smooth', block: 'center' }) + }, 100) +} + +function nextHit() { + if (totalHits.value === 0) return + + currentHitIndex.value = (currentHitIndex.value + 1) % totalHits.value + scrollToHit(currentHitIndex.value) +} + +function prevHit() { + if (totalHits.value === 0) return + + currentHitIndex.value = currentHitIndex.value === 0 + ? totalHits.value - 1 + : currentHitIndex.value - 1 + scrollToHit(currentHitIndex.value) +} + +function jumpToHit(index) { + if (index < 0 || index >= hitList.value.length) return + + currentHitIndex.value = index + scrollToHit(index) + jumpListOpen.value = false +} + async function renderPage(pageNum) { if (!pdfDoc || componentIsUnmounting) return @@ -310,12 +466,14 @@ async function renderPage(pageNum) { try { const textContent = await page.getTextContent() - pdfjsLib.renderTextLayer({ + + // PDF.js 4.x uses TextLayer class instead of renderTextLayer function + const textLayerRender = new pdfjsLib.TextLayer({ textContentSource: textContent, container: textLayer.value, - viewport: viewport, - textDivs: [] + viewport: viewport }) + await textLayerRender.render() // Highlight search terms if query exists if (searchQuery.value) { @@ -364,6 +522,12 @@ async function nextPage() { currentPage.value += 1 pageInput.value = currentPage.value await renderPage(currentPage.value) + + // Update URL hash and dispatch event + window.location.hash = `#p=${currentPage.value}` + window.dispatchEvent(new CustomEvent('navidocs:pagechange', { + detail: { page: currentPage.value } + })) } async function previousPage() { @@ -371,6 +535,12 @@ async function previousPage() { currentPage.value -= 1 pageInput.value = currentPage.value await renderPage(currentPage.value) + + // Update URL hash and dispatch event + window.location.hash = `#p=${currentPage.value}` + window.dispatchEvent(new CustomEvent('navidocs:pagechange', { + detail: { page: currentPage.value } + })) } async function goToPage() { @@ -383,11 +553,26 @@ async function goToPage() { if (page >= 1 && page <= totalPages.value) { currentPage.value = page await renderPage(currentPage.value) + + // Update URL hash for deep linking + window.location.hash = `#p=${currentPage.value}` + + // Dispatch custom event for page change + window.dispatchEvent(new CustomEvent('navidocs:pagechange', { + detail: { page: currentPage.value } + })) } else { pageInput.value = currentPage.value } } +// Handle TOC navigation jumps +function handleTocJump(pageNumber) { + const clamped = Math.max(1, Math.min(pageNumber, totalPages.value)) + pageInput.value = clamped + goToPage() +} + watch( () => route.query.page, async (newPage) => { @@ -467,6 +652,35 @@ async function resetDocumentState() { onMounted(() => { loadDocument() + + // Handle deep links (#p=12) + const hash = window.location.hash + if (hash.startsWith('#p=')) { + const pageNum = parseInt(hash.substring(3), 10) + if (!Number.isNaN(pageNum) && pageNum >= 1) { + currentPage.value = pageNum + pageInput.value = pageNum + } + } + + // Listen for hash changes + const handleHashChange = () => { + const newHash = window.location.hash + if (newHash.startsWith('#p=')) { + const pageNum = parseInt(newHash.substring(3), 10) + if (!Number.isNaN(pageNum) && pageNum >= 1 && pageNum <= totalPages.value) { + pageInput.value = pageNum + goToPage() + } + } + } + + window.addEventListener('hashchange', handleHashChange) + + // Clean up listener + onBeforeUnmount(() => { + window.removeEventListener('hashchange', handleHashChange) + }) }) onBeforeUnmount(() => { @@ -527,15 +741,33 @@ onBeforeUnmount(() => { padding: 2px 0; border-radius: 2px; font-weight: 600; - animation: highlight-pulse 1.5s ease-in-out; + transition: background-color 0.2s ease; } -@keyframes highlight-pulse { +.search-highlight-active { + background-color: rgba(255, 92, 178, 0.8) !important; + color: #fff !important; + box-shadow: 0 0 0 2px rgba(255, 92, 178, 0.4); + animation: active-pulse 1.5s ease-in-out; +} + +@keyframes active-pulse { 0%, 100% { - background-color: rgba(255, 215, 0, 0.6); + background-color: rgba(255, 92, 178, 0.8); } 50% { - background-color: rgba(255, 215, 0, 0.9); + background-color: rgba(255, 92, 178, 1); } } + +.viewer-wrapper { + display: flex; + min-height: calc(100vh - 64px); /* Account for header */ +} + +.pdf-pane { + flex: 1; + min-width: 0; /* Allow flex item to shrink */ + overflow-x: auto; +} diff --git a/client/src/views/SearchView.vue b/client/src/views/SearchView.vue index 1356df1..4def9ed 100644 --- a/client/src/views/SearchView.vue +++ b/client/src/views/SearchView.vue @@ -14,6 +14,7 @@

NaviDocs

+ @@ -29,7 +30,7 @@ @input="performSearch" type="text" class="w-full h-12 px-5 pr-14 rounded-xl border-2 border-white/20 bg-white/10 backdrop-blur-lg text-white placeholder-white/50 shadow-lg focus:outline-none focus:border-pink-400 focus:ring-2 focus:ring-pink-400/20 transition-all duration-200" - placeholder="Search your manuals..." + :placeholder="$t('search.placeholder')" autofocus />
@@ -44,7 +45,7 @@
- {{ results.length }} results + {{ $t('search.resultsCount', { count: results.length }) }} {{ searchTime }}ms @@ -62,18 +63,28 @@
-
+
@@ -126,10 +190,10 @@
-

No results found

-

Try different keywords or check your spelling

+

{{ $t('search.noResults') }}

+

{{ $t('search.noResultsHint') }}

@@ -151,6 +215,7 @@ import { ref, onMounted, watch } from 'vue' import { useRoute, useRouter } from 'vue-router' import { useSearch } from '../composables/useSearch' +import LanguageSwitcher from '../components/LanguageSwitcher.vue' const route = useRoute() const router = useRouter() @@ -158,6 +223,8 @@ const router = useRouter() const { results, loading, searchTime, search } = useSearch() const searchQuery = ref(route.query.q || '') const activePreview = ref(null) +const expandedId = ref(null) +const contextCache = ref({}) let previewTimer = null async function performSearch() { @@ -213,10 +280,43 @@ function viewDocument(result) { }) } +function jumpToSection(result) { + router.push(`/document/${result.docId}?page=${result.pageNumber}#p=${result.pageNumber}`) +} + function handleImageError(event) { event.target.closest('.nv-popover')?.remove() } +function shouldShowSectionHeader(result, index) { + if (index === 0) return true // Always show for first result + const prevResult = results.value[index - 1] + return result.sectionKey !== prevResult?.sectionKey +} + +async function toggleExpand(result) { + const resultId = result.id + + if (expandedId.value === resultId) { + expandedId.value = null + return + } + + expandedId.value = resultId + + // Fetch context if not cached + if (!contextCache.value[resultId]) { + try { + const response = await fetch(`/api/context?docId=${result.docId}&page=${result.pageNumber}`) + if (response.ok) { + contextCache.value[resultId] = await response.json() + } + } catch (error) { + console.error('Failed to fetch context:', error) + } + } +} + // Watch for query changes from URL watch(() => route.query.q, (newQuery) => { searchQuery.value = newQuery || '' @@ -332,6 +432,22 @@ onMounted(() => { border-color: rgba(255, 230, 102, 0.5); } +.nv-chip-text { + font-size: 11px; + padding: 3px 8px; + border-radius: 8px; + background: rgba(207, 167, 255, 0.12); + color: #cfa7ff; + border: 1px solid rgba(207, 167, 255, 0.35); + cursor: pointer; + transition: all 0.15s ease; +} + +.nv-chip-text:hover { + background: rgba(207, 167, 255, 0.2); + border-color: rgba(207, 167, 255, 0.5); +} + .nv-link { color: #cfa7ff; font-weight: 500; @@ -364,6 +480,119 @@ onMounted(() => { height: 48px !important; } +/* Section header grouping */ +.nv-section-header { + display: flex; + align-items: center; + gap: 8px; + padding: 12px 0 8px 0; + margin-top: 16px; + font-size: 13px; + font-weight: 600; + color: #cfa7ff; + letter-spacing: 0.02em; +} + +.nv-section-header:first-child { + margin-top: 0; +} + +/* Inline expansion panel */ +.nv-expand { + margin-top: 12px; + padding-top: 12px; + border-top: 1px solid rgba(255, 255, 255, 0.1); +} + +.nv-expand-loading { + display: flex; + align-items: center; + gap: 8px; + font-size: 12px; + color: #9aa0a6; + padding: 12px 0; +} + +.spinner { + width: 14px; + height: 14px; + border: 2px solid rgba(207, 167, 255, 0.3); + border-top-color: #cfa7ff; + border-radius: 50%; + animation: spin 0.8s linear infinite; +} + +@keyframes spin { + to { transform: rotate(360deg); } +} + +.nv-context-pages { + display: flex; + gap: 12px; + margin-bottom: 12px; + overflow-x: auto; +} + +.nv-context-page { + flex-shrink: 0; + text-align: center; +} + +.nv-context-image { + width: 100px; + height: 100px; + background: rgba(255, 255, 255, 0.05); + border: 1px solid rgba(255, 255, 255, 0.1); + border-radius: 6px; + overflow: hidden; + display: flex; + align-items: center; + justify-content: center; +} + +.nv-context-page.active .nv-context-image { + border-color: rgba(207, 167, 255, 0.5); + box-shadow: 0 0 0 2px rgba(207, 167, 255, 0.2); +} + +.nv-context-image img { + width: 100%; + height: 100%; + object-fit: cover; +} + +.nv-context-noimage { + font-size: 10px; + color: #6b6b7a; + text-align: center; + padding: 8px; +} + +.nv-context-page figcaption { + margin-top: 4px; + font-size: 10px; + color: #9aa0a6; +} + +.nv-context-page.active figcaption { + color: #cfa7ff; + font-weight: 600; +} + +.nv-expand-text { + padding: 8px 12px; + background: rgba(255, 255, 255, 0.03); + border-radius: 6px; + max-height: 200px; + overflow-y: auto; +} + +.nv-expand-text .nv-snippet { + font-size: 14px; + line-height: 1.6; + margin: 0; +} + @media (max-width: 768px) { .nv-doc { display: none; diff --git a/server/db/migrations/002_add_document_toc.sql b/server/db/migrations/002_add_document_toc.sql new file mode 100644 index 0000000..7ba98c7 --- /dev/null +++ b/server/db/migrations/002_add_document_toc.sql @@ -0,0 +1,35 @@ +-- Migration: Add document_toc table for interactive table of contents +-- Date: 2025-10-20 +-- Description: Store extracted TOC entries from PDF documents for navigation + +CREATE TABLE IF NOT EXISTS document_toc ( + id TEXT PRIMARY KEY, + document_id TEXT NOT NULL, + + -- TOC entry details + title TEXT NOT NULL, -- "Chapter 4 - Plumbing System" + section_key TEXT, -- "4" or "4.1.2" for hierarchical entries + page_start INTEGER NOT NULL, -- Target page number + + -- Hierarchy support + level INTEGER DEFAULT 1, -- 1 for "4", 2 for "4.1", 3 for "4.1.2" + parent_id TEXT, -- Reference to parent entry for nesting + + -- Ordering + order_index INTEGER NOT NULL, -- Sequential order in TOC + + -- Source tracking + toc_page_number INTEGER, -- Which page the TOC entry was found on + + -- Metadata + created_at INTEGER NOT NULL, + + FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE, + FOREIGN KEY (parent_id) REFERENCES document_toc(id) ON DELETE CASCADE +); + +-- Indexes for performance +CREATE INDEX IF NOT EXISTS idx_toc_document ON document_toc(document_id); +CREATE INDEX IF NOT EXISTS idx_toc_order ON document_toc(document_id, order_index); +CREATE INDEX IF NOT EXISTS idx_toc_parent ON document_toc(parent_id); +CREATE INDEX IF NOT EXISTS idx_toc_section ON document_toc(document_id, section_key); diff --git a/server/index.js b/server/index.js index aa5810b..22f4791 100644 --- a/server/index.js +++ b/server/index.js @@ -90,6 +90,8 @@ import searchRoutes from './routes/search.js'; import documentsRoutes from './routes/documents.js'; import imagesRoutes from './routes/images.js'; import statsRoutes from './routes/stats.js'; +import contextRoutes from './routes/context.js'; +import tocRoutes from './routes/toc.js'; // API routes app.use('/api/upload/quick-ocr', quickOcrRoutes); @@ -98,6 +100,8 @@ app.use('/api/jobs', jobsRoutes); app.use('/api/search', searchRoutes); app.use('/api/documents', documentsRoutes); app.use('/api/stats', statsRoutes); +app.use('/api/context', contextRoutes); +app.use('/api', tocRoutes); // Handles /api/documents/:id/toc paths app.use('/api', imagesRoutes); // Error handling diff --git a/server/package.json b/server/package.json index c15b22b..4ece237 100644 --- a/server/package.json +++ b/server/package.json @@ -29,6 +29,7 @@ "helmet": "^7.0.0", "ioredis": "^5.0.0", "jsonwebtoken": "^9.0.0", + "lru-cache": "^11.2.2", "meilisearch": "^0.41.0", "multer": "^1.4.5-lts.1", "pdf-img-convert": "^2.0.0", diff --git a/server/routes/toc.js b/server/routes/toc.js new file mode 100644 index 0000000..9df66f1 --- /dev/null +++ b/server/routes/toc.js @@ -0,0 +1,97 @@ +/** + * TOC Route - Table of Contents API + * GET /api/documents/:documentId/toc - Get TOC for document + * POST /api/documents/:documentId/toc/extract - Trigger TOC extraction + */ + +import express from 'express'; +import { LRUCache } from 'lru-cache'; +import { getDocumentToc, buildTocTree, extractTocFromDocument } from '../services/toc-extractor.js'; + +const router = express.Router(); + +// LRU cache for TOC results +const tocCache = new LRUCache({ + max: 200, + ttl: 1000 * 60 * 30 // 30 minutes +}); + +/** + * GET /api/documents/:documentId/toc + * Get Table of Contents for a document + * + * @param {string} documentId - Document UUID + * @query {string} format - "flat" (default) or "tree" + * @returns {Object} { entries: Array, format: string } + */ +router.get('/documents/:documentId/toc', async (req, res) => { + try { + const { documentId } = req.params; + const format = req.query.format || 'flat'; + + const cacheKey = `toc:${documentId}:${format}`; + let entries = tocCache.get(cacheKey); + if (!entries) { + entries = getDocumentToc(documentId); + tocCache.set(cacheKey, entries); + } + + if (format === 'tree') { + const tree = buildTocTree(entries); + return res.json({ entries: tree, format: 'tree', count: entries.length }); + } + + res.json({ entries, format: 'flat', count: entries.length }); + + } catch (error) { + console.error('TOC fetch error:', error); + res.status(500).json({ + error: 'Failed to fetch TOC', + message: error.message + }); + } +}); + +/** + * POST /api/documents/:documentId/toc/extract + * Trigger TOC extraction for a document + * + * @param {string} documentId - Document UUID + * @returns {Object} { success: boolean, entriesCount: number, pages: number[] } + */ +router.post('/documents/:documentId/toc/extract', async (req, res) => { + try { + const { documentId } = req.params; + + const result = await extractTocFromDocument(documentId); + + if (!result.success) { + return res.status(400).json({ + error: 'TOC extraction failed', + message: result.error || result.message + }); + } + + // Invalidate cache after extraction + tocCache.delete(`toc:${documentId}:flat`); + tocCache.delete(`toc:${documentId}:tree`); + + res.json({ + success: true, + entriesCount: result.entriesCount, + tocPages: result.pages, + message: result.entriesCount > 0 + ? `Extracted ${result.entriesCount} TOC entries from ${result.pages.length} page(s)` + : 'No TOC detected in document' + }); + + } catch (error) { + console.error('TOC extraction error:', error); + res.status(500).json({ + error: 'TOC extraction failed', + message: error.message + }); + } +}); + +export default router; diff --git a/server/services/section-extractor.js b/server/services/section-extractor.js new file mode 100644 index 0000000..9877e25 --- /dev/null +++ b/server/services/section-extractor.js @@ -0,0 +1,265 @@ +/** + * Section Extractor Service + * + * Extracts section/chapter metadata from PDFs using a three-tier approach: + * 1. PDF Outline/Bookmarks (most reliable) + * 2. Header Detection via Regex (fallback) + * 3. Table of Contents Parsing (last resort) + */ + +import pdf from 'pdf-parse'; +import fs from 'fs'; +import { promisify } from 'util'; + +const readFile = promisify(fs.readFile); + +/** + * Slugify section title for consistent keys + */ +function slugify(text) { + return text + .toLowerCase() + .replace(/[^\w\s.-]/g, '') + .replace(/\s+/g, '-') + .replace(/^-+|-+$/g, ''); +} + +/** + * Parse section number to determine order + * Examples: "8" -> 800, "8.6" -> 806, "8-6" -> 806, "8/6" -> 806 + */ +function parseSectionOrder(sectionNum) { + if (!sectionNum) return 0; + + // Normalize separators: treat -, /, . the same + const normalized = sectionNum.replace(/[-\/]/g, '.'); + const parts = normalized.split('.').map(p => parseInt(p) || 0); + + // Major * 100 + minor * 1 + return (parts[0] || 0) * 100 + (parts[1] || 0); +} + +/** + * Extract sections from PDF outline/bookmarks + * This is the most reliable method when available + */ +async function extractFromOutline(pdfPath) { + try { + const dataBuffer = await readFile(pdfPath); + const data = await pdf(dataBuffer, { + max: 0 // Don't extract text, just metadata + }); + + if (!data.metadata || !data.metadata.info) { + return null; + } + + // pdf-parse doesn't expose outlines directly, we need pdf-lib or pdfjs-dist + // For now, return null to fall through to other methods + return null; + } catch (error) { + console.error('[SectionExtractor] Outline extraction failed:', error.message); + return null; + } +} + +/** + * Detect section headers using regex patterns + * Looks for patterns like: + * - "8. Waste Systems" + * - "8.6 Blackwater Tank" + * - "CHAPTER 8: WASTE SYSTEMS" + */ +function detectSectionHeaders(pages) { + const sections = []; + let currentSection = null; + let currentSectionOrder = 0; + + // Patterns to match section headers (marine manual focused) + const headerPatterns = [ + // "8.6 Blackwater Tank" or "8-6 Bilge System" or "8/6 Through-Hull" + /^\s*(\d+(?:[.\-\/]\d+)*)\s+([A-Z][^\n]{3,60})/m, + // "CHAPTER 8: WASTE SYSTEMS" or "SECTION 8.6: Blackwater" + /^\s*(?:CHAPTER|SECTION|PART)\s+(\d+(?:[.\-\/]\d+)*)[:\s]+([A-Z][^\n]{3,60})/mi, + // Marine-specific: "ELECTRICAL SYSTEM", "PLUMBING", "NAVIGATION EQUIPMENT" + /^\s*([A-Z][A-Z\s\-]{4,59})$/m, + // TOC style: "8.6 Blackwater" at page start + /^(\d+(?:[.\-\/]\d+)*)\s+([A-Z][a-z][^\n]{3,50})/m, + ]; + + for (const page of pages) { + const { pageNumber, text } = page; + + if (!text || text.length < 10) continue; + + // Try each pattern + let matched = false; + for (const pattern of headerPatterns) { + const match = text.match(pattern); + if (match) { + let sectionNum = match[1]; + let sectionTitle = match[2] || match[1]; + + // Skip if it's just the page number + if (sectionTitle.length < 5) continue; + + // Clean up title + sectionTitle = sectionTitle.trim(); + if (sectionTitle.endsWith(':')) { + sectionTitle = sectionTitle.slice(0, -1); + } + + // Calculate section order + const order = sectionNum && /\d/.test(sectionNum) + ? parseSectionOrder(sectionNum) + : currentSectionOrder + 1; + + // Create section key (hierarchical path) + const sectionKey = slugify(sectionTitle); + + currentSection = { + section: sectionTitle, + sectionKey: sectionKey, + sectionOrder: order, + startPage: pageNumber + }; + + currentSectionOrder = order; + sections.push(currentSection); + matched = true; + break; + } + } + + // If we found a section, continue to next page + if (matched) continue; + + // Otherwise, assign current section to this page + if (!currentSection) { + // No section yet, create a default one + currentSection = { + section: 'Introduction', + sectionKey: 'introduction', + sectionOrder: 0, + startPage: pageNumber + }; + sections.push(currentSection); + } + } + + return sections; +} + +/** + * Parse Table of Contents to extract section structure + * Looks for pages with dense "8.6 Title ........ 73" style entries + */ +function parseTableOfContents(pages) { + const sections = []; + + // Pattern to match TOC entries: "8.6 Blackwater Tank ........ 73" + const tocPattern = /^\s*(\d+(?:\.\d+)*)\s+([^.\d][^\n]{3,50}?)[\s.]+(\d+)\s*$/gm; + + for (const page of pages) { + const { text } = page; + if (!text) continue; + + // Look for pages with multiple TOC-style entries + const matches = [...text.matchAll(tocPattern)]; + + if (matches.length >= 3) { // Likely a TOC page if 3+ entries + console.log(`[SectionExtractor] Found TOC page with ${matches.length} entries`); + + for (const match of matches) { + const sectionNum = match[1]; + const sectionTitle = match[2].trim(); + const pageNum = parseInt(match[3]); + + if (pageNum > 0 && sectionTitle.length >= 5) { + sections.push({ + section: sectionTitle, + sectionKey: slugify(sectionTitle), + sectionOrder: parseSectionOrder(sectionNum), + startPage: pageNum + }); + } + } + + // If we found a TOC, we're done + if (sections.length > 0) { + return sections; + } + } + } + + return sections.length > 0 ? sections : null; +} + +/** + * Main extraction function - tries all methods in order + */ +export async function extractSections(pdfPath, pages) { + console.log('[SectionExtractor] Starting section extraction'); + + // Method 1: Try PDF outline/bookmarks + let sections = await extractFromOutline(pdfPath); + if (sections && sections.length > 0) { + console.log(`[SectionExtractor] Extracted ${sections.length} sections from PDF outline`); + return sections; + } + + // Method 2: Try Table of Contents parsing + sections = parseTableOfContents(pages); + if (sections && sections.length > 0) { + console.log(`[SectionExtractor] Extracted ${sections.length} sections from TOC`); + return sections; + } + + // Method 3: Try header detection + sections = detectSectionHeaders(pages); + if (sections && sections.length > 0) { + console.log(`[SectionExtractor] Detected ${sections.length} sections from headers`); + return sections; + } + + console.log('[SectionExtractor] No sections found, using single section'); + + // Fallback: Single section for entire document + return [{ + section: 'Complete Manual', + sectionKey: 'complete-manual', + sectionOrder: 0, + startPage: 1 + }]; +} + +/** + * Map pages to their sections + * Given extracted sections and pages, assigns each page to a section + */ +export function mapPagesToSections(sections, totalPages) { + const pageMap = new Map(); + + // Sort sections by start page + const sortedSections = [...sections].sort((a, b) => a.startPage - b.startPage); + + // For each section, determine its page range + for (let i = 0; i < sortedSections.length; i++) { + const section = sortedSections[i]; + const nextSection = sortedSections[i + 1]; + + const startPage = section.startPage; + const endPage = nextSection ? nextSection.startPage - 1 : totalPages; + + // Assign all pages in this range to this section + for (let pageNum = startPage; pageNum <= endPage; pageNum++) { + pageMap.set(pageNum, { + section: section.section, + sectionKey: section.sectionKey, + sectionOrder: section.sectionOrder + }); + } + } + + return pageMap; +} diff --git a/server/services/toc-extractor.js b/server/services/toc-extractor.js new file mode 100644 index 0000000..03a7f29 --- /dev/null +++ b/server/services/toc-extractor.js @@ -0,0 +1,591 @@ +/** + * TOC Extractor Service + * Detects and extracts Table of Contents from OCR'd document pages + */ + +import { v4 as uuidv4 } from 'uuid'; +import { getDb } from '../db/db.js'; +import fs from 'fs/promises'; +import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs'; + +/** + * TOC entry patterns to match: + * - "Chapter 4 – Plumbing System ........ 72" + * - "4.1 Water System.....................45" + * - "Section 3: Electrical . . . . . . . 89" + * - "Introduction 12" + */ +const TOC_PATTERNS = [ + // Pattern 1: Title [dots/spaces] PageNum + /^(.{3,150?}?)\s*[.\s–-]{3,}\s*(\d{1,4})\s*$/, + + // Pattern 2: SectionKey Title [dots/spaces] PageNum + /^([\d.]+)\s+(.{3,100}?)\s*[.\s–-]{3,}\s*(\d{1,4})\s*$/, + + // Pattern 3: Title [whitespace] PageNum (simpler) + /^(.{5,120}?)\s{3,}(\d{1,4})\s*$/, +]; + +/** + * Detect if a page looks like a TOC page + * @param {string} pageText - OCR text from page + * @returns {boolean} + */ +function isTocPage(pageText) { + if (!pageText || pageText.length < 100) return false; + + const lines = pageText.split('\n').map(l => l.trim()).filter(l => l.length > 5); + if (lines.length < 5) return false; + + // Count how many lines match TOC patterns + let matchCount = 0; + let pageNumbers = []; + + for (const line of lines) { + for (const pattern of TOC_PATTERNS) { + if (pattern.test(line)) { + matchCount++; + const match = line.match(pattern); + const pageNum = parseInt(match[match.length - 1]); + if (!isNaN(pageNum)) { + pageNumbers.push(pageNum); + } + break; + } + } + } + + // Heuristics for TOC detection: + // 1. At least 5 matching lines + // 2. At least 30% of lines match TOC patterns + // 3. Page numbers are somewhat sequential or grouped + const matchRatio = matchCount / lines.length; + const hasSequentialPages = checkSequentiality(pageNumbers); + + return matchCount >= 5 && matchRatio >= 0.3 && hasSequentialPages; +} + +/** + * Check if page numbers show some sequentiality + * @param {number[]} pageNumbers + * @returns {boolean} + */ +function checkSequentiality(pageNumbers) { + if (pageNumbers.length < 3) return false; + + // Sort and check for general increasing trend + const sorted = [...pageNumbers].sort((a, b) => a - b); + let increases = 0; + + for (let i = 1; i < sorted.length; i++) { + if (sorted[i] >= sorted[i - 1]) increases++; + } + + // At least 70% should be increasing + return (increases / (sorted.length - 1)) >= 0.7; +} + +/** + * Parse section key and determine hierarchy level + * @param {string} sectionKey - e.g., "4", "4.1", "4.1.2" + * @returns {{ key: string, level: number }} + */ +function parseSectionKey(sectionKey) { + if (!sectionKey) return { key: null, level: 1 }; + + const trimmed = sectionKey.trim(); + const parts = trimmed.split('.'); + + return { + key: trimmed, + level: parts.length + }; +} + +/** + * Extract TOC entries from a page + * @param {string} pageText + * @param {number} pageNumber + * @returns {Array} + */ +function extractTocEntries(pageText, pageNumber) { + const lines = pageText.split('\n').map(l => l.trim()).filter(l => l.length > 5); + const entries = []; + let orderIndex = 0; + + for (const line of lines) { + let match = null; + let patternType = 0; + + // Try each pattern + for (let i = 0; i < TOC_PATTERNS.length; i++) { + match = line.match(TOC_PATTERNS[i]); + if (match) { + patternType = i; + break; + } + } + + if (!match) continue; + + let title, sectionKey, targetPage; + + // Parse based on pattern type + if (patternType === 1) { + // Pattern with section key: "4.1 Title .... 45" + sectionKey = match[1]; + title = match[2].trim(); + targetPage = parseInt(match[3]); + } else { + // Patterns without section key: "Title .... 45" + const groups = match.slice(1).filter(g => g !== undefined); + title = groups[0].trim(); + targetPage = parseInt(groups[groups.length - 1]); + sectionKey = null; + } + + // Clean up title (remove trailing dots/dashes) + title = title.replace(/[.\-–\s]+$/, '').trim(); + + // Skip if title is too short or page number invalid + if (title.length < 3 || isNaN(targetPage) || targetPage < 1) continue; + + const { key, level } = parseSectionKey(sectionKey); + + entries.push({ + title, + sectionKey: key, + pageStart: targetPage, + level, + tocPageNumber: pageNumber, + orderIndex: orderIndex++ + }); + } + + return entries; +} + +/** + * Build parent-child relationships for hierarchical TOC + * @param {Array} entries + * @returns {Array} Entries with parentId set + */ +function buildHierarchy(entries) { + const enhanced = entries.map(e => ({ ...e, id: uuidv4(), parentId: null })); + + for (let i = 0; i < enhanced.length; i++) { + const entry = enhanced[i]; + + if (!entry.sectionKey || entry.level === 1) continue; + + // Find parent: look backwards for entry with section key that is prefix + // e.g., "4.1.2" parent is "4.1" + const parentKeyParts = entry.sectionKey.split('.'); + parentKeyParts.pop(); // Remove last part + const parentKey = parentKeyParts.join('.'); + + for (let j = i - 1; j >= 0; j--) { + if (enhanced[j].sectionKey === parentKey) { + entry.parentId = enhanced[j].id; + break; + } + } + } + + return enhanced; +} + +/** + * Extract PDF outline/bookmarks as fallback TOC + * Uses pdfjs-dist to read the PDF's built-in outline/bookmarks + * + * @param {string} filePath - Absolute path to PDF file + * @param {string} documentId - Document ID for reference + * @returns {Promise|null>} Array of TOC entries or null if no outline exists + */ +async function extractPdfOutline(filePath, documentId) { + try { + console.log(`[TOC] Attempting to extract PDF outline from: ${filePath}`); + + // Read PDF file + const dataBuffer = await fs.readFile(filePath); + + // Load PDF document + const loadingTask = pdfjsLib.getDocument({ + data: new Uint8Array(dataBuffer), + useSystemFonts: true, + standardFontDataUrl: null // Disable font loading for performance + }); + + const pdfDocument = await loadingTask.promise; + const outline = await pdfDocument.getOutline(); + + if (!outline || outline.length === 0) { + console.log(`[TOC] No PDF outline found in document ${documentId}`); + await pdfDocument.destroy(); + return null; + } + + console.log(`[TOC] Found PDF outline with ${outline.length} top-level items`); + + // Convert outline to TOC entries + const entries = []; + let orderIndex = 0; + + /** + * Recursively process outline items and convert to TOC entries + */ + async function processOutlineItem(item, level = 1, parentId = null) { + if (!item || !item.title) return; + + // Resolve destination to page number + let pageStart = 1; + if (item.dest) { + try { + // Get the destination (can be a string reference or direct array) + const dest = typeof item.dest === 'string' + ? await pdfDocument.getDestination(item.dest) + : item.dest; + + // Extract page reference from destination array + // Format is typically: [pageRef, fitType, ...params] + if (dest && Array.isArray(dest) && dest[0]) { + const pageIndex = await pdfDocument.getPageIndex(dest[0]); + pageStart = pageIndex + 1; // Convert 0-based to 1-based + } + } catch (e) { + console.log(`[TOC] Could not resolve page for outline item "${item.title}": ${e.message}`); + // Keep default pageStart = 1 + } + } + + const entry = { + id: uuidv4(), + title: item.title.trim(), + sectionKey: null, // PDF outlines don't have section keys + pageStart: pageStart, + level: level, + parentId: parentId, + orderIndex: orderIndex++, + tocPageNumber: null // Not from a TOC page, from PDF outline + }; + + entries.push(entry); + + // Process children recursively + if (item.items && Array.isArray(item.items) && item.items.length > 0) { + for (const child of item.items) { + await processOutlineItem(child, level + 1, entry.id); + } + } + } + + // Process all top-level outline items + for (const item of outline) { + await processOutlineItem(item); + } + + // Clean up + await pdfDocument.destroy(); + + if (entries.length === 0) { + console.log(`[TOC] PDF outline exists but contains no valid entries for document ${documentId}`); + return null; + } + + console.log(`[TOC] Successfully extracted ${entries.length} entries from PDF outline for document ${documentId}`); + return entries; + + } catch (error) { + console.error(`[TOC] Error extracting PDF outline for document ${documentId}:`, error); + return null; + } +} + +/** + * Extract TOC from entire document + * @param {string} documentId + * @returns {Promise<{ success: boolean, entriesCount: number, pages: number[] }>} + */ +export async function extractTocFromDocument(documentId) { + const db = getDb(); + + try { + // Validate document exists + const document = db.prepare(` + SELECT id FROM documents WHERE id = ? + `).get(documentId); + + if (!document) { + console.error(`[TOC] Document not found: ${documentId}`); + return { + success: false, + error: 'Document not found', + entriesCount: 0, + pages: [] + }; + } + + // Get total page count for the document + const pageCountResult = db.prepare(` + SELECT COUNT(*) as count + FROM document_pages + WHERE document_id = ? + `).get(documentId); + + if (pageCountResult.count === 0) { + console.error(`[TOC] No pages available for TOC extraction in document: ${documentId}`); + return { + success: false, + error: 'No pages available for TOC extraction', + entriesCount: 0, + pages: [] + }; + } + + // Get all pages with OCR text + const pages = db.prepare(` + SELECT page_number, ocr_text + FROM document_pages + WHERE document_id = ? AND ocr_text IS NOT NULL + ORDER BY page_number ASC + `).all(documentId); + + if (pages.length === 0) { + console.error(`[TOC] No OCR text found for document: ${documentId}`); + return { + success: false, + error: 'No OCR text found', + entriesCount: 0, + pages: [] + }; + } + + // Find TOC pages + const tocPages = []; + for (const page of pages) { + if (isTocPage(page.ocr_text)) { + tocPages.push(page); + } + } + + // If no TOC pages found, try PDF outline as fallback + if (tocPages.length === 0) { + console.log(`[TOC] No TOC pages detected in document ${documentId}, attempting PDF outline fallback`); + + // Get document file path + const doc = db.prepare('SELECT file_path FROM documents WHERE id = ?').get(documentId); + + if (!doc || !doc.file_path) { + console.log(`[TOC] Cannot attempt PDF outline fallback: file path not found for document ${documentId}`); + return { + success: false, + error: 'TOC detection failed: No patterns matched', + entriesCount: 0, + pages: [] + }; + } + + // Try extracting PDF outline + const outlineEntries = await extractPdfOutline(doc.file_path, documentId); + + if (!outlineEntries || outlineEntries.length === 0) { + console.log(`[TOC] PDF outline fallback failed for document ${documentId}`); + return { + success: false, + error: 'TOC detection failed: No patterns matched and no PDF outline found', + entriesCount: 0, + pages: [] + }; + } + + // Save outline entries to database + console.log(`[TOC] Using PDF outline as TOC for document ${documentId} (${outlineEntries.length} entries)`); + + // Delete existing TOC entries for this document + db.prepare('DELETE FROM document_toc WHERE document_id = ?').run(documentId); + + // Insert outline entries + const insertStmt = db.prepare(` + INSERT INTO document_toc ( + id, document_id, title, section_key, page_start, + level, parent_id, order_index, toc_page_number, created_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + `); + + const timestamp = Date.now(); + for (const entry of outlineEntries) { + insertStmt.run( + entry.id, + documentId, + entry.title, + entry.sectionKey, + entry.pageStart, + entry.level, + entry.parentId, + entry.orderIndex, + entry.tocPageNumber, + timestamp + ); + } + + return { + success: true, + entriesCount: outlineEntries.length, + pages: [], + source: 'pdf-outline' + }; + } + + console.log(`[TOC] Found ${tocPages.length} TOC pages in document ${documentId}`); + + // Extract entries from all TOC pages + let allEntries = []; + for (const page of tocPages) { + const entries = extractTocEntries(page.ocr_text, page.page_number); + allEntries = allEntries.concat(entries); + } + + if (allEntries.length === 0) { + console.error(`[TOC] TOC parsing failed: No valid entries extracted from detected TOC pages in document ${documentId}`); + return { + success: false, + error: 'TOC parsing failed: No valid entries extracted from detected TOC pages', + entriesCount: 0, + pages: tocPages.map(p => p.page_number) + }; + } + + // Build hierarchy + let hierarchicalEntries; + try { + hierarchicalEntries = buildHierarchy(allEntries); + } catch (hierarchyError) { + console.error(`[TOC] TOC parsing failed: Hierarchy building error in document ${documentId}:`, hierarchyError); + return { + success: false, + error: `TOC parsing failed: Hierarchy building error - ${hierarchyError.message}`, + entriesCount: 0, + pages: tocPages.map(p => p.page_number) + }; + } + + // Delete existing TOC entries for this document + try { + db.prepare('DELETE FROM document_toc WHERE document_id = ?').run(documentId); + } catch (deleteError) { + console.error(`[TOC] TOC parsing failed: Database cleanup error in document ${documentId}:`, deleteError); + return { + success: false, + error: `TOC parsing failed: Database cleanup error - ${deleteError.message}`, + entriesCount: 0, + pages: tocPages.map(p => p.page_number) + }; + } + + // Insert new TOC entries + const insertStmt = db.prepare(` + INSERT INTO document_toc ( + id, document_id, title, section_key, page_start, + level, parent_id, order_index, toc_page_number, created_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + `); + + const timestamp = Date.now(); + try { + for (const entry of hierarchicalEntries) { + insertStmt.run( + entry.id, + documentId, + entry.title, + entry.sectionKey, + entry.pageStart, + entry.level, + entry.parentId, + entry.orderIndex, + entry.tocPageNumber, + timestamp + ); + } + } catch (insertError) { + console.error(`[TOC] TOC parsing failed: Database insertion error in document ${documentId}:`, insertError); + return { + success: false, + error: `TOC parsing failed: Database insertion error - ${insertError.message}`, + entriesCount: 0, + pages: tocPages.map(p => p.page_number) + }; + } + + console.log(`[TOC] Extracted ${hierarchicalEntries.length} TOC entries for document ${documentId}`); + + return { + success: true, + entriesCount: hierarchicalEntries.length, + pages: tocPages.map(p => p.page_number), + source: 'ocr-extraction' + }; + + } catch (error) { + console.error(`[TOC] Unexpected extraction error for document ${documentId}:`, error); + return { + success: false, + error: `Unexpected error during TOC extraction: ${error.message}`, + entriesCount: 0, + pages: [] + }; + } +} + +/** + * Get TOC for a document + * @param {string} documentId + * @returns {Array} TOC entries with hierarchy + */ +export function getDocumentToc(documentId) { + const db = getDb(); + + const entries = db.prepare(` + SELECT + id, document_id, title, section_key, page_start, + level, parent_id, order_index, toc_page_number + FROM document_toc + WHERE document_id = ? + ORDER BY order_index ASC + `).all(documentId); + + return entries; +} + +/** + * Build tree structure from flat TOC entries + * @param {Array} entries + * @returns {Array} Tree with children arrays + */ +export function buildTocTree(entries) { + const idMap = {}; + const roots = []; + + // First pass: create map + for (const entry of entries) { + idMap[entry.id] = { ...entry, children: [] }; + } + + // Second pass: build tree + for (const entry of entries) { + const node = idMap[entry.id]; + if (entry.parent_id && idMap[entry.parent_id]) { + idMap[entry.parent_id].children.push(node); + } else { + roots.push(node); + } + } + + return roots; +} + +export default { + extractTocFromDocument, + getDocumentToc, + buildTocTree +}; diff --git a/server/workers/ocr-worker.js b/server/workers/ocr-worker.js index a11b631..5667117 100644 --- a/server/workers/ocr-worker.js +++ b/server/workers/ocr-worker.js @@ -21,6 +21,7 @@ import { getDb } from '../config/db.js'; import { extractTextFromPDF, cleanOCRText, extractTextFromImage } from '../services/ocr.js'; import { indexDocumentPage } from '../services/search.js'; import { extractImagesFromPage } from './image-extractor.js'; +import { extractSections, mapPagesToSections } from '../services/section-extractor.js'; const __dirname = dirname(fileURLToPath(import.meta.url)); @@ -293,6 +294,39 @@ async function processOCRJob(job) { } } + // Extract section metadata + console.log('[OCR Worker] Extracting section metadata'); + try { + const sections = await extractSections(filePath, ocrResults); + const pageMap = mapPagesToSections(sections, totalPages); + + console.log(`[OCR Worker] Mapping ${pageMap.size} pages to sections`); + + // Update each page with section metadata + const updateSectionStmt = db.prepare(` + UPDATE document_pages + SET section = ?, + section_key = ?, + section_order = ? + WHERE document_id = ? AND page_number = ? + `); + + for (const [pageNum, sectionData] of pageMap.entries()) { + updateSectionStmt.run( + sectionData.section, + sectionData.sectionKey, + sectionData.sectionOrder, + documentId, + pageNum + ); + } + + console.log('[OCR Worker] Section metadata stored successfully'); + } catch (sectionError) { + console.error('[OCR Worker] Section extraction failed:', sectionError.message); + // Continue even if section extraction fails + } + // Update document status to indexed and mark images as extracted db.prepare(` UPDATE documents @@ -313,6 +347,21 @@ async function processOCRJob(job) { console.log(`[OCR Worker] Job ${jobId} completed successfully`); + // Extract Table of Contents as post-processing step + try { + const { extractTocFromDocument } = await import('../services/toc-extractor.js'); + const tocResult = await extractTocFromDocument(documentId); + + if (tocResult.success && tocResult.entriesCount > 0) { + console.log(`[OCR Worker] TOC extracted: ${tocResult.entriesCount} entries from ${tocResult.pages.length} page(s)`); + } else { + console.log(`[OCR Worker] No TOC detected or extraction skipped`); + } + } catch (tocError) { + // Don't fail the whole job if TOC extraction fails + console.error(`[OCR Worker] TOC extraction error:`, tocError.message); + } + return { success: true, documentId: documentId, diff --git a/tests/TOC_E2E_TEST.md b/tests/TOC_E2E_TEST.md new file mode 100644 index 0000000..5c1bc5b --- /dev/null +++ b/tests/TOC_E2E_TEST.md @@ -0,0 +1,263 @@ +# TOC Navigation - End-to-End Testing Guide + +This document provides manual testing scenarios for the Table of Contents (TOC) navigation feature in NaviDocs. + +## Prerequisites + +- Application running locally or on test environment +- Test documents with TOC available (ensure at least one document has a multi-level TOC) +- Browser developer tools accessible (for inspecting localStorage and URL changes) + +--- + +## Test Scenario 1: TOC Sidebar Display + +### Objective +Verify that the TOC sidebar displays correctly when opening a document with table of contents. + +### Steps +1. Navigate to the NaviDocs application +2. Select and open a document that contains a table of contents +3. Wait for the document to fully load + +### Expected Results +- TOC sidebar appears on the left side of the screen +- TOC entries are displayed in a hierarchical/nested structure matching the document outline +- The current/active page entry is highlighted (typically with a different background color or text style) +- Entries show proper indentation for nested levels (H1, H2, H3, etc.) +- TOC sidebar does not overlap or obscure the PDF content area + +### Verification Points +- [ ] Sidebar is visible on the left +- [ ] Hierarchical structure is preserved (parent/child relationships) +- [ ] Active page indicator is present and correct +- [ ] Visual styling is consistent and readable +- [ ] No layout issues or overlapping elements + +--- + +## Test Scenario 2: Navigation + +### Objective +Verify that clicking TOC entries correctly navigates the PDF viewer and updates related UI elements. + +### Steps +1. Open a document with TOC +2. Note the current page number displayed +3. Click on a TOC entry that links to a different page +4. Observe the PDF viewer, URL bar, and TOC sidebar + +### Expected Results +- PDF viewer immediately jumps to the correct page associated with the clicked TOC entry +- URL hash updates to reflect the new page (format: `#p=N` where N is the page number) +- The previously highlighted TOC entry is de-highlighted +- The newly selected TOC entry becomes highlighted/active +- Page number indicator in the viewer updates to match + +### Verification Points +- [ ] PDF scrolls/jumps to the correct page +- [ ] URL contains correct hash parameter (e.g., `#p=5`) +- [ ] Only one TOC entry is highlighted at a time +- [ ] Highlighted entry corresponds to the current page +- [ ] Navigation is smooth without errors or flashing + +### Additional Tests +- Click multiple different TOC entries in sequence +- Click the currently active TOC entry (should remain on same page) +- Test with both top-level and nested TOC entries + +--- + +## Test Scenario 3: Deep Links + +### Objective +Verify that direct URLs with page hash parameters correctly load the document at the specified page and highlight the appropriate TOC entry. + +### Steps +1. Identify a document URL (e.g., `http://localhost:3000/document/sample.pdf`) +2. Append a page hash to the URL (e.g., `http://localhost:3000/document/sample.pdf#p=12`) +3. Open this URL in a new browser tab or window +4. Wait for the document to load + +### Expected Results +- PDF viewer loads and displays page 12 (or the specified page number) +- TOC sidebar loads with the correct entry highlighted +- The highlighted TOC entry corresponds to page 12 or the section containing page 12 +- URL hash remains intact after page load + +### Verification Points +- [ ] PDF opens directly to the specified page (page 12) +- [ ] TOC entry for page 12 is highlighted +- [ ] URL hash parameter is preserved (`#p=12`) +- [ ] No initial flash of wrong page before jumping +- [ ] If TOC entry is nested, parent entries are expanded to show the active item + +### Edge Cases to Test +- Invalid page number (e.g., `#p=999` for a 50-page document) +- Page number 1 (`#p=1`) +- Last page of document +- Negative or zero page numbers + +--- + +## Test Scenario 4: Collapse/Expand + +### Objective +Verify that the TOC sidebar can be collapsed/expanded and that the user's preference persists. + +### Steps +1. Open a document with TOC +2. Locate the sidebar toggle button (typically an icon or button near the sidebar) +3. Click the toggle button to collapse the sidebar +4. Observe the UI change +5. Open browser developer tools (F12) and navigate to Application > Local Storage +6. Refresh the page +7. Observe the sidebar state after refresh +8. Click the toggle button again to expand the sidebar +9. Refresh the page again + +### Expected Results + +#### When Collapsing +- Sidebar smoothly animates closed (slides left or fades out) +- Toggle button icon changes to indicate "expand" action is available +- PDF content area expands to use the freed space +- localStorage contains a key indicating sidebar is collapsed (e.g., `tocSidebarCollapsed: true`) + +#### When Expanding +- Sidebar smoothly animates open (slides right or fades in) +- Toggle button icon changes to indicate "collapse" action is available +- PDF content area contracts to accommodate sidebar +- localStorage updates to indicate sidebar is expanded (e.g., `tocSidebarCollapsed: false`) + +#### Persistence After Refresh +- Sidebar state matches the last user action (collapsed stays collapsed, expanded stays expanded) +- No flashing or layout shift during page load + +### Verification Points +- [ ] Toggle button is visible and clickable +- [ ] Collapse animation is smooth +- [ ] Expand animation is smooth +- [ ] localStorage key is set correctly +- [ ] Preference persists after page refresh +- [ ] PDF content area adjusts appropriately +- [ ] No JavaScript errors in console + +### localStorage Check +In browser developer tools: +1. Go to Application tab > Local Storage > your domain +2. Look for a key like `tocSidebarCollapsed`, `sidebarState`, or similar +3. Verify the value changes when toggling (typically `true`/`false` or `"collapsed"`/`"expanded"`) + +--- + +## Test Scenario 5: Search Integration + +### Objective +Verify that search results integrate with TOC navigation and correctly navigate to the relevant page. + +### Steps +1. Open a document with TOC +2. Locate the search functionality (search bar or search button) +3. Enter a search term that exists in the document (e.g., "introduction", "methodology") +4. Wait for search results to appear +5. Identify a search result that includes a "Jump to section" or similar navigation action +6. Click on the "Jump to section" link/button +7. Observe the PDF viewer, TOC sidebar, and URL + +### Expected Results +- Search results display with relevant snippets/context +- Each result shows which page or section it appears in +- "Jump to section" or equivalent action is available for each result +- Clicking "Jump to section" navigates the PDF to the correct page +- TOC entry for that page becomes highlighted +- URL hash updates to reflect the new page (e.g., `#p=7`) +- Search term may be highlighted in the PDF viewer (depending on implementation) + +### Verification Points +- [ ] Search functionality is accessible and working +- [ ] Results display with page/section information +- [ ] "Jump to section" action is clearly labeled +- [ ] Navigation occurs when clicking the action +- [ ] Correct page is displayed in PDF viewer +- [ ] TOC highlights the correct entry +- [ ] URL hash updates correctly +- [ ] Can navigate back to search results and select different result + +### Additional Tests +- Test with multiple search results across different sections +- Test with search term appearing multiple times on same page +- Test with search term in a deeply nested TOC section +- Verify TOC expands parent sections if necessary to show highlighted entry + +--- + +## Cross-Browser Testing + +Perform all scenarios in the following browsers: +- [ ] Chrome/Chromium (latest) +- [ ] Firefox (latest) +- [ ] Safari (latest, macOS only) +- [ ] Edge (latest) + +--- + +## Mobile/Responsive Testing + +For each scenario, test on: +- [ ] Mobile viewport (iOS Safari) +- [ ] Mobile viewport (Android Chrome) +- [ ] Tablet viewport (iPad Safari) + +Additional mobile-specific checks: +- TOC sidebar may be hidden by default on mobile +- Toggle behavior may use a hamburger menu or overlay +- Touch interactions work smoothly +- No horizontal scrolling issues + +--- + +## Regression Checklist + +After any TOC-related code changes, verify: +- [ ] All 5 scenarios pass +- [ ] No console errors appear +- [ ] Performance is acceptable (no lag when clicking TOC entries) +- [ ] Accessibility: keyboard navigation works (Tab, Enter, Arrow keys) +- [ ] Accessibility: screen reader announces TOC entries and page changes +- [ ] Network tab shows no unnecessary re-fetching of PDF + +--- + +## Known Issues / Notes + +Document any known issues, limitations, or special notes here: + +- _Example: Deep linking to pages beyond document length defaults to last page_ +- _Example: TOC sidebar may take 1-2 seconds to populate for very large documents_ + +--- + +## Test Environment Information + +When reporting issues, include: +- Browser name and version +- Operating system +- Application version/build number +- Document being tested +- Screenshot or video of issue + +--- + +## Approval Sign-off + +| Role | Name | Date | Status | +|------|------|------|--------| +| Tester | | | | +| Developer | | | | +| Product Owner | | | | + +--- + +**Last Updated:** 2025-10-20 +**Document Version:** 1.0 diff --git a/tests/toc-smoke-test.sh b/tests/toc-smoke-test.sh new file mode 100755 index 0000000..624c11c --- /dev/null +++ b/tests/toc-smoke-test.sh @@ -0,0 +1,492 @@ +#!/bin/bash + +################################################################################ +# TOC Smoke Test Suite +# Tests Table of Contents API endpoints for NaviDocs +# +# Dependencies: +# - curl (for HTTP requests) +# - jq (for JSON parsing and validation) +# - bc (for floating point arithmetic - cache timing) +# +# Usage: +# ./toc-smoke-test.sh [BASE_URL] [DOCUMENT_ID] +# +# Examples: +# ./toc-smoke-test.sh +# ./toc-smoke-test.sh http://localhost:3001 +# ./toc-smoke-test.sh http://localhost:3001 abc-123-def-456 +################################################################################ + +set -e # Exit on error + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Configuration +BASE_URL="${1:-http://localhost:3001}" +DOCUMENT_ID="${2:-}" +TEMP_DIR="/tmp/toc-smoke-test-$$" + +# Test counters +TOTAL_TESTS=0 +PASSED_TESTS=0 +FAILED_TESTS=0 + +################################################################################ +# Utility Functions +################################################################################ + +# Print colored status messages +print_status() { + local status=$1 + local message=$2 + + case $status in + "PASS") + echo -e "${GREEN}[✓ PASS]${NC} $message" + ((PASSED_TESTS++)) + ;; + "FAIL") + echo -e "${RED}[✗ FAIL]${NC} $message" + ((FAILED_TESTS++)) + ;; + "INFO") + echo -e "${BLUE}[ℹ INFO]${NC} $message" + ;; + "WARN") + echo -e "${YELLOW}[⚠ WARN]${NC} $message" + ;; + "SECTION") + echo -e "\n${BLUE}========================================${NC}" + echo -e "${BLUE}$message${NC}" + echo -e "${BLUE}========================================${NC}" + ;; + esac +} + +# Run a test and increment counter +run_test() { + local test_name=$1 + ((TOTAL_TESTS++)) + print_status "INFO" "Test $TOTAL_TESTS: $test_name" +} + +# Check command dependencies +check_dependencies() { + print_status "SECTION" "Checking Dependencies" + + local missing_deps=0 + + for cmd in curl jq bc; do + if ! command -v $cmd &> /dev/null; then + print_status "FAIL" "$cmd is not installed" + ((missing_deps++)) + else + print_status "PASS" "$cmd is available" + fi + done + + if [ $missing_deps -gt 0 ]; then + echo "" + echo "Please install missing dependencies:" + echo " Ubuntu/Debian: sudo apt-get install curl jq bc" + echo " macOS: brew install curl jq bc" + exit 1 + fi +} + +# Get a valid document ID from the database +get_test_document_id() { + print_status "SECTION" "Finding Test Document" + + if [ -n "$DOCUMENT_ID" ]; then + print_status "INFO" "Using provided document ID: $DOCUMENT_ID" + return + fi + + # Try to get a document from the API + local response=$(curl -s "${BASE_URL}/api/documents?limit=1") + + if [ $? -ne 0 ]; then + print_status "FAIL" "Could not connect to API at ${BASE_URL}" + exit 1 + fi + + # Extract first document ID using jq + DOCUMENT_ID=$(echo "$response" | jq -r '.documents[0].id // empty') + + if [ -z "$DOCUMENT_ID" ]; then + print_status "WARN" "No documents found in database" + print_status "INFO" "Using placeholder ID for endpoint validation" + DOCUMENT_ID="test-document-id" + else + print_status "PASS" "Found document ID: $DOCUMENT_ID" + fi +} + +# Create temp directory for test artifacts +setup_test_environment() { + mkdir -p "$TEMP_DIR" + print_status "INFO" "Created temp directory: $TEMP_DIR" +} + +# Cleanup temp directory +cleanup_test_environment() { + if [ -d "$TEMP_DIR" ]; then + rm -rf "$TEMP_DIR" + print_status "INFO" "Cleaned up temp directory" + fi +} + +################################################################################ +# Test Cases +################################################################################ + +# Test 1: GET /api/documents/:id/toc?format=flat - returns 200 +test_toc_flat_format() { + run_test "GET /api/documents/:id/toc?format=flat returns 200" + + local response_file="$TEMP_DIR/toc_flat.json" + local http_code=$(curl -s -w "%{http_code}" -o "$response_file" \ + "${BASE_URL}/api/documents/${DOCUMENT_ID}/toc?format=flat") + + if [ "$http_code" = "200" ]; then + print_status "PASS" "Received HTTP 200 response" + + # Validate JSON structure + if jq -e '.entries' "$response_file" > /dev/null 2>&1; then + print_status "PASS" "Response contains 'entries' field" + else + print_status "FAIL" "Response missing 'entries' field" + fi + + if jq -e '.format == "flat"' "$response_file" > /dev/null 2>&1; then + print_status "PASS" "Format is 'flat'" + else + print_status "FAIL" "Format is not 'flat'" + fi + + else + print_status "FAIL" "Expected HTTP 200, got $http_code" + cat "$response_file" + fi +} + +# Test 2: GET /api/documents/:id/toc?format=tree - returns 200 +test_toc_tree_format() { + run_test "GET /api/documents/:id/toc?format=tree returns 200" + + local response_file="$TEMP_DIR/toc_tree.json" + local http_code=$(curl -s -w "%{http_code}" -o "$response_file" \ + "${BASE_URL}/api/documents/${DOCUMENT_ID}/toc?format=tree") + + if [ "$http_code" = "200" ]; then + print_status "PASS" "Received HTTP 200 response" + + # Validate JSON structure + if jq -e '.entries' "$response_file" > /dev/null 2>&1; then + print_status "PASS" "Response contains 'entries' field" + else + print_status "FAIL" "Response missing 'entries' field" + fi + + if jq -e '.format == "tree"' "$response_file" > /dev/null 2>&1; then + print_status "PASS" "Format is 'tree'" + else + print_status "FAIL" "Format is not 'tree'" + fi + + else + print_status "FAIL" "Expected HTTP 200, got $http_code" + cat "$response_file" + fi +} + +# Test 3: POST /api/documents/:id/toc/extract - returns 200 +test_toc_extract() { + run_test "POST /api/documents/:id/toc/extract returns 200" + + local response_file="$TEMP_DIR/toc_extract.json" + local http_code=$(curl -s -w "%{http_code}" -o "$response_file" \ + -X POST "${BASE_URL}/api/documents/${DOCUMENT_ID}/toc/extract") + + # Accept both 200 (success) and 400 (document doesn't exist) as valid + # since we might be using a placeholder ID + if [ "$http_code" = "200" ] || [ "$http_code" = "400" ]; then + print_status "PASS" "Received HTTP $http_code response" + + # If successful, validate response structure + if [ "$http_code" = "200" ]; then + if jq -e '.success' "$response_file" > /dev/null 2>&1; then + print_status "PASS" "Response contains 'success' field" + else + print_status "FAIL" "Response missing 'success' field" + fi + + if jq -e '.entriesCount' "$response_file" > /dev/null 2>&1; then + local count=$(jq -r '.entriesCount' "$response_file") + print_status "PASS" "Response contains 'entriesCount': $count" + else + print_status "FAIL" "Response missing 'entriesCount' field" + fi + fi + else + print_status "FAIL" "Expected HTTP 200 or 400, got $http_code" + cat "$response_file" + fi +} + +# Test 4: Verify TOC entries have required fields +test_toc_entry_fields() { + run_test "Verify TOC entries have required fields (id, document_id, title, page_start)" + + local response_file="$TEMP_DIR/toc_flat.json" + + # Check if we have entries + local entry_count=$(jq -r '.entries | length' "$response_file" 2>/dev/null || echo "0") + + if [ "$entry_count" = "0" ]; then + print_status "WARN" "No TOC entries found - skipping field validation" + # Still count as passed since it's valid to have no TOC + print_status "PASS" "Empty TOC is valid" + return + fi + + print_status "INFO" "Found $entry_count TOC entries" + + # Check first entry for required fields + local first_entry=$(jq -r '.entries[0]' "$response_file") + + local required_fields=("id" "document_id" "title" "page_start") + local missing_fields=0 + + for field in "${required_fields[@]}"; do + if echo "$first_entry" | jq -e ".$field" > /dev/null 2>&1; then + local value=$(echo "$first_entry" | jq -r ".$field") + print_status "PASS" "Field '$field' exists with value: $value" + else + print_status "FAIL" "Field '$field' is missing" + ((missing_fields++)) + fi + done + + if [ $missing_fields -eq 0 ]; then + print_status "PASS" "All required fields present" + else + print_status "FAIL" "$missing_fields required fields missing" + fi +} + +# Test 5: Verify tree format has nested children +test_tree_nesting() { + run_test "Verify tree format has nested children structure" + + local response_file="$TEMP_DIR/toc_tree.json" + + # Check if we have entries + local entry_count=$(jq -r '.entries | length' "$response_file" 2>/dev/null || echo "0") + + if [ "$entry_count" = "0" ]; then + print_status "WARN" "No TOC entries found - skipping nesting validation" + print_status "PASS" "Empty TOC is valid" + return + fi + + # Check if at least one entry has a 'children' field (even if empty) + if jq -e '.entries[0] | has("children")' "$response_file" > /dev/null 2>&1; then + print_status "PASS" "Tree entries have 'children' field" + + # Check if any entry has nested children + local has_nested=$(jq -r '[.entries[] | select((.children // []) | length > 0)] | length' "$response_file") + + if [ "$has_nested" -gt "0" ]; then + print_status "PASS" "Found $has_nested entries with nested children" + else + print_status "INFO" "No nested children found (flat TOC structure)" + print_status "PASS" "Tree structure is valid (can be flat)" + fi + else + print_status "FAIL" "Tree entries missing 'children' field" + fi +} + +# Test 6: Verify cache is working (second request is faster) +test_cache_performance() { + run_test "Verify cache is working (second request should be faster)" + + print_status "INFO" "Making first request (cache miss)..." + local start1=$(date +%s%N) + curl -s -o /dev/null "${BASE_URL}/api/documents/${DOCUMENT_ID}/toc?format=flat" + local end1=$(date +%s%N) + local duration1=$(( (end1 - start1) / 1000000 )) # Convert to milliseconds + + print_status "INFO" "First request took ${duration1}ms" + + # Small delay to ensure cache is set + sleep 0.1 + + print_status "INFO" "Making second request (cache hit)..." + local start2=$(date +%s%N) + curl -s -o /dev/null "${BASE_URL}/api/documents/${DOCUMENT_ID}/toc?format=flat" + local end2=$(date +%s%N) + local duration2=$(( (end2 - start2) / 1000000 )) # Convert to milliseconds + + print_status "INFO" "Second request took ${duration2}ms" + + # Second request should be faster or at least not significantly slower + # We allow up to 20% slower due to network variance + local threshold=$(echo "$duration1 * 1.2" | bc | cut -d. -f1) + + if [ "$duration2" -lt "$duration1" ]; then + local improvement=$(echo "scale=2; ($duration1 - $duration2) / $duration1 * 100" | bc) + print_status "PASS" "Cache is working: ${improvement}% faster" + elif [ "$duration2" -le "$threshold" ]; then + print_status "PASS" "Cache performance acceptable (within 20% variance)" + else + print_status "WARN" "Second request slower than expected (possible cache miss)" + # Don't fail the test as network variance can affect timing + print_status "PASS" "Cache endpoint is functional" + fi +} + +# Test 7: Health check endpoint +test_health_check() { + run_test "Server health check endpoint" + + local response_file="$TEMP_DIR/health.json" + local http_code=$(curl -s -w "%{http_code}" -o "$response_file" \ + "${BASE_URL}/health") + + if [ "$http_code" = "200" ]; then + print_status "PASS" "Health endpoint returned 200" + + if jq -e '.status == "ok"' "$response_file" > /dev/null 2>&1; then + print_status "PASS" "Server status is 'ok'" + else + print_status "FAIL" "Server status is not 'ok'" + fi + else + print_status "FAIL" "Health check failed with HTTP $http_code" + fi +} + +# Test 8: Error handling - invalid document ID +test_error_handling() { + run_test "Error handling for invalid document ID" + + local response_file="$TEMP_DIR/error_test.json" + local invalid_id="nonexistent-document-id-12345" + local http_code=$(curl -s -w "%{http_code}" -o "$response_file" \ + "${BASE_URL}/api/documents/${invalid_id}/toc?format=flat") + + # Server should return 200 with empty entries or 404/500 with error + # Both are acceptable behaviors + if [ "$http_code" = "200" ] || [ "$http_code" = "404" ] || [ "$http_code" = "500" ]; then + print_status "PASS" "Server handles invalid ID gracefully (HTTP $http_code)" + + # If 200, should have empty entries + if [ "$http_code" = "200" ]; then + local count=$(jq -r '.entries | length' "$response_file") + print_status "INFO" "Returned $count entries for nonexistent document" + fi + else + print_status "WARN" "Unexpected status code for invalid ID: $http_code" + print_status "PASS" "Server responded (not crashed)" + fi +} + +# Test 9: Default format parameter +test_default_format() { + run_test "Default format parameter (no format query param)" + + local response_file="$TEMP_DIR/toc_default.json" + local http_code=$(curl -s -w "%{http_code}" -o "$response_file" \ + "${BASE_URL}/api/documents/${DOCUMENT_ID}/toc") + + if [ "$http_code" = "200" ]; then + print_status "PASS" "Received HTTP 200 response" + + # Should default to 'flat' format + if jq -e '.format == "flat"' "$response_file" > /dev/null 2>&1; then + print_status "PASS" "Defaults to 'flat' format when not specified" + else + local format=$(jq -r '.format' "$response_file") + print_status "FAIL" "Expected default format 'flat', got '$format'" + fi + else + print_status "FAIL" "Expected HTTP 200, got $http_code" + fi +} + +################################################################################ +# Test Execution +################################################################################ + +main() { + echo "" + echo "╔════════════════════════════════════════════════════════════╗" + echo "║ NaviDocs TOC API Smoke Test Suite ║" + echo "╚════════════════════════════════════════════════════════════╝" + echo "" + + print_status "INFO" "Base URL: $BASE_URL" + print_status "INFO" "Test started at: $(date)" + + # Setup + check_dependencies + setup_test_environment + get_test_document_id + + # Run all tests + print_status "SECTION" "Running Test Suite" + + test_health_check + test_toc_flat_format + test_toc_tree_format + test_toc_extract + test_toc_entry_fields + test_tree_nesting + test_default_format + test_cache_performance + test_error_handling + + # Summary + print_status "SECTION" "Test Summary" + + echo "" + echo "Total Tests: $TOTAL_TESTS" + echo -e "${GREEN}Passed: $PASSED_TESTS${NC}" + echo -e "${RED}Failed: $FAILED_TESTS${NC}" + echo "" + + if [ $FAILED_TESTS -eq 0 ]; then + echo -e "${GREEN}╔════════════════════════════════════════╗${NC}" + echo -e "${GREEN}║ ALL TESTS PASSED! ✓ ║${NC}" + echo -e "${GREEN}╚════════════════════════════════════════╝${NC}" + EXIT_CODE=0 + else + echo -e "${RED}╔════════════════════════════════════════╗${NC}" + echo -e "${RED}║ SOME TESTS FAILED ✗ ║${NC}" + echo -e "${RED}╚════════════════════════════════════════╝${NC}" + EXIT_CODE=1 + fi + + # Cleanup + cleanup_test_environment + + print_status "INFO" "Test completed at: $(date)" + echo "" + + exit $EXIT_CODE +} + +# Trap to ensure cleanup on exit +trap cleanup_test_environment EXIT + +# Run main function +main