navidocs/index_remediation.py
Danny Stocker 364f0800f4 feat(infra): Immortalize forensic tools and Golden Index
Redis Golden Index Consolidation System:
- index_remediation.py: Production indexing script (986 files, 1,975 keys)
- verify_golden_index.sh: 10-test verification suite (all passed)
- GOLDEN_INDEX_README.md: Complete technical documentation
- GOLDEN_INDEX_EXECUTION_SUMMARY.md: Executive summary

Namespace: navidocs:remediated_2025:*
Total Indexed: 986 files (442.58 MB)
Priority Files: 12/12 captured
Verification: 10/10 tests passed
Memory Used: 1.62 GB
Status: Production-ready

🤖 Generated with Claude Code

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-27 15:38:12 +01:00

384 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Redis Golden Index Consolidation Script
=========================================
Indexes the remediated NaviDocs codebase into a new Redis namespace.
Namespace: navidocs:remediated_2025:*
Source: /home/setup/navidocs/ (clean state from fix/production-sync-2025 branch)
This creates a "golden index" of all remediated files with metadata for
verification and tracking.
"""
import os
import redis
import json
import hashlib
import base64
from datetime import datetime
from pathlib import Path
from typing import Dict, Any, List
import sys
# Configuration
REDIS_HOST = 'localhost'
REDIS_PORT = 6379
REDIS_DB = 0
SOURCE_DIR = "/home/setup/navidocs/"
NAMESPACE = "navidocs:remediated_2025"
GIT_COMMIT = "841c9ac" # Latest commit on fix/production-sync-2025
TIMESTAMP = datetime.utcnow().isoformat()
# Files/directories to exclude
EXCLUDE_DIRS = {'.git', 'node_modules', '.github', '.next', 'dist', 'build'}
EXCLUDE_EXTENSIONS = {'.log', '.tmp', '.cache'}
EXCLUDE_FILES = {'.DS_Store', '.env', 'credentials.json', 'secrets.json'}
# Priority files (for highlighting and verification)
PRIORITY_FILES = [
"restore_chaos.sh",
"server/config/db_connect.js",
"public/js/doc-viewer.js",
"server/routes/api_search.js",
"server/index.js",
"Dockerfile",
"server/.env.example",
"test_search_wiring.sh",
"docs/ROADMAP_V2_RECOVERED.md",
"PHASE_2_DELTA_REPORT.md",
"GLOBAL_VISION_REPORT.md",
"COMPREHENSIVE_AUDIT_REPORT.md"
]
# Binary file extensions
BINARY_EXTENSIONS = {
'.jpg', '.jpeg', '.png', '.gif', '.pdf', '.zip', '.tar', '.gz',
'.bin', '.exe', '.so', '.dylib', '.class', '.pyc', '.node'
}
class RedisGoldenIndexer:
"""Manages Redis golden index creation and file indexing."""
def __init__(self):
"""Initialize Redis connection and counters."""
try:
self.redis_client = redis.Redis(
host=REDIS_HOST,
port=REDIS_PORT,
db=REDIS_DB,
decode_responses=False
)
self.redis_client.ping()
print("✓ Redis connection established")
except Exception as e:
print(f"✗ Redis connection failed: {e}")
sys.exit(1)
self.file_count = 0
self.total_size = 0
self.indexed_files = []
self.priority_files_found = []
self.errors = []
def should_index_file(self, file_path: str, relative_path: str) -> bool:
"""Check if a file should be indexed."""
# Check if in excluded directory
for part in Path(relative_path).parts:
if part in EXCLUDE_DIRS:
return False
# Check if excluded extension
if any(relative_path.endswith(ext) for ext in EXCLUDE_EXTENSIONS):
return False
# Check if excluded filename
if os.path.basename(file_path) in EXCLUDE_FILES:
return False
# Check if it's a file
if not os.path.isfile(file_path):
return False
return True
def is_binary_file(self, file_path: str) -> bool:
"""Determine if file is binary."""
_, ext = os.path.splitext(file_path)
if ext.lower() in BINARY_EXTENSIONS:
return True
# Try to read as text
try:
with open(file_path, 'r', encoding='utf-8') as f:
f.read(512)
return False
except (UnicodeDecodeError, OSError):
return True
def read_file_content(self, file_path: str, is_binary: bool) -> str:
"""Read file content, handling both text and binary files."""
try:
if is_binary:
with open(file_path, 'rb') as f:
content = f.read()
# For binary files, store as base64
return base64.b64encode(content).decode('utf-8')
else:
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
return f.read()
except Exception as e:
self.errors.append(f"Error reading {file_path}: {e}")
return None
def compute_md5(self, file_path: str) -> str:
"""Compute MD5 hash of file."""
try:
md5_hash = hashlib.md5()
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b''):
md5_hash.update(chunk)
return md5_hash.hexdigest()
except Exception as e:
self.errors.append(f"Error computing hash for {file_path}: {e}")
return "unknown"
def index_file(self, file_path: str, relative_path: str) -> bool:
"""Index a single file to Redis."""
try:
is_binary = self.is_binary_file(file_path)
file_size = os.path.getsize(file_path)
md5_hash = self.compute_md5(file_path)
content = self.read_file_content(file_path, is_binary)
if content is None:
return False
# Create metadata
metadata = {
"path": relative_path,
"status": "REMEDIATED",
"source": "fix/production-sync-2025",
"timestamp": TIMESTAMP,
"git_commit": GIT_COMMIT,
"md5_hash": md5_hash,
"size_bytes": file_size,
"is_binary": is_binary,
"content_length": len(content)
}
# Store in Redis with namespace
key = f"{NAMESPACE}:file:{relative_path}"
# Store metadata as JSON
metadata_key = f"{NAMESPACE}:meta:{relative_path}"
self.redis_client.set(
metadata_key,
json.dumps(metadata, default=str),
ex=None # No expiration
)
# Store content (with size limit for very large files)
if file_size > 10_000_000: # 10MB limit
content = content[:10_000_000] + "\n... [TRUNCATED - FILE TOO LARGE] ..."
self.redis_client.set(key, content, ex=None)
# Add to index set
self.redis_client.sadd(f"{NAMESPACE}:index", relative_path)
# Track priority files
if relative_path in PRIORITY_FILES:
self.priority_files_found.append(relative_path)
self.redis_client.sadd(f"{NAMESPACE}:priority", relative_path)
self.file_count += 1
self.total_size += file_size
self.indexed_files.append({
"path": relative_path,
"size": file_size,
"md5": md5_hash,
"binary": is_binary
})
return True
except Exception as e:
self.errors.append(f"Error indexing {relative_path}: {e}")
return False
def index_directory(self):
"""Recursively index all files in SOURCE_DIR."""
print(f"\n⧗ Indexing files from: {SOURCE_DIR}")
for root, dirs, files in os.walk(SOURCE_DIR):
# Skip excluded directories
dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS]
for filename in files:
file_path = os.path.join(root, filename)
relative_path = os.path.relpath(file_path, SOURCE_DIR)
if self.should_index_file(file_path, relative_path):
if self.index_file(file_path, relative_path):
if self.file_count % 50 == 0:
print(f" Indexed {self.file_count} files...")
print(f"✓ Indexing complete: {self.file_count} files indexed")
def create_index_metadata(self):
"""Create overall index metadata."""
index_meta = {
"namespace": NAMESPACE,
"created": TIMESTAMP,
"source_branch": "fix/production-sync-2025",
"git_commit": GIT_COMMIT,
"total_files": self.file_count,
"total_size_bytes": self.total_size,
"source_directory": SOURCE_DIR,
"priority_files_found": len(self.priority_files_found),
"priority_files": self.priority_files_found,
"errors": len(self.errors)
}
meta_key = f"{NAMESPACE}:metadata"
self.redis_client.set(
meta_key,
json.dumps(index_meta, default=str, indent=2)
)
return index_meta
def get_redis_memory_usage(self) -> Dict[str, Any]:
"""Get Redis memory usage information."""
try:
info = self.redis_client.info('memory')
return {
'used_memory': info.get('used_memory', 0),
'used_memory_human': info.get('used_memory_human', 'unknown'),
'used_memory_peak': info.get('used_memory_peak', 0),
'used_memory_peak_human': info.get('used_memory_peak_human', 'unknown'),
'maxmemory': info.get('maxmemory', 0),
'maxmemory_human': info.get('maxmemory_human', 'unlimited')
}
except Exception as e:
return {'error': str(e)}
def get_namespace_key_count(self) -> int:
"""Count all keys in the golden index namespace."""
try:
cursor = 0
count = 0
while True:
cursor, keys = self.redis_client.scan(
cursor,
match=f"{NAMESPACE}:*",
count=1000
)
count += len(keys)
if cursor == 0:
break
return count
except Exception as e:
print(f"Error counting keys: {e}")
return 0
def print_sample_files(self, limit: int = 5):
"""Print sample indexed files."""
print(f"\n📄 Sample Indexed Files (first {limit}):")
for i, file_info in enumerate(self.indexed_files[:limit]):
size_kb = file_info['size'] / 1024
print(f" {i+1}. {file_info['path']}")
print(f" Size: {size_kb:.1f} KB, MD5: {file_info['md5'][:8]}...")
def print_priority_files(self):
"""Print found priority files."""
if self.priority_files_found:
print(f"\n⭐ Priority Files Found ({len(self.priority_files_found)}):")
for pf in self.priority_files_found:
print(f"{pf}")
else:
print(f"\n⚠ No priority files found (searched for {len(PRIORITY_FILES)} files)")
def print_summary(self):
"""Print indexing summary."""
print("\n" + "="*70)
print("GOLDEN INDEX CONSOLIDATION SUMMARY")
print("="*70)
index_meta = self.create_index_metadata()
print(f"\nNamespace: {NAMESPACE}")
print(f"Source Directory: {SOURCE_DIR}")
print(f"Git Commit: {GIT_COMMIT}")
print(f"Created: {TIMESTAMP}")
print(f"\n📊 Index Statistics:")
print(f" Total Files: {self.file_count}")
print(f" Total Size: {self.total_size / (1024*1024):.2f} MB")
print(f" Priority Files: {len(self.priority_files_found)}/{len(PRIORITY_FILES)}")
print(f" Indexing Errors: {len(self.errors)}")
redis_mem = self.get_redis_memory_usage()
print(f"\n💾 Redis Memory Usage:")
if 'error' not in redis_mem:
print(f" Used Memory: {redis_mem.get('used_memory_human', 'unknown')}")
print(f" Peak Memory: {redis_mem.get('used_memory_peak_human', 'unknown')}")
print(f" Max Memory: {redis_mem.get('maxmemory_human', 'unlimited')}")
namespace_keys = self.get_namespace_key_count()
print(f"\n🔑 Redis Keys Created:")
print(f" Total Keys: {namespace_keys}")
print(f" Index Set Size: {self.redis_client.scard(f'{NAMESPACE}:index')}")
print(f" Priority Set Size: {self.redis_client.scard(f'{NAMESPACE}:priority')}")
self.print_sample_files()
self.print_priority_files()
if self.errors:
print(f"\n⚠ Errors ({len(self.errors)}):")
for error in self.errors[:5]:
print(f"{error}")
if len(self.errors) > 5:
print(f" ... and {len(self.errors) - 5} more")
print("\n" + "="*70)
print("VERIFICATION COMMANDS:")
print("="*70)
print(f"# Count index set")
print(f"redis-cli SCARD '{NAMESPACE}:index'")
print(f"\n# List all files")
print(f"redis-cli SMEMBERS '{NAMESPACE}:index' | head -20")
print(f"\n# View namespace metadata")
print(f"redis-cli GET '{NAMESPACE}:metadata'")
print(f"\n# Check specific file")
print(f"redis-cli GET '{NAMESPACE}:file:restore_chaos.sh' | head -20")
print(f"\n# Check file metadata")
print(f"redis-cli GET '{NAMESPACE}:meta:restore_chaos.sh'")
print("="*70 + "\n")
def run(self):
"""Execute the complete indexing process."""
try:
self.index_directory()
self.print_summary()
return True
except Exception as e:
print(f"\n✗ Fatal error: {e}")
return False
def main():
"""Main entry point."""
print("Redis Golden Index Consolidation")
print("=" * 70)
indexer = RedisGoldenIndexer()
success = indexer.run()
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()