""" Build script optimized for Hugging Face Spaces deployment Maintains the exact same SOTA RAG architecture """ import os import sys import logging import pickle import json import numpy as np import torch from pathlib import Path # Add parent directory to path sys.path.append('.') from app import ( load_opc_datasets, build_retrieval_system, ARTIFACT_DIR, FAISS_AVAILABLE, MODEL_NAME, EMBED_MODEL, MAX_CORPUS_SIZE ) # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler('/data/build.log') ] ) logger = logging.getLogger(__name__) def check_artifacts(): """Check if artifacts already exist""" required_files = [ "corpus_data.json", "corpus_embeddings.npy", "answer_embeddings.npy", "bm25.pkl" ] if FAISS_AVAILABLE: required_files.append("faiss_index.bin") all_exist = all(os.path.exists(os.path.join(ARTIFACT_DIR, f)) for f in required_files) return all_exist def build_retrieval_with_progress(): """Build retrieval system with progress tracking""" logger.info("Building SOTA RAG Retrieval System for Coding Assistant") logger.info(f"Architecture: HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval") logger.info(f"Embedding Model: {EMBED_MODEL}") logger.info(f"Max Corpus Size: {MAX_CORPUS_SIZE}") # Load datasets logger.info("Loading coding datasets...") ds_map = load_opc_datasets() # Build retrieval system (using the exact same function from app.py) logger.info("Building retrieval system...") retrieval_system = build_retrieval_system(ds_map) logger.info("Retrieval system built successfully!") logger.info(f" - Corpus size: {len(retrieval_system.corpus_texts)}") logger.info(f" - Embedding dimension: {retrieval_system.corpus_embeddings.shape[1]}") logger.info(f" - FAISS index: {'Yes' if retrieval_system.faiss_index else 'No'}") return retrieval_system def prepare_llm_artifacts(): """Prepare LLM artifacts without downloading the full model""" logger.info("🤖 Preparing LLM configuration...") from transformers import AutoTokenizer, GenerationConfig llm_path = os.path.join(ARTIFACT_DIR, "llm_model") os.makedirs(llm_path, exist_ok=True) # Download and save tokenizer logger.info(f"📥 Downloading tokenizer for {MODEL_NAME}...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Use the exact same chat template from app.py tokenizer.chat_template = ( "{% for message in messages %}" "{{'<|'+message['role']+'|>\\n'+message['content']+'\\n'}}" "{% endfor %}" "{% if add_generation_prompt %}" "<|assistant|>\n" "{% endif %}" ) # Use the exact same generation config from app.py generation_config = GenerationConfig( max_new_tokens=300, temperature=0.7, top_p=0.9, do_sample=True, repetition_penalty=1.15, pad_token_id=tokenizer.pad_token_id ) # Save tokenizer and config tokenizer.save_pretrained(llm_path) generation_config.save_pretrained(llm_path) # Create minimal config file config = { "_name_or_path": MODEL_NAME, "architectures": ["LlamaForCausalLM"], "model_type": "llama", "torch_dtype": "float16", "quantization_config": { "load_in_4bit": True, "bnb_4bit_compute_dtype": "float32", "bnb_4bit_use_double_quant": True, "bnb_4bit_quant_type": "nf4" } if torch.cuda.is_available() else {} } config_path = os.path.join(llm_path, "config.json") with open(config_path, "w") as f: json.dump(config, f, indent=2) logger.info(f"LLM configuration saved to {llm_path}") logger.info("Note: Full model will be downloaded at runtime with 4-bit quantization") def verify_artifacts(): """Verify all artifacts are properly built""" logger.info("Verifying artifacts...") files_to_check = { "corpus_data.json": "Corpus data", "corpus_embeddings.npy": "Question embeddings", "answer_embeddings.npy": "Answer embeddings", "bm25.pkl": "BM25 index", "faiss_index.bin": "FAISS index" } for file, description in files_to_check.items(): path = os.path.join(ARTIFACT_DIR, file) if os.path.exists(path): size_mb = os.path.getsize(path) / (1024 * 1024) logger.info(f" ✓ {description}: {size_mb:.2f} MB") else: if file != "faiss_index.bin" or FAISS_AVAILABLE: logger.warning(f" ✗ Missing: {description}") def main(): """Main build process""" logger.info("=" * 60) logger.info("🤖 Codey Bryant 3.0 - SOTA RAG Build Script") logger.info("=" * 60) # Create artifacts directory os.makedirs(ARTIFACT_DIR, exist_ok=True) # Check if we need to rebuild if check_artifacts(): logger.info("Artifacts already exist. Skipping build.") logger.info("Delete artifacts to force rebuild.") else: logger.info("Building fresh artifacts...") # Build retrieval system build_retrieval_with_progress() # Prepare LLM artifacts prepare_llm_artifacts() logger.info("Build complete!") # Verify artifacts verify_artifacts() # Show total size logger.info("\nArtifact Summary:") total_size = 0 for root, dirs, files in os.walk(ARTIFACT_DIR): for file in files: filepath = os.path.join(root, file) size_mb = os.path.getsize(filepath) / (1024 * 1024) total_size += size_mb logger.info(f" Total size: {total_size:.2f} MB") logger.info("=" * 60) logger.info("Ready to launch Codey Bryant!") logger.info(" Run: python app.py") logger.info("=" * 60) if __name__ == "__main__": main()