Spaces:

pasxalisag
/

Codey-Bryant

Sleeping

App Files Files Community

pasxalisag commited on 20 days ago

Commit

8526b04

verified ·

1 Parent(s): 8aeda0c

Upload 2 files

Browse files

Files changed (2) hide show

app.py +623 -0
build.py +200 -0

app.py ADDED Viewed

	@@ -0,0 +1,623 @@

+"""
+Codey Bryant 3.0 — SOTA RAG for Hugging Face Spaces
+Maintains EXACT same architecture: HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval
+"""
+import os
+import sys
+import logging
+from dataclasses import dataclass
+from typing import List, Dict, Tuple, Optional, Iterator
+from functools import lru_cache
+from threading import Thread
+import warnings
+# Configure logging for Hugging Face Spaces
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler('/data/app.log')
+    ]
+)
+logger = logging.getLogger(__name__)
+warnings.filterwarnings("ignore")
+# Import core dependencies
+import numpy as np
+import torch
+from datasets import load_dataset, Dataset
+from sentence_transformers import SentenceTransformer
+from rank_bm25 import BM25Okapi
+from sklearn.cluster import MiniBatchKMeans
+import spacy
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    GenerationConfig,
+    TextIteratorStreamer,
+    BitsAndBytesConfig,
+)
+import gradio as gr
+import pickle
+import json
+# Try to import FAISS
+try:
+    import faiss
+    FAISS_AVAILABLE = True
+except ImportError:
+    FAISS_AVAILABLE = False
+    logger.warning("FAISS not available, using numpy fallback")
+# Environment setup for Hugging Face Spaces
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+# Use persistent storage for Hugging Face Spaces
+ARTIFACT_DIR = os.environ.get("ARTIFACT_DIR", "/data/artifacts")
+os.makedirs(ARTIFACT_DIR, exist_ok=True)
+# Paths for artifacts
+LLM_ARTIFACT_PATH = os.path.join(ARTIFACT_DIR, "llm_model")
+EMBED_ARTIFACT_PATH = os.path.join(ARTIFACT_DIR, "embed_model")
+BM25_ARTIFACT_PATH = os.path.join(ARTIFACT_DIR, "bm25.pkl")
+CORPUS_DATA_PATH = os.path.join(ARTIFACT_DIR, "corpus_data.json")
+CORPUS_EMBED_PATH = os.path.join(ARTIFACT_DIR, "corpus_embeddings.npy")
+ANSWER_EMBED_PATH = os.path.join(ARTIFACT_DIR, "answer_embeddings.npy")
+FAISS_INDEX_PATH = os.path.join(ARTIFACT_DIR, "faiss_index.bin")
+# Device configuration
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+if torch.cuda.is_available():
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.benchmark = True
+    logger.info(f"Using GPU: {torch.cuda.get_device_name(0)}")
+else:
+    logger.info("Using CPU")
+# Model configuration (EXACT SAME AS BEFORE)
+MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+MAX_CORPUS_SIZE = 600
+# ========================
+# 1) Dataset & Retrieval (EXACT SAME)
+# ========================
+def load_opc_datasets() -> Dict[str, Dataset]:
+    """Load coding datasets - same function"""
+    try:
+        logger.info("Loading OPC datasets...")
+        ds_instruct = load_dataset("OpenCoder-LLM/opc-sft-stage2", "educational_instruct", split="train")
+        ds_evol = load_dataset("OpenCoder-LLM/opc-sft-stage2", "evol_instruct", split="train")
+        return {"educational_instruct": ds_instruct, "evol_instruct": ds_evol}
+    except Exception as e:
+        logger.warning(f"OPC failed ({e}), falling back to python_code_instructions...")
+        ds = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
+        return {"python_code": ds}
+def convo_to_io(example: Dict) -> Tuple[str, str]:
+    """Convert conversation to input/output - same function"""
+    if "messages" in example:
+        msgs = example["messages"]
+    elif "conversations" in example:
+        msgs = example["conversations"]
+    else:
+        instr = example.get("instruction") or example.get("prompt") or ""
+        inp = example.get("input") or ""
+        out = example.get("output") or example.get("response") or ""
+        return (instr + "\n" + inp).strip(), out
+    user_text, assistant_text = "", ""
+    for i, m in enumerate(msgs):
+        role = (m.get("role") or m.get("from") or "").lower()
+        content = m.get("content") or m.get("value") or ""
+        if role in ("user", "human") and not user_text:
+            user_text = content
+        if role in ("assistant", "gpt") and user_text:
+            assistant_text = content
+            break
+    return user_text.strip(), assistant_text.strip()
+@dataclass
+class RetrievalSystem:
+    """Retrieval system dataclass - same structure"""
+    embed_model: SentenceTransformer
+    bm25: BM25Okapi
+    corpus_texts: List[str]
+    corpus_answers: List[str]
+    corpus_embeddings: np.ndarray
+    answer_embeddings: np.ndarray
+    corpus_meta: List[Dict]
+    nlp: spacy.language.Language
+    faiss_index: Optional[any] = None
+def build_retrieval_system(ds_map: Dict[str, Dataset]) -> RetrievalSystem:
+    """Build retrieval system - EXACT SAME IMPLEMENTATION"""
+    # Try to load from artifacts first
+    required = [EMBED_ARTIFACT_PATH, BM25_ARTIFACT_PATH, CORPUS_DATA_PATH, CORPUS_EMBED_PATH, ANSWER_EMBED_PATH]
+    if FAISS_AVAILABLE:
+        required.append(FAISS_INDEX_PATH)
+    if all(os.path.exists(p) for p in required):
+        logger.info("Loading retrieval system from artifacts...")
+        embed_model = SentenceTransformer(EMBED_ARTIFACT_PATH, device=str(DEVICE))
+        with open(BM25_ARTIFACT_PATH, "rb") as f:
+            bm25 = pickle.load(f)
+        with open(CORPUS_DATA_PATH, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        corpus_embeddings = np.load(CORPUS_EMBED_PATH)
+        answer_embeddings = np.load(ANSWER_EMBED_PATH)
+        faiss_index = faiss.read_index(FAISS_INDEX_PATH) if FAISS_AVAILABLE and os.path.exists(FAISS_INDEX_PATH) else None
+        nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
+        return RetrievalSystem(
+            embed_model=embed_model, bm25=bm25,
+            corpus_texts=data["texts"], corpus_answers=data["answers"],
+            corpus_embeddings=corpus_embeddings, answer_embeddings=answer_embeddings,
+            corpus_meta=data["meta"], nlp=nlp, faiss_index=faiss_index
+        )
+    # Build from scratch (same implementation)
+    logger.info("Building retrieval system with answer-space support...")
+    all_questions, all_answers, all_metas = [], [], []
+    for name, ds in ds_map.items():
+        for ex in ds.select(range(min(len(ds), 1500))):
+            q, a = convo_to_io(ex)
+            if q and a and 50 < len(a) < 2000:
+                all_questions.append(q)
+                all_answers.append(a)
+                all_metas.append({"intent": name, "answer": a})
+    embed_model = SentenceTransformer(EMBED_MODEL, device=str(DEVICE))
+    question_embeddings = embed_model.encode(all_questions, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
+    answer_embeddings = embed_model.encode(all_answers, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
+    # Clustering to reduce size (same)
+    if len(all_questions) > MAX_CORPUS_SIZE:
+        kmeans = MiniBatchKMeans(n_clusters=MAX_CORPUS_SIZE, random_state=42, batch_size=1000)
+        labels = kmeans.fit_predict(answer_embeddings)
+        selected = []
+        for i in range(MAX_CORPUS_SIZE):
+            mask = labels == i
+            if mask.any():
+                idx = np.where(mask)[0]
+                dists = np.linalg.norm(answer_embeddings[idx] - kmeans.cluster_centers_[i], axis=1)
+                selected.append(idx[np.argmin(dists)])
+        idxs = selected
+    else:
+        idxs = list(range(len(all_questions)))
+    texts = [all_questions[i] for i in idxs]
+    answers = [all_answers[i] for i in idxs]
+    metas = [all_metas[i] for i in idxs]
+    q_embs = question_embeddings[idxs]
+    a_embs = answer_embeddings[idxs]
+    tokenized = [t.lower().split() for t in texts]
+    bm25 = BM25Okapi(tokenized)
+    faiss_index = None
+    if FAISS_AVAILABLE:
+        faiss_index = faiss.IndexFlatIP(a_embs.shape[1])
+        faiss_index.add(a_embs.astype('float32'))
+    # Save everything
+    embed_model.save(EMBED_ARTIFACT_PATH)
+    with open(BM25_ARTIFACT_PATH, "wb") as f:
+        pickle.dump(bm25, f)
+    with open(CORPUS_DATA_PATH, "w", encoding="utf-8") as f:
+        json.dump({"texts": texts, "answers": answers, "meta": metas}, f)
+    np.save(CORPUS_EMBED_PATH, q_embs)
+    np.save(ANSWER_EMBED_PATH, a_embs)
+    if faiss_index:
+        faiss.write_index(faiss_index, FAISS_INDEX_PATH)
+    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
+    return RetrievalSystem(
+        embed_model=embed_model, bm25=bm25, corpus_texts=texts, corpus_answers=answers,
+        corpus_embeddings=q_embs, answer_embeddings=a_embs, corpus_meta=metas,
+        nlp=nlp, faiss_index=faiss_index
+    )
+# ========================
+# 2) Generative Core (EXACT SAME)
+# ========================
+@dataclass
+class GenerativeCore:
+    """Generative core dataclass - same structure"""
+    model: AutoModelForCausalLM
+    tokenizer: AutoTokenizer
+    generation_config: GenerationConfig
+def build_generative_core():
+    """Build generative core - EXACT SAME IMPLEMENTATION"""
+    # Always download fresh from HuggingFace for reliability
+    print("Downloading TinyLlama with 4-bit quantization...")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.chat_template = (
+        "{% for message in messages %}"
+        "{{'<|'+message['role']+'|>\\n'+message['content']+'</s>\\n'}}"
+        "{% endfor %}"
+        "{% if add_generation_prompt %}"
+        "<|assistant|>\n"
+        "{% endif %}"
+    )
+    quantization_config = None
+    if torch.cuda.is_available():
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float32,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4"
+        )
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_NAME,
+        quantization_config=quantization_config,
+        device_map="auto" if torch.cuda.is_available() else None,
+        low_cpu_mem_usage=True
+    )
+    model.eval()
+    gen_cfg = GenerationConfig(
+        max_new_tokens=300,
+        temperature=0.7,
+        top_p=0.9,
+        do_sample=True,
+        repetition_penalty=1.15,
+        pad_token_id=tokenizer.pad_token_id
+    )
+    # Save for future use (optional)
+    if not os.path.exists(LLM_ARTIFACT_PATH):
+        os.makedirs(LLM_ARTIFACT_PATH, exist_ok=True)
+        tokenizer.save_pretrained(LLM_ARTIFACT_PATH)
+        gen_cfg.save_pretrained(LLM_ARTIFACT_PATH)
+    return GenerativeCore(model, tokenizer, gen_cfg)
+# ========================
+# 3) SOTA Enhanced Retrieval (EXACT SAME)
+# ========================
+class HybridCodeAssistant:
+    """Main assistant class - EXACT SAME IMPLEMENTATION"""
+    def __init__(self):
+        self.retrieval = build_retrieval_system(load_opc_datasets())
+        self.generator = build_generative_core()
+        logger.info("Codey Bryant 3.0 ready with HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval!")
+    def generate_hyde(self, query: str) -> str:
+        """Generate HyDE - same implementation"""
+        prompt = f"""Write a concise, direct Python code example or explanation that answers this question.
+Only output the answer, no extra text.
+Question: {query}
+Answer:"""
+        inputs = self.generator.tokenizer(prompt, return_tensors="pt").to(DEVICE)
+        with torch.no_grad():
+            out = self.generator.model.generate(**inputs, max_new_tokens=128, temperature=0.3, do_sample=True)
+        return self.generator.tokenizer.decode(out[0], skip_special_tokens=True).split("Answer:")[-1].strip()
+    def rewrite_query(self, query: str) -> str:
+        """Rewrite query - same implementation"""
+        prompt = f"""Rewrite this vague or casual programming question into a clear, specific one for better code retrieval.
+Original: {query}
+Improved:"""
+        inputs = self.generator.tokenizer(prompt, return_tensors="pt").to(DEVICE)
+        with torch.no_grad():
+            out = self.generator.model.generate(**inputs, max_new_tokens=64, temperature=0.1)
+        return self.generator.tokenizer.decode(out[0], skip_special_tokens=True).split("Improved:")[-1].strip()
+    def retrieve_enhanced(self, query: str, k: int = 3) -> List[Tuple[str, Dict, float]]:
+        """Enhanced retrieval - EXACT SAME IMPLEMENTATION"""
+        # Use list of tuples instead of set to avoid hashability issues with dicts
+        results = []
+        def add_results(q_text: str, weight: float = 1.0):
+            try:
+                # Determine embedding space (answer for HyDE/long texts, question otherwise)
+                use_answer_space = "HyDE" in q_text or len(q_text.split()) > 20
+                target_embs = self.retrieval.answer_embeddings if use_answer_space else self.retrieval.corpus_embeddings
+                # Encode query
+                q_emb = self.retrieval.embed_model.encode(q_text, normalize_embeddings=True)
+                if self.retrieval.faiss_index is not None and use_answer_space:
+                    # FAISS on answer space
+                    query_vec = q_emb.astype('float32').reshape(1, -1)
+                    scores_top, indices_top = self.retrieval.faiss_index.search(query_vec, min(k * 3, len(self.retrieval.corpus_texts)))
+                    scores = scores_top[0]
+                    idxs = indices_top[0]
+                else:
+                    # Numpy fallback or question space
+                    scores = np.dot(target_embs, q_emb)
+                    idxs = np.argsort(-scores)[:k*3]
+                # Add BM25 if not answer space
+                if not use_answer_space:
+                    tokenized_query = q_text.lower().split()
+                    bm25_scores = self.retrieval.bm25.get_scores(tokenized_query)
+                    if bm25_scores.max() > 0:
+                        bm25_scores = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min())
+                    else:
+                        bm25_scores = np.zeros_like(bm25_scores)
+                    scores = 0.3 * bm25_scores + 0.7 * scores  # Hybrid
+                # Collect candidates (avoid duplicates by checking text)
+                seen_texts = set()
+                for score, idx in zip(scores, idxs):
+                    if score > 0.15 and idx < len(self.retrieval.corpus_texts):
+                        text = self.retrieval.corpus_texts[idx]
+                        if text not in seen_texts:
+                            seen_texts.add(text)
+                            results.append((text, self.retrieval.corpus_meta[idx], float(score * weight)))
+            except Exception as e:
+                logger.error(f"add_results failed for '{q_text}': {e}")
+        # 1. Original query
+        add_results(query, weight=1.0)
+        # 2. Rewritten query
+        try:
+            rw = self.rewrite_query(query)
+            if len(rw) > 8 and rw != query:
+                add_results(rw, weight=1.2)
+        except Exception as e:
+            logger.warning(f"Rewrite failed: {e}")
+        # 3. HyDE (strong weight in answer space!)
+        try:
+            hyde = self.generate_hyde(query)
+            if len(hyde) > 20:
+                add_results(hyde, weight=1.5)  # Note: No " HyDE" suffix needed now
+        except Exception as e:
+            logger.warning(f"HyDE failed: {e}")
+        # 4. Multi-query variants (lighter weight)
+        variants = [
+            f"Python code for: {query}",
+            f"Fix error: {query}",
+            f"Explain in Python: {query}",
+            f"Best way to {query} in Python",
+        ]
+        for v in variants:
+            add_results(v, weight=0.8)
+        # Rerank by similarity to original (no set needed)
+        if not results:
+            return []
+        q_emb = self.retrieval.embed_model.encode(query, normalize_embeddings=True)
+        final = []
+        for text, meta, score in results:
+            text_emb = self.retrieval.embed_model.encode(text, normalize_embeddings=True)
+            sim = float(np.dot(q_emb, text_emb))
+            final.append((text, meta, score + 0.3 * sim))
+        final.sort(key=lambda x: x[2], reverse=True)
+        return final[:k]
+    def answer_stream(self, text: str) -> Iterator[str]:
+        """Stream answer - same implementation"""
+        retrieved = self.retrieve_enhanced(text, k=3)
+        context = ""
+        if retrieved and retrieved[0][2] > 0.3:
+            q, meta, _ = retrieved[0]
+            ans = meta["answer"][:200]
+            context = f"Reference example:\nQ: {q}\nA: {ans}\n\n"
+        messages = [
+            {"role": "system", "content": "You are a concise, accurate Python coding assistant. Use the reference if helpful." + context},
+            {"role": "user", "content": text}
+        ]
+        prompt = self.generator.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = self.generator.tokenizer(prompt, return_tensors="pt").to(DEVICE)
+        streamer = TextIteratorStreamer(self.generator.tokenizer, skip_prompt=True, skip_special_tokens=True)
+        thread = Thread(target=self.generator.model.generate, kwargs=dict(
+            **inputs, streamer=streamer, generation_config=self.generator.generation_config
+        ))
+        thread.start()
+        for token in streamer:
+            yield token
+        thread.join()
+# ========================
+# 4) Gradio UI (Optimized for Hugging Face)
+# ========================
+ASSISTANT: Optional[HybridCodeAssistant] = None
+def initialize_assistant():
+    """Initialize assistant with progress tracking"""
+    global ASSISTANT
+    if ASSISTANT is None:
+        yield "Initializing Codey Bryant 3.0..."
+        yield "Loading retrieval system..."
+        ASSISTANT = HybridCodeAssistant()
+        yield "Codey Bryant 3.0 Ready!"
+        yield "SOTA RAG Features: HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval"
+        yield "Ask coding questions like: 'it's not working', 'help with error', 'make it faster'"
+    else:
+        yield "Assistant already initialized!"
+def chat(message: str, history: list):
+    """Chat function with error handling"""
+    if ASSISTANT is None:
+        yield "Please click 'Initialize Assistant' first!"
+        return
+    # Append user message
+    history.append([message, ""])
+    yield history
+    # Stream response
+    try:
+        response = ""
+        for token in ASSISTANT.answer_stream(message):
+            response += token
+            history[-1][1] = response
+            yield history
+    except Exception as e:
+        logger.error(f"Chat error: {e}")
+        history[-1][1] = f"Error: {str(e)}"
+        yield history
+def create_ui():
+    """Create Gradio UI optimized for Hugging Face"""
+    with gr.Blocks(
+        title="Codey Bryant 3.0 - SOTA RAG Coding Assistant",
+        theme=gr.themes.Soft(),
+        css="""
+        .gradio-container { max-width: 1200px; margin: auto; }
+        .chatbot { min-height: 500px; }
+        .status-box { padding: 20px; border-radius: 10px; background: #f0f8ff; }
+        """
+    ) as demo:
+        gr.Markdown("""
+        # 🤖 Codey Bryant 3.0
+        ## **SOTA RAG Coding Assistant**
+        ### **Advanced Features:**
+        - **HyDE** (Hypothetical Document Embeddings)
+        - **Query Rewriting** for vague queries
+        - **Multi-Query** retrieval
+        - **Answer-Space Retrieval**
+        ### **Handles vague questions like:**
+        - "it's not working"
+        - "help with error"
+        - "make it faster"
+        - "why error"
+        - "how to implement"
+        ### **Powered by:**
+        - TinyLlama 1.1B (4-bit quantized)
+        - Hybrid retrieval (FAISS + BM25)
+        - OPC coding datasets
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                init_btn = gr.Button(
+                    "Initialize Assistant",
+                    variant="primary",
+                    size="lg"
+                )
+                clear_btn = gr.Button("Clear Chat", size="lg")
+            with gr.Column(scale=4):
+                status = gr.Markdown(
+                    "### Status: Click 'Initialize Assistant' to start",
+                    elem_classes="status-box"
+                )
+        chatbot = gr.Chatbot(
+            label="Chat with Codey",
+            height=500,
+            show_label=True,
+            avatar_images=(None, "🤖"),
+            bubble_full_width=False
+        )
+        with gr.Row():
+            msg = gr.Textbox(
+                placeholder="Ask anything about Python coding...",
+                label="Your Question",
+                lines=3,
+                scale=5,
+                container=False
+            )
+            submit_btn = gr.Button("Send", variant="secondary", scale=1)
+        # Examples
+        gr.Examples(
+            examples=[
+                ["How to read a CSV file in Python?"],
+                ["Why am I getting 'list index out of range' error?"],
+                ["Make this function faster..."],
+                ["Help, my code isn't working!"],
+                ["Best way to sort a dictionary by value?"]
+            ],
+            inputs=msg,
+            label="Try these examples:"
+        )
+        # Event handlers
+        init_btn.click(
+            initialize_assistant,
+            outputs=status
+        )
+        def submit_message(message, history):
+            return "", history + [[message, None]]
+        msg.submit(
+            submit_message,
+            [msg, chatbot],
+            [msg, chatbot],
+            queue=False
+        ).then(
+            chat,
+            [msg, chatbot],
+            chatbot
+        )
+        submit_btn.click(
+            submit_message,
+            [msg, chatbot],
+            [msg, chatbot],
+            queue=False
+        ).then(
+            chat,
+            [msg, chatbot],
+            chatbot
+        )
+        clear_btn.click(lambda: None, None, chatbot, queue=False)
+        # Footer
+        gr.Markdown("""
+        ---
+        *Codey Bryant 3.0 uses TinyLlama 1.1B with 4-bit quantization. Responses may take a few seconds.*
+        """)
+    return demo
+# ========================
+# 5) Main Entry Point
+# ========================
+if __name__ == "__main__":
+    # Configure for Hugging Face Spaces
+    server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
+    server_port = int(os.environ.get("GRADIO_SERVER_PORT", 7860))
+    # Create and launch the demo
+    demo = create_ui()
+    logger.info(f"Starting Codey Bryant 3.0 on {server_name}:{server_port}")
+    logger.info("SOTA RAG Architecture: HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval")
+    demo.launch(
+        server_name=server_name,
+        server_port=server_port,
+        share=False,  # Set to True if you want a public link
+        debug=False,
+        show_error=True
+    )

build.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""
+Build script optimized for Hugging Face Spaces deployment
+Maintains the exact same SOTA RAG architecture
+"""
+import os
+import sys
+import logging
+import pickle
+import json
+import numpy as np
+import torch
+from pathlib import Path
+# Add parent directory to path
+sys.path.append('.')
+from app import (
+    load_opc_datasets,
+    build_retrieval_system,
+    ARTIFACT_DIR,
+    FAISS_AVAILABLE,
+    MODEL_NAME,
+    EMBED_MODEL,
+    MAX_CORPUS_SIZE
+)
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler('/data/build.log')
+    ]
+)
+logger = logging.getLogger(__name__)
+def check_artifacts():
+    """Check if artifacts already exist"""
+    required_files = [
+        "corpus_data.json",
+        "corpus_embeddings.npy",
+        "answer_embeddings.npy",
+        "bm25.pkl"
+    ]
+    if FAISS_AVAILABLE:
+        required_files.append("faiss_index.bin")
+    all_exist = all(os.path.exists(os.path.join(ARTIFACT_DIR, f)) for f in required_files)
+    return all_exist
+def build_retrieval_with_progress():
+    """Build retrieval system with progress tracking"""
+    logger.info("Building SOTA RAG Retrieval System for Coding Assistant")
+    logger.info(f"Architecture: HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval")
+    logger.info(f"Embedding Model: {EMBED_MODEL}")
+    logger.info(f"Max Corpus Size: {MAX_CORPUS_SIZE}")
+    # Load datasets
+    logger.info("Loading coding datasets...")
+    ds_map = load_opc_datasets()
+    # Build retrieval system (using the exact same function from app.py)
+    logger.info("Building retrieval system...")
+    retrieval_system = build_retrieval_system(ds_map)
+    logger.info("Retrieval system built successfully!")
+    logger.info(f"   - Corpus size: {len(retrieval_system.corpus_texts)}")
+    logger.info(f"   - Embedding dimension: {retrieval_system.corpus_embeddings.shape[1]}")
+    logger.info(f"   - FAISS index: {'Yes' if retrieval_system.faiss_index else 'No'}")
+    return retrieval_system
+def prepare_llm_artifacts():
+    """Prepare LLM artifacts without downloading the full model"""
+    logger.info("🤖 Preparing LLM configuration...")
+    from transformers import AutoTokenizer, GenerationConfig
+    llm_path = os.path.join(ARTIFACT_DIR, "llm_model")
+    os.makedirs(llm_path, exist_ok=True)
+    # Download and save tokenizer
+    logger.info(f"📥 Downloading tokenizer for {MODEL_NAME}...")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Use the exact same chat template from app.py
+    tokenizer.chat_template = (
+        "{% for message in messages %}"
+        "{{'<|'+message['role']+'|>\\n'+message['content']+'</s>\\n'}}"
+        "{% endfor %}"
+        "{% if add_generation_prompt %}"
+        "<|assistant|>\n"
+        "{% endif %}"
+    )
+    # Use the exact same generation config from app.py
+    generation_config = GenerationConfig(
+        max_new_tokens=300,
+        temperature=0.7,
+        top_p=0.9,
+        do_sample=True,
+        repetition_penalty=1.15,
+        pad_token_id=tokenizer.pad_token_id
+    )
+    # Save tokenizer and config
+    tokenizer.save_pretrained(llm_path)
+    generation_config.save_pretrained(llm_path)
+    # Create minimal config file
+    config = {
+        "_name_or_path": MODEL_NAME,
+        "architectures": ["LlamaForCausalLM"],
+        "model_type": "llama",
+        "torch_dtype": "float16",
+        "quantization_config": {
+            "load_in_4bit": True,
+            "bnb_4bit_compute_dtype": "float32",
+            "bnb_4bit_use_double_quant": True,
+            "bnb_4bit_quant_type": "nf4"
+        } if torch.cuda.is_available() else {}
+    }
+    config_path = os.path.join(llm_path, "config.json")
+    with open(config_path, "w") as f:
+        json.dump(config, f, indent=2)
+    logger.info(f"LLM configuration saved to {llm_path}")
+    logger.info("Note: Full model will be downloaded at runtime with 4-bit quantization")
+def verify_artifacts():
+    """Verify all artifacts are properly built"""
+    logger.info("Verifying artifacts...")
+    files_to_check = {
+        "corpus_data.json": "Corpus data",
+        "corpus_embeddings.npy": "Question embeddings",
+        "answer_embeddings.npy": "Answer embeddings",
+        "bm25.pkl": "BM25 index",
+        "faiss_index.bin": "FAISS index"
+    }
+    for file, description in files_to_check.items():
+        path = os.path.join(ARTIFACT_DIR, file)
+        if os.path.exists(path):
+            size_mb = os.path.getsize(path) / (1024 * 1024)
+            logger.info(f"   ✓ {description}: {size_mb:.2f} MB")
+        else:
+            if file != "faiss_index.bin" or FAISS_AVAILABLE:
+                logger.warning(f"   ✗ Missing: {description}")
+def main():
+    """Main build process"""
+    logger.info("=" * 60)
+    logger.info("🤖 Codey Bryant 3.0 - SOTA RAG Build Script")
+    logger.info("=" * 60)
+    # Create artifacts directory
+    os.makedirs(ARTIFACT_DIR, exist_ok=True)
+    # Check if we need to rebuild
+    if check_artifacts():
+        logger.info("Artifacts already exist. Skipping build.")
+        logger.info("Delete artifacts to force rebuild.")
+    else:
+        logger.info("Building fresh artifacts...")
+        # Build retrieval system
+        build_retrieval_with_progress()
+        # Prepare LLM artifacts
+        prepare_llm_artifacts()
+        logger.info("Build complete!")
+    # Verify artifacts
+    verify_artifacts()
+    # Show total size
+    logger.info("\nArtifact Summary:")
+    total_size = 0
+    for root, dirs, files in os.walk(ARTIFACT_DIR):
+        for file in files:
+            filepath = os.path.join(root, file)
+            size_mb = os.path.getsize(filepath) / (1024 * 1024)
+            total_size += size_mb
+    logger.info(f"   Total size: {total_size:.2f} MB")
+    logger.info("=" * 60)
+    logger.info("Ready to launch Codey Bryant!")
+    logger.info("   Run: python app.py")
+    logger.info("=" * 60)
+if __name__ == "__main__":
+    main()