pasxalisag commited on
Commit
8526b04
·
verified ·
1 Parent(s): 8aeda0c

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +623 -0
  2. build.py +200 -0
app.py ADDED
@@ -0,0 +1,623 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Codey Bryant 3.0 — SOTA RAG for Hugging Face Spaces
3
+ Maintains EXACT same architecture: HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import logging
9
+ from dataclasses import dataclass
10
+ from typing import List, Dict, Tuple, Optional, Iterator
11
+ from functools import lru_cache
12
+ from threading import Thread
13
+ import warnings
14
+
15
+ # Configure logging for Hugging Face Spaces
16
+ logging.basicConfig(
17
+ level=logging.INFO,
18
+ format='%(asctime)s - %(levelname)s - %(message)s',
19
+ handlers=[
20
+ logging.StreamHandler(sys.stdout),
21
+ logging.FileHandler('/data/app.log')
22
+ ]
23
+ )
24
+ logger = logging.getLogger(__name__)
25
+ warnings.filterwarnings("ignore")
26
+
27
+ # Import core dependencies
28
+ import numpy as np
29
+ import torch
30
+ from datasets import load_dataset, Dataset
31
+ from sentence_transformers import SentenceTransformer
32
+ from rank_bm25 import BM25Okapi
33
+ from sklearn.cluster import MiniBatchKMeans
34
+ import spacy
35
+ from transformers import (
36
+ AutoTokenizer,
37
+ AutoModelForCausalLM,
38
+ GenerationConfig,
39
+ TextIteratorStreamer,
40
+ BitsAndBytesConfig,
41
+ )
42
+ import gradio as gr
43
+ import pickle
44
+ import json
45
+
46
+ # Try to import FAISS
47
+ try:
48
+ import faiss
49
+ FAISS_AVAILABLE = True
50
+ except ImportError:
51
+ FAISS_AVAILABLE = False
52
+ logger.warning("FAISS not available, using numpy fallback")
53
+
54
+ # Environment setup for Hugging Face Spaces
55
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
56
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
57
+
58
+ # Use persistent storage for Hugging Face Spaces
59
+ ARTIFACT_DIR = os.environ.get("ARTIFACT_DIR", "/data/artifacts")
60
+ os.makedirs(ARTIFACT_DIR, exist_ok=True)
61
+
62
+ # Paths for artifacts
63
+ LLM_ARTIFACT_PATH = os.path.join(ARTIFACT_DIR, "llm_model")
64
+ EMBED_ARTIFACT_PATH = os.path.join(ARTIFACT_DIR, "embed_model")
65
+ BM25_ARTIFACT_PATH = os.path.join(ARTIFACT_DIR, "bm25.pkl")
66
+ CORPUS_DATA_PATH = os.path.join(ARTIFACT_DIR, "corpus_data.json")
67
+ CORPUS_EMBED_PATH = os.path.join(ARTIFACT_DIR, "corpus_embeddings.npy")
68
+ ANSWER_EMBED_PATH = os.path.join(ARTIFACT_DIR, "answer_embeddings.npy")
69
+ FAISS_INDEX_PATH = os.path.join(ARTIFACT_DIR, "faiss_index.bin")
70
+
71
+ # Device configuration
72
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
73
+ if torch.cuda.is_available():
74
+ torch.backends.cuda.matmul.allow_tf32 = True
75
+ torch.backends.cudnn.benchmark = True
76
+ logger.info(f"Using GPU: {torch.cuda.get_device_name(0)}")
77
+ else:
78
+ logger.info("Using CPU")
79
+
80
+ # Model configuration (EXACT SAME AS BEFORE)
81
+ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
82
+ EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
83
+ MAX_CORPUS_SIZE = 600
84
+
85
+ # ========================
86
+ # 1) Dataset & Retrieval (EXACT SAME)
87
+ # ========================
88
+
89
+ def load_opc_datasets() -> Dict[str, Dataset]:
90
+ """Load coding datasets - same function"""
91
+ try:
92
+ logger.info("Loading OPC datasets...")
93
+ ds_instruct = load_dataset("OpenCoder-LLM/opc-sft-stage2", "educational_instruct", split="train")
94
+ ds_evol = load_dataset("OpenCoder-LLM/opc-sft-stage2", "evol_instruct", split="train")
95
+ return {"educational_instruct": ds_instruct, "evol_instruct": ds_evol}
96
+ except Exception as e:
97
+ logger.warning(f"OPC failed ({e}), falling back to python_code_instructions...")
98
+ ds = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
99
+ return {"python_code": ds}
100
+
101
+ def convo_to_io(example: Dict) -> Tuple[str, str]:
102
+ """Convert conversation to input/output - same function"""
103
+ if "messages" in example:
104
+ msgs = example["messages"]
105
+ elif "conversations" in example:
106
+ msgs = example["conversations"]
107
+ else:
108
+ instr = example.get("instruction") or example.get("prompt") or ""
109
+ inp = example.get("input") or ""
110
+ out = example.get("output") or example.get("response") or ""
111
+ return (instr + "\n" + inp).strip(), out
112
+
113
+ user_text, assistant_text = "", ""
114
+ for i, m in enumerate(msgs):
115
+ role = (m.get("role") or m.get("from") or "").lower()
116
+ content = m.get("content") or m.get("value") or ""
117
+ if role in ("user", "human") and not user_text:
118
+ user_text = content
119
+ if role in ("assistant", "gpt") and user_text:
120
+ assistant_text = content
121
+ break
122
+ return user_text.strip(), assistant_text.strip()
123
+
124
+ @dataclass
125
+ class RetrievalSystem:
126
+ """Retrieval system dataclass - same structure"""
127
+ embed_model: SentenceTransformer
128
+ bm25: BM25Okapi
129
+ corpus_texts: List[str]
130
+ corpus_answers: List[str]
131
+ corpus_embeddings: np.ndarray
132
+ answer_embeddings: np.ndarray
133
+ corpus_meta: List[Dict]
134
+ nlp: spacy.language.Language
135
+ faiss_index: Optional[any] = None
136
+
137
+ def build_retrieval_system(ds_map: Dict[str, Dataset]) -> RetrievalSystem:
138
+ """Build retrieval system - EXACT SAME IMPLEMENTATION"""
139
+ # Try to load from artifacts first
140
+ required = [EMBED_ARTIFACT_PATH, BM25_ARTIFACT_PATH, CORPUS_DATA_PATH, CORPUS_EMBED_PATH, ANSWER_EMBED_PATH]
141
+ if FAISS_AVAILABLE:
142
+ required.append(FAISS_INDEX_PATH)
143
+
144
+ if all(os.path.exists(p) for p in required):
145
+ logger.info("Loading retrieval system from artifacts...")
146
+ embed_model = SentenceTransformer(EMBED_ARTIFACT_PATH, device=str(DEVICE))
147
+ with open(BM25_ARTIFACT_PATH, "rb") as f:
148
+ bm25 = pickle.load(f)
149
+ with open(CORPUS_DATA_PATH, "r", encoding="utf-8") as f:
150
+ data = json.load(f)
151
+ corpus_embeddings = np.load(CORPUS_EMBED_PATH)
152
+ answer_embeddings = np.load(ANSWER_EMBED_PATH)
153
+ faiss_index = faiss.read_index(FAISS_INDEX_PATH) if FAISS_AVAILABLE and os.path.exists(FAISS_INDEX_PATH) else None
154
+ nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
155
+ return RetrievalSystem(
156
+ embed_model=embed_model, bm25=bm25,
157
+ corpus_texts=data["texts"], corpus_answers=data["answers"],
158
+ corpus_embeddings=corpus_embeddings, answer_embeddings=answer_embeddings,
159
+ corpus_meta=data["meta"], nlp=nlp, faiss_index=faiss_index
160
+ )
161
+
162
+ # Build from scratch (same implementation)
163
+ logger.info("Building retrieval system with answer-space support...")
164
+ all_questions, all_answers, all_metas = [], [], []
165
+ for name, ds in ds_map.items():
166
+ for ex in ds.select(range(min(len(ds), 1500))):
167
+ q, a = convo_to_io(ex)
168
+ if q and a and 50 < len(a) < 2000:
169
+ all_questions.append(q)
170
+ all_answers.append(a)
171
+ all_metas.append({"intent": name, "answer": a})
172
+
173
+ embed_model = SentenceTransformer(EMBED_MODEL, device=str(DEVICE))
174
+ question_embeddings = embed_model.encode(all_questions, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
175
+ answer_embeddings = embed_model.encode(all_answers, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
176
+
177
+ # Clustering to reduce size (same)
178
+ if len(all_questions) > MAX_CORPUS_SIZE:
179
+ kmeans = MiniBatchKMeans(n_clusters=MAX_CORPUS_SIZE, random_state=42, batch_size=1000)
180
+ labels = kmeans.fit_predict(answer_embeddings)
181
+ selected = []
182
+ for i in range(MAX_CORPUS_SIZE):
183
+ mask = labels == i
184
+ if mask.any():
185
+ idx = np.where(mask)[0]
186
+ dists = np.linalg.norm(answer_embeddings[idx] - kmeans.cluster_centers_[i], axis=1)
187
+ selected.append(idx[np.argmin(dists)])
188
+ idxs = selected
189
+ else:
190
+ idxs = list(range(len(all_questions)))
191
+
192
+ texts = [all_questions[i] for i in idxs]
193
+ answers = [all_answers[i] for i in idxs]
194
+ metas = [all_metas[i] for i in idxs]
195
+ q_embs = question_embeddings[idxs]
196
+ a_embs = answer_embeddings[idxs]
197
+
198
+ tokenized = [t.lower().split() for t in texts]
199
+ bm25 = BM25Okapi(tokenized)
200
+
201
+ faiss_index = None
202
+ if FAISS_AVAILABLE:
203
+ faiss_index = faiss.IndexFlatIP(a_embs.shape[1])
204
+ faiss_index.add(a_embs.astype('float32'))
205
+
206
+ # Save everything
207
+ embed_model.save(EMBED_ARTIFACT_PATH)
208
+ with open(BM25_ARTIFACT_PATH, "wb") as f:
209
+ pickle.dump(bm25, f)
210
+ with open(CORPUS_DATA_PATH, "w", encoding="utf-8") as f:
211
+ json.dump({"texts": texts, "answers": answers, "meta": metas}, f)
212
+ np.save(CORPUS_EMBED_PATH, q_embs)
213
+ np.save(ANSWER_EMBED_PATH, a_embs)
214
+ if faiss_index:
215
+ faiss.write_index(faiss_index, FAISS_INDEX_PATH)
216
+
217
+ nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
218
+ return RetrievalSystem(
219
+ embed_model=embed_model, bm25=bm25, corpus_texts=texts, corpus_answers=answers,
220
+ corpus_embeddings=q_embs, answer_embeddings=a_embs, corpus_meta=metas,
221
+ nlp=nlp, faiss_index=faiss_index
222
+ )
223
+
224
+ # ========================
225
+ # 2) Generative Core (EXACT SAME)
226
+ # ========================
227
+
228
+ @dataclass
229
+ class GenerativeCore:
230
+ """Generative core dataclass - same structure"""
231
+ model: AutoModelForCausalLM
232
+ tokenizer: AutoTokenizer
233
+ generation_config: GenerationConfig
234
+
235
+ def build_generative_core():
236
+ """Build generative core - EXACT SAME IMPLEMENTATION"""
237
+ # Always download fresh from HuggingFace for reliability
238
+ print("Downloading TinyLlama with 4-bit quantization...")
239
+
240
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
241
+ if tokenizer.pad_token is None:
242
+ tokenizer.pad_token = tokenizer.eos_token
243
+
244
+ tokenizer.chat_template = (
245
+ "{% for message in messages %}"
246
+ "{{'<|'+message['role']+'|>\\n'+message['content']+'</s>\\n'}}"
247
+ "{% endfor %}"
248
+ "{% if add_generation_prompt %}"
249
+ "<|assistant|>\n"
250
+ "{% endif %}"
251
+ )
252
+
253
+ quantization_config = None
254
+ if torch.cuda.is_available():
255
+ quantization_config = BitsAndBytesConfig(
256
+ load_in_4bit=True,
257
+ bnb_4bit_compute_dtype=torch.float32,
258
+ bnb_4bit_use_double_quant=True,
259
+ bnb_4bit_quant_type="nf4"
260
+ )
261
+
262
+ model = AutoModelForCausalLM.from_pretrained(
263
+ MODEL_NAME,
264
+ quantization_config=quantization_config,
265
+ device_map="auto" if torch.cuda.is_available() else None,
266
+ low_cpu_mem_usage=True
267
+ )
268
+ model.eval()
269
+
270
+ gen_cfg = GenerationConfig(
271
+ max_new_tokens=300,
272
+ temperature=0.7,
273
+ top_p=0.9,
274
+ do_sample=True,
275
+ repetition_penalty=1.15,
276
+ pad_token_id=tokenizer.pad_token_id
277
+ )
278
+
279
+ # Save for future use (optional)
280
+ if not os.path.exists(LLM_ARTIFACT_PATH):
281
+ os.makedirs(LLM_ARTIFACT_PATH, exist_ok=True)
282
+ tokenizer.save_pretrained(LLM_ARTIFACT_PATH)
283
+ gen_cfg.save_pretrained(LLM_ARTIFACT_PATH)
284
+
285
+ return GenerativeCore(model, tokenizer, gen_cfg)
286
+
287
+ # ========================
288
+ # 3) SOTA Enhanced Retrieval (EXACT SAME)
289
+ # ========================
290
+
291
+ class HybridCodeAssistant:
292
+ """Main assistant class - EXACT SAME IMPLEMENTATION"""
293
+ def __init__(self):
294
+ self.retrieval = build_retrieval_system(load_opc_datasets())
295
+ self.generator = build_generative_core()
296
+ logger.info("Codey Bryant 3.0 ready with HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval!")
297
+
298
+ def generate_hyde(self, query: str) -> str:
299
+ """Generate HyDE - same implementation"""
300
+ prompt = f"""Write a concise, direct Python code example or explanation that answers this question.
301
+ Only output the answer, no extra text.
302
+
303
+ Question: {query}
304
+
305
+ Answer:"""
306
+ inputs = self.generator.tokenizer(prompt, return_tensors="pt").to(DEVICE)
307
+ with torch.no_grad():
308
+ out = self.generator.model.generate(**inputs, max_new_tokens=128, temperature=0.3, do_sample=True)
309
+ return self.generator.tokenizer.decode(out[0], skip_special_tokens=True).split("Answer:")[-1].strip()
310
+
311
+ def rewrite_query(self, query: str) -> str:
312
+ """Rewrite query - same implementation"""
313
+ prompt = f"""Rewrite this vague or casual programming question into a clear, specific one for better code retrieval.
314
+
315
+ Original: {query}
316
+
317
+ Improved:"""
318
+ inputs = self.generator.tokenizer(prompt, return_tensors="pt").to(DEVICE)
319
+ with torch.no_grad():
320
+ out = self.generator.model.generate(**inputs, max_new_tokens=64, temperature=0.1)
321
+ return self.generator.tokenizer.decode(out[0], skip_special_tokens=True).split("Improved:")[-1].strip()
322
+
323
+ def retrieve_enhanced(self, query: str, k: int = 3) -> List[Tuple[str, Dict, float]]:
324
+ """Enhanced retrieval - EXACT SAME IMPLEMENTATION"""
325
+ # Use list of tuples instead of set to avoid hashability issues with dicts
326
+ results = []
327
+
328
+ def add_results(q_text: str, weight: float = 1.0):
329
+ try:
330
+ # Determine embedding space (answer for HyDE/long texts, question otherwise)
331
+ use_answer_space = "HyDE" in q_text or len(q_text.split()) > 20
332
+ target_embs = self.retrieval.answer_embeddings if use_answer_space else self.retrieval.corpus_embeddings
333
+
334
+ # Encode query
335
+ q_emb = self.retrieval.embed_model.encode(q_text, normalize_embeddings=True)
336
+
337
+ if self.retrieval.faiss_index is not None and use_answer_space:
338
+ # FAISS on answer space
339
+ query_vec = q_emb.astype('float32').reshape(1, -1)
340
+ scores_top, indices_top = self.retrieval.faiss_index.search(query_vec, min(k * 3, len(self.retrieval.corpus_texts)))
341
+ scores = scores_top[0]
342
+ idxs = indices_top[0]
343
+ else:
344
+ # Numpy fallback or question space
345
+ scores = np.dot(target_embs, q_emb)
346
+ idxs = np.argsort(-scores)[:k*3]
347
+
348
+ # Add BM25 if not answer space
349
+ if not use_answer_space:
350
+ tokenized_query = q_text.lower().split()
351
+ bm25_scores = self.retrieval.bm25.get_scores(tokenized_query)
352
+ if bm25_scores.max() > 0:
353
+ bm25_scores = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min())
354
+ else:
355
+ bm25_scores = np.zeros_like(bm25_scores)
356
+ scores = 0.3 * bm25_scores + 0.7 * scores # Hybrid
357
+
358
+ # Collect candidates (avoid duplicates by checking text)
359
+ seen_texts = set()
360
+ for score, idx in zip(scores, idxs):
361
+ if score > 0.15 and idx < len(self.retrieval.corpus_texts):
362
+ text = self.retrieval.corpus_texts[idx]
363
+ if text not in seen_texts:
364
+ seen_texts.add(text)
365
+ results.append((text, self.retrieval.corpus_meta[idx], float(score * weight)))
366
+ except Exception as e:
367
+ logger.error(f"add_results failed for '{q_text}': {e}")
368
+
369
+ # 1. Original query
370
+ add_results(query, weight=1.0)
371
+
372
+ # 2. Rewritten query
373
+ try:
374
+ rw = self.rewrite_query(query)
375
+ if len(rw) > 8 and rw != query:
376
+ add_results(rw, weight=1.2)
377
+ except Exception as e:
378
+ logger.warning(f"Rewrite failed: {e}")
379
+
380
+ # 3. HyDE (strong weight in answer space!)
381
+ try:
382
+ hyde = self.generate_hyde(query)
383
+ if len(hyde) > 20:
384
+ add_results(hyde, weight=1.5) # Note: No " HyDE" suffix needed now
385
+ except Exception as e:
386
+ logger.warning(f"HyDE failed: {e}")
387
+
388
+ # 4. Multi-query variants (lighter weight)
389
+ variants = [
390
+ f"Python code for: {query}",
391
+ f"Fix error: {query}",
392
+ f"Explain in Python: {query}",
393
+ f"Best way to {query} in Python",
394
+ ]
395
+ for v in variants:
396
+ add_results(v, weight=0.8)
397
+
398
+ # Rerank by similarity to original (no set needed)
399
+ if not results:
400
+ return []
401
+
402
+ q_emb = self.retrieval.embed_model.encode(query, normalize_embeddings=True)
403
+ final = []
404
+ for text, meta, score in results:
405
+ text_emb = self.retrieval.embed_model.encode(text, normalize_embeddings=True)
406
+ sim = float(np.dot(q_emb, text_emb))
407
+ final.append((text, meta, score + 0.3 * sim))
408
+
409
+ final.sort(key=lambda x: x[2], reverse=True)
410
+ return final[:k]
411
+
412
+ def answer_stream(self, text: str) -> Iterator[str]:
413
+ """Stream answer - same implementation"""
414
+ retrieved = self.retrieve_enhanced(text, k=3)
415
+
416
+ context = ""
417
+ if retrieved and retrieved[0][2] > 0.3:
418
+ q, meta, _ = retrieved[0]
419
+ ans = meta["answer"][:200]
420
+ context = f"Reference example:\nQ: {q}\nA: {ans}\n\n"
421
+
422
+ messages = [
423
+ {"role": "system", "content": "You are a concise, accurate Python coding assistant. Use the reference if helpful." + context},
424
+ {"role": "user", "content": text}
425
+ ]
426
+
427
+ prompt = self.generator.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
428
+ inputs = self.generator.tokenizer(prompt, return_tensors="pt").to(DEVICE)
429
+
430
+ streamer = TextIteratorStreamer(self.generator.tokenizer, skip_prompt=True, skip_special_tokens=True)
431
+ thread = Thread(target=self.generator.model.generate, kwargs=dict(
432
+ **inputs, streamer=streamer, generation_config=self.generator.generation_config
433
+ ))
434
+ thread.start()
435
+
436
+ for token in streamer:
437
+ yield token
438
+ thread.join()
439
+
440
+ # ========================
441
+ # 4) Gradio UI (Optimized for Hugging Face)
442
+ # ========================
443
+
444
+ ASSISTANT: Optional[HybridCodeAssistant] = None
445
+
446
+ def initialize_assistant():
447
+ """Initialize assistant with progress tracking"""
448
+ global ASSISTANT
449
+ if ASSISTANT is None:
450
+ yield "Initializing Codey Bryant 3.0..."
451
+ yield "Loading retrieval system..."
452
+ ASSISTANT = HybridCodeAssistant()
453
+ yield "Codey Bryant 3.0 Ready!"
454
+ yield "SOTA RAG Features: HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval"
455
+ yield "Ask coding questions like: 'it's not working', 'help with error', 'make it faster'"
456
+ else:
457
+ yield "Assistant already initialized!"
458
+
459
+ def chat(message: str, history: list):
460
+ """Chat function with error handling"""
461
+ if ASSISTANT is None:
462
+ yield "Please click 'Initialize Assistant' first!"
463
+ return
464
+
465
+ # Append user message
466
+ history.append([message, ""])
467
+ yield history
468
+
469
+ # Stream response
470
+ try:
471
+ response = ""
472
+ for token in ASSISTANT.answer_stream(message):
473
+ response += token
474
+ history[-1][1] = response
475
+ yield history
476
+ except Exception as e:
477
+ logger.error(f"Chat error: {e}")
478
+ history[-1][1] = f"Error: {str(e)}"
479
+ yield history
480
+
481
+ def create_ui():
482
+ """Create Gradio UI optimized for Hugging Face"""
483
+ with gr.Blocks(
484
+ title="Codey Bryant 3.0 - SOTA RAG Coding Assistant",
485
+ theme=gr.themes.Soft(),
486
+ css="""
487
+ .gradio-container { max-width: 1200px; margin: auto; }
488
+ .chatbot { min-height: 500px; }
489
+ .status-box { padding: 20px; border-radius: 10px; background: #f0f8ff; }
490
+ """
491
+ ) as demo:
492
+ gr.Markdown("""
493
+ # 🤖 Codey Bryant 3.0
494
+ ## **SOTA RAG Coding Assistant**
495
+
496
+ ### **Advanced Features:**
497
+ - **HyDE** (Hypothetical Document Embeddings)
498
+ - **Query Rewriting** for vague queries
499
+ - **Multi-Query** retrieval
500
+ - **Answer-Space Retrieval**
501
+
502
+ ### **Handles vague questions like:**
503
+ - "it's not working"
504
+ - "help with error"
505
+ - "make it faster"
506
+ - "why error"
507
+ - "how to implement"
508
+
509
+ ### **Powered by:**
510
+ - TinyLlama 1.1B (4-bit quantized)
511
+ - Hybrid retrieval (FAISS + BM25)
512
+ - OPC coding datasets
513
+ """)
514
+
515
+ with gr.Row():
516
+ with gr.Column(scale=1):
517
+ init_btn = gr.Button(
518
+ "Initialize Assistant",
519
+ variant="primary",
520
+ size="lg"
521
+ )
522
+ clear_btn = gr.Button("Clear Chat", size="lg")
523
+
524
+ with gr.Column(scale=4):
525
+ status = gr.Markdown(
526
+ "### Status: Click 'Initialize Assistant' to start",
527
+ elem_classes="status-box"
528
+ )
529
+
530
+ chatbot = gr.Chatbot(
531
+ label="Chat with Codey",
532
+ height=500,
533
+ show_label=True,
534
+ avatar_images=(None, "🤖"),
535
+ bubble_full_width=False
536
+ )
537
+
538
+ with gr.Row():
539
+ msg = gr.Textbox(
540
+ placeholder="Ask anything about Python coding...",
541
+ label="Your Question",
542
+ lines=3,
543
+ scale=5,
544
+ container=False
545
+ )
546
+ submit_btn = gr.Button("Send", variant="secondary", scale=1)
547
+
548
+ # Examples
549
+ gr.Examples(
550
+ examples=[
551
+ ["How to read a CSV file in Python?"],
552
+ ["Why am I getting 'list index out of range' error?"],
553
+ ["Make this function faster..."],
554
+ ["Help, my code isn't working!"],
555
+ ["Best way to sort a dictionary by value?"]
556
+ ],
557
+ inputs=msg,
558
+ label="Try these examples:"
559
+ )
560
+
561
+ # Event handlers
562
+ init_btn.click(
563
+ initialize_assistant,
564
+ outputs=status
565
+ )
566
+
567
+ def submit_message(message, history):
568
+ return "", history + [[message, None]]
569
+
570
+ msg.submit(
571
+ submit_message,
572
+ [msg, chatbot],
573
+ [msg, chatbot],
574
+ queue=False
575
+ ).then(
576
+ chat,
577
+ [msg, chatbot],
578
+ chatbot
579
+ )
580
+
581
+ submit_btn.click(
582
+ submit_message,
583
+ [msg, chatbot],
584
+ [msg, chatbot],
585
+ queue=False
586
+ ).then(
587
+ chat,
588
+ [msg, chatbot],
589
+ chatbot
590
+ )
591
+
592
+ clear_btn.click(lambda: None, None, chatbot, queue=False)
593
+
594
+ # Footer
595
+ gr.Markdown("""
596
+ ---
597
+ *Codey Bryant 3.0 uses TinyLlama 1.1B with 4-bit quantization. Responses may take a few seconds.*
598
+ """)
599
+
600
+ return demo
601
+
602
+ # ========================
603
+ # 5) Main Entry Point
604
+ # ========================
605
+
606
+ if __name__ == "__main__":
607
+ # Configure for Hugging Face Spaces
608
+ server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
609
+ server_port = int(os.environ.get("GRADIO_SERVER_PORT", 7860))
610
+
611
+ # Create and launch the demo
612
+ demo = create_ui()
613
+
614
+ logger.info(f"Starting Codey Bryant 3.0 on {server_name}:{server_port}")
615
+ logger.info("SOTA RAG Architecture: HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval")
616
+
617
+ demo.launch(
618
+ server_name=server_name,
619
+ server_port=server_port,
620
+ share=False, # Set to True if you want a public link
621
+ debug=False,
622
+ show_error=True
623
+ )
build.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Build script optimized for Hugging Face Spaces deployment
3
+ Maintains the exact same SOTA RAG architecture
4
+ """
5
+ import os
6
+ import sys
7
+ import logging
8
+ import pickle
9
+ import json
10
+ import numpy as np
11
+ import torch
12
+ from pathlib import Path
13
+
14
+ # Add parent directory to path
15
+ sys.path.append('.')
16
+
17
+ from app import (
18
+ load_opc_datasets,
19
+ build_retrieval_system,
20
+ ARTIFACT_DIR,
21
+ FAISS_AVAILABLE,
22
+ MODEL_NAME,
23
+ EMBED_MODEL,
24
+ MAX_CORPUS_SIZE
25
+ )
26
+
27
+ # Configure logging
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format='%(asctime)s - %(levelname)s - %(message)s',
31
+ handlers=[
32
+ logging.StreamHandler(sys.stdout),
33
+ logging.FileHandler('/data/build.log')
34
+ ]
35
+ )
36
+ logger = logging.getLogger(__name__)
37
+
38
+ def check_artifacts():
39
+ """Check if artifacts already exist"""
40
+ required_files = [
41
+ "corpus_data.json",
42
+ "corpus_embeddings.npy",
43
+ "answer_embeddings.npy",
44
+ "bm25.pkl"
45
+ ]
46
+
47
+ if FAISS_AVAILABLE:
48
+ required_files.append("faiss_index.bin")
49
+
50
+ all_exist = all(os.path.exists(os.path.join(ARTIFACT_DIR, f)) for f in required_files)
51
+ return all_exist
52
+
53
+ def build_retrieval_with_progress():
54
+ """Build retrieval system with progress tracking"""
55
+ logger.info("Building SOTA RAG Retrieval System for Coding Assistant")
56
+ logger.info(f"Architecture: HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval")
57
+ logger.info(f"Embedding Model: {EMBED_MODEL}")
58
+ logger.info(f"Max Corpus Size: {MAX_CORPUS_SIZE}")
59
+
60
+ # Load datasets
61
+ logger.info("Loading coding datasets...")
62
+ ds_map = load_opc_datasets()
63
+
64
+ # Build retrieval system (using the exact same function from app.py)
65
+ logger.info("Building retrieval system...")
66
+ retrieval_system = build_retrieval_system(ds_map)
67
+
68
+ logger.info("Retrieval system built successfully!")
69
+ logger.info(f" - Corpus size: {len(retrieval_system.corpus_texts)}")
70
+ logger.info(f" - Embedding dimension: {retrieval_system.corpus_embeddings.shape[1]}")
71
+ logger.info(f" - FAISS index: {'Yes' if retrieval_system.faiss_index else 'No'}")
72
+
73
+ return retrieval_system
74
+
75
+ def prepare_llm_artifacts():
76
+ """Prepare LLM artifacts without downloading the full model"""
77
+ logger.info("🤖 Preparing LLM configuration...")
78
+
79
+ from transformers import AutoTokenizer, GenerationConfig
80
+
81
+ llm_path = os.path.join(ARTIFACT_DIR, "llm_model")
82
+ os.makedirs(llm_path, exist_ok=True)
83
+
84
+ # Download and save tokenizer
85
+ logger.info(f"📥 Downloading tokenizer for {MODEL_NAME}...")
86
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
87
+
88
+ if tokenizer.pad_token is None:
89
+ tokenizer.pad_token = tokenizer.eos_token
90
+
91
+ # Use the exact same chat template from app.py
92
+ tokenizer.chat_template = (
93
+ "{% for message in messages %}"
94
+ "{{'<|'+message['role']+'|>\\n'+message['content']+'</s>\\n'}}"
95
+ "{% endfor %}"
96
+ "{% if add_generation_prompt %}"
97
+ "<|assistant|>\n"
98
+ "{% endif %}"
99
+ )
100
+
101
+ # Use the exact same generation config from app.py
102
+ generation_config = GenerationConfig(
103
+ max_new_tokens=300,
104
+ temperature=0.7,
105
+ top_p=0.9,
106
+ do_sample=True,
107
+ repetition_penalty=1.15,
108
+ pad_token_id=tokenizer.pad_token_id
109
+ )
110
+
111
+ # Save tokenizer and config
112
+ tokenizer.save_pretrained(llm_path)
113
+ generation_config.save_pretrained(llm_path)
114
+
115
+ # Create minimal config file
116
+ config = {
117
+ "_name_or_path": MODEL_NAME,
118
+ "architectures": ["LlamaForCausalLM"],
119
+ "model_type": "llama",
120
+ "torch_dtype": "float16",
121
+ "quantization_config": {
122
+ "load_in_4bit": True,
123
+ "bnb_4bit_compute_dtype": "float32",
124
+ "bnb_4bit_use_double_quant": True,
125
+ "bnb_4bit_quant_type": "nf4"
126
+ } if torch.cuda.is_available() else {}
127
+ }
128
+
129
+ config_path = os.path.join(llm_path, "config.json")
130
+ with open(config_path, "w") as f:
131
+ json.dump(config, f, indent=2)
132
+
133
+ logger.info(f"LLM configuration saved to {llm_path}")
134
+ logger.info("Note: Full model will be downloaded at runtime with 4-bit quantization")
135
+
136
+ def verify_artifacts():
137
+ """Verify all artifacts are properly built"""
138
+ logger.info("Verifying artifacts...")
139
+
140
+ files_to_check = {
141
+ "corpus_data.json": "Corpus data",
142
+ "corpus_embeddings.npy": "Question embeddings",
143
+ "answer_embeddings.npy": "Answer embeddings",
144
+ "bm25.pkl": "BM25 index",
145
+ "faiss_index.bin": "FAISS index"
146
+ }
147
+
148
+ for file, description in files_to_check.items():
149
+ path = os.path.join(ARTIFACT_DIR, file)
150
+ if os.path.exists(path):
151
+ size_mb = os.path.getsize(path) / (1024 * 1024)
152
+ logger.info(f" ✓ {description}: {size_mb:.2f} MB")
153
+ else:
154
+ if file != "faiss_index.bin" or FAISS_AVAILABLE:
155
+ logger.warning(f" ✗ Missing: {description}")
156
+
157
+ def main():
158
+ """Main build process"""
159
+ logger.info("=" * 60)
160
+ logger.info("🤖 Codey Bryant 3.0 - SOTA RAG Build Script")
161
+ logger.info("=" * 60)
162
+
163
+ # Create artifacts directory
164
+ os.makedirs(ARTIFACT_DIR, exist_ok=True)
165
+
166
+ # Check if we need to rebuild
167
+ if check_artifacts():
168
+ logger.info("Artifacts already exist. Skipping build.")
169
+ logger.info("Delete artifacts to force rebuild.")
170
+ else:
171
+ logger.info("Building fresh artifacts...")
172
+
173
+ # Build retrieval system
174
+ build_retrieval_with_progress()
175
+
176
+ # Prepare LLM artifacts
177
+ prepare_llm_artifacts()
178
+
179
+ logger.info("Build complete!")
180
+
181
+ # Verify artifacts
182
+ verify_artifacts()
183
+
184
+ # Show total size
185
+ logger.info("\nArtifact Summary:")
186
+ total_size = 0
187
+ for root, dirs, files in os.walk(ARTIFACT_DIR):
188
+ for file in files:
189
+ filepath = os.path.join(root, file)
190
+ size_mb = os.path.getsize(filepath) / (1024 * 1024)
191
+ total_size += size_mb
192
+
193
+ logger.info(f" Total size: {total_size:.2f} MB")
194
+ logger.info("=" * 60)
195
+ logger.info("Ready to launch Codey Bryant!")
196
+ logger.info(" Run: python app.py")
197
+ logger.info("=" * 60)
198
+
199
+ if __name__ == "__main__":
200
+ main()