Smllm

Sleeping

App Files Files Community

ghosthets commited on Dec 3, 2025

Commit

02f2b8a

verified ·

1 Parent(s): 4637032

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -8

app.py CHANGED Viewed

@@ -1,16 +1,18 @@
 import flask
 from flask import request, jsonify
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 app = flask.Flask(__name__)
 model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-print("🔄 Loading fast chat model...")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
@@ -26,16 +28,39 @@ def chat():
         if not msg:
             return jsonify({"error": "No message sent"}), 400
-        inputs = tokenizer(msg, return_tensors="pt").to(device)
         output = model.generate(
             **inputs,
-            max_length=200,
             do_sample=True,
-            top_p=0.92,
-            temperature=0.7
         )
-        reply = tokenizer.decode(output[0], skip_special_tokens=True)
         return jsonify({"reply": reply})
     except Exception as e:

 import flask
 from flask import request, jsonify
+# Use AutoModelForCausalLM for Decoder-only models like TinyLlama
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 app = flask.Flask(__name__)
 model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+print("🔄 Loading TinyLlama model...")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
+# Load using AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16) # Using bfloat16 for better memory/speed on GPU
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
         if not msg:
             return jsonify({"error": "No message sent"}), 400
+        # --- Key Change 1: Apply Chat Template ---
+        # Format the user message into the model's required chat template
+        chat_history = [{"role": "user", "content": msg}]
+        # add_generation_prompt=True ensures the model knows it needs to respond
+        formatted_prompt = tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True)
+        # Tokenize the formatted prompt
+        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
+        # Generation
         output = model.generate(
             **inputs,
+            max_length=256,
             do_sample=True,
+            top_p=0.9,
+            temperature=0.7,
+            eos_token_id=tokenizer.eos_token_id
         )
+        # Decode the output
+        full_reply = tokenizer.decode(output[0], skip_special_tokens=False)
+        # --- Key Change 2: Extract only the generated response ---
+        # The output includes the input prompt, so we extract only the response part.
+        # Identify the assistant marker used by TinyLlama's chat template
+        if "[/INST]" in full_reply:
+             # This structure is often used: <s>[INST] User Prompt [/INST] Assistant Reply
+             reply = full_reply.split("[/INST]")[-1].strip()
+        else:
+             # Fallback: decode only the newly generated tokens
+             reply = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip()
         return jsonify({"reply": reply})
     except Exception as e: