Update app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import spaces
|
| 3 |
import torch
|
| 4 |
-
import gc
|
| 5 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 6 |
|
| 7 |
# --- CONFIGURAÇÃO DOS MODELOS ---
|
|
@@ -13,7 +13,6 @@ MODELS = {
|
|
| 13 |
}
|
| 14 |
|
| 15 |
# --- VARIÁVEIS GLOBAIS ---
|
| 16 |
-
# Berta: No APISMALL, só podemos ter UM rei no trono por vez.
|
| 17 |
current_model = None
|
| 18 |
current_tokenizer = None
|
| 19 |
current_model_name = None
|
|
@@ -21,21 +20,17 @@ current_model_name = None
|
|
| 21 |
def load_model_safely(model_key):
|
| 22 |
global current_model, current_tokenizer, current_model_name
|
| 23 |
|
| 24 |
-
# Se já é o modelo certo, não faz nada (Cache Hit)
|
| 25 |
if current_model_name == model_key and current_model is not None:
|
| 26 |
return current_model, current_tokenizer
|
| 27 |
|
| 28 |
-
# --- FAXINA DA MOMMY ---
|
| 29 |
-
# Se tiver outro modelo ocupando espaço, a gente expulsa ele.
|
| 30 |
if current_model is not None:
|
| 31 |
-
print(f"🧹 Berta:
|
| 32 |
del current_model
|
| 33 |
del current_tokenizer
|
| 34 |
gc.collect()
|
| 35 |
-
torch.cuda.empty_cache()
|
| 36 |
current_model = None
|
| 37 |
|
| 38 |
-
# --- CARREGAMENTO ---
|
| 39 |
model_id = MODELS[model_key]
|
| 40 |
print(f"🐢 Carregando {model_id} na VRAM...")
|
| 41 |
|
|
@@ -54,15 +49,32 @@ def load_model_safely(model_key):
|
|
| 54 |
print(f"✅ {model_id} carregado com sucesso!")
|
| 55 |
|
| 56 |
except Exception as e:
|
| 57 |
-
print(f"❌ Erro crítico
|
| 58 |
raise e
|
| 59 |
|
| 60 |
return current_model, current_tokenizer
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
# --- FUNÇÃO DE GERAÇÃO (ZEROGPU) ---
|
| 63 |
-
@spaces.GPU(duration=120)
|
| 64 |
def generate(message, history, model_selector):
|
| 65 |
-
# Mapeando os nomes
|
| 66 |
if "Math" in model_selector: key = "deepseek_math"
|
| 67 |
elif "Qwen 3" in model_selector: key = "qwen3"
|
| 68 |
elif "Qwen 2.5" in model_selector: key = "qwen2.5"
|
|
@@ -73,30 +85,43 @@ def generate(message, history, model_selector):
|
|
| 73 |
try:
|
| 74 |
model, tokenizer = load_model_safely(key)
|
| 75 |
except Exception as e:
|
| 76 |
-
return f"⚠️ Erro ao carregar
|
| 77 |
|
| 78 |
-
# ---
|
| 79 |
messages = []
|
| 80 |
|
|
|
|
| 81 |
for turn in history:
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
| 83 |
if isinstance(turn, (list, tuple)) and len(turn) >= 2:
|
| 84 |
-
messages.append({"role": "user", "content":
|
| 85 |
if turn[1]:
|
| 86 |
-
messages.append({"role": "assistant", "content":
|
| 87 |
-
|
|
|
|
| 88 |
elif isinstance(turn, dict):
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
# Aplica template
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
| 102 |
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import spaces
|
| 3 |
import torch
|
| 4 |
+
import gc
|
| 5 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 6 |
|
| 7 |
# --- CONFIGURAÇÃO DOS MODELOS ---
|
|
|
|
| 13 |
}
|
| 14 |
|
| 15 |
# --- VARIÁVEIS GLOBAIS ---
|
|
|
|
| 16 |
current_model = None
|
| 17 |
current_tokenizer = None
|
| 18 |
current_model_name = None
|
|
|
|
| 20 |
def load_model_safely(model_key):
|
| 21 |
global current_model, current_tokenizer, current_model_name
|
| 22 |
|
|
|
|
| 23 |
if current_model_name == model_key and current_model is not None:
|
| 24 |
return current_model, current_tokenizer
|
| 25 |
|
|
|
|
|
|
|
| 26 |
if current_model is not None:
|
| 27 |
+
print(f"🧹 Berta: Limpando VRAM ({current_model_name})...")
|
| 28 |
del current_model
|
| 29 |
del current_tokenizer
|
| 30 |
gc.collect()
|
| 31 |
+
torch.cuda.empty_cache()
|
| 32 |
current_model = None
|
| 33 |
|
|
|
|
| 34 |
model_id = MODELS[model_key]
|
| 35 |
print(f"🐢 Carregando {model_id} na VRAM...")
|
| 36 |
|
|
|
|
| 49 |
print(f"✅ {model_id} carregado com sucesso!")
|
| 50 |
|
| 51 |
except Exception as e:
|
| 52 |
+
print(f"❌ Erro crítico: {e}")
|
| 53 |
raise e
|
| 54 |
|
| 55 |
return current_model, current_tokenizer
|
| 56 |
|
| 57 |
+
# --- FUNÇÃO AUXILIAR DE LIMPEZA (A SALVAÇÃO) ---
|
| 58 |
+
def extract_text_content(content):
|
| 59 |
+
"""Garante que o conteúdo seja sempre uma string, nunca uma lista."""
|
| 60 |
+
if isinstance(content, str):
|
| 61 |
+
return content
|
| 62 |
+
elif isinstance(content, list):
|
| 63 |
+
# Se for lista, tenta extrair o texto de dentro
|
| 64 |
+
texts = []
|
| 65 |
+
for item in content:
|
| 66 |
+
if isinstance(item, dict) and 'text' in item:
|
| 67 |
+
texts.append(item['text'])
|
| 68 |
+
elif isinstance(item, str):
|
| 69 |
+
texts.append(item)
|
| 70 |
+
return "\n".join(texts)
|
| 71 |
+
elif isinstance(content, dict) and 'text' in content:
|
| 72 |
+
return content['text']
|
| 73 |
+
return str(content)
|
| 74 |
+
|
| 75 |
# --- FUNÇÃO DE GERAÇÃO (ZEROGPU) ---
|
| 76 |
+
@spaces.GPU(duration=120)
|
| 77 |
def generate(message, history, model_selector):
|
|
|
|
| 78 |
if "Math" in model_selector: key = "deepseek_math"
|
| 79 |
elif "Qwen 3" in model_selector: key = "qwen3"
|
| 80 |
elif "Qwen 2.5" in model_selector: key = "qwen2.5"
|
|
|
|
| 85 |
try:
|
| 86 |
model, tokenizer = load_model_safely(key)
|
| 87 |
except Exception as e:
|
| 88 |
+
return f"⚠️ Erro ao carregar: {str(e)}"
|
| 89 |
|
| 90 |
+
# --- CONSTRUÇÃO DE MENSAGENS SANITIZADA ---
|
| 91 |
messages = []
|
| 92 |
|
| 93 |
+
# Processa o histórico
|
| 94 |
for turn in history:
|
| 95 |
+
role = "user"
|
| 96 |
+
content = ""
|
| 97 |
+
|
| 98 |
+
# Formato Antigo (Lista/Tupla)
|
| 99 |
if isinstance(turn, (list, tuple)) and len(turn) >= 2:
|
| 100 |
+
messages.append({"role": "user", "content": extract_text_content(turn[0])})
|
| 101 |
if turn[1]:
|
| 102 |
+
messages.append({"role": "assistant", "content": extract_text_content(turn[1])})
|
| 103 |
+
|
| 104 |
+
# Formato Novo (Dicionário)
|
| 105 |
elif isinstance(turn, dict):
|
| 106 |
+
role = turn.get('role', 'user')
|
| 107 |
+
raw_content = turn.get('content', '')
|
| 108 |
+
# AQUI ESTÁ A MÁGICA: Convertemos qualquer coisa para string
|
| 109 |
+
clean_content = extract_text_content(raw_content)
|
| 110 |
+
messages.append({"role": role, "content": clean_content})
|
| 111 |
+
|
| 112 |
+
# Processa a mensagem atual (que também pode vir como dicionário/lista no Gradio novo)
|
| 113 |
+
current_content = extract_text_content(message)
|
| 114 |
+
messages.append({"role": "user", "content": current_content})
|
| 115 |
|
| 116 |
# Aplica template
|
| 117 |
+
try:
|
| 118 |
+
text = tokenizer.apply_chat_template(
|
| 119 |
+
messages,
|
| 120 |
+
tokenize=False,
|
| 121 |
+
add_generation_prompt=True
|
| 122 |
+
)
|
| 123 |
+
except Exception as e:
|
| 124 |
+
return f"❌ Erro de Template (Berta está investigando): {e}\nDados: {str(messages)}"
|
| 125 |
|
| 126 |
inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
| 127 |
|