File size: 3,001 Bytes
3dc800d
1112958
a355102
a77651c
a355102
1112958
3dc800d
 
 
a77651c
a355102
 
 
7977478
1112958
 
 
0394aba
 
1112958
d616881
a355102
a19b9f5
 
a355102
a77651c
a355102
 
 
 
7977478
 
 
 
c36c476
 
 
 
 
 
 
a320fdd
 
 
a355102
 
119f983
 
d616881
 
 
3dc800d
119f983
25b1890
f5c681c
119f983
 
 
3dc800d
1bf5e6b
a355102
 
119f983
 
 
3dc800d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import gradio as gr
from peft import PeftModel, PeftConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
import spaces
import torch
import os

from tqdm import tqdm


device="cuda" if torch.cuda.is_available() else "cpu"
# Load the model
MODEL_NAME="abdeljalilELmajjodi/alatlas_instruct_lora"
print(f"bf16 available: {torch.cuda.is_bf16_supported()}")
config = PeftConfig.from_pretrained(MODEL_NAME,token = os.environ['TOKEN'])
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
                                             device_map="auto",
                                             token = os.environ['TOKEN'],
                                             torch_dtype=torch.bfloat16
                                             )
model = PeftModel.from_pretrained(model,MODEL_NAME,torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

@spaces.GPU
def generate(prompt, temperature=0.7, top_k=50, repetition_penalty=1.2):
    messages=[{"role":"user","content":prompt}]
    formatted_prompt=tokenizer.apply_chat_template(messages,tokenize=False)
    ids=tokenizer(formatted_prompt,return_tensors="pt").to(device)
    output_ids=model.generate(**ids,
                           max_new_tokens=50,
                           do_sample=True,
                          temperature=temperature,
                          top_k=top_k,
                          repetition_penalty=repetition_penalty,
                          eos_token_id=tokenizer.eos_token_id,
                          pad_token_id=tokenizer.pad_token_id
                          )
    output_ids=output_ids[0][len(ids.input_ids[0]):]
    output=tokenizer.decode(output_ids,skip_special_tokens=True)
    assistant_marker = "ﭺassistant"
    if output.startswith(assistant_marker):
        output = output[len(assistant_marker):].strip()
    return output

prompt_input=gr.Textbox(label="Enter your prompt",lines=5,rtl=True,)
model_response = gr.Textbox(label="Model Response",lines=5,interactive=False,rtl=True,)
temperature = gr.Slider(minimum=0.01,maximum=1.0,value=0.7, label="Temperature")  # Reduced default
top_k = gr.Slider(1, 10000, value=10, label="Top-k")  # Reduced default
repetition_penalty = gr.Slider(0.1, 100.0, value=1.2, label="Repetition Penalty")  # Reduced default

examples = [
   [ "عافاك بغيت نسافر فالمغرب فالصيف ولكن معرفتش فين نمشي. ممكن تعاوني؟",0.1,90,1.2],
    [ "عافاك، بغيت نعرف شنو هي أحسن الأماكن لي نقدر نزورها فالمغرب فالصيف؟",0.1,100,1.2],
    ["شرح ليا الذكاء الاصطناعي عفاك",0.1,1,1.2],
]

demo=gr.Interface(
    fn=generate,
    inputs=[prompt_input,temperature,top_k,repetition_penalty],
    outputs=model_response,
    flagging_mode="never",
    examples=examples,
    cache_examples=True,
)
demo.launch()