Spaces:
Running
Running
delete "Unbabel/TowerInstruct-7B-v0.2", "HuggingFaceTB/SmolLM3-3B"; add YanoljaNEXT-Rosetta-4B
Browse files
app.py
CHANGED
|
@@ -17,26 +17,25 @@ iso1toall = {iso[1]: (iso[0], iso[2], iso[3]) for iso in non_empty_isos} # {'ro'
|
|
| 17 |
langs = list(favourite_langs.keys())
|
| 18 |
langs.extend(list(all_langs.keys())) # Language options as list, add favourite languages first
|
| 19 |
|
| 20 |
-
models = ["Helsinki-NLP", "QUICKMT", "Argos", "
|
| 21 |
"Helsinki-NLP/opus-mt-tc-bible-big-mul-mul", "Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_nld",
|
| 22 |
"Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-mul",
|
| 23 |
"Helsinki-NLP/opus-mt-tc-bible-big-roa-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-roa", "Helsinki-NLP/opus-mt-tc-bible-big-roa-en",
|
| 24 |
"facebook/nllb-200-distilled-600M", "facebook/nllb-200-distilled-1.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-3.3B",
|
| 25 |
"facebook/mbart-large-50-many-to-many-mmt", "facebook/mbart-large-50-one-to-many-mmt", "facebook/mbart-large-50-many-to-one-mmt",
|
| 26 |
-
"facebook/m2m100_418M", "facebook/m2m100_1.2B", "alirezamsh/small100",
|
| 27 |
"facebook/hf-seamless-m4t-medium", "facebook/seamless-m4t-large", "facebook/seamless-m4t-v2-large",
|
|
|
|
|
|
|
| 28 |
"bigscience/mt0-small", "bigscience/mt0-base", "bigscience/mt0-large", "bigscience/mt0-xl",
|
| 29 |
"bigscience/bloomz-560m", "bigscience/bloomz-1b1", "bigscience/bloomz-1b7", "bigscience/bloomz-3b",
|
| 30 |
"google/madlad400-3b-mt", "jbochi/madlad400-3b-mt",
|
| 31 |
"NiuTrans/LMT-60-0.6B", "NiuTrans/LMT-60-1.7B", "NiuTrans/LMT-60-4B",
|
| 32 |
-
"naist-nlp/mitre_466m", "naist-nlp/mitre_913m",
|
| 33 |
"Lego-MT/Lego-MT", "BSC-LT/salamandraTA-2b-instruct",
|
| 34 |
"winninghealth/WiNGPT-Babel", "winninghealth/WiNGPT-Babel-2", "winninghealth/WiNGPT-Babel-2.1",
|
| 35 |
-
"Unbabel/Tower-Plus-2B", "
|
| 36 |
-
"
|
| 37 |
"google-t5/t5-small", "google-t5/t5-base", "google-t5/t5-large",
|
| 38 |
-
"google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl"
|
| 39 |
-
]
|
| 40 |
DEFAULTS = [langs[0], langs[1], models[0]]
|
| 41 |
|
| 42 |
def timer(func):
|
|
@@ -114,6 +113,31 @@ class Translators:
|
|
| 114 |
result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
| 115 |
return result
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
def niutrans(self):
|
| 118 |
tokenizer = AutoTokenizer.from_pretrained(self.model_name, padding_side='left')
|
| 119 |
model = AutoModelForCausalLM.from_pretrained(self.model_name)
|
|
@@ -303,23 +327,6 @@ class Translators:
|
|
| 303 |
translated_text = translator(text, max_length=512)
|
| 304 |
return translated_text[0]['translation_text']
|
| 305 |
|
| 306 |
-
def smollm(self):
|
| 307 |
-
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 308 |
-
model = AutoModelForCausalLM.from_pretrained(self.model_name)
|
| 309 |
-
prompt = f"""Translate the following {self.sl} text to {self.tl}, generating only the translated text and maintaining the original meaning and tone:
|
| 310 |
-
{self.input_text}
|
| 311 |
-
Translation:"""
|
| 312 |
-
inputs = tokenizer(prompt, return_tensors="pt")
|
| 313 |
-
outputs = model.generate(
|
| 314 |
-
inputs.input_ids,
|
| 315 |
-
max_length=len(inputs.input_ids[0]) + 150,
|
| 316 |
-
temperature=0.3,
|
| 317 |
-
do_sample=True
|
| 318 |
-
)
|
| 319 |
-
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 320 |
-
print(response)
|
| 321 |
-
return response.split("Translation:")[-1].strip()
|
| 322 |
-
|
| 323 |
def flan(self):
|
| 324 |
tokenizer = T5Tokenizer.from_pretrained(self.model_name, legacy=False)
|
| 325 |
model = T5ForConditionalGeneration.from_pretrained(self.model_name)
|
|
@@ -605,6 +612,9 @@ def translate_text(input_text: str, s_language: str, t_language: str, model_name
|
|
| 605 |
elif "small100" in model_name.lower():
|
| 606 |
translated_text = Translators(model_name, sl, tl, input_text).smallonehundred()
|
| 607 |
|
|
|
|
|
|
|
|
|
|
| 608 |
elif "lego" in model_name.lower():
|
| 609 |
translated_text = Translators(model_name, sl, tl, input_text).LegoMT()
|
| 610 |
|
|
@@ -657,9 +667,6 @@ def translate_text(input_text: str, s_language: str, t_language: str, model_name
|
|
| 657 |
elif 'Unbabel' in model_name:
|
| 658 |
translated_text = Translators(model_name, s_language, t_language, input_text).unbabel()
|
| 659 |
|
| 660 |
-
elif model_name == "HuggingFaceTB/SmolLM3-3B":
|
| 661 |
-
translated_text = Translators(model_name, s_language, t_language, input_text).smollm()
|
| 662 |
-
|
| 663 |
elif "winninghealth/WiNGPT" in model_name:
|
| 664 |
translated_text = Translators(model_name, s_language, t_language, input_text).wingpt()
|
| 665 |
|
|
|
|
| 17 |
langs = list(favourite_langs.keys())
|
| 18 |
langs.extend(list(all_langs.keys())) # Language options as list, add favourite languages first
|
| 19 |
|
| 20 |
+
models = ["Helsinki-NLP", "QUICKMT", "Argos", "HPLT", "HPLT-OPUS", "Google",
|
| 21 |
"Helsinki-NLP/opus-mt-tc-bible-big-mul-mul", "Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_nld",
|
| 22 |
"Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-mul",
|
| 23 |
"Helsinki-NLP/opus-mt-tc-bible-big-roa-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-roa", "Helsinki-NLP/opus-mt-tc-bible-big-roa-en",
|
| 24 |
"facebook/nllb-200-distilled-600M", "facebook/nllb-200-distilled-1.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-3.3B",
|
| 25 |
"facebook/mbart-large-50-many-to-many-mmt", "facebook/mbart-large-50-one-to-many-mmt", "facebook/mbart-large-50-many-to-one-mmt",
|
|
|
|
| 26 |
"facebook/hf-seamless-m4t-medium", "facebook/seamless-m4t-large", "facebook/seamless-m4t-v2-large",
|
| 27 |
+
"facebook/m2m100_418M", "facebook/m2m100_1.2B",
|
| 28 |
+
"alirezamsh/small100", "naist-nlp/mitre_466m", "naist-nlp/mitre_913m",
|
| 29 |
"bigscience/mt0-small", "bigscience/mt0-base", "bigscience/mt0-large", "bigscience/mt0-xl",
|
| 30 |
"bigscience/bloomz-560m", "bigscience/bloomz-1b1", "bigscience/bloomz-1b7", "bigscience/bloomz-3b",
|
| 31 |
"google/madlad400-3b-mt", "jbochi/madlad400-3b-mt",
|
| 32 |
"NiuTrans/LMT-60-0.6B", "NiuTrans/LMT-60-1.7B", "NiuTrans/LMT-60-4B",
|
|
|
|
| 33 |
"Lego-MT/Lego-MT", "BSC-LT/salamandraTA-2b-instruct",
|
| 34 |
"winninghealth/WiNGPT-Babel", "winninghealth/WiNGPT-Babel-2", "winninghealth/WiNGPT-Babel-2.1",
|
| 35 |
+
"Unbabel/Tower-Plus-2B", "utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct",
|
| 36 |
+
"yanolja/YanoljaNEXT-Rosetta-4B-2511", "yanolja/YanoljaNEXT-Rosetta-4B",
|
| 37 |
"google-t5/t5-small", "google-t5/t5-base", "google-t5/t5-large",
|
| 38 |
+
"google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl"]
|
|
|
|
| 39 |
DEFAULTS = [langs[0], langs[1], models[0]]
|
| 40 |
|
| 41 |
def timer(func):
|
|
|
|
| 113 |
result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
| 114 |
return result
|
| 115 |
|
| 116 |
+
def rosetta(self):
|
| 117 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 118 |
+
self.model_name,
|
| 119 |
+
dtype=torch.bfloat16, # float32 slow
|
| 120 |
+
low_cpu_mem_usage=False, # True
|
| 121 |
+
device_map="auto")
|
| 122 |
+
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 123 |
+
system = f"Translate the user's text to {self.tl}. Provide the final translation in a formal tone immediately immediately without any other text."
|
| 124 |
+
messages = [
|
| 125 |
+
{"role": "system", "content": system},
|
| 126 |
+
{"role": "user", "content": self.input_text},
|
| 127 |
+
]
|
| 128 |
+
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 129 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(self.device)
|
| 130 |
+
input_length = inputs["input_ids"].shape[1]
|
| 131 |
+
model.eval()
|
| 132 |
+
with torch.inference_mode():
|
| 133 |
+
outputs = model.generate(
|
| 134 |
+
**inputs,
|
| 135 |
+
max_new_tokens=self.max_new_tokens,
|
| 136 |
+
)
|
| 137 |
+
generated_tokens = outputs[0][input_length:]
|
| 138 |
+
translation = tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
| 139 |
+
return translation
|
| 140 |
+
|
| 141 |
def niutrans(self):
|
| 142 |
tokenizer = AutoTokenizer.from_pretrained(self.model_name, padding_side='left')
|
| 143 |
model = AutoModelForCausalLM.from_pretrained(self.model_name)
|
|
|
|
| 327 |
translated_text = translator(text, max_length=512)
|
| 328 |
return translated_text[0]['translation_text']
|
| 329 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
def flan(self):
|
| 331 |
tokenizer = T5Tokenizer.from_pretrained(self.model_name, legacy=False)
|
| 332 |
model = T5ForConditionalGeneration.from_pretrained(self.model_name)
|
|
|
|
| 612 |
elif "small100" in model_name.lower():
|
| 613 |
translated_text = Translators(model_name, sl, tl, input_text).smallonehundred()
|
| 614 |
|
| 615 |
+
elif "rosetta" in model_name.lower():
|
| 616 |
+
translated_text = Translators(model_name, s_language, t_language, input_text).rosetta()
|
| 617 |
+
|
| 618 |
elif "lego" in model_name.lower():
|
| 619 |
translated_text = Translators(model_name, sl, tl, input_text).LegoMT()
|
| 620 |
|
|
|
|
| 667 |
elif 'Unbabel' in model_name:
|
| 668 |
translated_text = Translators(model_name, s_language, t_language, input_text).unbabel()
|
| 669 |
|
|
|
|
|
|
|
|
|
|
| 670 |
elif "winninghealth/WiNGPT" in model_name:
|
| 671 |
translated_text = Translators(model_name, s_language, t_language, input_text).wingpt()
|
| 672 |
|