TiberiuCristianLeon commited on
Commit
48ec1f2
·
verified ·
1 Parent(s): ecf7a0d

delete "Unbabel/TowerInstruct-7B-v0.2", "HuggingFaceTB/SmolLM3-3B"; add YanoljaNEXT-Rosetta-4B

Browse files
Files changed (1) hide show
  1. app.py +34 -27
app.py CHANGED
@@ -17,26 +17,25 @@ iso1toall = {iso[1]: (iso[0], iso[2], iso[3]) for iso in non_empty_isos} # {'ro'
17
  langs = list(favourite_langs.keys())
18
  langs.extend(list(all_langs.keys())) # Language options as list, add favourite languages first
19
 
20
- models = ["Helsinki-NLP", "QUICKMT", "Argos", "Google", "HPLT", "HPLT-OPUS",
21
  "Helsinki-NLP/opus-mt-tc-bible-big-mul-mul", "Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_nld",
22
  "Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-mul",
23
  "Helsinki-NLP/opus-mt-tc-bible-big-roa-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-roa", "Helsinki-NLP/opus-mt-tc-bible-big-roa-en",
24
  "facebook/nllb-200-distilled-600M", "facebook/nllb-200-distilled-1.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-3.3B",
25
  "facebook/mbart-large-50-many-to-many-mmt", "facebook/mbart-large-50-one-to-many-mmt", "facebook/mbart-large-50-many-to-one-mmt",
26
- "facebook/m2m100_418M", "facebook/m2m100_1.2B", "alirezamsh/small100",
27
  "facebook/hf-seamless-m4t-medium", "facebook/seamless-m4t-large", "facebook/seamless-m4t-v2-large",
 
 
28
  "bigscience/mt0-small", "bigscience/mt0-base", "bigscience/mt0-large", "bigscience/mt0-xl",
29
  "bigscience/bloomz-560m", "bigscience/bloomz-1b1", "bigscience/bloomz-1b7", "bigscience/bloomz-3b",
30
  "google/madlad400-3b-mt", "jbochi/madlad400-3b-mt",
31
  "NiuTrans/LMT-60-0.6B", "NiuTrans/LMT-60-1.7B", "NiuTrans/LMT-60-4B",
32
- "naist-nlp/mitre_466m", "naist-nlp/mitre_913m",
33
  "Lego-MT/Lego-MT", "BSC-LT/salamandraTA-2b-instruct",
34
  "winninghealth/WiNGPT-Babel", "winninghealth/WiNGPT-Babel-2", "winninghealth/WiNGPT-Babel-2.1",
35
- "Unbabel/Tower-Plus-2B", "HuggingFaceTB/SmolLM3-3B", "Unbabel/TowerInstruct-7B-v0.2",
36
- "utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct",
37
  "google-t5/t5-small", "google-t5/t5-base", "google-t5/t5-large",
38
- "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl"
39
- ]
40
  DEFAULTS = [langs[0], langs[1], models[0]]
41
 
42
  def timer(func):
@@ -114,6 +113,31 @@ class Translators:
114
  result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
115
  return result
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  def niutrans(self):
118
  tokenizer = AutoTokenizer.from_pretrained(self.model_name, padding_side='left')
119
  model = AutoModelForCausalLM.from_pretrained(self.model_name)
@@ -303,23 +327,6 @@ class Translators:
303
  translated_text = translator(text, max_length=512)
304
  return translated_text[0]['translation_text']
305
 
306
- def smollm(self):
307
- tokenizer = AutoTokenizer.from_pretrained(self.model_name)
308
- model = AutoModelForCausalLM.from_pretrained(self.model_name)
309
- prompt = f"""Translate the following {self.sl} text to {self.tl}, generating only the translated text and maintaining the original meaning and tone:
310
- {self.input_text}
311
- Translation:"""
312
- inputs = tokenizer(prompt, return_tensors="pt")
313
- outputs = model.generate(
314
- inputs.input_ids,
315
- max_length=len(inputs.input_ids[0]) + 150,
316
- temperature=0.3,
317
- do_sample=True
318
- )
319
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
320
- print(response)
321
- return response.split("Translation:")[-1].strip()
322
-
323
  def flan(self):
324
  tokenizer = T5Tokenizer.from_pretrained(self.model_name, legacy=False)
325
  model = T5ForConditionalGeneration.from_pretrained(self.model_name)
@@ -605,6 +612,9 @@ def translate_text(input_text: str, s_language: str, t_language: str, model_name
605
  elif "small100" in model_name.lower():
606
  translated_text = Translators(model_name, sl, tl, input_text).smallonehundred()
607
 
 
 
 
608
  elif "lego" in model_name.lower():
609
  translated_text = Translators(model_name, sl, tl, input_text).LegoMT()
610
 
@@ -657,9 +667,6 @@ def translate_text(input_text: str, s_language: str, t_language: str, model_name
657
  elif 'Unbabel' in model_name:
658
  translated_text = Translators(model_name, s_language, t_language, input_text).unbabel()
659
 
660
- elif model_name == "HuggingFaceTB/SmolLM3-3B":
661
- translated_text = Translators(model_name, s_language, t_language, input_text).smollm()
662
-
663
  elif "winninghealth/WiNGPT" in model_name:
664
  translated_text = Translators(model_name, s_language, t_language, input_text).wingpt()
665
 
 
17
  langs = list(favourite_langs.keys())
18
  langs.extend(list(all_langs.keys())) # Language options as list, add favourite languages first
19
 
20
+ models = ["Helsinki-NLP", "QUICKMT", "Argos", "HPLT", "HPLT-OPUS", "Google",
21
  "Helsinki-NLP/opus-mt-tc-bible-big-mul-mul", "Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_nld",
22
  "Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-mul",
23
  "Helsinki-NLP/opus-mt-tc-bible-big-roa-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-roa", "Helsinki-NLP/opus-mt-tc-bible-big-roa-en",
24
  "facebook/nllb-200-distilled-600M", "facebook/nllb-200-distilled-1.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-3.3B",
25
  "facebook/mbart-large-50-many-to-many-mmt", "facebook/mbart-large-50-one-to-many-mmt", "facebook/mbart-large-50-many-to-one-mmt",
 
26
  "facebook/hf-seamless-m4t-medium", "facebook/seamless-m4t-large", "facebook/seamless-m4t-v2-large",
27
+ "facebook/m2m100_418M", "facebook/m2m100_1.2B",
28
+ "alirezamsh/small100", "naist-nlp/mitre_466m", "naist-nlp/mitre_913m",
29
  "bigscience/mt0-small", "bigscience/mt0-base", "bigscience/mt0-large", "bigscience/mt0-xl",
30
  "bigscience/bloomz-560m", "bigscience/bloomz-1b1", "bigscience/bloomz-1b7", "bigscience/bloomz-3b",
31
  "google/madlad400-3b-mt", "jbochi/madlad400-3b-mt",
32
  "NiuTrans/LMT-60-0.6B", "NiuTrans/LMT-60-1.7B", "NiuTrans/LMT-60-4B",
 
33
  "Lego-MT/Lego-MT", "BSC-LT/salamandraTA-2b-instruct",
34
  "winninghealth/WiNGPT-Babel", "winninghealth/WiNGPT-Babel-2", "winninghealth/WiNGPT-Babel-2.1",
35
+ "Unbabel/Tower-Plus-2B", "utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct",
36
+ "yanolja/YanoljaNEXT-Rosetta-4B-2511", "yanolja/YanoljaNEXT-Rosetta-4B",
37
  "google-t5/t5-small", "google-t5/t5-base", "google-t5/t5-large",
38
+ "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl"]
 
39
  DEFAULTS = [langs[0], langs[1], models[0]]
40
 
41
  def timer(func):
 
113
  result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
114
  return result
115
 
116
+ def rosetta(self):
117
+ model = AutoModelForCausalLM.from_pretrained(
118
+ self.model_name,
119
+ dtype=torch.bfloat16, # float32 slow
120
+ low_cpu_mem_usage=False, # True
121
+ device_map="auto")
122
+ tokenizer = AutoTokenizer.from_pretrained(self.model_name)
123
+ system = f"Translate the user's text to {self.tl}. Provide the final translation in a formal tone immediately immediately without any other text."
124
+ messages = [
125
+ {"role": "system", "content": system},
126
+ {"role": "user", "content": self.input_text},
127
+ ]
128
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
129
+ inputs = tokenizer(prompt, return_tensors="pt").to(self.device)
130
+ input_length = inputs["input_ids"].shape[1]
131
+ model.eval()
132
+ with torch.inference_mode():
133
+ outputs = model.generate(
134
+ **inputs,
135
+ max_new_tokens=self.max_new_tokens,
136
+ )
137
+ generated_tokens = outputs[0][input_length:]
138
+ translation = tokenizer.decode(generated_tokens, skip_special_tokens=True)
139
+ return translation
140
+
141
  def niutrans(self):
142
  tokenizer = AutoTokenizer.from_pretrained(self.model_name, padding_side='left')
143
  model = AutoModelForCausalLM.from_pretrained(self.model_name)
 
327
  translated_text = translator(text, max_length=512)
328
  return translated_text[0]['translation_text']
329
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  def flan(self):
331
  tokenizer = T5Tokenizer.from_pretrained(self.model_name, legacy=False)
332
  model = T5ForConditionalGeneration.from_pretrained(self.model_name)
 
612
  elif "small100" in model_name.lower():
613
  translated_text = Translators(model_name, sl, tl, input_text).smallonehundred()
614
 
615
+ elif "rosetta" in model_name.lower():
616
+ translated_text = Translators(model_name, s_language, t_language, input_text).rosetta()
617
+
618
  elif "lego" in model_name.lower():
619
  translated_text = Translators(model_name, sl, tl, input_text).LegoMT()
620
 
 
667
  elif 'Unbabel' in model_name:
668
  translated_text = Translators(model_name, s_language, t_language, input_text).unbabel()
669
 
 
 
 
670
  elif "winninghealth/WiNGPT" in model_name:
671
  translated_text = Translators(model_name, s_language, t_language, input_text).wingpt()
672