app.py CHANGED
@@ -102,7 +102,7 @@ LANGUAGE_CONFIG = {
102
  },
103
  "zh": {
104
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/zh_f2.flac",
105
- "text": "上个月,我们达到了一个新的里程碑。 我们的YouTube频道观看次数达到了二十亿次,这绝对令人难以置信。"
106
  },
107
  }
108
 
 
102
  },
103
  "zh": {
104
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/zh_f2.flac",
105
+ "text": "上个月,我们达到了一个新的里程碑. 我们的YouTube频道观看次数达到了二十亿次,这绝对令人难以置信。"
106
  },
107
  }
108
 
requirements.txt CHANGED
@@ -13,7 +13,6 @@ safetensors
13
 
14
  # Optional language-specific dependencies
15
  # Uncomment the ones you need for specific languages:
16
- spacy_pkuseg # For Chinese text segmentation
17
  pykakasi>=2.2.0 # For Japanese text processing (Kanji to Hiragana)
18
- russian-text-stresser @ git+https://github.com/Vuizur/add-stress-to-epub
19
  # dicta-onnx>=0.1.0 # For Hebrew diacritization
 
13
 
14
  # Optional language-specific dependencies
15
  # Uncomment the ones you need for specific languages:
16
+ pkuseg # For Chinese text segmentation (improves mixed text handling)
17
  pykakasi>=2.2.0 # For Japanese text processing (Kanji to Hiragana)
 
18
  # dicta-onnx>=0.1.0 # For Hebrew diacritization
src/chatterbox/models/t3/modules/t3_config.py CHANGED
@@ -28,7 +28,7 @@ class T3Config:
28
 
29
  @property
30
  def is_multilingual(self):
31
- return self.text_tokens_dict_size == 2454
32
 
33
  @classmethod
34
  def english_only(cls):
@@ -38,4 +38,4 @@ class T3Config:
38
  @classmethod
39
  def multilingual(cls):
40
  """Create configuration for multilingual TTS model."""
41
- return cls(text_tokens_dict_size=2454)
 
28
 
29
  @property
30
  def is_multilingual(self):
31
+ return self.text_tokens_dict_size == 2352
32
 
33
  @classmethod
34
  def english_only(cls):
 
38
  @classmethod
39
  def multilingual(cls):
40
  """Create configuration for multilingual TTS model."""
41
+ return cls(text_tokens_dict_size=2352)
src/chatterbox/models/tokenizers/tokenizer.py CHANGED
@@ -1,9 +1,10 @@
1
  import logging
2
  import json
 
3
 
4
  import torch
5
  from pathlib import Path
6
- from unicodedata import category, normalize
7
  from tokenizers import Tokenizer
8
  from huggingface_hub import hf_hub_download
9
 
@@ -32,7 +33,7 @@ class EnTokenizer:
32
  text_tokens = torch.IntTensor(text_tokens).unsqueeze(0)
33
  return text_tokens
34
 
35
- def encode(self, txt: str):
36
  """
37
  clean_text > (append `lang_id`) > replace SPACE > encode text using Tokenizer
38
  """
@@ -45,7 +46,8 @@ class EnTokenizer:
45
  if isinstance(seq, torch.Tensor):
46
  seq = seq.cpu().numpy()
47
 
48
- txt: str = self.tokenizer.decode(seq, skip_special_tokens=False)
 
49
  txt = txt.replace(' ', '')
50
  txt = txt.replace(SPACE, ' ')
51
  txt = txt.replace(EOT, '')
@@ -59,7 +61,6 @@ REPO_ID = "ResembleAI/chatterbox"
59
  # Global instances for optional dependencies
60
  _kakasi = None
61
  _dicta = None
62
- _russian_stresser = None
63
 
64
 
65
  def is_kanji(c: str) -> bool:
@@ -190,7 +191,7 @@ class ChineseCangjieConverter:
190
  def _init_segmenter(self):
191
  """Initialize pkuseg segmenter."""
192
  try:
193
- from spacy_pkuseg import pkuseg
194
  self.segmenter = pkuseg()
195
  except ImportError:
196
  logger.warning("pkuseg not available - Chinese segmentation will be skipped")
@@ -206,6 +207,7 @@ class ChineseCangjieConverter:
206
  index = str(index) if index > 0 else ""
207
  return code + str(index)
208
 
 
209
 
210
  def __call__(self, text):
211
  """Convert Chinese characters in text to Cangjie tokens."""
@@ -233,25 +235,6 @@ class ChineseCangjieConverter:
233
  return "".join(output)
234
 
235
 
236
- def add_russian_stress(text: str) -> str:
237
- """Russian text normalization: adds stress marks to Russian text."""
238
- global _russian_stresser
239
-
240
- try:
241
- if _russian_stresser is None:
242
- from russian_text_stresser.text_stresser import RussianTextStresser
243
- _russian_stresser = RussianTextStresser()
244
-
245
- return _russian_stresser.stress_text(text)
246
-
247
- except ImportError:
248
- logger.warning("russian_text_stresser not available - Russian stress labeling skipped")
249
- return text
250
- except Exception as e:
251
- logger.warning(f"Russian stress labeling failed: {e}")
252
- return text
253
-
254
-
255
  class MTLTokenizer:
256
  def __init__(self, vocab_file_path):
257
  self.tokenizer: Tokenizer = Tokenizer.from_file(vocab_file_path)
@@ -264,26 +247,12 @@ class MTLTokenizer:
264
  assert SOT in voc
265
  assert EOT in voc
266
 
267
- def preprocess_text(self, raw_text: str, language_id: str = None, lowercase: bool = True, nfkd_normalize: bool = True):
268
- """
269
- Text preprocessor that handles lowercase conversion and NFKD normalization.
270
- """
271
- preprocessed_text = raw_text
272
- if lowercase:
273
- preprocessed_text = preprocessed_text.lower()
274
- if nfkd_normalize:
275
- preprocessed_text = normalize("NFKD", preprocessed_text)
276
-
277
- return preprocessed_text
278
-
279
- def text_to_tokens(self, text: str, language_id: str = None, lowercase: bool = True, nfkd_normalize: bool = True):
280
- text_tokens = self.encode(text, language_id=language_id, lowercase=lowercase, nfkd_normalize=nfkd_normalize)
281
  text_tokens = torch.IntTensor(text_tokens).unsqueeze(0)
282
  return text_tokens
283
 
284
- def encode(self, txt: str, language_id: str = None, lowercase: bool = True, nfkd_normalize: bool = True):
285
- txt = self.preprocess_text(txt, language_id=language_id, lowercase=lowercase, nfkd_normalize=nfkd_normalize)
286
-
287
  # Language-specific text processing
288
  if language_id == 'zh':
289
  txt = self.cangjie_converter(txt)
@@ -293,8 +262,6 @@ class MTLTokenizer:
293
  txt = add_hebrew_diacritics(txt)
294
  elif language_id == 'ko':
295
  txt = korean_normalize(txt)
296
- elif language_id == 'ru':
297
- txt = add_russian_stress(txt)
298
 
299
  # Prepend language token
300
  if language_id:
 
1
  import logging
2
  import json
3
+ import re
4
 
5
  import torch
6
  from pathlib import Path
7
+ from unicodedata import category
8
  from tokenizers import Tokenizer
9
  from huggingface_hub import hf_hub_download
10
 
 
33
  text_tokens = torch.IntTensor(text_tokens).unsqueeze(0)
34
  return text_tokens
35
 
36
+ def encode( self, txt: str, verbose=False):
37
  """
38
  clean_text > (append `lang_id`) > replace SPACE > encode text using Tokenizer
39
  """
 
46
  if isinstance(seq, torch.Tensor):
47
  seq = seq.cpu().numpy()
48
 
49
+ txt: str = self.tokenizer.decode(seq,
50
+ skip_special_tokens=False)
51
  txt = txt.replace(' ', '')
52
  txt = txt.replace(SPACE, ' ')
53
  txt = txt.replace(EOT, '')
 
61
  # Global instances for optional dependencies
62
  _kakasi = None
63
  _dicta = None
 
64
 
65
 
66
  def is_kanji(c: str) -> bool:
 
191
  def _init_segmenter(self):
192
  """Initialize pkuseg segmenter."""
193
  try:
194
+ from pkuseg import pkuseg
195
  self.segmenter = pkuseg()
196
  except ImportError:
197
  logger.warning("pkuseg not available - Chinese segmentation will be skipped")
 
207
  index = str(index) if index > 0 else ""
208
  return code + str(index)
209
 
210
+
211
 
212
  def __call__(self, text):
213
  """Convert Chinese characters in text to Cangjie tokens."""
 
235
  return "".join(output)
236
 
237
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  class MTLTokenizer:
239
  def __init__(self, vocab_file_path):
240
  self.tokenizer: Tokenizer = Tokenizer.from_file(vocab_file_path)
 
247
  assert SOT in voc
248
  assert EOT in voc
249
 
250
+ def text_to_tokens(self, text: str, language_id: str = None):
251
+ text_tokens = self.encode(text, language_id=language_id)
 
 
 
 
 
 
 
 
 
 
 
 
252
  text_tokens = torch.IntTensor(text_tokens).unsqueeze(0)
253
  return text_tokens
254
 
255
+ def encode(self, txt: str, language_id: str = None):
 
 
256
  # Language-specific text processing
257
  if language_id == 'zh':
258
  txt = self.cangjie_converter(txt)
 
262
  txt = add_hebrew_diacritics(txt)
263
  elif language_id == 'ko':
264
  txt = korean_normalize(txt)
 
 
265
 
266
  # Prepend language token
267
  if language_id:
src/chatterbox/mtl_tts.py CHANGED
@@ -168,7 +168,7 @@ class ChatterboxMultilingualTTS:
168
  ve.to(device).eval()
169
 
170
  t3 = T3(T3Config.multilingual())
171
- t3_state = load_safetensors(ckpt_dir / "t3_mtl23ls_v2.safetensors")
172
  if "model" in t3_state.keys():
173
  t3_state = t3_state["model"][0]
174
  t3.load_state_dict(t3_state)
@@ -181,7 +181,7 @@ class ChatterboxMultilingualTTS:
181
  s3gen.to(device).eval()
182
 
183
  tokenizer = MTLTokenizer(
184
- str(ckpt_dir / "grapheme_mtl_merged_expanded_v1.json")
185
  )
186
 
187
  conds = None
@@ -197,7 +197,7 @@ class ChatterboxMultilingualTTS:
197
  repo_id=REPO_ID,
198
  repo_type="model",
199
  revision="main",
200
- allow_patterns=["ve.pt", "t3_mtl23ls_v2.safetensors", "s3gen.pt", "grapheme_mtl_merged_expanded_v1.json", "conds.pt", "Cangjie5_TC.json"],
201
  token=os.getenv("HF_TOKEN"),
202
  )
203
  )
 
168
  ve.to(device).eval()
169
 
170
  t3 = T3(T3Config.multilingual())
171
+ t3_state = load_safetensors(ckpt_dir / "t3_23lang.safetensors")
172
  if "model" in t3_state.keys():
173
  t3_state = t3_state["model"][0]
174
  t3.load_state_dict(t3_state)
 
181
  s3gen.to(device).eval()
182
 
183
  tokenizer = MTLTokenizer(
184
+ str(ckpt_dir / "mtl_tokenizer.json")
185
  )
186
 
187
  conds = None
 
197
  repo_id=REPO_ID,
198
  repo_type="model",
199
  revision="main",
200
+ allow_patterns=["ve.pt", "t3_23lang.safetensors", "s3gen.pt", "mtl_tokenizer.json", "conds.pt", "Cangjie5_TC.json"],
201
  token=os.getenv("HF_TOKEN"),
202
  )
203
  )