learnable-speech / speech /config_hf.yaml
mnhatdaous's picture
Add comprehensive training pipeline for Hugging Face deployment
248479c
# Hugging Face optimized configuration
# This config is optimized for training on HF Spaces with limited resources
# set random seed
__set_seed1: !apply:random.seed [1986]
__set_seed2: !apply:numpy.random.seed [1986]
__set_seed3: !apply:torch.manual_seed [1986]
__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
# fixed params - optimized for HF
sample_rate: 24000
llm_input_size: 512 # Reduced from 896
llm_output_size: 512 # Reduced from 896
spk_embed_dim: 128 # Reduced from 192
qwen_pretrain_path: ''
token_frame_rate: 25
token_mel_ratio: 2
token_latent_ratio: 3
use_speaker_encoder: True
speaker_encoder_path: '/tmp/checkpoints/llm/best_speaker_encoder.pt'
# stream related params
chunk_size: 16 # Reduced from 25
num_decoding_left_chunks: -1
speaker_encoder_config:
mel_dim: 80
model_dim: 256 # Reduced from 512
output_dim: !ref <spk_embed_dim>
num_blocks: 4 # Reduced from 6
num_heads: 4 # Reduced from 8
kernel_size: 1
dropout: 0.1
max_conditioning_inputs: 2 # Reduced from 3
# Smaller LLM model for HF
llm: !new:cosyvoice.llm.llm.Qwen2LM
llm_input_size: !ref <llm_input_size>
llm_output_size: !ref <llm_output_size>
speech_token_size: 6561
length_normalized_loss: True
lsm_weight: 0
mix_ratio: [3, 10] # Reduced from [5, 15]
use_speaker_encoder: !ref <use_speaker_encoder>
spk_embed_dim: !ref <spk_embed_dim>
max_conditioning_inputs: 2
llm: !new:cosyvoice.llm.llm.Qwen2Encoder
pretrain_path: !ref <qwen_pretrain_path>
sampling: !name:cosyvoice.utils.common.ras_sampling
top_p: 0.8
top_k: 25
win_size: 8 # Reduced from 10
tau_r: 0.1
extract_reference_mel:
!name:cosyvoice.dataset.processor.extract_reference_mel_from_speech
feat_extractor: !ref <feat_extractor>
min_length: 0.5
max_length: 3.0 # Reduced from 4.0
num_crops: 1
training: True
sample_rate: !ref <sample_rate>
# Smaller Flow model for HF
flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
input_size: 256 # Reduced from 512
output_size: 64
spk_embed_dim: !ref <spk_embed_dim>
output_type: 'mel'
vocab_size: 6561
input_frame_rate: !ref <token_frame_rate>
only_mask_loss: True
token_latent_ratio: !ref <token_latent_ratio>
pre_lookahead_len: 2 # Reduced from 3
use_speaker_encoder: !ref <use_speaker_encoder>
freeze_speaker_encoder: True
speaker_encoder_path: !ref <speaker_encoder_path>
encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
output_size: 256 # Reduced from 512
attention_heads: 4 # Reduced from 8
linear_units: 1024 # Reduced from 2048
num_blocks: 4 # Reduced from 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
normalize_before: True
input_layer: 'linear'
pos_enc_layer_type: 'rel_pos_espnet'
selfattention_layer_type: 'rel_selfattn'
input_size: 256 # Reduced from 512
use_cnn_module: False
macaron_style: False
static_chunk_size: !ref <chunk_size>
decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
in_channels: 240
n_spks: 1
spk_emb_dim: 80
cfm_params: !new:omegaconf.DictConfig
content:
sigma_min: 1e-06
solver: 'euler'
t_scheduler: 'cosine'
training_cfg_rate: 0.1 # Reduced from 0.2
inference_cfg_rate: 0.5 # Reduced from 0.7
reg_loss_type: 'l1'
use_immiscible: True
immiscible_k: 4 # Reduced from 8
use_contrastive_fm: True
contrastive_lambda: 0.03 # Reduced from 0.05
estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
in_channels: 320
out_channels: 64
channels: [128] # Reduced from [256]
dropout: 0.0
attention_head_dim: 32 # Reduced from 64
n_blocks: 3 # Reduced from 4
num_mid_blocks: 8 # Reduced from 12
num_heads: 4 # Reduced from 8
act_fn: 'gelu'
static_chunk_size: !ref <chunk_size> * <token_latent_ratio>
num_decoding_left_chunks: !ref <num_decoding_left_chunks>
# Processor functions (unchanged)
individual_file_opener: !name:cosyvoice.dataset.processor.individual_file_opener
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
token_path: !ref <qwen_pretrain_path>
skip_special_tokens: True
allowed_special: 'all'
tokenize: !name:cosyvoice.dataset.processor.tokenize
get_tokenizer: !ref <get_tokenizer>
allowed_special: !ref <allowed_special>
filter: !name:cosyvoice.dataset.processor.filter
max_length: 20480 # Reduced from 40960
min_length: 100
token_max_length: 150 # Reduced from 200
token_min_length: 1
resample: !name:cosyvoice.dataset.processor.resample
resample_rate: !ref <sample_rate>
truncate: !name:cosyvoice.dataset.processor.truncate
truncate_length: 12240 # Reduced from 24480
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
n_fft: 1920
num_mels: 80
sampling_rate: !ref <sample_rate>
hop_size: 480
win_size: 1920
fmin: 0
fmax: 8000
center: False
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
feat_extractor: !ref <feat_extractor>
token_mel_ratio: !ref <token_mel_ratio>
shuffle: !name:cosyvoice.dataset.processor.shuffle
shuffle_size: 500 # Reduced from 1000
sort: !name:cosyvoice.dataset.processor.sort
sort_size: 250 # Reduced from 500
batch: !name:cosyvoice.dataset.processor.batch
batch_type: 'dynamic'
max_frames_in_batch: 2500 # Reduced from 5000
padding: !name:cosyvoice.dataset.processor.padding
use_speaker_encoder: !ref <use_speaker_encoder>
# dataset processor pipeline
data_pipeline:
[
!ref <individual_file_opener>,
!ref <tokenize>,
!ref <filter>,
!ref <resample>,
!ref <extract_reference_mel>,
!ref <compute_fbank>,
!ref <shuffle>,
!ref <sort>,
!ref <batch>,
!ref <padding>,
]
# HF optimized training configuration
train_conf:
optim: adamw
optim_conf:
lr: 3e-5 # Reduced from 5e-5
scheduler: constantlr
scheduler_conf:
warmup_steps: 200 # Reduced from 500
max_epoch: 50 # Reduced from 2000
grad_clip: 1
accum_grad: 2 # Added gradient accumulation
log_interval: 10 # Increased from 5
save_per_step: 1000 # Reduced from 2000
total_iters: 100000 # Reduced from 1000000000