Spaces:
Sleeping
Sleeping
| # Hugging Face optimized configuration | |
| # This config is optimized for training on HF Spaces with limited resources | |
| # set random seed | |
| __set_seed1: !apply:random.seed [1986] | |
| __set_seed2: !apply:numpy.random.seed [1986] | |
| __set_seed3: !apply:torch.manual_seed [1986] | |
| __set_seed4: !apply:torch.cuda.manual_seed_all [1986] | |
| # fixed params - optimized for HF | |
| sample_rate: 24000 | |
| llm_input_size: 512 # Reduced from 896 | |
| llm_output_size: 512 # Reduced from 896 | |
| spk_embed_dim: 128 # Reduced from 192 | |
| qwen_pretrain_path: '' | |
| token_frame_rate: 25 | |
| token_mel_ratio: 2 | |
| token_latent_ratio: 3 | |
| use_speaker_encoder: True | |
| speaker_encoder_path: '/tmp/checkpoints/llm/best_speaker_encoder.pt' | |
| # stream related params | |
| chunk_size: 16 # Reduced from 25 | |
| num_decoding_left_chunks: -1 | |
| speaker_encoder_config: | |
| mel_dim: 80 | |
| model_dim: 256 # Reduced from 512 | |
| output_dim: !ref <spk_embed_dim> | |
| num_blocks: 4 # Reduced from 6 | |
| num_heads: 4 # Reduced from 8 | |
| kernel_size: 1 | |
| dropout: 0.1 | |
| max_conditioning_inputs: 2 # Reduced from 3 | |
| # Smaller LLM model for HF | |
| llm: !new:cosyvoice.llm.llm.Qwen2LM | |
| llm_input_size: !ref <llm_input_size> | |
| llm_output_size: !ref <llm_output_size> | |
| speech_token_size: 6561 | |
| length_normalized_loss: True | |
| lsm_weight: 0 | |
| mix_ratio: [3, 10] # Reduced from [5, 15] | |
| use_speaker_encoder: !ref <use_speaker_encoder> | |
| spk_embed_dim: !ref <spk_embed_dim> | |
| max_conditioning_inputs: 2 | |
| llm: !new:cosyvoice.llm.llm.Qwen2Encoder | |
| pretrain_path: !ref <qwen_pretrain_path> | |
| sampling: !name:cosyvoice.utils.common.ras_sampling | |
| top_p: 0.8 | |
| top_k: 25 | |
| win_size: 8 # Reduced from 10 | |
| tau_r: 0.1 | |
| extract_reference_mel: | |
| !name:cosyvoice.dataset.processor.extract_reference_mel_from_speech | |
| feat_extractor: !ref <feat_extractor> | |
| min_length: 0.5 | |
| max_length: 3.0 # Reduced from 4.0 | |
| num_crops: 1 | |
| training: True | |
| sample_rate: !ref <sample_rate> | |
| # Smaller Flow model for HF | |
| flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec | |
| input_size: 256 # Reduced from 512 | |
| output_size: 64 | |
| spk_embed_dim: !ref <spk_embed_dim> | |
| output_type: 'mel' | |
| vocab_size: 6561 | |
| input_frame_rate: !ref <token_frame_rate> | |
| only_mask_loss: True | |
| token_latent_ratio: !ref <token_latent_ratio> | |
| pre_lookahead_len: 2 # Reduced from 3 | |
| use_speaker_encoder: !ref <use_speaker_encoder> | |
| freeze_speaker_encoder: True | |
| speaker_encoder_path: !ref <speaker_encoder_path> | |
| encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder | |
| output_size: 256 # Reduced from 512 | |
| attention_heads: 4 # Reduced from 8 | |
| linear_units: 1024 # Reduced from 2048 | |
| num_blocks: 4 # Reduced from 6 | |
| dropout_rate: 0.1 | |
| positional_dropout_rate: 0.1 | |
| attention_dropout_rate: 0.1 | |
| normalize_before: True | |
| input_layer: 'linear' | |
| pos_enc_layer_type: 'rel_pos_espnet' | |
| selfattention_layer_type: 'rel_selfattn' | |
| input_size: 256 # Reduced from 512 | |
| use_cnn_module: False | |
| macaron_style: False | |
| static_chunk_size: !ref <chunk_size> | |
| decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM | |
| in_channels: 240 | |
| n_spks: 1 | |
| spk_emb_dim: 80 | |
| cfm_params: !new:omegaconf.DictConfig | |
| content: | |
| sigma_min: 1e-06 | |
| solver: 'euler' | |
| t_scheduler: 'cosine' | |
| training_cfg_rate: 0.1 # Reduced from 0.2 | |
| inference_cfg_rate: 0.5 # Reduced from 0.7 | |
| reg_loss_type: 'l1' | |
| use_immiscible: True | |
| immiscible_k: 4 # Reduced from 8 | |
| use_contrastive_fm: True | |
| contrastive_lambda: 0.03 # Reduced from 0.05 | |
| estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder | |
| in_channels: 320 | |
| out_channels: 64 | |
| channels: [128] # Reduced from [256] | |
| dropout: 0.0 | |
| attention_head_dim: 32 # Reduced from 64 | |
| n_blocks: 3 # Reduced from 4 | |
| num_mid_blocks: 8 # Reduced from 12 | |
| num_heads: 4 # Reduced from 8 | |
| act_fn: 'gelu' | |
| static_chunk_size: !ref <chunk_size> * <token_latent_ratio> | |
| num_decoding_left_chunks: !ref <num_decoding_left_chunks> | |
| # Processor functions (unchanged) | |
| individual_file_opener: !name:cosyvoice.dataset.processor.individual_file_opener | |
| parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener | |
| get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer | |
| token_path: !ref <qwen_pretrain_path> | |
| skip_special_tokens: True | |
| allowed_special: 'all' | |
| tokenize: !name:cosyvoice.dataset.processor.tokenize | |
| get_tokenizer: !ref <get_tokenizer> | |
| allowed_special: !ref <allowed_special> | |
| filter: !name:cosyvoice.dataset.processor.filter | |
| max_length: 20480 # Reduced from 40960 | |
| min_length: 100 | |
| token_max_length: 150 # Reduced from 200 | |
| token_min_length: 1 | |
| resample: !name:cosyvoice.dataset.processor.resample | |
| resample_rate: !ref <sample_rate> | |
| truncate: !name:cosyvoice.dataset.processor.truncate | |
| truncate_length: 12240 # Reduced from 24480 | |
| feat_extractor: !name:matcha.utils.audio.mel_spectrogram | |
| n_fft: 1920 | |
| num_mels: 80 | |
| sampling_rate: !ref <sample_rate> | |
| hop_size: 480 | |
| win_size: 1920 | |
| fmin: 0 | |
| fmax: 8000 | |
| center: False | |
| compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank | |
| feat_extractor: !ref <feat_extractor> | |
| token_mel_ratio: !ref <token_mel_ratio> | |
| shuffle: !name:cosyvoice.dataset.processor.shuffle | |
| shuffle_size: 500 # Reduced from 1000 | |
| sort: !name:cosyvoice.dataset.processor.sort | |
| sort_size: 250 # Reduced from 500 | |
| batch: !name:cosyvoice.dataset.processor.batch | |
| batch_type: 'dynamic' | |
| max_frames_in_batch: 2500 # Reduced from 5000 | |
| padding: !name:cosyvoice.dataset.processor.padding | |
| use_speaker_encoder: !ref <use_speaker_encoder> | |
| # dataset processor pipeline | |
| data_pipeline: | |
| [ | |
| !ref <individual_file_opener>, | |
| !ref <tokenize>, | |
| !ref <filter>, | |
| !ref <resample>, | |
| !ref <extract_reference_mel>, | |
| !ref <compute_fbank>, | |
| !ref <shuffle>, | |
| !ref <sort>, | |
| !ref <batch>, | |
| !ref <padding>, | |
| ] | |
| # HF optimized training configuration | |
| train_conf: | |
| optim: adamw | |
| optim_conf: | |
| lr: 3e-5 # Reduced from 5e-5 | |
| scheduler: constantlr | |
| scheduler_conf: | |
| warmup_steps: 200 # Reduced from 500 | |
| max_epoch: 50 # Reduced from 2000 | |
| grad_clip: 1 | |
| accum_grad: 2 # Added gradient accumulation | |
| log_interval: 10 # Increased from 5 | |
| save_per_step: 1000 # Reduced from 2000 | |
| total_iters: 100000 # Reduced from 1000000000 | |