Spaces:

mnhatdaous
/

learnable-speech

Sleeping

App Files Files Community

learnable-speech / speech /config_hf.yaml

mnhatdaous

Add comprehensive training pipeline for Hugging Face deployment

248479c 3 months ago

raw

history blame contribute delete

6.27 kB

	# Hugging Face optimized configuration
	# This config is optimized for training on HF Spaces with limited resources

	# set random seed
	__set_seed1: !apply:random.seed [1986]
	__set_seed2: !apply:numpy.random.seed [1986]
	__set_seed3: !apply:torch.manual_seed [1986]
	__set_seed4: !apply:torch.cuda.manual_seed_all [1986]

	# fixed params - optimized for HF
	sample_rate: 24000
	llm_input_size: 512 # Reduced from 896
	llm_output_size: 512 # Reduced from 896
	spk_embed_dim: 128 # Reduced from 192
	qwen_pretrain_path: ''
	token_frame_rate: 25
	token_mel_ratio: 2
	token_latent_ratio: 3
	use_speaker_encoder: True
	speaker_encoder_path: '/tmp/checkpoints/llm/best_speaker_encoder.pt'

	# stream related params
	chunk_size: 16 # Reduced from 25
	num_decoding_left_chunks: -1

	speaker_encoder_config:
	mel_dim: 80
	model_dim: 256 # Reduced from 512
	output_dim: !ref <spk_embed_dim>
	num_blocks: 4 # Reduced from 6
	num_heads: 4 # Reduced from 8
	kernel_size: 1
	dropout: 0.1
	max_conditioning_inputs: 2 # Reduced from 3

	# Smaller LLM model for HF
	llm: !new:cosyvoice.llm.llm.Qwen2LM
	llm_input_size: !ref <llm_input_size>
	llm_output_size: !ref <llm_output_size>
	speech_token_size: 6561
	length_normalized_loss: True
	lsm_weight: 0
	mix_ratio: [3, 10] # Reduced from [5, 15]
	use_speaker_encoder: !ref <use_speaker_encoder>
	spk_embed_dim: !ref <spk_embed_dim>
	max_conditioning_inputs: 2
	llm: !new:cosyvoice.llm.llm.Qwen2Encoder
	pretrain_path: !ref <qwen_pretrain_path>
	sampling: !name:cosyvoice.utils.common.ras_sampling
	top_p: 0.8
	top_k: 25
	win_size: 8 # Reduced from 10
	tau_r: 0.1

	extract_reference_mel:
	!name:cosyvoice.dataset.processor.extract_reference_mel_from_speech
	feat_extractor: !ref <feat_extractor>
	min_length: 0.5
	max_length: 3.0 # Reduced from 4.0
	num_crops: 1
	training: True
	sample_rate: !ref <sample_rate>

	# Smaller Flow model for HF
	flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
	input_size: 256 # Reduced from 512
	output_size: 64
	spk_embed_dim: !ref <spk_embed_dim>
	output_type: 'mel'
	vocab_size: 6561
	input_frame_rate: !ref <token_frame_rate>
	only_mask_loss: True
	token_latent_ratio: !ref <token_latent_ratio>
	pre_lookahead_len: 2 # Reduced from 3
	use_speaker_encoder: !ref <use_speaker_encoder>
	freeze_speaker_encoder: True
	speaker_encoder_path: !ref <speaker_encoder_path>
	encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
	output_size: 256 # Reduced from 512
	attention_heads: 4 # Reduced from 8
	linear_units: 1024 # Reduced from 2048
	num_blocks: 4 # Reduced from 6
	dropout_rate: 0.1
	positional_dropout_rate: 0.1
	attention_dropout_rate: 0.1
	normalize_before: True
	input_layer: 'linear'
	pos_enc_layer_type: 'rel_pos_espnet'
	selfattention_layer_type: 'rel_selfattn'
	input_size: 256 # Reduced from 512
	use_cnn_module: False
	macaron_style: False
	static_chunk_size: !ref <chunk_size>
	decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
	in_channels: 240
	n_spks: 1
	spk_emb_dim: 80
	cfm_params: !new:omegaconf.DictConfig
	content:
	sigma_min: 1e-06
	solver: 'euler'
	t_scheduler: 'cosine'
	training_cfg_rate: 0.1 # Reduced from 0.2
	inference_cfg_rate: 0.5 # Reduced from 0.7
	reg_loss_type: 'l1'
	use_immiscible: True
	immiscible_k: 4 # Reduced from 8
	use_contrastive_fm: True
	contrastive_lambda: 0.03 # Reduced from 0.05
	estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
	in_channels: 320
	out_channels: 64
	channels: [128] # Reduced from [256]
	dropout: 0.0
	attention_head_dim: 32 # Reduced from 64
	n_blocks: 3 # Reduced from 4
	num_mid_blocks: 8 # Reduced from 12
	num_heads: 4 # Reduced from 8
	act_fn: 'gelu'
	static_chunk_size: !ref <chunk_size> * <token_latent_ratio>
	num_decoding_left_chunks: !ref <num_decoding_left_chunks>

	# Processor functions (unchanged)
	individual_file_opener: !name:cosyvoice.dataset.processor.individual_file_opener
	parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
	get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
	token_path: !ref <qwen_pretrain_path>
	skip_special_tokens: True
	allowed_special: 'all'
	tokenize: !name:cosyvoice.dataset.processor.tokenize
	get_tokenizer: !ref <get_tokenizer>
	allowed_special: !ref <allowed_special>
	filter: !name:cosyvoice.dataset.processor.filter
	max_length: 20480 # Reduced from 40960
	min_length: 100
	token_max_length: 150 # Reduced from 200
	token_min_length: 1
	resample: !name:cosyvoice.dataset.processor.resample
	resample_rate: !ref <sample_rate>
	truncate: !name:cosyvoice.dataset.processor.truncate
	truncate_length: 12240 # Reduced from 24480
	feat_extractor: !name:matcha.utils.audio.mel_spectrogram
	n_fft: 1920
	num_mels: 80
	sampling_rate: !ref <sample_rate>
	hop_size: 480
	win_size: 1920
	fmin: 0
	fmax: 8000
	center: False
	compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
	feat_extractor: !ref <feat_extractor>
	token_mel_ratio: !ref <token_mel_ratio>
	shuffle: !name:cosyvoice.dataset.processor.shuffle
	shuffle_size: 500 # Reduced from 1000
	sort: !name:cosyvoice.dataset.processor.sort
	sort_size: 250 # Reduced from 500
	batch: !name:cosyvoice.dataset.processor.batch
	batch_type: 'dynamic'
	max_frames_in_batch: 2500 # Reduced from 5000
	padding: !name:cosyvoice.dataset.processor.padding
	use_speaker_encoder: !ref <use_speaker_encoder>

	# dataset processor pipeline
	data_pipeline:
	[
	!ref <individual_file_opener>,
	!ref <tokenize>,
	!ref <filter>,
	!ref <resample>,
	!ref <extract_reference_mel>,
	!ref <compute_fbank>,
	!ref <shuffle>,
	!ref <sort>,
	!ref <batch>,
	!ref <padding>,
	]

	# HF optimized training configuration
	train_conf:
	optim: adamw
	optim_conf:
	lr: 3e-5 # Reduced from 5e-5
	scheduler: constantlr
	scheduler_conf:
	warmup_steps: 200 # Reduced from 500
	max_epoch: 50 # Reduced from 2000
	grad_clip: 1
	accum_grad: 2 # Added gradient accumulation
	log_interval: 10 # Increased from 5
	save_per_step: 1000 # Reduced from 2000
	total_iters: 100000 # Reduced from 1000000000