diff --git a/egs/aishell/branchformer/conf/train_asr_branchformer_bs16000.yaml b/egs/aishell/branchformer/conf/train_asr_branchformer_bs16000.yaml new file mode 100644 index 000000000..5f889d045 --- /dev/null +++ b/egs/aishell/branchformer/conf/train_asr_branchformer_bs16000.yaml @@ -0,0 +1,104 @@ +# network architecture +# encoder related +encoder: branchformer +encoder_conf: + output_size: 256 + use_attn: true + attention_heads: 4 + attention_layer_type: rel_selfattn + pos_enc_layer_type: rel_pos + rel_pos_type: latest + use_cgmlp: true + cgmlp_linear_units: 2048 + cgmlp_conv_kernel: 31 + use_linear_after_conv: false + gate_activation: identity + merge_method: concat + cgmlp_weight: 0.5 # used only if merge_method is "fixed_ave" + attn_branch_drop_rate: 0.0 # used only if merge_method is "learned_ave" + num_blocks: 24 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d + stochastic_depth_rate: 0.0 + +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0. + src_attention_dropout_rate: 0. + +# frontend related +frontend: wav_frontend +frontend_conf: + fs: 16000 + window: hamming + n_mels: 80 + frame_length: 25 + frame_shift: 10 + lfr_m: 1 + lfr_n: 1 + +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + +# optimization related +accum_grad: 1 +grad_clip: 5 +max_epoch: 180 +val_scheduler_criterion: + - valid + - acc +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 10 + +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 0.000001 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 35000 + +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0. + - 0.05 + num_time_mask: 10 + +dataset_conf: + data_names: speech,text + data_types: sound,text + shuffle: True + shuffle_conf: + shuffle_size: 2048 + sort_size: 500 + batch_conf: + batch_type: token + batch_size: 16000 + num_workers: 8 + +log_interval: 50 +normalize: None \ No newline at end of file diff --git a/egs/aishell/branchformer/run.sh b/egs/aishell/branchformer/run.sh index 6bb4a0cc5..86c12b1a2 100755 --- a/egs/aishell/branchformer/run.sh +++ b/egs/aishell/branchformer/run.sh @@ -46,7 +46,8 @@ train_set=train valid_set=dev test_sets="dev test" -asr_config=conf/train_asr_branchformer.yaml +#asr_config=conf/train_asr_branchformer.yaml +asr_config=conf/train_asr_branchformer_bs16000.yaml model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}" inference_config=conf/decode_asr_transformer.yaml