update repo

2025-09-15 14:48:36 +08:00 · 2023-05-11 15:05:57 +08:00 · 2023-05-11 15:05:57 +08:00 · 280593676b
commit 280593676b
parent cf3644f96c
3 changed files with 21 additions and 169 deletions
--- a/egs/librispeech/conformer/conf/train_asr_conformer_uttnorm.yaml
+++ b/egs/librispeech/conformer/conf/train_asr_conformer_uttnorm.yaml
@ -1,80 +0,0 @@
-encoder: conformer
-encoder_conf:
-    output_size: 512
-    attention_heads: 8
-    linear_units: 2048
-    num_blocks: 12
-    dropout_rate: 0.1
-    positional_dropout_rate: 0.1
-    attention_dropout_rate: 0.1
-    input_layer: conv2d
-    normalize_before: true
-    macaron_style: true
-    rel_pos_type: latest
-    pos_enc_layer_type: rel_pos
-    selfattention_layer_type: rel_selfattn
-    activation_type: swish
-    use_cnn_module: true
-    cnn_module_kernel: 31
-
-decoder: transformer
-decoder_conf:
-    attention_heads: 8
-    linear_units: 2048
-    num_blocks: 6
-    dropout_rate: 0.1
-    positional_dropout_rate: 0.1
-    self_attention_dropout_rate: 0.1
-    src_attention_dropout_rate: 0.1
-
-model_conf:
-    ctc_weight: 0.3
-    lsm_weight: 0.1
-    length_normalized_loss: false
-
-accum_grad: 2
-max_epoch: 50
-patience: none
-init: none
-best_model_criterion:
-   - valid
-    - acc
-    - max
-keep_nbest_models: 10
-
-optim: adam
-optim_conf:
-    lr: 0.0025
-    weight_decay: 0.000001
-scheduler: warmuplr
-scheduler_conf:
-    warmup_steps: 40000
-
-specaug: specaug
-specaug_conf:
-    apply_time_warp: true
-    time_warp_window: 5
-    time_warp_mode: bicubic
-    apply_freq_mask: true
-    freq_mask_width_range:
-    - 0
-    - 27
-    num_freq_mask: 2
-    apply_time_mask: true
-    time_mask_width_ratio_range:
-    - 0.
-    - 0.05
-    num_time_mask: 10
-
-dataset_conf:
-    shuffle: True
-    shuffle_conf:
-        shuffle_size: 1024
-        sort_size: 500
-    batch_conf:
-        batch_type: token
-        batch_size: 10000
-    num_workers: 8
-
-log_interval: 50
-normalize: utterance_mvn
--- a/egs/librispeech_100h/conformer/conf/train_asr_conformer.yaml
+++ b/egs/librispeech_100h/conformer/conf/train_asr_conformer.yaml
@ -1,8 +1,8 @@
 encoder: conformer
 encoder_conf:
-    output_size: 512
-    attention_heads: 8
-    linear_units: 2048
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
    num_blocks: 12
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
@ -19,7 +19,7 @@ encoder_conf:

 decoder: transformer
 decoder_conf:
-    attention_heads: 8
+    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
    dropout_rate: 0.1
@ -27,13 +27,25 @@ decoder_conf:
    self_attention_dropout_rate: 0.1
    src_attention_dropout_rate: 0.1

+# frontend related
+frontend: wav_frontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 1
+    lfr_n: 1
+
+# hybrid CTC/attention
 model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1
    length_normalized_loss: false

-accum_grad: 2
-max_epoch: 50
+accum_grad: 1
+max_epoch: 210
 patience: none
 init: none
 best_model_criterion:
@ -44,11 +56,11 @@ keep_nbest_models: 10

 optim: adam
 optim_conf:
-    lr: 0.0025
+    lr: 0.002
    weight_decay: 0.000001
 scheduler: warmuplr
 scheduler_conf:
-    warmup_steps: 40000
+    warmup_steps: 15000

 specaug: specaug
 specaug_conf:
@ -64,7 +76,7 @@ specaug_conf:
    time_mask_width_ratio_range:
    - 0.
    - 0.05
-    num_time_mask: 10
+    num_time_mask: 5

 dataset_conf:
    shuffle: True
--- a/egs/librispeech_100h/conformer/conf/train_asr_conformer_uttnorm.yaml
+++ b/egs/librispeech_100h/conformer/conf/train_asr_conformer_uttnorm.yaml
@ -1,80 +0,0 @@
-encoder: conformer
-encoder_conf:
-    output_size: 512
-    attention_heads: 8
-    linear_units: 2048
-    num_blocks: 12
-    dropout_rate: 0.1
-    positional_dropout_rate: 0.1
-    attention_dropout_rate: 0.1
-    input_layer: conv2d
-    normalize_before: true
-    macaron_style: true
-    rel_pos_type: latest
-    pos_enc_layer_type: rel_pos
-    selfattention_layer_type: rel_selfattn
-    activation_type: swish
-    use_cnn_module: true
-    cnn_module_kernel: 31
-
-decoder: transformer
-decoder_conf:
-    attention_heads: 8
-    linear_units: 2048
-    num_blocks: 6
-    dropout_rate: 0.1
-    positional_dropout_rate: 0.1
-    self_attention_dropout_rate: 0.1
-    src_attention_dropout_rate: 0.1
-
-model_conf:
-    ctc_weight: 0.3
-    lsm_weight: 0.1
-    length_normalized_loss: false
-
-accum_grad: 2
-max_epoch: 50
-patience: none
-init: none
-best_model_criterion:
-   - valid
-    - acc
-    - max
-keep_nbest_models: 10
-
-optim: adam
-optim_conf:
-    lr: 0.0025
-    weight_decay: 0.000001
-scheduler: warmuplr
-scheduler_conf:
-    warmup_steps: 40000
-
-specaug: specaug
-specaug_conf:
-    apply_time_warp: true
-    time_warp_window: 5
-    time_warp_mode: bicubic
-    apply_freq_mask: true
-    freq_mask_width_range:
-    - 0
-    - 27
-    num_freq_mask: 2
-    apply_time_mask: true
-    time_mask_width_ratio_range:
-    - 0.
-    - 0.05
-    num_time_mask: 10
-
-dataset_conf:
-    shuffle: True
-    shuffle_conf:
-        shuffle_size: 1024
-        sort_size: 500
-    batch_conf:
-        batch_type: token
-        batch_size: 10000
-    num_workers: 8
-
-log_interval: 50
-normalize: utterance_mvn