From eb43576ed00902a5c0d5c05f5b50f9eebda3a0e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=98=89=E6=B8=8A?= <wangjiaming.wjm@alibaba-inc.com>
Date: Mon, 15 May 2023 13:54:46 +0800
Subject: [PATCH] update repo

---
 ..._paraformer_conformer_12e_6d_2048_256.yaml |  2 +-
 ...aformerbert_conformer_12e_6d_2048_256.yaml | 37 ++++++++++++-------
 egs/aishell/paraformerbert/run.sh             | 12 +++---
 3 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/egs/aishell/paraformer/conf/train_asr_paraformer_conformer_12e_6d_2048_256.yaml b/egs/aishell/paraformer/conf/train_asr_paraformer_conformer_12e_6d_2048_256.yaml
index 6073f1fd5..bac8d0497 100644
--- a/egs/aishell/paraformer/conf/train_asr_paraformer_conformer_12e_6d_2048_256.yaml
+++ b/egs/aishell/paraformer/conf/train_asr_paraformer_conformer_12e_6d_2048_256.yaml
@@ -84,7 +84,7 @@ specaug_conf:
     - 40
     num_time_mask: 2
 
-predictor: cif_predictor_v2
+predictor: cif_predictor
 predictor_conf:
     idim: 256
     threshold: 1.0
diff --git a/egs/aishell/paraformerbert/conf/train_asr_paraformerbert_conformer_12e_6d_2048_256.yaml b/egs/aishell/paraformerbert/conf/train_asr_paraformerbert_conformer_12e_6d_2048_256.yaml
index f51a2ea3f..8f3f06720 100644
--- a/egs/aishell/paraformerbert/conf/train_asr_paraformerbert_conformer_12e_6d_2048_256.yaml
+++ b/egs/aishell/paraformerbert/conf/train_asr_paraformerbert_conformer_12e_6d_2048_256.yaml
@@ -29,6 +29,17 @@ decoder_conf:
     self_attention_dropout_rate: 0.0
     src_attention_dropout_rate: 0.0
 
+# frontend related
+frontend: wav_frontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 1
+    lfr_n: 1
+
 # hybrid CTC/attention
 model: paraformer_bert
 model_conf:
@@ -41,19 +52,10 @@ model_conf:
     embed_dims: 768
     embeds_loss_weight: 2.0
 
-
-
-# minibatch related
-#batch_type: length
-#batch_bins: 40000
-batch_type: numel
-batch_bins: 2000000
-num_workers: 16
-
 # optimization related
-accum_grad: 4
+accum_grad: 1
 grad_clip: 5
-max_epoch: 50
+max_epoch: 150
 val_scheduler_criterion:
     - valid
     - acc
@@ -92,8 +94,17 @@ predictor_conf:
     threshold: 1.0
     l_order: 1
     r_order: 1
+    tail_threshold: 0.45
 
+dataset_conf:
+    shuffle: True
+    shuffle_conf:
+        shuffle_size: 2048
+        sort_size: 500
+    batch_conf:
+        batch_type: token
+        batch_size: 25000
+    num_workers: 8
 
 log_interval: 50
-normalize: None
-allow_variable_data_keys: true
\ No newline at end of file
+normalize: None
\ No newline at end of file
diff --git a/egs/aishell/paraformerbert/run.sh b/egs/aishell/paraformerbert/run.sh
index e0245f35a..b46d4e47b 100755
--- a/egs/aishell/paraformerbert/run.sh
+++ b/egs/aishell/paraformerbert/run.sh
@@ -111,12 +111,12 @@ fi
 world_size=$gpu_num  # run on one machine
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     echo "stage 3: Training"
-    if ! "${skip_extract_embed}"; then
-        echo "extract embeddings..."
-        local/extract_embeds.sh \
-            --bert_model_name ${bert_model_name} \
-            --raw_dataset_path ${feats_dir}
-    fi
+#    if ! "${skip_extract_embed}"; then
+#        echo "extract embeddings..."
+#        local/extract_embeds.sh \
+#            --bert_model_name ${bert_model_name} \
+#            --raw_dataset_path ${feats_dir}
+#    fi
     mkdir -p ${exp_dir}/exp/${model_dir}
     mkdir -p ${exp_dir}/exp/${model_dir}/log
     INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init