update saasr egs and m2met docs

2025-09-15 14:48:36 +08:00 · 2023-05-11 15:54:02 +08:00 · 2023-05-11 15:54:02 +08:00 · 062eb8ff5a
commit 062eb8ff5a
parent d788b6d5a6
27 changed files with 70 additions and 148 deletions
--- a/docs/m2met2/Baseline.md
+++ b/docs/m2met2/Baseline.md
@ -31,4 +31,4 @@ For more details you can see [here](https://github.com/alibaba-damo-academy/FunA
 ## Baseline results
 The results of the baseline system are shown in Table 3. The speaker profile adopts the oracle speaker embedding during training. However, due to the lack of oracle speaker label during evaluation, the speaker profile provided by an additional spectral clustering is used. Meanwhile, the results of using the oracle speaker profile on Eval and Test Set are also provided to show the impact of speaker profile accuracy. 

-![baseline result](images/baseline_result.png)
+![baseline_result](images/baseline_result.png)
--- a/docs/m2met2/Introduction.md
+++ b/docs/m2met2/Introduction.md
@ -10,14 +10,14 @@ Building on the success of the previous M2MeT challenge, we are excited to propo

 ## Timeline(AOE Time)
 - $ April~29, 2023: $ Challenge and registration open.
- $ May~8, 2023: $ Baseline release.
- $ May~15, 2023: $ Registration deadline, the due date for participants to join the Challenge.
- $ June~9, 2023: $ Test data release and leaderboard open.
- $ June~13, 2023: $ Final submission deadline and leaderboar close.
- $ June~19, 2023: $ Evaluation result and ranking release.
+- $ May~11, 2023: $ Baseline release.
+- $ May~22, 2023: $ Registration deadline, the due date for participants to join the Challenge.
+- $ June~16, 2023: $ Test data release and leaderboard open.
+- $ June~20, 2023: $ Final submission deadline and leaderboar close.
+- $ June~26, 2023: $ Evaluation result and ranking release.
 - $ July~3, 2023: $ Deadline for paper submission.
 - $ July~10, 2023: $ Deadline for final paper submission.
- $ December~12\ to\ 16, 2023: $ ASRU Workshop and challenge Session
+- $ December~12\ to\ 16, 2023: $ ASRU Workshop and Challenge Session.

 ## Guidelines

--- a/docs/m2met2/_build/doctrees/Baseline.doctree
+++ b/docs/m2met2/_build/doctrees/Baseline.doctree
--- a/docs/m2met2/_build/doctrees/Introduction.doctree
+++ b/docs/m2met2/_build/doctrees/Introduction.doctree
--- a/docs/m2met2/_build/doctrees/environment.pickle
+++ b/docs/m2met2/_build/doctrees/environment.pickle
--- a/docs/m2met2/_build/html/Baseline.html
+++ b/docs/m2met2/_build/html/Baseline.html
@ -157,7 +157,7 @@ Before running <code class="docutils literal notranslate"><span class="pre">run.
 <section id="baseline-results">
 <h2>Baseline results<a class="headerlink" href="#baseline-results" title="Permalink to this heading">¶</a></h2>
 <p>The results of the baseline system are shown in Table 3. The speaker profile adopts the oracle speaker embedding during training. However, due to the lack of oracle speaker label during evaluation, the speaker profile provided by an additional spectral clustering is used. Meanwhile, the results of using the oracle speaker profile on Eval and Test Set are also provided to show the impact of speaker profile accuracy.</p>
-<p><img alt="baseline result" src="_images/baseline_result.png" /></p>
+<p><img alt="baseline_result" src="_images/baseline_result.png" /></p>
 </section>
 </section>

--- a/docs/m2met2/_build/html/Introduction.html
+++ b/docs/m2met2/_build/html/Introduction.html
@ -136,14 +136,14 @@
 <h2>Timeline(AOE Time)<a class="headerlink" href="#timeline-aoe-time" title="Permalink to this heading">¶</a></h2>
 <ul class="simple">
 <li><p><span class="math notranslate nohighlight">\( April~29, 2023: \)</span> Challenge and registration open.</p></li>
-<li><p><span class="math notranslate nohighlight">\( May~8, 2023: \)</span> Baseline release.</p></li>
-<li><p><span class="math notranslate nohighlight">\( May~15, 2023: \)</span> Registration deadline, the due date for participants to join the Challenge.</p></li>
-<li><p><span class="math notranslate nohighlight">\( June~9, 2023: \)</span> Test data release and leaderboard open.</p></li>
-<li><p><span class="math notranslate nohighlight">\( June~13, 2023: \)</span> Final submission deadline and leaderboar close.</p></li>
-<li><p><span class="math notranslate nohighlight">\( June~19, 2023: \)</span> Evaluation result and ranking release.</p></li>
+<li><p><span class="math notranslate nohighlight">\( May~11, 2023: \)</span> Baseline release.</p></li>
+<li><p><span class="math notranslate nohighlight">\( May~22, 2023: \)</span> Registration deadline, the due date for participants to join the Challenge.</p></li>
+<li><p><span class="math notranslate nohighlight">\( June~16, 2023: \)</span> Test data release and leaderboard open.</p></li>
+<li><p><span class="math notranslate nohighlight">\( June~20, 2023: \)</span> Final submission deadline and leaderboar close.</p></li>
+<li><p><span class="math notranslate nohighlight">\( June~26, 2023: \)</span> Evaluation result and ranking release.</p></li>
 <li><p><span class="math notranslate nohighlight">\( July~3, 2023: \)</span> Deadline for paper submission.</p></li>
 <li><p><span class="math notranslate nohighlight">\( July~10, 2023: \)</span> Deadline for final paper submission.</p></li>
-<li><p><span class="math notranslate nohighlight">\( December~12\ to\ 16, 2023: \)</span> ASRU Workshop and challenge Session</p></li>
+<li><p><span class="math notranslate nohighlight">\( December~12\ to\ 16, 2023: \)</span> ASRU Workshop and Challenge Session.</p></li>
 </ul>
 </section>
 <section id="guidelines">
--- a/docs/m2met2/_build/html/_images/baseline_result.png
+++ b/docs/m2met2/_build/html/_images/baseline_result.png
--- a/docs/m2met2/_build/html/_images/qrcode.png
+++ b/docs/m2met2/_build/html/_images/qrcode.png
--- a/docs/m2met2/_build/html/_sources/Baseline.md.txt
+++ b/docs/m2met2/_build/html/_sources/Baseline.md.txt
@ -31,4 +31,4 @@ For more details you can see [here](https://github.com/alibaba-damo-academy/FunA
 ## Baseline results
 The results of the baseline system are shown in Table 3. The speaker profile adopts the oracle speaker embedding during training. However, due to the lack of oracle speaker label during evaluation, the speaker profile provided by an additional spectral clustering is used. Meanwhile, the results of using the oracle speaker profile on Eval and Test Set are also provided to show the impact of speaker profile accuracy. 

-![baseline result](images/baseline_result.png)
+![baseline_result](images/baseline_result.png)
--- a/docs/m2met2/_build/html/_sources/Introduction.md.txt
+++ b/docs/m2met2/_build/html/_sources/Introduction.md.txt
@ -10,14 +10,14 @@ Building on the success of the previous M2MeT challenge, we are excited to propo

 ## Timeline(AOE Time)
 - $ April~29, 2023: $ Challenge and registration open.
- $ May~8, 2023: $ Baseline release.
- $ May~15, 2023: $ Registration deadline, the due date for participants to join the Challenge.
- $ June~9, 2023: $ Test data release and leaderboard open.
- $ June~13, 2023: $ Final submission deadline and leaderboar close.
- $ June~19, 2023: $ Evaluation result and ranking release.
+- $ May~11, 2023: $ Baseline release.
+- $ May~22, 2023: $ Registration deadline, the due date for participants to join the Challenge.
+- $ June~16, 2023: $ Test data release and leaderboard open.
+- $ June~20, 2023: $ Final submission deadline and leaderboar close.
+- $ June~26, 2023: $ Evaluation result and ranking release.
 - $ July~3, 2023: $ Deadline for paper submission.
 - $ July~10, 2023: $ Deadline for final paper submission.
- $ December~12\ to\ 16, 2023: $ ASRU Workshop and challenge Session
+- $ December~12\ to\ 16, 2023: $ ASRU Workshop and Challenge Session.

 ## Guidelines

--- a/docs/m2met2/_build/html/searchindex.js
+++ b/docs/m2met2/_build/html/searchindex.js
--- a/docs/m2met2/images/baseline_result.png
+++ b/docs/m2met2/images/baseline_result.png
--- a/docs/m2met2/images/qrcode.png
+++ b/docs/m2met2/images/qrcode.png
--- a/docs/m2met2_cn/_build/doctrees/environment.pickle
+++ b/docs/m2met2_cn/_build/doctrees/environment.pickle
--- a/docs/m2met2_cn/_build/doctrees/基线.doctree
+++ b/docs/m2met2_cn/_build/doctrees/基线.doctree
--- a/docs/m2met2_cn/_build/html/_images/baseline_result.png
+++ b/docs/m2met2_cn/_build/html/_images/baseline_result.png
--- a/docs/m2met2_cn/_build/html/_images/qrcode.png
+++ b/docs/m2met2_cn/_build/html/_images/qrcode.png
--- a/docs/m2met2_cn/_build/html/_sources/基线.md.txt
+++ b/docs/m2met2_cn/_build/html/_sources/基线.md.txt
@ -29,4 +29,5 @@ data/Test_2023_Ali_far
 更多基线系统详情见[此处](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs/alimeeting/sa-asr/README.md)
 ## 基线结果
 基线系统的结果如表3所示。在训练期间，说话人档案采用了真实说话人嵌入。然而由于在评估过程中缺乏真实说话人标签，因此使用了由额外的谱聚类提供的说话人特征。同时我们还提供了在评估和测试集上使用真实说话人档案的结果，以显示说话人档案准确性的影响。
-![baseline result](images/baseline_result.png)
+
+![baseline_result](images/baseline_result.png)
--- a/docs/m2met2_cn/_build/html/基线.html
+++ b/docs/m2met2_cn/_build/html/基线.html
@ -157,8 +157,8 @@
 </section>
 <section id="id4">
 <h2>基线结果<a class="headerlink" href="#id4" title="此标题的永久链接">¶</a></h2>
-<p>基线系统的结果如表3所示。在训练期间，说话人档案采用了真实说话人嵌入。然而由于在评估过程中缺乏真实说话人标签，因此使用了由额外的谱聚类提供的说话人特征。同时我们还提供了在评估和测试集上使用真实说话人档案的结果，以显示说话人档案准确性的影响。
-<img alt="baseline result" src="_images/baseline_result.png" /></p>
+<p>基线系统的结果如表3所示。在训练期间，说话人档案采用了真实说话人嵌入。然而由于在评估过程中缺乏真实说话人标签，因此使用了由额外的谱聚类提供的说话人特征。同时我们还提供了在评估和测试集上使用真实说话人档案的结果，以显示说话人档案准确性的影响。</p>
+<p><img alt="baseline_result" src="_images/baseline_result.png" /></p>
 </section>
 </section>

--- a/docs/m2met2_cn/images/baseline_result.png
+++ b/docs/m2met2_cn/images/baseline_result.png
--- a/docs/m2met2_cn/images/qrcode.png
+++ b/docs/m2met2_cn/images/qrcode.png
--- a/docs/m2met2_cn/基线.md
+++ b/docs/m2met2_cn/基线.md
@ -29,4 +29,5 @@ data/Test_2023_Ali_far
 更多基线系统详情见[此处](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs/alimeeting/sa-asr/README.md)
 ## 基线结果
 基线系统的结果如表3所示。在训练期间，说话人档案采用了真实说话人嵌入。然而由于在评估过程中缺乏真实说话人标签，因此使用了由额外的谱聚类提供的说话人特征。同时我们还提供了在评估和测试集上使用真实说话人档案的结果，以显示说话人档案准确性的影响。
-![baseline result](images/baseline_result.png)
+
+![baseline_result](images/baseline_result.png)
--- a/egs/alimeeting/sa-asr/README.md
+++ b/egs/alimeeting/sa-asr/README.md
@ -19,7 +19,7 @@ stage 6: Generate speaker profiles (Stage 6 takes a lot of time).
 stage 7 - 9: Language model training (Optional).
 stage 10 - 11: ASR training (SA-ASR requires loading the pre-trained ASR model).
 stage 12: SA-ASR training.
-stage 13 - 18: Inference and evaluation.
+stage 13 - 16: Inference and evaluation.
 ```
 Before running `run_m2met_2023_infer.sh`, you need to place the new test set `Test_2023_Ali_far` (to be released after the challenge starts) in the `./dataset` directory, which contains only raw audios. Then put the given `wav.scp`, `wav_raw.scp`, `segments`, `utt2spk` and `spk2utt` in the `./data/Test_2023_Ali_far` directory.  
 ```shell
@ -37,6 +37,10 @@ stage 2: Generate speaker profiles for inference.
 stage 3: Inference.
 stage 4: Generation of SA-ASR results required for final submission.
 ```
+
+The baseline model is available on [ModelScope](https://www.modelscope.cn/models/damo/speech_saasr_asr-zh-cn-16k-alimeeting/summary).
+After generate stats of AliMeeting corpus(stage 10 in `run.sh`), you can set the `infer_with_pretrained_model=true` in `run.sh` to infer with our official baseline model released on ModelScope without training.
+
 # Format of Final Submission
 Finally, you need to submit a file called `text_spk_merge` with the following format:
 ```shell
--- a/egs/alimeeting/sa-asr/asr_local.sh
+++ b/egs/alimeeting/sa-asr/asr_local.sh
@ -107,8 +107,8 @@ inference_asr_model=valid.acc.ave.pb  # ASR model path for decoding.
                                      # inference_asr_model=valid.acc.best.pth
                                      # inference_asr_model=valid.loss.ave.pth
 inference_sa_asr_model=valid.acc_spk.ave.pb
-download_model= # Download a model from Model Zoo and use it for decoding.
-
+infer_with_pretrained_model=false   # Use pretrained model for decoding
+download_sa_asr_model=          # Download the SA-ASR model from ModelScope and use it for decoding.
 # [Task dependent] Set the datadir name created by local/data.sh
 train_set=       # Name of training set.
 valid_set=       # Name of validation set used for monitoring/tuning network training.
@ -203,7 +203,8 @@ Options:
                          # Note that it will overwrite args in inference config.
    --inference_lm        # Language modle path for decoding (default="${inference_lm}").
    --inference_asr_model # ASR model path for decoding (default="${inference_asr_model}").
-    --download_model      # Download a model from Model Zoo and use it for decoding (default="${download_model}").
+    --infer_with_pretrained_model      # Use pretrained model for decoding (default="${infer_with_pretrained_model}").
+    --download_sa_asr_model=          # Download the SA-ASR model from ModelScope and use it for decoding(default="${download_sa_asr_model}").

    # [Task dependent] Set the datadir name created by local/data.sh
    --train_set     # Name of training set (required).
@ -304,6 +305,9 @@ else
    lm_token_type="${token_type}"
 fi

+if ${infer_with_pretrained_model}; then
+    skip_train=true
+fi

 # Set tag for naming of model directory
 if [ -z "${asr_tag}" ]; then
@ -1220,119 +1224,20 @@ else
    log "Skip the training stages"
 fi

+if ${infer_with_pretrained_model}; then
+    log "Use ${download_sa_asr_model} for decoding and evaluation"
+
+    sa_asr_exp="${expdir}/${download_sa_asr_model}"
+    mkdir -p "${sa_asr_exp}"
+
+    python local/download_pretrained_model_from_modelscope.py $download_sa_asr_model ${expdir}
+    inference_sa_asr_model="model.pb"
+    inference_config=${sa_asr_exp}/decoding.yaml
+fi

 if ! "${skip_eval}"; then
    if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
-        log "Stage 13: Decoding multi-talker ASR: training_dir=${asr_exp}"
-
-        if ${gpu_inference}; then
-            _cmd="${cuda_cmd}"
-            inference_nj=$[${ngpu}*${njob_infer}]
-            _ngpu=1
-
-        else
-            _cmd="${decode_cmd}"
-            inference_nj=$inference_nj
-            _ngpu=0
-        fi
-
-        _opts=
-        if [ -n "${inference_config}" ]; then
-            _opts+="--config ${inference_config} "
-        fi
-        if "${use_lm}"; then
-            if "${use_word_lm}"; then
-                _opts+="--word_lm_train_config ${lm_exp}/config.yaml "
-                _opts+="--word_lm_file ${lm_exp}/${inference_lm} "
-            else
-                _opts+="--lm_train_config ${lm_exp}/config.yaml "
-                _opts+="--lm_file ${lm_exp}/${inference_lm} "
-            fi
-        fi
-
-        # 2. Generate run.sh
-        log "Generate '${asr_exp}/${inference_tag}/run.sh'. You can resume the process from stage 13 using this script"
-        mkdir -p "${asr_exp}/${inference_tag}"; echo "${run_args} --stage 13 \"\$@\"; exit \$?" > "${asr_exp}/${inference_tag}/run.sh"; chmod +x "${asr_exp}/${inference_tag}/run.sh"
-
-        for dset in ${test_sets}; do
-            _data="${data_feats}/${dset}"
-            _dir="${asr_exp}/${inference_tag}/${dset}"
-            _logdir="${_dir}/logdir"
-            mkdir -p "${_logdir}"
-
-            _feats_type="$(<${_data}/feats_type)"
-            if [ "${_feats_type}" = raw ]; then
-                _scp=wav.scp
-                if [[ "${audio_format}" == *ark* ]]; then
-                    _type=kaldi_ark
-                else
-                    _type=sound
-                fi
-            else
-                _scp=feats.scp
-                _type=kaldi_ark
-            fi
-
-            # 1. Split the key file
-            key_file=${_data}/${_scp}
-            split_scps=""
-            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
-            echo $_nj
-            for n in $(seq "${_nj}"); do
-                split_scps+=" ${_logdir}/keys.${n}.scp"
-            done
-            # shellcheck disable=SC2086
-            utils/split_scp.pl "${key_file}" ${split_scps}
-
-            # 2. Submit decoding jobs
-            log "Decoding started... log: '${_logdir}/asr_inference.*.log'"
-            
-            ${_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
-                python -m funasr.bin.asr_inference_launch \
-                    --batch_size 1 \
-                    --mc True   \
-                    --nbest 1   \
-                    --ngpu "${_ngpu}" \
-                    --njob ${njob_infer} \
-                    --gpuid_list ${device} \
-                    --data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \
-                    --key_file "${_logdir}"/keys.JOB.scp \
-                    --asr_train_config "${asr_exp}"/config.yaml \
-                    --asr_model_file "${asr_exp}"/"${inference_asr_model}" \
-                    --output_dir "${_logdir}"/output.JOB \
-                    --mode asr \
-                    ${_opts}
-
-            # 3. Concatenates the output files from each jobs
-            for f in token token_int score text; do
-                for i in $(seq "${_nj}"); do
-                    cat "${_logdir}/output.${i}/1best_recog/${f}"
-                done | LC_ALL=C sort -k1 >"${_dir}/${f}"
-            done
-        done
-    fi
-
-
-    if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
-        log "Stage 14: Scoring multi-talker ASR"
-
-        for dset in ${test_sets}; do
-            _data="${data_feats}/${dset}"
-            _dir="${asr_exp}/${inference_tag}/${dset}"
-
-            python utils/proce_text.py ${_data}/text ${_data}/text.proc
-            python utils/proce_text.py ${_dir}/text ${_dir}/text.proc
-
-            python utils/compute_wer.py ${_data}/text.proc ${_dir}/text.proc ${_dir}/text.cer
-            tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
-            cat ${_dir}/text.cer.txt
-            
-        done
-
-    fi
-
-    if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
-        log "Stage 15: Decoding SA-ASR (oracle profile): training_dir=${sa_asr_exp}"
+        log "Stage 13: Decoding SA-ASR (oracle profile): training_dir=${sa_asr_exp}"

        if ${gpu_inference}; then
            _cmd="${cuda_cmd}"
@ -1423,8 +1328,8 @@ if ! "${skip_eval}"; then
        done
    fi

-    if [ ${stage} -le 16 ] && [ ${stop_stage} -ge 16 ]; then
-        log "Stage 16: Scoring SA-ASR (oracle profile)"
+    if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
+        log "Stage 14: Scoring SA-ASR (oracle profile)"

        for dset in ${test_sets}; do
            _data="${data_feats}/${dset}"
@ -1448,8 +1353,8 @@ if ! "${skip_eval}"; then

    fi

-    if [ ${stage} -le 17 ] && [ ${stop_stage} -ge 17 ]; then
-        log "Stage 17: Decoding SA-ASR (cluster profile): training_dir=${sa_asr_exp}"
+    if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
+        log "Stage 15: Decoding SA-ASR (cluster profile): training_dir=${sa_asr_exp}"

        if ${gpu_inference}; then
            _cmd="${cuda_cmd}"
@ -1539,8 +1444,8 @@ if ! "${skip_eval}"; then
        done
    fi

-    if [ ${stage} -le 18 ] && [ ${stop_stage} -ge 18 ]; then
-        log "Stage 18: Scoring SA-ASR (cluster profile)"
+    if [ ${stage} -le 16 ] && [ ${stop_stage} -ge 16 ]; then
+        log "Stage 16: Scoring SA-ASR (cluster profile)"

        for dset in ${test_sets}; do
            _data="${data_feats}/${dset}"
--- a/egs/alimeeting/sa-asr/local/download_pretrained_model_from_modelscope.py
+++ b/egs/alimeeting/sa-asr/local/download_pretrained_model_from_modelscope.py
@ -0,0 +1,7 @@
+from modelscope.hub.snapshot_download import snapshot_download
+import sys
+
+if __name__ == "__main__":
+    model_tag = sys.argv[1]
+    local_model_dir = sys.argv[2]
+    model_dir = snapshot_download(model_tag, cache_dir=local_model_dir, revision='1.0.0')
--- a/egs/alimeeting/sa-asr/run.sh
+++ b/egs/alimeeting/sa-asr/run.sh
@ -8,8 +8,8 @@ set -o pipefail
 ngpu=4
 device="0,1,2,3"

-stage=1
-stop_stage=18
+stage=12
+stop_stage=13


 train_set=Train_Ali_far
@ -18,6 +18,8 @@ test_sets="Test_Ali_far"
 asr_config=conf/train_asr_conformer.yaml
 sa_asr_config=conf/train_sa_asr_conformer.yaml
 inference_config=conf/decode_asr_rnn.yaml
+infer_with_pretrained_model=true
+download_sa_asr_model="damo/speech_saasr_asr-zh-cn-16k-alimeeting"

 lm_config=conf/train_lm_transformer.yaml
 use_lm=false
@ -29,6 +31,8 @@ use_wordlm=false
    --stop_stage ${stop_stage}                         \
    --gpu_inference true    \
    --njob_infer 4    \
+    --infer_with_pretrained_model ${infer_with_pretrained_model} \
+    --download_sa_asr_model $download_sa_asr_model \
    --asr_exp exp/asr_train_multispeaker_conformer_raw_zh_char_data_alimeeting \
    --sa_asr_exp exp/sa_asr_train_conformer_raw_zh_char_data_alimeeting \
    --asr_stats_dir exp/asr_stats_multispeaker_conformer_raw_zh_char_data_alimeeting \