update saasr egs and m2met docs
@ -31,4 +31,4 @@ For more details you can see [here](https://github.com/alibaba-damo-academy/FunA
|
||||
## Baseline results
|
||||
The results of the baseline system are shown in Table 3. The speaker profile adopts the oracle speaker embedding during training. However, due to the lack of oracle speaker label during evaluation, the speaker profile provided by an additional spectral clustering is used. Meanwhile, the results of using the oracle speaker profile on Eval and Test Set are also provided to show the impact of speaker profile accuracy.
|
||||
|
||||

|
||||

|
||||
@ -10,14 +10,14 @@ Building on the success of the previous M2MeT challenge, we are excited to propo
|
||||
|
||||
## Timeline(AOE Time)
|
||||
- $ April~29, 2023: $ Challenge and registration open.
|
||||
- $ May~8, 2023: $ Baseline release.
|
||||
- $ May~15, 2023: $ Registration deadline, the due date for participants to join the Challenge.
|
||||
- $ June~9, 2023: $ Test data release and leaderboard open.
|
||||
- $ June~13, 2023: $ Final submission deadline and leaderboar close.
|
||||
- $ June~19, 2023: $ Evaluation result and ranking release.
|
||||
- $ May~11, 2023: $ Baseline release.
|
||||
- $ May~22, 2023: $ Registration deadline, the due date for participants to join the Challenge.
|
||||
- $ June~16, 2023: $ Test data release and leaderboard open.
|
||||
- $ June~20, 2023: $ Final submission deadline and leaderboar close.
|
||||
- $ June~26, 2023: $ Evaluation result and ranking release.
|
||||
- $ July~3, 2023: $ Deadline for paper submission.
|
||||
- $ July~10, 2023: $ Deadline for final paper submission.
|
||||
- $ December~12\ to\ 16, 2023: $ ASRU Workshop and challenge Session
|
||||
- $ December~12\ to\ 16, 2023: $ ASRU Workshop and Challenge Session.
|
||||
|
||||
## Guidelines
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ Before running <code class="docutils literal notranslate"><span class="pre">run.
|
||||
<section id="baseline-results">
|
||||
<h2>Baseline results<a class="headerlink" href="#baseline-results" title="Permalink to this heading">¶</a></h2>
|
||||
<p>The results of the baseline system are shown in Table 3. The speaker profile adopts the oracle speaker embedding during training. However, due to the lack of oracle speaker label during evaluation, the speaker profile provided by an additional spectral clustering is used. Meanwhile, the results of using the oracle speaker profile on Eval and Test Set are also provided to show the impact of speaker profile accuracy.</p>
|
||||
<p><img alt="baseline result" src="_images/baseline_result.png" /></p>
|
||||
<p><img alt="baseline_result" src="_images/baseline_result.png" /></p>
|
||||
</section>
|
||||
</section>
|
||||
|
||||
|
||||
@ -136,14 +136,14 @@
|
||||
<h2>Timeline(AOE Time)<a class="headerlink" href="#timeline-aoe-time" title="Permalink to this heading">¶</a></h2>
|
||||
<ul class="simple">
|
||||
<li><p><span class="math notranslate nohighlight">\( April~29, 2023: \)</span> Challenge and registration open.</p></li>
|
||||
<li><p><span class="math notranslate nohighlight">\( May~8, 2023: \)</span> Baseline release.</p></li>
|
||||
<li><p><span class="math notranslate nohighlight">\( May~15, 2023: \)</span> Registration deadline, the due date for participants to join the Challenge.</p></li>
|
||||
<li><p><span class="math notranslate nohighlight">\( June~9, 2023: \)</span> Test data release and leaderboard open.</p></li>
|
||||
<li><p><span class="math notranslate nohighlight">\( June~13, 2023: \)</span> Final submission deadline and leaderboar close.</p></li>
|
||||
<li><p><span class="math notranslate nohighlight">\( June~19, 2023: \)</span> Evaluation result and ranking release.</p></li>
|
||||
<li><p><span class="math notranslate nohighlight">\( May~11, 2023: \)</span> Baseline release.</p></li>
|
||||
<li><p><span class="math notranslate nohighlight">\( May~22, 2023: \)</span> Registration deadline, the due date for participants to join the Challenge.</p></li>
|
||||
<li><p><span class="math notranslate nohighlight">\( June~16, 2023: \)</span> Test data release and leaderboard open.</p></li>
|
||||
<li><p><span class="math notranslate nohighlight">\( June~20, 2023: \)</span> Final submission deadline and leaderboar close.</p></li>
|
||||
<li><p><span class="math notranslate nohighlight">\( June~26, 2023: \)</span> Evaluation result and ranking release.</p></li>
|
||||
<li><p><span class="math notranslate nohighlight">\( July~3, 2023: \)</span> Deadline for paper submission.</p></li>
|
||||
<li><p><span class="math notranslate nohighlight">\( July~10, 2023: \)</span> Deadline for final paper submission.</p></li>
|
||||
<li><p><span class="math notranslate nohighlight">\( December~12\ to\ 16, 2023: \)</span> ASRU Workshop and challenge Session</p></li>
|
||||
<li><p><span class="math notranslate nohighlight">\( December~12\ to\ 16, 2023: \)</span> ASRU Workshop and Challenge Session.</p></li>
|
||||
</ul>
|
||||
</section>
|
||||
<section id="guidelines">
|
||||
|
||||
|
Before Width: | Height: | Size: 144 KiB After Width: | Height: | Size: 119 KiB |
|
Before Width: | Height: | Size: 183 KiB After Width: | Height: | Size: 152 KiB |
@ -31,4 +31,4 @@ For more details you can see [here](https://github.com/alibaba-damo-academy/FunA
|
||||
## Baseline results
|
||||
The results of the baseline system are shown in Table 3. The speaker profile adopts the oracle speaker embedding during training. However, due to the lack of oracle speaker label during evaluation, the speaker profile provided by an additional spectral clustering is used. Meanwhile, the results of using the oracle speaker profile on Eval and Test Set are also provided to show the impact of speaker profile accuracy.
|
||||
|
||||

|
||||

|
||||
@ -10,14 +10,14 @@ Building on the success of the previous M2MeT challenge, we are excited to propo
|
||||
|
||||
## Timeline(AOE Time)
|
||||
- $ April~29, 2023: $ Challenge and registration open.
|
||||
- $ May~8, 2023: $ Baseline release.
|
||||
- $ May~15, 2023: $ Registration deadline, the due date for participants to join the Challenge.
|
||||
- $ June~9, 2023: $ Test data release and leaderboard open.
|
||||
- $ June~13, 2023: $ Final submission deadline and leaderboar close.
|
||||
- $ June~19, 2023: $ Evaluation result and ranking release.
|
||||
- $ May~11, 2023: $ Baseline release.
|
||||
- $ May~22, 2023: $ Registration deadline, the due date for participants to join the Challenge.
|
||||
- $ June~16, 2023: $ Test data release and leaderboard open.
|
||||
- $ June~20, 2023: $ Final submission deadline and leaderboar close.
|
||||
- $ June~26, 2023: $ Evaluation result and ranking release.
|
||||
- $ July~3, 2023: $ Deadline for paper submission.
|
||||
- $ July~10, 2023: $ Deadline for final paper submission.
|
||||
- $ December~12\ to\ 16, 2023: $ ASRU Workshop and challenge Session
|
||||
- $ December~12\ to\ 16, 2023: $ ASRU Workshop and Challenge Session.
|
||||
|
||||
## Guidelines
|
||||
|
||||
|
||||
|
Before Width: | Height: | Size: 144 KiB After Width: | Height: | Size: 119 KiB |
|
Before Width: | Height: | Size: 183 KiB After Width: | Height: | Size: 152 KiB |
|
Before Width: | Height: | Size: 144 KiB After Width: | Height: | Size: 119 KiB |
|
Before Width: | Height: | Size: 183 KiB After Width: | Height: | Size: 152 KiB |
@ -29,4 +29,5 @@ data/Test_2023_Ali_far
|
||||
更多基线系统详情见[此处](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs/alimeeting/sa-asr/README.md)
|
||||
## 基线结果
|
||||
基线系统的结果如表3所示。在训练期间,说话人档案采用了真实说话人嵌入。然而由于在评估过程中缺乏真实说话人标签,因此使用了由额外的谱聚类提供的说话人特征。同时我们还提供了在评估和测试集上使用真实说话人档案的结果,以显示说话人档案准确性的影响。
|
||||

|
||||
|
||||

|
||||
@ -157,8 +157,8 @@
|
||||
</section>
|
||||
<section id="id4">
|
||||
<h2>基线结果<a class="headerlink" href="#id4" title="此标题的永久链接">¶</a></h2>
|
||||
<p>基线系统的结果如表3所示。在训练期间,说话人档案采用了真实说话人嵌入。然而由于在评估过程中缺乏真实说话人标签,因此使用了由额外的谱聚类提供的说话人特征。同时我们还提供了在评估和测试集上使用真实说话人档案的结果,以显示说话人档案准确性的影响。
|
||||
<img alt="baseline result" src="_images/baseline_result.png" /></p>
|
||||
<p>基线系统的结果如表3所示。在训练期间,说话人档案采用了真实说话人嵌入。然而由于在评估过程中缺乏真实说话人标签,因此使用了由额外的谱聚类提供的说话人特征。同时我们还提供了在评估和测试集上使用真实说话人档案的结果,以显示说话人档案准确性的影响。</p>
|
||||
<p><img alt="baseline_result" src="_images/baseline_result.png" /></p>
|
||||
</section>
|
||||
</section>
|
||||
|
||||
|
||||
|
Before Width: | Height: | Size: 144 KiB After Width: | Height: | Size: 119 KiB |
|
Before Width: | Height: | Size: 183 KiB After Width: | Height: | Size: 152 KiB |
@ -29,4 +29,5 @@ data/Test_2023_Ali_far
|
||||
更多基线系统详情见[此处](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs/alimeeting/sa-asr/README.md)
|
||||
## 基线结果
|
||||
基线系统的结果如表3所示。在训练期间,说话人档案采用了真实说话人嵌入。然而由于在评估过程中缺乏真实说话人标签,因此使用了由额外的谱聚类提供的说话人特征。同时我们还提供了在评估和测试集上使用真实说话人档案的结果,以显示说话人档案准确性的影响。
|
||||

|
||||
|
||||

|
||||
@ -19,7 +19,7 @@ stage 6: Generate speaker profiles (Stage 6 takes a lot of time).
|
||||
stage 7 - 9: Language model training (Optional).
|
||||
stage 10 - 11: ASR training (SA-ASR requires loading the pre-trained ASR model).
|
||||
stage 12: SA-ASR training.
|
||||
stage 13 - 18: Inference and evaluation.
|
||||
stage 13 - 16: Inference and evaluation.
|
||||
```
|
||||
Before running `run_m2met_2023_infer.sh`, you need to place the new test set `Test_2023_Ali_far` (to be released after the challenge starts) in the `./dataset` directory, which contains only raw audios. Then put the given `wav.scp`, `wav_raw.scp`, `segments`, `utt2spk` and `spk2utt` in the `./data/Test_2023_Ali_far` directory.
|
||||
```shell
|
||||
@ -37,6 +37,10 @@ stage 2: Generate speaker profiles for inference.
|
||||
stage 3: Inference.
|
||||
stage 4: Generation of SA-ASR results required for final submission.
|
||||
```
|
||||
|
||||
The baseline model is available on [ModelScope](https://www.modelscope.cn/models/damo/speech_saasr_asr-zh-cn-16k-alimeeting/summary).
|
||||
After generate stats of AliMeeting corpus(stage 10 in `run.sh`), you can set the `infer_with_pretrained_model=true` in `run.sh` to infer with our official baseline model released on ModelScope without training.
|
||||
|
||||
# Format of Final Submission
|
||||
Finally, you need to submit a file called `text_spk_merge` with the following format:
|
||||
```shell
|
||||
|
||||
@ -107,8 +107,8 @@ inference_asr_model=valid.acc.ave.pb # ASR model path for decoding.
|
||||
# inference_asr_model=valid.acc.best.pth
|
||||
# inference_asr_model=valid.loss.ave.pth
|
||||
inference_sa_asr_model=valid.acc_spk.ave.pb
|
||||
download_model= # Download a model from Model Zoo and use it for decoding.
|
||||
|
||||
infer_with_pretrained_model=false # Use pretrained model for decoding
|
||||
download_sa_asr_model= # Download the SA-ASR model from ModelScope and use it for decoding.
|
||||
# [Task dependent] Set the datadir name created by local/data.sh
|
||||
train_set= # Name of training set.
|
||||
valid_set= # Name of validation set used for monitoring/tuning network training.
|
||||
@ -203,7 +203,8 @@ Options:
|
||||
# Note that it will overwrite args in inference config.
|
||||
--inference_lm # Language modle path for decoding (default="${inference_lm}").
|
||||
--inference_asr_model # ASR model path for decoding (default="${inference_asr_model}").
|
||||
--download_model # Download a model from Model Zoo and use it for decoding (default="${download_model}").
|
||||
--infer_with_pretrained_model # Use pretrained model for decoding (default="${infer_with_pretrained_model}").
|
||||
--download_sa_asr_model= # Download the SA-ASR model from ModelScope and use it for decoding(default="${download_sa_asr_model}").
|
||||
|
||||
# [Task dependent] Set the datadir name created by local/data.sh
|
||||
--train_set # Name of training set (required).
|
||||
@ -304,6 +305,9 @@ else
|
||||
lm_token_type="${token_type}"
|
||||
fi
|
||||
|
||||
if ${infer_with_pretrained_model}; then
|
||||
skip_train=true
|
||||
fi
|
||||
|
||||
# Set tag for naming of model directory
|
||||
if [ -z "${asr_tag}" ]; then
|
||||
@ -1220,119 +1224,20 @@ else
|
||||
log "Skip the training stages"
|
||||
fi
|
||||
|
||||
if ${infer_with_pretrained_model}; then
|
||||
log "Use ${download_sa_asr_model} for decoding and evaluation"
|
||||
|
||||
sa_asr_exp="${expdir}/${download_sa_asr_model}"
|
||||
mkdir -p "${sa_asr_exp}"
|
||||
|
||||
python local/download_pretrained_model_from_modelscope.py $download_sa_asr_model ${expdir}
|
||||
inference_sa_asr_model="model.pb"
|
||||
inference_config=${sa_asr_exp}/decoding.yaml
|
||||
fi
|
||||
|
||||
if ! "${skip_eval}"; then
|
||||
if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
|
||||
log "Stage 13: Decoding multi-talker ASR: training_dir=${asr_exp}"
|
||||
|
||||
if ${gpu_inference}; then
|
||||
_cmd="${cuda_cmd}"
|
||||
inference_nj=$[${ngpu}*${njob_infer}]
|
||||
_ngpu=1
|
||||
|
||||
else
|
||||
_cmd="${decode_cmd}"
|
||||
inference_nj=$inference_nj
|
||||
_ngpu=0
|
||||
fi
|
||||
|
||||
_opts=
|
||||
if [ -n "${inference_config}" ]; then
|
||||
_opts+="--config ${inference_config} "
|
||||
fi
|
||||
if "${use_lm}"; then
|
||||
if "${use_word_lm}"; then
|
||||
_opts+="--word_lm_train_config ${lm_exp}/config.yaml "
|
||||
_opts+="--word_lm_file ${lm_exp}/${inference_lm} "
|
||||
else
|
||||
_opts+="--lm_train_config ${lm_exp}/config.yaml "
|
||||
_opts+="--lm_file ${lm_exp}/${inference_lm} "
|
||||
fi
|
||||
fi
|
||||
|
||||
# 2. Generate run.sh
|
||||
log "Generate '${asr_exp}/${inference_tag}/run.sh'. You can resume the process from stage 13 using this script"
|
||||
mkdir -p "${asr_exp}/${inference_tag}"; echo "${run_args} --stage 13 \"\$@\"; exit \$?" > "${asr_exp}/${inference_tag}/run.sh"; chmod +x "${asr_exp}/${inference_tag}/run.sh"
|
||||
|
||||
for dset in ${test_sets}; do
|
||||
_data="${data_feats}/${dset}"
|
||||
_dir="${asr_exp}/${inference_tag}/${dset}"
|
||||
_logdir="${_dir}/logdir"
|
||||
mkdir -p "${_logdir}"
|
||||
|
||||
_feats_type="$(<${_data}/feats_type)"
|
||||
if [ "${_feats_type}" = raw ]; then
|
||||
_scp=wav.scp
|
||||
if [[ "${audio_format}" == *ark* ]]; then
|
||||
_type=kaldi_ark
|
||||
else
|
||||
_type=sound
|
||||
fi
|
||||
else
|
||||
_scp=feats.scp
|
||||
_type=kaldi_ark
|
||||
fi
|
||||
|
||||
# 1. Split the key file
|
||||
key_file=${_data}/${_scp}
|
||||
split_scps=""
|
||||
_nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
|
||||
echo $_nj
|
||||
for n in $(seq "${_nj}"); do
|
||||
split_scps+=" ${_logdir}/keys.${n}.scp"
|
||||
done
|
||||
# shellcheck disable=SC2086
|
||||
utils/split_scp.pl "${key_file}" ${split_scps}
|
||||
|
||||
# 2. Submit decoding jobs
|
||||
log "Decoding started... log: '${_logdir}/asr_inference.*.log'"
|
||||
|
||||
${_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
|
||||
python -m funasr.bin.asr_inference_launch \
|
||||
--batch_size 1 \
|
||||
--mc True \
|
||||
--nbest 1 \
|
||||
--ngpu "${_ngpu}" \
|
||||
--njob ${njob_infer} \
|
||||
--gpuid_list ${device} \
|
||||
--data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \
|
||||
--key_file "${_logdir}"/keys.JOB.scp \
|
||||
--asr_train_config "${asr_exp}"/config.yaml \
|
||||
--asr_model_file "${asr_exp}"/"${inference_asr_model}" \
|
||||
--output_dir "${_logdir}"/output.JOB \
|
||||
--mode asr \
|
||||
${_opts}
|
||||
|
||||
# 3. Concatenates the output files from each jobs
|
||||
for f in token token_int score text; do
|
||||
for i in $(seq "${_nj}"); do
|
||||
cat "${_logdir}/output.${i}/1best_recog/${f}"
|
||||
done | LC_ALL=C sort -k1 >"${_dir}/${f}"
|
||||
done
|
||||
done
|
||||
fi
|
||||
|
||||
|
||||
if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
|
||||
log "Stage 14: Scoring multi-talker ASR"
|
||||
|
||||
for dset in ${test_sets}; do
|
||||
_data="${data_feats}/${dset}"
|
||||
_dir="${asr_exp}/${inference_tag}/${dset}"
|
||||
|
||||
python utils/proce_text.py ${_data}/text ${_data}/text.proc
|
||||
python utils/proce_text.py ${_dir}/text ${_dir}/text.proc
|
||||
|
||||
python utils/compute_wer.py ${_data}/text.proc ${_dir}/text.proc ${_dir}/text.cer
|
||||
tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
|
||||
cat ${_dir}/text.cer.txt
|
||||
|
||||
done
|
||||
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
|
||||
log "Stage 15: Decoding SA-ASR (oracle profile): training_dir=${sa_asr_exp}"
|
||||
log "Stage 13: Decoding SA-ASR (oracle profile): training_dir=${sa_asr_exp}"
|
||||
|
||||
if ${gpu_inference}; then
|
||||
_cmd="${cuda_cmd}"
|
||||
@ -1423,8 +1328,8 @@ if ! "${skip_eval}"; then
|
||||
done
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 16 ] && [ ${stop_stage} -ge 16 ]; then
|
||||
log "Stage 16: Scoring SA-ASR (oracle profile)"
|
||||
if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
|
||||
log "Stage 14: Scoring SA-ASR (oracle profile)"
|
||||
|
||||
for dset in ${test_sets}; do
|
||||
_data="${data_feats}/${dset}"
|
||||
@ -1448,8 +1353,8 @@ if ! "${skip_eval}"; then
|
||||
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 17 ] && [ ${stop_stage} -ge 17 ]; then
|
||||
log "Stage 17: Decoding SA-ASR (cluster profile): training_dir=${sa_asr_exp}"
|
||||
if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
|
||||
log "Stage 15: Decoding SA-ASR (cluster profile): training_dir=${sa_asr_exp}"
|
||||
|
||||
if ${gpu_inference}; then
|
||||
_cmd="${cuda_cmd}"
|
||||
@ -1539,8 +1444,8 @@ if ! "${skip_eval}"; then
|
||||
done
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 18 ] && [ ${stop_stage} -ge 18 ]; then
|
||||
log "Stage 18: Scoring SA-ASR (cluster profile)"
|
||||
if [ ${stage} -le 16 ] && [ ${stop_stage} -ge 16 ]; then
|
||||
log "Stage 16: Scoring SA-ASR (cluster profile)"
|
||||
|
||||
for dset in ${test_sets}; do
|
||||
_data="${data_feats}/${dset}"
|
||||
|
||||
@ -0,0 +1,7 @@
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
import sys
|
||||
|
||||
if __name__ == "__main__":
|
||||
model_tag = sys.argv[1]
|
||||
local_model_dir = sys.argv[2]
|
||||
model_dir = snapshot_download(model_tag, cache_dir=local_model_dir, revision='1.0.0')
|
||||
@ -8,8 +8,8 @@ set -o pipefail
|
||||
ngpu=4
|
||||
device="0,1,2,3"
|
||||
|
||||
stage=1
|
||||
stop_stage=18
|
||||
stage=12
|
||||
stop_stage=13
|
||||
|
||||
|
||||
train_set=Train_Ali_far
|
||||
@ -18,6 +18,8 @@ test_sets="Test_Ali_far"
|
||||
asr_config=conf/train_asr_conformer.yaml
|
||||
sa_asr_config=conf/train_sa_asr_conformer.yaml
|
||||
inference_config=conf/decode_asr_rnn.yaml
|
||||
infer_with_pretrained_model=true
|
||||
download_sa_asr_model="damo/speech_saasr_asr-zh-cn-16k-alimeeting"
|
||||
|
||||
lm_config=conf/train_lm_transformer.yaml
|
||||
use_lm=false
|
||||
@ -29,6 +31,8 @@ use_wordlm=false
|
||||
--stop_stage ${stop_stage} \
|
||||
--gpu_inference true \
|
||||
--njob_infer 4 \
|
||||
--infer_with_pretrained_model ${infer_with_pretrained_model} \
|
||||
--download_sa_asr_model $download_sa_asr_model \
|
||||
--asr_exp exp/asr_train_multispeaker_conformer_raw_zh_char_data_alimeeting \
|
||||
--sa_asr_exp exp/sa_asr_train_conformer_raw_zh_char_data_alimeeting \
|
||||
--asr_stats_dir exp/asr_stats_multispeaker_conformer_raw_zh_char_data_alimeeting \
|
||||
|
||||