From a8701ad5df6255502c431fb749b4661ff4853e57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BF=97=E6=B5=A9?= Date: Wed, 2 Aug 2023 15:26:01 +0800 Subject: [PATCH] TOLD/SOND: add utt2num_frame script --- egs/callhome/diarization/sond/finetune.sh | 7 ++++++- .../sond/script/calc_num_frames.py | 21 +++++++++++++++++++ .../script/calc_real_meeting_frame_labels.py | 6 +++--- 3 files changed, 30 insertions(+), 4 deletions(-) create mode 100644 egs/callhome/diarization/sond/script/calc_num_frames.py diff --git a/egs/callhome/diarization/sond/finetune.sh b/egs/callhome/diarization/sond/finetune.sh index 5a4842ddb..91e6798b3 100644 --- a/egs/callhome/diarization/sond/finetune.sh +++ b/egs/callhome/diarization/sond/finetune.sh @@ -180,9 +180,11 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then git lfs install git clone https://www.modelscope.cn/damo/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch.git mv speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch ${expdir}/ + echo "Done." fi for dset in callhome1/nonoverlap_0s callhome2/nonoverlap_0s; do + echo "Start to extract speaker embeddings for ${dset}" key_file=${datadir}/${dset}/wav.scp num_scp_file="$(<${key_file} wc -l)" _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file") @@ -207,6 +209,9 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then --sv_model_file ${sv_exp_dir}/sv.pth \ --output_dir "${_logdir}"/output.JOB cat ${_logdir}/output.*/xvector.scp | sort > ${datadir}/${dset}/utt2xvec + + python script/calc_num_frames.py ${key_file} ${datadir}/${dset}/utt2num_frames + echo "Done." done fi @@ -219,7 +224,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then python -Wignore script/calc_real_meeting_frame_labels.py \ ${datadir}/${dset} ${dumpdir}/${dset}/labels \ --n_spk 8 --frame_shift 0.01 --nj 16 --sr 8000 - find `pwd`/${dumpdir}/${dset}/labels -iname "*.lbl.mat" | awk -F'[/.]' '{print $(NF-2),$0}' | sort > ${datadir}/${dset}/labels.scp + find `pwd`/${dumpdir}/${dset}/labels/ -iname "*.lbl.mat" | awk -F'[/.]' '{print $(NF-2),$0}' | sort > ${datadir}/${dset}/labels.scp done fi diff --git a/egs/callhome/diarization/sond/script/calc_num_frames.py b/egs/callhome/diarization/sond/script/calc_num_frames.py new file mode 100644 index 000000000..b55ff7561 --- /dev/null +++ b/egs/callhome/diarization/sond/script/calc_num_frames.py @@ -0,0 +1,21 @@ +import os +import sys +import soundfile as sf +from funasr.utils.misc import load_scp_as_list + + +if __name__ == '__main__': + wav_scp = sys.argv[1] + out_file = sys.argv[2] + frame_shift = 0.01 + + os.makedirs(os.path.dirname(out_file), exist_ok=True) + + out_file = open(out_file, "wt") + for uttid, wav_path in load_scp_as_list(wav_scp): + wav, sr = sf.read(wav_path) + num_frame = wav.shape[0] // int(sr * frame_shift) + out_file.write(f"{uttid} {num_frame}\n") + out_file.flush() + + out_file.close() diff --git a/egs/callhome/diarization/sond/script/calc_real_meeting_frame_labels.py b/egs/callhome/diarization/sond/script/calc_real_meeting_frame_labels.py index 58232d4c6..f295c0c7e 100644 --- a/egs/callhome/diarization/sond/script/calc_real_meeting_frame_labels.py +++ b/egs/callhome/diarization/sond/script/calc_real_meeting_frame_labels.py @@ -1,6 +1,6 @@ import numpy as np -from opennmt.utils.job_runner import MultiProcessRunnerV3 -from opennmt.utils.misc import load_scp_as_list, load_scp_as_dict +from funasr.utils.job_runner import MultiProcessRunnerV3 +from funasr.utils.misc import load_scp_as_list, load_scp_as_dict import os import librosa import scipy.io as sio @@ -90,7 +90,7 @@ def process(task_args): for mid, wav_path, rttms in task_list: meeting_labels, spk_list = build_labels(wav_path, rttms, args.n_spk, args.remove_sil, args.sr, args.frame_shift) - save_path = os.path.join(args.out_dir, "{}.lbl".format(mid)) + save_path = os.path.join(args.out_dir, "{}.lbl.mat".format(mid)) sio.savemat(save_path, {"labels": meeting_labels.astype(bool), "spk_list": spk_list}) # print mid return None