diff --git a/egs/callhome/eend_ola/local/make_mixture.py b/egs/callhome/eend_ola/local/make_mixture.py index 82d03cd60..6b159034d 100755 --- a/egs/callhome/eend_ola/local/make_mixture.py +++ b/egs/callhome/eend_ola/local/make_mixture.py @@ -13,7 +13,7 @@ import argparse import os -from eend import kaldi_data +from funasr.modules.eend_ola.utils import kaldi_data import numpy as np import math import soundfile as sf diff --git a/egs/callhome/eend_ola/local/run_prepare_shared_eda.sh b/egs/callhome/eend_ola/local/run_prepare_shared_eda.sh index a256edafc..5431ba1de 100755 --- a/egs/callhome/eend_ola/local/run_prepare_shared_eda.sh +++ b/egs/callhome/eend_ola/local/run_prepare_shared_eda.sh @@ -9,7 +9,7 @@ # - data/simu_${simu_outputs} # simulation mixtures generated with various options -stage=0 +stage=1 # Modify corpus directories # - callhome_dir @@ -156,8 +156,8 @@ simudir=data/simu if [ $stage -le 1 ]; then echo "simulation of mixture" mkdir -p $simudir/.work - local/random_mixture_cmd=random_mixture.py - local/make_mixture_cmd=make_mixture.py + random_mixture_cmd=local/random_mixture.py + make_mixture_cmd=local/make_mixture.py for ((i=0; i<${#simu_opts_sil_scale_array[@]}; ++i)); do simu_opts_num_speaker=${simu_opts_num_speaker_array[i]} diff --git a/egs/callhome/eend_ola/run.sh b/egs/callhome/eend_ola/run.sh index 286fc29aa..b4f273945 100644 --- a/egs/callhome/eend_ola/run.sh +++ b/egs/callhome/eend_ola/run.sh @@ -31,7 +31,7 @@ stage=-1 stop_stage=-1 # exp tag -tag="exp_fix" +tag="exp1" . local/parse_options.sh || exit 1; diff --git a/egs/callhome/eend_ola/run_test.sh b/egs/callhome/eend_ola/run_test.sh new file mode 100644 index 000000000..d00444665 --- /dev/null +++ b/egs/callhome/eend_ola/run_test.sh @@ -0,0 +1,257 @@ +#!/usr/bin/env bash + +. ./path.sh || exit 1; + +# machines configuration +CUDA_VISIBLE_DEVICES="7" +gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +count=1 + +# general configuration +simu_feats_dir="/nfs/wangjiaming.wjm/EEND_ARK_DATA/dump/simu_data/data" +simu_feats_dir_chunk2000="/nfs/wangjiaming.wjm/EEND_ARK_DATA/dump/simu_data_chunk2000/data" +callhome_feats_dir_chunk2000="/nfs/wangjiaming.wjm/EEND_ARK_DATA/dump/callhome_chunk2000/data" +simu_train_dataset=train +simu_valid_dataset=dev +callhome_train_dataset=callhome1_allspk +callhome_valid_dataset=callhome2_allspk +callhome2_wav_scp_file=wav.scp + +# model average +simu_average_2spkr_start=91 +simu_average_2spkr_end=100 +simu_average_allspkr_start=16 +simu_average_allspkr_end=25 +callhome_average_start=91 +callhome_average_end=100 + +exp_dir="." +input_size=345 +stage=5 +stop_stage=5 + +# exp tag +tag="exp1" + +. local/parse_options.sh || exit 1; + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +simu_2spkr_diar_config=conf/train_diar_eend_ola_simu_2spkr.yaml +simu_allspkr_diar_config=conf/train_diar_eend_ola_simu_allspkr.yaml +simu_allspkr_chunk2000_diar_config=conf/train_diar_eend_ola_simu_allspkr_chunk2000.yaml +callhome_diar_config=conf/train_diar_eend_ola_callhome_chunk2000.yaml +simu_2spkr_model_dir="baseline_$(basename "${simu_2spkr_diar_config}" .yaml)_${tag}" +simu_allspkr_model_dir="baseline_$(basename "${simu_allspkr_diar_config}" .yaml)_${tag}" +simu_allspkr_chunk2000_model_dir="baseline_$(basename "${simu_allspkr_chunk2000_diar_config}" .yaml)_${tag}" +callhome_model_dir="baseline_$(basename "${callhome_diar_config}" .yaml)_${tag}" + +# simulate mixture data for training and inference +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + echo "stage -1: Simulate mixture data for training and inference" + echo "The detail can be found in https://github.com/hitachi-speech/EEND" + echo "Before running this step, you should download and compile kaldi and set KALDI_ROOT in this script and path.sh" + echo "This stage may take a long time, please waiting..." + KALDI_ROOT= + ln -s $KALDI_ROOT/egs/wsj/s5/steps steps + ln -s $KALDI_ROOT/egs/wsj/s5/utils utils + local/run_prepare_shared_eda.sh +fi + +## Prepare data for training and inference +#if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then +# echo "stage 0: Prepare data for training and inference" +# echo "The detail can be found in https://github.com/hitachi-speech/EEND" +# . ./local/ +#fi +# + +# Training on simulated two-speaker data +world_size=$gpu_num +simu_2spkr_ave_id=avg${simu_average_2spkr_start}-${simu_average_2spkr_end} +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "stage 1: Training on simulated two-speaker data" + mkdir -p ${exp_dir}/exp/${simu_2spkr_model_dir} + mkdir -p ${exp_dir}/exp/${simu_2spkr_model_dir}/log + INIT_FILE=${exp_dir}/exp/${simu_2spkr_model_dir}/ddp_init + if [ -f $INIT_FILE ];then + rm -f $INIT_FILE + fi + init_method=file://$(readlink -f $INIT_FILE) + echo "$0: init method is $init_method" + for ((i = 0; i < $gpu_num; ++i)); do + { + rank=$i + local_rank=$i + gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) + train.py \ + --task_name diar \ + --gpu_id $gpu_id \ + --use_preprocessor false \ + --input_size $input_size \ + --data_dir ${simu_feats_dir} \ + --train_set ${simu_train_dataset} \ + --valid_set ${simu_valid_dataset} \ + --data_file_names "feats_2spkr.scp" \ + --resume true \ + --output_dir ${exp_dir}/exp/${simu_2spkr_model_dir} \ + --config $simu_2spkr_diar_config \ + --ngpu $gpu_num \ + --num_worker_count $count \ + --dist_init_method $init_method \ + --dist_world_size $world_size \ + --dist_rank $rank \ + --local_rank $local_rank 1> ${exp_dir}/exp/${simu_2spkr_model_dir}/log/train.log.$i 2>&1 + } & + done + wait + echo "averaging model parameters into ${exp_dir}/exp/$simu_2spkr_model_dir/$simu_2spkr_ave_id.pb" + models=`eval echo ${exp_dir}/exp/${simu_2spkr_model_dir}/{$simu_average_2spkr_start..$simu_average_2spkr_end}epoch.pb` + python local/model_averaging.py ${exp_dir}/exp/${simu_2spkr_model_dir}/$simu_2spkr_ave_id.pb $models +fi + +# Training on simulated all-speaker data +world_size=$gpu_num +simu_allspkr_ave_id=avg${simu_average_allspkr_start}-${simu_average_allspkr_end} +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + echo "stage 2: Training on simulated all-speaker data" + mkdir -p ${exp_dir}/exp/${simu_allspkr_model_dir} + mkdir -p ${exp_dir}/exp/${simu_allspkr_model_dir}/log + INIT_FILE=${exp_dir}/exp/${simu_allspkr_model_dir}/ddp_init + if [ -f $INIT_FILE ];then + rm -f $INIT_FILE + fi + init_method=file://$(readlink -f $INIT_FILE) + echo "$0: init method is $init_method" + for ((i = 0; i < $gpu_num; ++i)); do + { + rank=$i + local_rank=$i + gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) + train.py \ + --task_name diar \ + --gpu_id $gpu_id \ + --use_preprocessor false \ + --input_size $input_size \ + --data_dir ${simu_feats_dir} \ + --train_set ${simu_train_dataset} \ + --valid_set ${simu_valid_dataset} \ + --data_file_names "feats.scp" \ + --resume true \ + --init_param ${exp_dir}/exp/${simu_2spkr_model_dir}/$simu_2spkr_ave_id.pb \ + --output_dir ${exp_dir}/exp/${simu_allspkr_model_dir} \ + --config $simu_allspkr_diar_config \ + --ngpu $gpu_num \ + --num_worker_count $count \ + --dist_init_method $init_method \ + --dist_world_size $world_size \ + --dist_rank $rank \ + --local_rank $local_rank 1> ${exp_dir}/exp/${simu_allspkr_model_dir}/log/train.log.$i 2>&1 + } & + done + wait + echo "averaging model parameters into ${exp_dir}/exp/$simu_allspkr_model_dir/$simu_allspkr_ave_id.pb" + models=`eval echo ${exp_dir}/exp/${simu_allspkr_model_dir}/{$simu_average_allspkr_start..$simu_average_allspkr_end}epoch.pb` + python local/model_averaging.py ${exp_dir}/exp/${simu_allspkr_model_dir}/$simu_allspkr_ave_id.pb $models +fi + +# Training on simulated all-speaker data with chunk_size=2000 +world_size=$gpu_num +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "stage 3: Training on simulated all-speaker data with chunk_size=2000" + mkdir -p ${exp_dir}/exp/${simu_allspkr_chunk2000_model_dir} + mkdir -p ${exp_dir}/exp/${simu_allspkr_chunk2000_model_dir}/log + INIT_FILE=${exp_dir}/exp/${simu_allspkr_chunk2000_model_dir}/ddp_init + if [ -f $INIT_FILE ];then + rm -f $INIT_FILE + fi + init_method=file://$(readlink -f $INIT_FILE) + echo "$0: init method is $init_method" + for ((i = 0; i < $gpu_num; ++i)); do + { + rank=$i + local_rank=$i + gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) + train.py \ + --task_name diar \ + --gpu_id $gpu_id \ + --use_preprocessor false \ + --input_size $input_size \ + --data_dir ${simu_feats_dir_chunk2000} \ + --train_set ${simu_train_dataset} \ + --valid_set ${simu_valid_dataset} \ + --data_file_names "feats.scp" \ + --resume true \ + --init_param ${exp_dir}/exp/${simu_allspkr_model_dir}/$simu_allspkr_ave_id.pb \ + --output_dir ${exp_dir}/exp/${simu_allspkr_chunk2000_model_dir} \ + --config $simu_allspkr_chunk2000_diar_config \ + --ngpu $gpu_num \ + --num_worker_count $count \ + --dist_init_method $init_method \ + --dist_world_size $world_size \ + --dist_rank $rank \ + --local_rank $local_rank 1> ${exp_dir}/exp/${simu_allspkr_chunk2000_model_dir}/log/train.log.$i 2>&1 + } & + done + wait +fi + +# Training on callhome all-speaker data with chunk_size=2000 +world_size=$gpu_num +callhome_ave_id=avg${callhome_average_start}-${callhome_average_end} +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "stage 4: Training on callhome all-speaker data with chunk_size=2000" + mkdir -p ${exp_dir}/exp/${callhome_model_dir} + mkdir -p ${exp_dir}/exp/${callhome_model_dir}/log + INIT_FILE=${exp_dir}/exp/${callhome_model_dir}/ddp_init + if [ -f $INIT_FILE ];then + rm -f $INIT_FILE + fi + init_method=file://$(readlink -f $INIT_FILE) + echo "$0: init method is $init_method" + for ((i = 0; i < $gpu_num; ++i)); do + { + rank=$i + local_rank=$i + gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) + train.py \ + --task_name diar \ + --gpu_id $gpu_id \ + --use_preprocessor false \ + --input_size $input_size \ + --data_dir ${callhome_feats_dir_chunk2000} \ + --train_set ${callhome_train_dataset} \ + --valid_set ${callhome_valid_dataset} \ + --data_file_names "feats.scp" \ + --resume true \ + --init_param ${exp_dir}/exp/${simu_allspkr_chunk2000_model_dir}/1epoch.pb \ + --output_dir ${exp_dir}/exp/${callhome_model_dir} \ + --config $callhome_diar_config \ + --ngpu $gpu_num \ + --num_worker_count $count \ + --dist_init_method $init_method \ + --dist_world_size $world_size \ + --dist_rank $rank \ + --local_rank $local_rank 1> ${exp_dir}/exp/${callhome_model_dir}/log/train.log.$i 2>&1 + } & + done + wait + echo "averaging model parameters into ${exp_dir}/exp/$callhome_model_dir/$callhome_ave_id.pb" + models=`eval echo ${exp_dir}/exp/${callhome_model_dir}/{$callhome_average_start..$callhome_average_end}epoch.pb` + python local/model_averaging.py ${exp_dir}/exp/${callhome_model_dir}/$callhome_ave_id.pb $models +fi + +# inference +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + echo "Inference" + mkdir -p ${exp_dir}/exp/${callhome_model_dir}/inference/log + CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES python local/infer.py \ + --config_file ${exp_dir}/exp/${callhome_model_dir}/config.yaml \ + --model_file ${exp_dir}/exp/${callhome_model_dir}/$callhome_ave_id.pb \ + --output_rttm_file ${exp_dir}/exp/${callhome_model_dir}/inference/rttm \ + --wav_scp_file ${callhome_feats_dir_chunk2000}/${callhome_valid_dataset}/${callhome2_wav_scp_file} 1> ${exp_dir}/exp/${callhome_model_dir}/inference/log/infer.log 2>&1 +fi \ No newline at end of file diff --git a/funasr/modules/eend_ola/utils/kaldi_data.py b/funasr/modules/eend_ola/utils/kaldi_data.py new file mode 100644 index 000000000..42f6d5ebc --- /dev/null +++ b/funasr/modules/eend_ola/utils/kaldi_data.py @@ -0,0 +1,162 @@ +# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita) +# Licensed under the MIT license. +# +# This library provides utilities for kaldi-style data directory. + + +from __future__ import print_function +import os +import sys +import numpy as np +import subprocess +import soundfile as sf +import io +from functools import lru_cache + + +def load_segments(segments_file): + """ load segments file as array """ + if not os.path.exists(segments_file): + return None + return np.loadtxt( + segments_file, + dtype=[('utt', 'object'), + ('rec', 'object'), + ('st', 'f'), + ('et', 'f')], + ndmin=1) + + +def load_segments_hash(segments_file): + ret = {} + if not os.path.exists(segments_file): + return None + for line in open(segments_file): + utt, rec, st, et = line.strip().split() + ret[utt] = (rec, float(st), float(et)) + return ret + + +def load_segments_rechash(segments_file): + ret = {} + if not os.path.exists(segments_file): + return None + for line in open(segments_file): + utt, rec, st, et = line.strip().split() + if rec not in ret: + ret[rec] = [] + ret[rec].append({'utt':utt, 'st':float(st), 'et':float(et)}) + return ret + + +def load_wav_scp(wav_scp_file): + """ return dictionary { rec: wav_rxfilename } """ + lines = [line.strip().split(None, 1) for line in open(wav_scp_file)] + return {x[0]: x[1] for x in lines} + + +@lru_cache(maxsize=1) +def load_wav(wav_rxfilename, start=0, end=None): + """ This function reads audio file and return data in numpy.float32 array. + "lru_cache" holds recently loaded audio so that can be called + many times on the same audio file. + OPTIMIZE: controls lru_cache size for random access, + considering memory size + """ + if wav_rxfilename.endswith('|'): + # input piped command + p = subprocess.Popen(wav_rxfilename[:-1], shell=True, + stdout=subprocess.PIPE) + data, samplerate = sf.read(io.BytesIO(p.stdout.read()), + dtype='float32') + # cannot seek + data = data[start:end] + elif wav_rxfilename == '-': + # stdin + data, samplerate = sf.read(sys.stdin, dtype='float32') + # cannot seek + data = data[start:end] + else: + # normal wav file + data, samplerate = sf.read(wav_rxfilename, start=start, stop=end) + return data, samplerate + + +def load_utt2spk(utt2spk_file): + """ returns dictionary { uttid: spkid } """ + lines = [line.strip().split(None, 1) for line in open(utt2spk_file)] + return {x[0]: x[1] for x in lines} + + +def load_spk2utt(spk2utt_file): + """ returns dictionary { spkid: list of uttids } """ + if not os.path.exists(spk2utt_file): + return None + lines = [line.strip().split() for line in open(spk2utt_file)] + return {x[0]: x[1:] for x in lines} + + +def load_reco2dur(reco2dur_file): + """ returns dictionary { recid: duration } """ + if not os.path.exists(reco2dur_file): + return None + lines = [line.strip().split(None, 1) for line in open(reco2dur_file)] + return {x[0]: float(x[1]) for x in lines} + + +def process_wav(wav_rxfilename, process): + """ This function returns preprocessed wav_rxfilename + Args: + wav_rxfilename: input + process: command which can be connected via pipe, + use stdin and stdout + Returns: + wav_rxfilename: output piped command + """ + if wav_rxfilename.endswith('|'): + # input piped command + return wav_rxfilename + process + "|" + else: + # stdin "-" or normal file + return "cat {} | {} |".format(wav_rxfilename, process) + + +def extract_segments(wavs, segments=None): + """ This function returns generator of segmented audio as + (utterance id, numpy.float32 array) + TODO?: sampling rate is not converted. + """ + if segments is not None: + # segments should be sorted by rec-id + for seg in segments: + wav = wavs[seg['rec']] + data, samplerate = load_wav(wav) + st_sample = np.rint(seg['st'] * samplerate).astype(int) + et_sample = np.rint(seg['et'] * samplerate).astype(int) + yield seg['utt'], data[st_sample:et_sample] + else: + # segments file not found, + # wav.scp is used as segmented audio list + for rec in wavs: + data, samplerate = load_wav(wavs[rec]) + yield rec, data + + +class KaldiData: + def __init__(self, data_dir): + self.data_dir = data_dir + self.segments = load_segments_rechash( + os.path.join(self.data_dir, 'segments')) + self.utt2spk = load_utt2spk( + os.path.join(self.data_dir, 'utt2spk')) + self.wavs = load_wav_scp( + os.path.join(self.data_dir, 'wav.scp')) + self.reco2dur = load_reco2dur( + os.path.join(self.data_dir, 'reco2dur')) + self.spk2utt = load_spk2utt( + os.path.join(self.data_dir, 'spk2utt')) + + def load_wav(self, recid, start=0, end=None): + data, rate = load_wav( + self.wavs[recid], start, end) + return data, rate