diff --git a/egs/callhome/eend_ola/conf/train_diar_eend_ola_callhome_chunk2000.yaml b/egs/callhome/eend_ola/conf/train_diar_eend_ola_callhome_chunk2000.yaml new file mode 100644 index 000000000..71ea9f0e9 --- /dev/null +++ b/egs/callhome/eend_ola/conf/train_diar_eend_ola_callhome_chunk2000.yaml @@ -0,0 +1,45 @@ +# network architecture +# encoder related +encoder: eend_ola_transformer +encoder_conf: + idim: 345 + n_layers: 4 + n_units: 256 + +# encoder-decoder attractor related +encoder_decoder_attractor: eda +encoder_decoder_attractor_conf: + n_units: 256 + +# model related +model: eend_ola_similar_eend +model_conf: + attractor_loss_weight: 0.01 + max_n_speaker: 8 + +# optimization related +accum_grad: 1 +grad_clip: 5 +max_epoch: 100 +val_scheduler_criterion: + - valid + - loss +best_model_criterion: +- - valid + - loss + - min +keep_nbest_models: 100 + +optim: adam +optim_conf: + lr: 0.00001 + +dataset_conf: + data_names: speech_speaker_labels + data_types: kaldi_ark + batch_conf: + batch_type: unsorted + batch_size: 8 + num_workers: 8 + +log_interval: 50 \ No newline at end of file diff --git a/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_2spkr.yaml b/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_2spkr.yaml new file mode 100644 index 000000000..baf43424f --- /dev/null +++ b/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_2spkr.yaml @@ -0,0 +1,52 @@ +# network architecture +# encoder related +encoder: eend_ola_transformer +encoder_conf: + idim: 345 + n_layers: 4 + n_units: 256 + +# encoder-decoder attractor related +encoder_decoder_attractor: eda +encoder_decoder_attractor_conf: + n_units: 256 + +# model related +model: eend_ola_similar_eend +model_conf: + max_n_speaker: 8 + +# optimization related +accum_grad: 1 +grad_clip: 5 +max_epoch: 100 +val_scheduler_criterion: + - valid + - loss +best_model_criterion: +- - valid + - loss + - min +keep_nbest_models: 100 + +optim: adam +optim_conf: + lr: 1.0 + betas: + - 0.9 + - 0.98 + eps: 1.0e-9 +scheduler: noamlr +scheduler_conf: + model_size: 256 + warmup_steps: 100000 + +dataset_conf: + data_names: speech_speaker_labels + data_types: kaldi_ark + batch_conf: + batch_type: unsorted + batch_size: 64 + num_workers: 8 + +log_interval: 50 \ No newline at end of file diff --git a/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr.yaml b/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr.yaml new file mode 100644 index 000000000..83a6eeeb9 --- /dev/null +++ b/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr.yaml @@ -0,0 +1,52 @@ +# network architecture +# encoder related +encoder: eend_ola_transformer +encoder_conf: + idim: 345 + n_layers: 4 + n_units: 256 + +# encoder-decoder attractor related +encoder_decoder_attractor: eda +encoder_decoder_attractor_conf: + n_units: 256 + +# model related +model: eend_ola_similar_eend +model_conf: + max_n_speaker: 8 + +# optimization related +accum_grad: 1 +grad_clip: 5 +max_epoch: 25 +val_scheduler_criterion: + - valid + - loss +best_model_criterion: +- - valid + - loss + - min +keep_nbest_models: 100 + +optim: adam +optim_conf: + lr: 1.0 + betas: + - 0.9 + - 0.98 + eps: 1.0e-9 +scheduler: noamlr +scheduler_conf: + model_size: 256 + warmup_steps: 100000 + +dataset_conf: + data_names: speech_speaker_labels + data_types: kaldi_ark + batch_conf: + batch_type: unsorted + batch_size: 64 + num_workers: 8 + +log_interval: 50 \ No newline at end of file diff --git a/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr_chunk2000.yaml b/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr_chunk2000.yaml new file mode 100644 index 000000000..f47850417 --- /dev/null +++ b/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr_chunk2000.yaml @@ -0,0 +1,44 @@ +# network architecture +# encoder related +encoder: eend_ola_transformer +encoder_conf: + idim: 345 + n_layers: 4 + n_units: 256 + +# encoder-decoder attractor related +encoder_decoder_attractor: eda +encoder_decoder_attractor_conf: + n_units: 256 + +# model related +model: eend_ola_similar_eend +model_conf: + max_n_speaker: 8 + +# optimization related +accum_grad: 1 +grad_clip: 5 +max_epoch: 1 +val_scheduler_criterion: + - valid + - loss +best_model_criterion: +- - valid + - loss + - min +keep_nbest_models: 100 + +optim: adam +optim_conf: + lr: 0.00001 + +dataset_conf: + data_names: speech_speaker_labels + data_types: kaldi_ark + batch_conf: + batch_type: unsorted + batch_size: 8 + num_workers: 8 + +log_interval: 50 \ No newline at end of file diff --git a/egs/callhome/eend_ola/local/model_averaging.py b/egs/callhome/eend_ola/local/model_averaging.py new file mode 100644 index 000000000..1871cd9cb --- /dev/null +++ b/egs/callhome/eend_ola/local/model_averaging.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +import argparse + +import torch + + +def average_model(input_files, output_file): + output_model = {} + for ckpt_path in input_files: + model_params = torch.load(ckpt_path, map_location="cpu") + for key, value in model_params.items(): + if key not in output_model: + output_model[key] = value + else: + output_model[key] += value + for key in output_model.keys(): + output_model[key] /= len(input_files) + torch.save(output_model, output_file) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("output_file") + parser.add_argument("input_files", nargs='+') + args = parser.parse_args() + + average_model(args.input_files, args.output_file) \ No newline at end of file diff --git a/egs/callhome/eend_ola/path.sh b/egs/callhome/eend_ola/path.sh new file mode 100755 index 000000000..ea3c0be2f --- /dev/null +++ b/egs/callhome/eend_ola/path.sh @@ -0,0 +1,6 @@ +export FUNASR_DIR=$PWD/../../.. + +# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=../../../:$PYTHONPATH +export PATH=$FUNASR_DIR/funasr/bin:$PATH diff --git a/egs/callhome/eend_ola/run.sh b/egs/callhome/eend_ola/run.sh new file mode 100644 index 000000000..893613752 --- /dev/null +++ b/egs/callhome/eend_ola/run.sh @@ -0,0 +1,242 @@ +#!/usr/bin/env bash + +. ./path.sh || exit 1; + +# machines configuration +CUDA_VISIBLE_DEVICES="7" +gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +count=1 + +# general configuration +simu_feats_dir="/nfs/wangjiaming.wjm/EEND_ARK_DATA/dump/simu_data/data" +simu_feats_dir_chunk2000="/nfs/wangjiaming.wjm/EEND_ARK_DATA/dump/simu_data_chunk2000/data" +callhome_feats_dir_chunk2000="/nfs/wangjiaming.wjm/EEND_ARK_DATA/dump/callhome_chunk2000/data" +simu_train_dataset=train +simu_valid_dataset=dev +callhome_train_dataset=callhome1_allspk +callhome_valid_dataset=callhome2_allspk +callhome2_wav_scp_file=wav.scp + +# model average +simu_average_2spkr_start=91 +simu_average_2spkr_end=100 +simu_average_allspkr_start=16 +simu_average_allspkr_end=25 +callhome_average_start=91 +callhome_average_end=100 + +exp_dir="." +input_size=345 +stage=1 +stop_stage=4 + +# exp tag +tag="exp_fix" + +. utils/parse_options.sh || exit 1; + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +simu_2spkr_diar_config=conf/train_diar_eend_ola_simu_2spkr.yaml +simu_allspkr_diar_config=conf/train_diar_eend_ola_simu_allspkr.yaml +simu_allspkr_chunk2000_diar_config=conf/train_diar_eend_ola_simu_allspkr_chunk2000.yaml +callhome_diar_config=conf/train_diar_eend_ola_callhome_chunk2000.yaml +simu_2spkr_model_dir="baseline_$(basename "${simu_2spkr_diar_config}" .yaml)_${tag}" +simu_allspkr_model_dir="baseline_$(basename "${simu_allspkr_diar_config}" .yaml)_${tag}" +simu_allspkr_chunk2000_model_dir="baseline_$(basename "${simu_allspkr_chunk2000_diar_config}" .yaml)_${tag}" +callhome_model_dir="baseline_$(basename "${callhome_diar_config}" .yaml)_${tag}" + +# Prepare data for training and inference +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + echo "stage 0: Prepare data for training and inference" +fi + +# Training on simulated two-speaker data +world_size=$gpu_num +simu_2spkr_ave_id=avg${simu_average_2spkr_start}-${simu_average_2spkr_end} +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "stage 1: Training on simulated two-speaker data" + mkdir -p ${exp_dir}/exp/${simu_2spkr_model_dir} + mkdir -p ${exp_dir}/exp/${simu_2spkr_model_dir}/log + INIT_FILE=${exp_dir}/exp/${simu_2spkr_model_dir}/ddp_init + if [ -f $INIT_FILE ];then + rm -f $INIT_FILE + fi + init_method=file://$(readlink -f $INIT_FILE) + echo "$0: init method is $init_method" + for ((i = 0; i < $gpu_num; ++i)); do + { + rank=$i + local_rank=$i + gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) + train.py \ + --task_name diar \ + --gpu_id $gpu_id \ + --use_preprocessor false \ + --input_size $input_size \ + --data_dir ${simu_feats_dir} \ + --train_set ${simu_train_dataset} \ + --valid_set ${simu_valid_dataset} \ + --data_file_names "feats_2spkr.scp" \ + --resume true \ + --output_dir ${exp_dir}/exp/${simu_2spkr_model_dir} \ + --config $simu_2spkr_diar_config \ + --ngpu $gpu_num \ + --num_worker_count $count \ + --dist_init_method $init_method \ + --dist_world_size $world_size \ + --dist_rank $rank \ + --local_rank $local_rank 1> ${exp_dir}/exp/${simu_2spkr_model_dir}/log/train.log.$i 2>&1 + } & + done + wait + echo "averaging model parameters into ${exp_dir}/exp/$simu_2spkr_model_dir/$simu_2spkr_ave_id.pb" + models=`eval echo ${exp_dir}/exp/${simu_2spkr_model_dir}/{$simu_average_2spkr_start..$simu_average_2spkr_end}epoch.pb` + python local/model_averaging.py ${exp_dir}/exp/${simu_2spkr_model_dir}/$simu_2spkr_ave_id.pb $models +fi + +# Training on simulated all-speaker data +world_size=$gpu_num +simu_allspkr_ave_id=avg${simu_average_allspkr_start}-${simu_average_allspkr_end} +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + echo "stage 2: Training on simulated all-speaker data" + mkdir -p ${exp_dir}/exp/${simu_allspkr_model_dir} + mkdir -p ${exp_dir}/exp/${simu_allspkr_model_dir}/log + INIT_FILE=${exp_dir}/exp/${simu_allspkr_model_dir}/ddp_init + if [ -f $INIT_FILE ];then + rm -f $INIT_FILE + fi + init_method=file://$(readlink -f $INIT_FILE) + echo "$0: init method is $init_method" + for ((i = 0; i < $gpu_num; ++i)); do + { + rank=$i + local_rank=$i + gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) + train.py \ + --task_name diar \ + --gpu_id $gpu_id \ + --use_preprocessor false \ + --input_size $input_size \ + --data_dir ${simu_feats_dir} \ + --train_set ${simu_train_dataset} \ + --valid_set ${simu_valid_dataset} \ + --data_file_names "feats.scp" \ + --resume true \ + --init_param ${exp_dir}/exp/${simu_2spkr_model_dir}/$simu_2spkr_ave_id.pb \ + --output_dir ${exp_dir}/exp/${simu_allspkr_model_dir} \ + --config $simu_allspkr_diar_config \ + --ngpu $gpu_num \ + --num_worker_count $count \ + --dist_init_method $init_method \ + --dist_world_size $world_size \ + --dist_rank $rank \ + --local_rank $local_rank 1> ${exp_dir}/exp/${simu_allspkr_model_dir}/log/train.log.$i 2>&1 + } & + done + wait + echo "averaging model parameters into ${exp_dir}/exp/$simu_allspkr_model_dir/$simu_allspkr_ave_id.pb" + models=`eval echo ${exp_dir}/exp/${simu_allspkr_model_dir}/{$simu_average_allspkr_start..$simu_average_allspkr_end}epoch.pb` + python local/model_averaging.py ${exp_dir}/exp/${simu_allspkr_model_dir}/$simu_allspkr_ave_id.pb $models +fi + +# Training on simulated all-speaker data with chunk_size=2000 +world_size=$gpu_num +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "stage 3: Training on simulated all-speaker data with chunk_size=2000" + mkdir -p ${exp_dir}/exp/${simu_allspkr_chunk2000_model_dir} + mkdir -p ${exp_dir}/exp/${simu_allspkr_chunk2000_model_dir}/log + INIT_FILE=${exp_dir}/exp/${simu_allspkr_chunk2000_model_dir}/ddp_init + if [ -f $INIT_FILE ];then + rm -f $INIT_FILE + fi + init_method=file://$(readlink -f $INIT_FILE) + echo "$0: init method is $init_method" + for ((i = 0; i < $gpu_num; ++i)); do + { + rank=$i + local_rank=$i + gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) + train.py \ + --task_name diar \ + --gpu_id $gpu_id \ + --use_preprocessor false \ + --input_size $input_size \ + --data_dir ${simu_feats_dir_chunk2000} \ + --train_set ${simu_train_dataset} \ + --valid_set ${simu_valid_dataset} \ + --data_file_names "feats.scp" \ + --resume true \ + --init_param ${exp_dir}/exp/${simu_allspkr_model_dir}/$simu_allspkr_ave_id.pb \ + --output_dir ${exp_dir}/exp/${simu_allspkr_chunk2000_model_dir} \ + --config $simu_allspkr_chunk2000_diar_config \ + --ngpu $gpu_num \ + --num_worker_count $count \ + --dist_init_method $init_method \ + --dist_world_size $world_size \ + --dist_rank $rank \ + --local_rank $local_rank 1> ${exp_dir}/exp/${simu_allspkr_chunk2000_model_dir}/log/train.log.$i 2>&1 + } & + done + wait +fi + +# Training on callhome all-speaker data with chunk_size=2000 +world_size=$gpu_num +callhome_ave_id=avg${callhome_average_start}-${callhome_average_end} +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "stage 4: Training on callhome all-speaker data with chunk_size=2000" + mkdir -p ${exp_dir}/exp/${callhome_model_dir} + mkdir -p ${exp_dir}/exp/${callhome_model_dir}/log + INIT_FILE=${exp_dir}/exp/${callhome_model_dir}/ddp_init + if [ -f $INIT_FILE ];then + rm -f $INIT_FILE + fi + init_method=file://$(readlink -f $INIT_FILE) + echo "$0: init method is $init_method" + for ((i = 0; i < $gpu_num; ++i)); do + { + rank=$i + local_rank=$i + gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) + train.py \ + --task_name diar \ + --gpu_id $gpu_id \ + --use_preprocessor false \ + --input_size $input_size \ + --data_dir ${callhome_feats_dir_chunk2000} \ + --train_set ${callhome_train_dataset} \ + --valid_set ${callhome_valid_dataset} \ + --data_file_names "feats.scp" \ + --resume true \ + --init_param ${exp_dir}/exp/${simu_allspkr_chunk2000_model_dir}/1epoch.pb \ + --output_dir ${exp_dir}/exp/${callhome_model_dir} \ + --config $callhome_diar_config \ + --ngpu $gpu_num \ + --num_worker_count $count \ + --dist_init_method $init_method \ + --dist_world_size $world_size \ + --dist_rank $rank \ + --local_rank $local_rank 1> ${exp_dir}/exp/${callhome_model_dir}/log/train.log.$i 2>&1 + } & + done + wait + echo "averaging model parameters into ${exp_dir}/exp/$callhome_model_dir/$callhome_ave_id.pb" + models=`eval echo ${exp_dir}/exp/${callhome_model_dir}/{$callhome_average_start..$callhome_average_end}epoch.pb` + python local/model_averaging.py ${exp_dir}/exp/${callhome_model_dir}/$callhome_ave_id.pb $models +fi + +## inference +#if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +# echo "Inference" +# mkdir -p ${exp_dir}/exp/${callhome_model_dir}/inference/log +# CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES python local/infer.py \ +# --config_file ${exp_dir}/exp/${callhome_model_dir}/config.yaml \ +# --model_file ${exp_dir}/exp/${callhome_model_dir}/$callhome_ave_id.pb \ +# --output_rttm_file ${exp_dir}/exp/${callhome_model_dir}/inference/rttm \ +# --wav_scp_file ${callhome_feats_dir_chunk2000}/${callhome_valid_dataset}/${callhome2_wav_scp_file} 1> ${exp_dir}/exp/${callhome_model_dir}/inference/log/infer.log 2>&1 +#fi \ No newline at end of file diff --git a/egs/callhome/eend_ola/utils b/egs/callhome/eend_ola/utils new file mode 120000 index 000000000..fe070dd3a --- /dev/null +++ b/egs/callhome/eend_ola/utils @@ -0,0 +1 @@ +../../aishell/transformer/utils \ No newline at end of file