From 6494a503f4ce11634cfd42d562011541b1e4ebf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=98=89=E6=B8=8A?= Date: Tue, 18 Jul 2023 14:55:24 +0800 Subject: [PATCH] update --- egs/callhome/eend_ola/local/make_callhome.sh | 73 ++++++ egs/callhome/eend_ola/local/make_mixture.py | 120 +++++++++ egs/callhome/eend_ola/local/make_musan.py | 123 +++++++++ egs/callhome/eend_ola/local/make_musan.sh | 37 +++ egs/callhome/eend_ola/local/make_sre.pl | 63 +++++ egs/callhome/eend_ola/local/make_sre.sh | 48 ++++ .../eend_ola/local/make_swbd2_phase1.pl | 106 ++++++++ .../eend_ola/local/make_swbd2_phase2.pl | 107 ++++++++ .../eend_ola/local/make_swbd2_phase3.pl | 102 ++++++++ .../eend_ola/local/make_swbd_cellular1.pl | 83 +++++++ .../eend_ola/local/make_swbd_cellular2.pl | 83 +++++++ egs/callhome/eend_ola/local/random_mixture.py | 145 +++++++++++ egs/callhome/eend_ola/local/run_blstm.sh | 9 + .../eend_ola/local/run_prepare_shared_eda.sh | 235 ++++++++++++++++++ egs/callhome/eend_ola/path.sh | 7 + egs/callhome/eend_ola/run.sh | 25 +- egs/callhome/{diarization => }/sond/sond.yaml | 0 .../{diarization => }/sond/sond_fbank.yaml | 0 .../{diarization => }/sond/unit_test.py | 0 19 files changed, 1361 insertions(+), 5 deletions(-) create mode 100644 egs/callhome/eend_ola/local/make_callhome.sh create mode 100644 egs/callhome/eend_ola/local/make_mixture.py create mode 100644 egs/callhome/eend_ola/local/make_musan.py create mode 100644 egs/callhome/eend_ola/local/make_musan.sh create mode 100644 egs/callhome/eend_ola/local/make_sre.pl create mode 100644 egs/callhome/eend_ola/local/make_sre.sh create mode 100644 egs/callhome/eend_ola/local/make_swbd2_phase1.pl create mode 100644 egs/callhome/eend_ola/local/make_swbd2_phase2.pl create mode 100644 egs/callhome/eend_ola/local/make_swbd2_phase3.pl create mode 100644 egs/callhome/eend_ola/local/make_swbd_cellular1.pl create mode 100644 egs/callhome/eend_ola/local/make_swbd_cellular2.pl create mode 100644 egs/callhome/eend_ola/local/random_mixture.py create mode 100644 egs/callhome/eend_ola/local/run_blstm.sh create mode 100644 egs/callhome/eend_ola/local/run_prepare_shared_eda.sh rename egs/callhome/{diarization => }/sond/sond.yaml (100%) rename egs/callhome/{diarization => }/sond/sond_fbank.yaml (100%) rename egs/callhome/{diarization => }/sond/unit_test.py (100%) diff --git a/egs/callhome/eend_ola/local/make_callhome.sh b/egs/callhome/eend_ola/local/make_callhome.sh new file mode 100644 index 000000000..caa8f679f --- /dev/null +++ b/egs/callhome/eend_ola/local/make_callhome.sh @@ -0,0 +1,73 @@ +#!/bin/bash +# Copyright 2017 David Snyder +# Apache 2.0. +# +# This script prepares the Callhome portion of the NIST SRE 2000 +# corpus (LDC2001S97). It is the evaluation dataset used in the +# callhome_diarization recipe. + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + echo "e.g.: $0 /mnt/data/LDC2001S97 data/" + exit 1; +fi + +src_dir=$1 +data_dir=$2 + +tmp_dir=$data_dir/callhome/.tmp/ +mkdir -p $tmp_dir + +# Download some metadata that wasn't provided in the LDC release +if [ ! -d "$tmp_dir/sre2000-key" ]; then + wget --no-check-certificate -P $tmp_dir/ \ + http://www.openslr.org/resources/10/sre2000-key.tar.gz + tar -xvf $tmp_dir/sre2000-key.tar.gz -C $tmp_dir/ +fi + +# The list of 500 recordings +awk '{print $1}' $tmp_dir/sre2000-key/reco2num > $tmp_dir/reco.list + +# Create wav.scp file +count=0 +missing=0 +while read reco; do + path=$(find $src_dir -name "$reco.sph") + if [ -z "${path// }" ]; then + >&2 echo "$0: Missing Sphere file for $reco" + missing=$((missing+1)) + else + echo "$reco sph2pipe -f wav -p $path |" + fi + count=$((count+1)) +done < $tmp_dir/reco.list > $data_dir/callhome/wav.scp + +if [ $missing -gt 0 ]; then + echo "$0: Missing $missing out of $count recordings" +fi + +cp $tmp_dir/sre2000-key/segments $data_dir/callhome/ +awk '{print $1, $2}' $data_dir/callhome/segments > $data_dir/callhome/utt2spk +utils/utt2spk_to_spk2utt.pl $data_dir/callhome/utt2spk > $data_dir/callhome/spk2utt +cp $tmp_dir/sre2000-key/reco2num $data_dir/callhome/reco2num_spk +cp $tmp_dir/sre2000-key/fullref.rttm $data_dir/callhome/ + +utils/validate_data_dir.sh --no-text --no-feats $data_dir/callhome +utils/fix_data_dir.sh $data_dir/callhome + +utils/copy_data_dir.sh $data_dir/callhome $data_dir/callhome1 +utils/copy_data_dir.sh $data_dir/callhome $data_dir/callhome2 + +utils/shuffle_list.pl $data_dir/callhome/wav.scp | head -n 250 \ + | utils/filter_scp.pl - $data_dir/callhome/wav.scp \ + > $data_dir/callhome1/wav.scp +utils/fix_data_dir.sh $data_dir/callhome1 +utils/filter_scp.pl --exclude $data_dir/callhome1/wav.scp \ + $data_dir/callhome/wav.scp > $data_dir/callhome2/wav.scp +utils/fix_data_dir.sh $data_dir/callhome2 +utils/filter_scp.pl $data_dir/callhome1/wav.scp $data_dir/callhome/reco2num_spk \ + > $data_dir/callhome1/reco2num_spk +utils/filter_scp.pl $data_dir/callhome2/wav.scp $data_dir/callhome/reco2num_spk \ + > $data_dir/callhome2/reco2num_spk + +rm -rf $tmp_dir 2> /dev/null diff --git a/egs/callhome/eend_ola/local/make_mixture.py b/egs/callhome/eend_ola/local/make_mixture.py new file mode 100644 index 000000000..82d03cd60 --- /dev/null +++ b/egs/callhome/eend_ola/local/make_mixture.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 + +# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita) +# Licensed under the MIT license. +# +# This script generates simulated multi-talker mixtures for diarization +# +# common/make_mixture.py \ +# mixture.scp \ +# data/mixture \ +# wav/mixture + + +import argparse +import os +from eend import kaldi_data +import numpy as np +import math +import soundfile as sf +import json + +parser = argparse.ArgumentParser() +parser.add_argument('script', + help='list of json') +parser.add_argument('out_data_dir', + help='output data dir of mixture') +parser.add_argument('out_wav_dir', + help='output mixture wav files are stored here') +parser.add_argument('--rate', type=int, default=16000, + help='sampling rate') +args = parser.parse_args() + +# open output data files +segments_f = open(args.out_data_dir + '/segments', 'w') +utt2spk_f = open(args.out_data_dir + '/utt2spk', 'w') +wav_scp_f = open(args.out_data_dir + '/wav.scp', 'w') + +# "-R" forces the default random seed for reproducibility +resample_cmd = "sox -R -t wav - -t wav - rate {}".format(args.rate) + +for line in open(args.script): + recid, jsonstr = line.strip().split(None, 1) + indata = json.loads(jsonstr) + wavfn = indata['recid'] + # recid now include out_wav_dir + recid = os.path.join(args.out_wav_dir, wavfn).replace('/','_') + noise = indata['noise'] + noise_snr = indata['snr'] + mixture = [] + for speaker in indata['speakers']: + spkid = speaker['spkid'] + utts = speaker['utts'] + intervals = speaker['intervals'] + rir = speaker['rir'] + data = [] + pos = 0 + for interval, utt in zip(intervals, utts): + # append silence interval data + silence = np.zeros(int(interval * args.rate)) + data.append(silence) + # utterance is reverberated using room impulse response + preprocess = "wav-reverberate --print-args=false " \ + " --impulse-response={} - -".format(rir) + if isinstance(utt, list): + rec, st, et = utt + st = np.rint(st * args.rate).astype(int) + et = np.rint(et * args.rate).astype(int) + else: + rec = utt + st = 0 + et = None + if rir is not None: + wav_rxfilename = kaldi_data.process_wav(rec, preprocess) + else: + wav_rxfilename = rec + wav_rxfilename = kaldi_data.process_wav( + wav_rxfilename, resample_cmd) + speech, _ = kaldi_data.load_wav(wav_rxfilename, st, et) + data.append(speech) + # calculate start/end position in samples + startpos = pos + len(silence) + endpos = startpos + len(speech) + # write segments and utt2spk + uttid = '{}_{}_{:07d}_{:07d}'.format( + spkid, recid, int(startpos / args.rate * 100), + int(endpos / args.rate * 100)) + print(uttid, recid, + startpos / args.rate, endpos / args.rate, file=segments_f) + print(uttid, spkid, file=utt2spk_f) + # update position for next utterance + pos = endpos + data = np.concatenate(data) + mixture.append(data) + + # fitting to the maximum-length speaker data, then mix all speakers + maxlen = max(len(x) for x in mixture) + mixture = [np.pad(x, (0, maxlen - len(x)), 'constant') for x in mixture] + mixture = np.sum(mixture, axis=0) + # noise is repeated or cutted for fitting to the mixture data length + noise_resampled = kaldi_data.process_wav(noise, resample_cmd) + noise_data, _ = kaldi_data.load_wav(noise_resampled) + if maxlen > len(noise_data): + noise_data = np.pad(noise_data, (0, maxlen - len(noise_data)), 'wrap') + else: + noise_data = noise_data[:maxlen] + # noise power is scaled according to selected SNR, then mixed + signal_power = np.sum(mixture**2) / len(mixture) + noise_power = np.sum(noise_data**2) / len(noise_data) + scale = math.sqrt( + math.pow(10, - noise_snr / 10) * signal_power / noise_power) + mixture += noise_data * scale + # output the wav file and write wav.scp + outfname = '{}.wav'.format(wavfn) + outpath = os.path.join(args.out_wav_dir, outfname) + sf.write(outpath, mixture, args.rate) + print(recid, os.path.abspath(outpath), file=wav_scp_f) + +wav_scp_f.close() +segments_f.close() +utt2spk_f.close() diff --git a/egs/callhome/eend_ola/local/make_musan.py b/egs/callhome/eend_ola/local/make_musan.py new file mode 100644 index 000000000..833da0619 --- /dev/null +++ b/egs/callhome/eend_ola/local/make_musan.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +# Copyright 2015 David Snyder +# 2018 Ewald Enzinger +# Apache 2.0. +# +# Modified version of egs/sre16/v1/local/make_musan.py (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8). +# This version uses the raw MUSAN audio files (16 kHz) and does not use sox to resample at 8 kHz. +# +# This file is meant to be invoked by make_musan.sh. + +import os, sys + +def process_music_annotations(path): + utt2spk = {} + utt2vocals = {} + lines = open(path, 'r').readlines() + for line in lines: + utt, genres, vocals, musician = line.rstrip().split()[:4] + # For this application, the musican ID isn't important + utt2spk[utt] = utt + utt2vocals[utt] = vocals == "Y" + return utt2spk, utt2vocals + +def prepare_music(root_dir, use_vocals): + utt2vocals = {} + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + music_dir = os.path.join(root_dir, "music") + for root, dirs, files in os.walk(music_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + elif str(file) == "ANNOTATIONS": + utt2spk_part, utt2vocals_part = process_music_annotations(file_path) + utt2spk.update(utt2spk_part) + utt2vocals.update(utt2vocals_part) + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2vocals: + if utt in utt2wav: + if use_vocals or not utt2vocals[utt]: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" + num_good_files += 1 + else: + print("Missing file {}".format(utt)) + num_bad_files += 1 + print("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files)) + return utt2spk_str, utt2wav_str + +def prepare_speech(root_dir): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + speech_dir = os.path.join(root_dir, "speech") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" + num_good_files += 1 + else: + print("Missing file {}".format(utt)) + num_bad_files += 1 + print("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files)) + return utt2spk_str, utt2wav_str + +def prepare_noise(root_dir): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + noise_dir = os.path.join(root_dir, "noise") + for root, dirs, files in os.walk(noise_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" + num_good_files += 1 + else: + print("Missing file {}".format(utt)) + num_bad_files += 1 + print("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files)) + return utt2spk_str, utt2wav_str + +def main(): + in_dir = sys.argv[1] + out_dir = sys.argv[2] + use_vocals = sys.argv[3] == "Y" + utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals) + utt2spk_speech, utt2wav_speech = prepare_speech(in_dir) + utt2spk_noise, utt2wav_noise = prepare_noise(in_dir) + utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise + utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise + wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w') + wav_fi.write(utt2wav) + utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w') + utt2spk_fi.write(utt2spk) + + +if __name__=="__main__": + main() diff --git a/egs/callhome/eend_ola/local/make_musan.sh b/egs/callhome/eend_ola/local/make_musan.sh new file mode 100644 index 000000000..694940ad7 --- /dev/null +++ b/egs/callhome/eend_ola/local/make_musan.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Copyright 2015 David Snyder +# Apache 2.0. +# +# This script, called by ../run.sh, creates the MUSAN +# data directory. The required dataset is freely available at +# http://www.openslr.org/17/ + +set -e +in_dir=$1 +data_dir=$2 +use_vocals='Y' + +mkdir -p local/musan.tmp + +echo "Preparing ${data_dir}/musan..." +mkdir -p ${data_dir}/musan +local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals} + +utils/fix_data_dir.sh ${data_dir}/musan + +grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music +grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech +grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise +utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \ + ${data_dir}/musan ${data_dir}/musan_music +utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \ + ${data_dir}/musan ${data_dir}/musan_speech +utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \ + ${data_dir}/musan ${data_dir}/musan_noise + +utils/fix_data_dir.sh ${data_dir}/musan_music +utils/fix_data_dir.sh ${data_dir}/musan_speech +utils/fix_data_dir.sh ${data_dir}/musan_noise + +rm -rf local/musan.tmp + diff --git a/egs/callhome/eend_ola/local/make_sre.pl b/egs/callhome/eend_ola/local/make_sre.pl new file mode 100644 index 000000000..b86fa7ee7 --- /dev/null +++ b/egs/callhome/eend_ola/local/make_sre.pl @@ -0,0 +1,63 @@ +#!/usr/bin/perl +# +# Copyright 2015 David Snyder +# Apache 2.0. +# Usage: make_sre.pl + +if (@ARGV != 4) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora5/LDC/LDC2006S44 sre2004 sre_ref data/sre2004\n"; + exit(1); +} + +($db_base, $sre_name, $sre_ref_filename, $out_dir) = @ARGV; +%utt2sph = (); +%spk2gender = (); + +$tmp_dir = "$out_dir/tmp"; +if (system("mkdir -p $tmp_dir") != 0) { + die "Error making directory $tmp_dir"; +} + +if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) { + die "Error getting list of sph files"; +} +open(WAVLIST, "<", "$tmp_dir/sph.list") or die "cannot open wav list"; + +while() { + chomp; + $sph = $_; + @A1 = split("/",$sph); + @A2 = split("[./]",$A1[$#A1]); + $uttId=$A2[0]; + $utt2sph{$uttId} = $sph; +} + +open(GNDR,">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; +open(SPKR,">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV,">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; +open(SRE_REF, "<", $sre_ref_filename) or die "Cannot open SRE reference."; +while () { + chomp; + ($speaker, $gender, $other_sre_name, $utt_id, $channel) = split(" ", $_); + $channel_num = "1"; + if ($channel eq "A") { + $channel_num = "1"; + } else { + $channel_num = "2"; + } + if (($other_sre_name eq $sre_name) and (exists $utt2sph{$utt_id})) { + $full_utt_id = "$speaker-$gender-$sre_name-$utt_id-$channel"; + $spk2gender{"$speaker-$gender"} = $gender; + print WAV "$full_utt_id"," sph2pipe -f wav -p -c $channel_num $utt2sph{$utt_id} |\n"; + print SPKR "$full_utt_id $speaker-$gender","\n"; + } +} +foreach $speaker (keys %spk2gender) { + print GNDR "$speaker $spk2gender{$speaker}\n"; +} + +close(GNDR) || die; +close(SPKR) || die; +close(WAV) || die; +close(SRE_REF) || die; diff --git a/egs/callhome/eend_ola/local/make_sre.sh b/egs/callhome/eend_ola/local/make_sre.sh new file mode 100644 index 000000000..bef4e06e6 --- /dev/null +++ b/egs/callhome/eend_ola/local/make_sre.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# Copyright 2015 David Snyder +# Apache 2.0. +# +# See README.txt for more info on data required. + +set -e + +data_root=$1 +data_dir=$2 + +wget -P data/local/ http://www.openslr.org/resources/15/speaker_list.tgz +tar -C data/local/ -xvf data/local/speaker_list.tgz +sre_ref=data/local/speaker_list + +local/make_sre.pl $data_root/LDC2006S44/ \ + sre2004 $sre_ref $data_dir/sre2004 + +local/make_sre.pl $data_root/LDC2011S01 \ + sre2005 $sre_ref $data_dir/sre2005_train + +local/make_sre.pl $data_root/LDC2011S04 \ + sre2005 $sre_ref $data_dir/sre2005_test + +local/make_sre.pl $data_root/LDC2011S09 \ + sre2006 $sre_ref $data_dir/sre2006_train + +local/make_sre.pl $data_root/LDC2011S10 \ + sre2006 $sre_ref $data_dir/sre2006_test_1 + +local/make_sre.pl $data_root/LDC2012S01 \ + sre2006 $sre_ref $data_dir/sre2006_test_2 + +local/make_sre.pl $data_root/LDC2011S05 \ + sre2008 $sre_ref $data_dir/sre2008_train + +local/make_sre.pl $data_root/LDC2011S08 \ + sre2008 $sre_ref $data_dir/sre2008_test + +utils/combine_data.sh $data_dir/sre \ + $data_dir/sre2004 $data_dir/sre2005_train \ + $data_dir/sre2005_test $data_dir/sre2006_train \ + $data_dir/sre2006_test_1 $data_dir/sre2006_test_2 \ + $data_dir/sre2008_train $data_dir/sre2008_test + +utils/validate_data_dir.sh --no-text --no-feats $data_dir/sre +utils/fix_data_dir.sh $data_dir/sre +rm data/local/speaker_list.* diff --git a/egs/callhome/eend_ola/local/make_swbd2_phase1.pl b/egs/callhome/eend_ola/local/make_swbd2_phase1.pl new file mode 100644 index 000000000..71b26b55d --- /dev/null +++ b/egs/callhome/eend_ola/local/make_swbd2_phase1.pl @@ -0,0 +1,106 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# +# Copyright 2017 David Snyder +# Apache 2.0 + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora3/LDC/LDC98S75 data/swbd2_phase1_train\n"; + exit(1); +} +($db_base, $out_dir) = @ARGV; + +if (system("mkdir -p $out_dir")) { + die "Error making directory $out_dir"; +} + +open(CS, "<$db_base/doc/callstat.tbl") || die "Could not open $db_base/doc/callstat.tbl"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; + +@badAudio = ("3", "4"); + +$tmp_dir = "$out_dir/tmp"; +if (system("mkdir -p $tmp_dir") != 0) { + die "Error making directory $tmp_dir"; +} + +if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) { + die "Error getting list of sph files"; +} + +open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list"; + +%wavs = (); +while() { + chomp; + $sph = $_; + @t = split("/",$sph); + @t1 = split("[./]",$t[$#t]); + $uttId = $t1[0]; + $wavs{$uttId} = $sph; +} + +while () { + $line = $_ ; + @A = split(",", $line); + @A1 = split("[./]",$A[0]); + $wav = $A1[0]; + if (/$wav/i ~~ @badAudio) { + # do nothing + print "Bad Audio = $wav"; + } else { + $spkr1= "sw_" . $A[2]; + $spkr2= "sw_" . $A[3]; + $gender1 = $A[5]; + $gender2 = $A[6]; + if ($gender1 eq "M") { + $gender1 = "m"; + } elsif ($gender1 eq "F") { + $gender1 = "f"; + } else { + die "Unknown Gender in $line"; + } + if ($gender2 eq "M") { + $gender2 = "m"; + } elsif ($gender2 eq "F") { + $gender2 = "f"; + } else { + die "Unknown Gender in $line"; + } + if (-e "$wavs{$wav}") { + $uttId = $spkr1 ."_" . $wav ."_1"; + if (!$spk2gender{$spkr1}) { + $spk2gender{$spkr1} = $gender1; + print GNDR "$spkr1"," $gender1\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wavs{$wav} |\n"; + print SPKR "$uttId"," $spkr1","\n"; + + $uttId = $spkr2 . "_" . $wav ."_2"; + if (!$spk2gender{$spkr2}) { + $spk2gender{$spkr2} = $gender2; + print GNDR "$spkr2"," $gender2\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 2 $wavs{$wav} |\n"; + print SPKR "$uttId"," $spkr2","\n"; + } else { + print STDERR "Missing $wavs{$wav} for $wav\n"; + } + } +} + +close(WAV) || die; +close(SPKR) || die; +close(GNDR) || die; +if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +if (system("utils/fix_data_dir.sh $out_dir") != 0) { + die "Error fixing data dir $out_dir"; +} +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/callhome/eend_ola/local/make_swbd2_phase2.pl b/egs/callhome/eend_ola/local/make_swbd2_phase2.pl new file mode 100644 index 000000000..337ab9d97 --- /dev/null +++ b/egs/callhome/eend_ola/local/make_swbd2_phase2.pl @@ -0,0 +1,107 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# +# Copyright 2013 Daniel Povey +# Apache 2.0 + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora5/LDC/LDC99S79 data/swbd2_phase2_train\n"; + exit(1); +} +($db_base, $out_dir) = @ARGV; + +if (system("mkdir -p $out_dir")) { + die "Error making directory $out_dir"; +} + +open(CS, "<$db_base/DISC1/doc/callstat.tbl") || die "Could not open $db_base/DISC1/doc/callstat.tbl"; +open(CI, "<$db_base/DISC1/doc/callinfo.tbl") || die "Could not open $db_base/DISC1/doc/callinfo.tbl"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; + +@badAudio = ("3", "4"); + +$tmp_dir = "$out_dir/tmp"; +if (system("mkdir -p $tmp_dir") != 0) { + die "Error making directory $tmp_dir"; +} + +if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) { + die "Error getting list of sph files"; +} + +open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list"; + +while() { + chomp; + $sph = $_; + @t = split("/",$sph); + @t1 = split("[./]",$t[$#t]); + $uttId=$t1[0]; + $wav{$uttId} = $sph; +} + +while () { + $line = $_ ; + $ci = ; + $ci = ; + @ci = split(",",$ci); + $wav = $ci[0]; + @A = split(",", $line); + if (/$wav/i ~~ @badAudio) { + # do nothing + } else { + $spkr1= "sw_" . $A[2]; + $spkr2= "sw_" . $A[3]; + $gender1 = $A[4]; + $gender2 = $A[5]; + if ($gender1 eq "M") { + $gender1 = "m"; + } elsif ($gender1 eq "F") { + $gender1 = "f"; + } else { + die "Unknown Gender in $line"; + } + if ($gender2 eq "M") { + $gender2 = "m"; + } elsif ($gender2 eq "F") { + $gender2 = "f"; + } else { + die "Unknown Gender in $line"; + } + if (-e "$wav{$wav}") { + $uttId = $spkr1 ."_" . $wav ."_1"; + if (!$spk2gender{$spkr1}) { + $spk2gender{$spkr1} = $gender1; + print GNDR "$spkr1"," $gender1\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wav{$wav} |\n"; + print SPKR "$uttId"," $spkr1","\n"; + + $uttId = $spkr2 . "_" . $wav ."_2"; + if (!$spk2gender{$spkr2}) { + $spk2gender{$spkr2} = $gender2; + print GNDR "$spkr2"," $gender2\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 2 $wav{$wav} |\n"; + print SPKR "$uttId"," $spkr2","\n"; + } else { + print STDERR "Missing $wav{$wav} for $wav\n"; + } + } +} + +close(WAV) || die; +close(SPKR) || die; +close(GNDR) || die; +if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +if (system("utils/fix_data_dir.sh $out_dir") != 0) { + die "Error fixing data dir $out_dir"; +} +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/callhome/eend_ola/local/make_swbd2_phase3.pl b/egs/callhome/eend_ola/local/make_swbd2_phase3.pl new file mode 100644 index 000000000..f27853415 --- /dev/null +++ b/egs/callhome/eend_ola/local/make_swbd2_phase3.pl @@ -0,0 +1,102 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# +# Copyright 2013 Daniel Povey +# Apache 2.0 + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora5/LDC/LDC2002S06 data/swbd2_phase3_train\n"; + exit(1); +} +($db_base, $out_dir) = @ARGV; + +if (system("mkdir -p $out_dir")) { + die "Error making directory $out_dir"; +} + +open(CS, "<$db_base/DISC1/docs/callstat.tbl") || die "Could not open $db_base/DISC1/docs/callstat.tbl"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; + +@badAudio = ("3", "4"); + +$tmp_dir = "$out_dir/tmp"; +if (system("mkdir -p $tmp_dir") != 0) { + die "Error making directory $tmp_dir"; +} + +if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) { + die "Error getting list of sph files"; +} + +open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list"; +while() { + chomp; + $sph = $_; + @t = split("/",$sph); + @t1 = split("[./]",$t[$#t]); + $uttId=$t1[0]; + $wav{$uttId} = $sph; +} + +while () { + $line = $_ ; + @A = split(",", $line); + $wav = "sw_" . $A[0] ; + if (/$wav/i ~~ @badAudio) { + # do nothing + } else { + $spkr1= "sw_" . $A[3]; + $spkr2= "sw_" . $A[4]; + $gender1 = $A[5]; + $gender2 = $A[6]; + if ($gender1 eq "M") { + $gender1 = "m"; + } elsif ($gender1 eq "F") { + $gender1 = "f"; + } else { + die "Unknown Gender in $line"; + } + if ($gender2 eq "M") { + $gender2 = "m"; + } elsif ($gender2 eq "F") { + $gender2 = "f"; + } else { + die "Unknown Gender in $line"; + } + if (-e "$wav{$wav}") { + $uttId = $spkr1 ."_" . $wav ."_1"; + if (!$spk2gender{$spkr1}) { + $spk2gender{$spkr1} = $gender1; + print GNDR "$spkr1"," $gender1\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wav{$wav} |\n"; + print SPKR "$uttId"," $spkr1","\n"; + + $uttId = $spkr2 . "_" . $wav ."_2"; + if (!$spk2gender{$spkr2}) { + $spk2gender{$spkr2} = $gender2; + print GNDR "$spkr2"," $gender2\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 2 $wav{$wav} |\n"; + print SPKR "$uttId"," $spkr2","\n"; + } else { + print STDERR "Missing $wav{$wav} for $wav\n"; + } + } +} + +close(WAV) || die; +close(SPKR) || die; +close(GNDR) || die; +if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +if (system("utils/fix_data_dir.sh $out_dir") != 0) { + die "Error fixing data dir $out_dir"; +} +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/callhome/eend_ola/local/make_swbd_cellular1.pl b/egs/callhome/eend_ola/local/make_swbd_cellular1.pl new file mode 100644 index 000000000..e30c710e6 --- /dev/null +++ b/egs/callhome/eend_ola/local/make_swbd_cellular1.pl @@ -0,0 +1,83 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# +# Copyright 2013 Daniel Povey +# Apache 2.0 + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora5/LDC/LDC2001S13 data/swbd_cellular1_train\n"; + exit(1); +} +($db_base, $out_dir) = @ARGV; + +if (system("mkdir -p $out_dir")) { + die "Error making directory $out_dir"; +} + +open(CS, "<$db_base/doc/swb_callstats.tbl") || die "Could not open $db_base/doc/swb_callstats.tbl"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; + +@badAudio = ("40019", "45024", "40022"); + +while () { + $line = $_ ; + @A = split(",", $line); + if (/$A[0]/i ~~ @badAudio) { + # do nothing + } else { + $wav = "sw_" . $A[0]; + $spkr1= "sw_" . $A[1]; + $spkr2= "sw_" . $A[2]; + $gender1 = $A[3]; + $gender2 = $A[4]; + if ($A[3] eq "M") { + $gender1 = "m"; + } elsif ($A[3] eq "F") { + $gender1 = "f"; + } else { + die "Unknown Gender in $line"; + } + if ($A[4] eq "M") { + $gender2 = "m"; + } elsif ($A[4] eq "F") { + $gender2 = "f"; + } else { + die "Unknown Gender in $line"; + } + if (-e "$db_base/$wav.sph") { + $uttId = $spkr1 . "-swbdc_" . $wav ."_1"; + if (!$spk2gender{$spkr1}) { + $spk2gender{$spkr1} = $gender1; + print GNDR "$spkr1"," $gender1\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 1 $db_base/$wav.sph |\n"; + print SPKR "$uttId"," $spkr1","\n"; + + $uttId = $spkr2 . "-swbdc_" . $wav ."_2"; + if (!$spk2gender{$spkr2}) { + $spk2gender{$spkr2} = $gender2; + print GNDR "$spkr2"," $gender2\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 2 $db_base/$wav.sph |\n"; + print SPKR "$uttId"," $spkr2","\n"; + } else { + print STDERR "Missing $db_base/$wav.sph\n"; + } + } +} + +close(WAV) || die; +close(SPKR) || die; +close(GNDR) || die; +if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +if (system("utils/fix_data_dir.sh $out_dir") != 0) { + die "Error fixing data dir $out_dir"; +} +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/callhome/eend_ola/local/make_swbd_cellular2.pl b/egs/callhome/eend_ola/local/make_swbd_cellular2.pl new file mode 100644 index 000000000..4de954c19 --- /dev/null +++ b/egs/callhome/eend_ola/local/make_swbd_cellular2.pl @@ -0,0 +1,83 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# +# Copyright 2013 Daniel Povey +# Apache 2.0 + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora5/LDC/LDC2004S07 data/swbd_cellular2_train\n"; + exit(1); +} +($db_base, $out_dir) = @ARGV; + +if (system("mkdir -p $out_dir")) { + die "Error making directory $out_dir"; +} + +open(CS, "<$db_base/docs/swb_callstats.tbl") || die "Could not open $db_base/docs/swb_callstats.tbl"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; + +@badAudio=("45024", "40022"); + +while () { + $line = $_ ; + @A = split(",", $line); + if (/$A[0]/i ~~ @badAudio) { + # do nothing + } else { + $wav = "sw_" . $A[0]; + $spkr1= "sw_" . $A[1]; + $spkr2= "sw_" . $A[2]; + $gender1 = $A[3]; + $gender2 = $A[4]; + if ($A[3] eq "M") { + $gender1 = "m"; + } elsif ($A[3] eq "F") { + $gender1 = "f"; + } else { + die "Unknown Gender in $line"; + } + if ($A[4] eq "M") { + $gender2 = "m"; + } elsif ($A[4] eq "F") { + $gender2 = "f"; + } else { + die "Unknown Gender in $line"; + } + if (-e "$db_base/data/$wav.sph") { + $uttId = $spkr1 . "-swbdc_" . $wav ."_1"; + if (!$spk2gender{$spkr1}) { + $spk2gender{$spkr1} = $gender1; + print GNDR "$spkr1"," $gender1\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 1 $db_base/data/$wav.sph |\n"; + print SPKR "$uttId"," $spkr1","\n"; + + $uttId = $spkr2 . "-swbdc_" . $wav ."_2"; + if (!$spk2gender{$spkr2}) { + $spk2gender{$spkr2} = $gender2; + print GNDR "$spkr2"," $gender2\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 2 $db_base/data/$wav.sph |\n"; + print SPKR "$uttId"," $spkr2","\n"; + } else { + print STDERR "Missing $db_base/data/$wav.sph\n"; + } + } +} + +close(WAV) || die; +close(SPKR) || die; +close(GNDR) || die; +if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +if (system("utils/fix_data_dir.sh $out_dir") != 0) { + die "Error fixing data dir $out_dir"; +} +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/callhome/eend_ola/local/random_mixture.py b/egs/callhome/eend_ola/local/random_mixture.py new file mode 100644 index 000000000..0032ef926 --- /dev/null +++ b/egs/callhome/eend_ola/local/random_mixture.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 + +# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita) +# Licensed under the MIT license. + +""" +This script generates random multi-talker mixtures for diarization. +It generates a scp-like outputs: lines of "[recid] [json]". + recid: recording id of mixture + serial numbers like mix_0000001, mix_0000002, ... + json: mixture configuration formatted in "one-line" +The json format is as following: +{ + 'speakers':[ # list of speakers + { + 'spkid': 'Name', # speaker id + 'rir': '/rirdir/rir.wav', # wav_rxfilename of room impulse response + 'utts': [ # list of wav_rxfilenames of utterances + '/wavdir/utt1.wav', + '/wavdir/utt2.wav',...], + 'intervals': [1.2, 3.4, ...] # list of silence durations before utterances + }, ... ], + 'noise': '/noisedir/noise.wav' # wav_rxfilename of background noise + 'snr': 15.0, # SNR for mixing background noise + 'recid': 'mix_000001' # recording id of the mixture +} + +Usage: + common/random_mixture.py \ + --n_mixtures=10000 \ # number of mixtures + data/voxceleb1_train \ # kaldi-style data dir of utterances + data/musan_noise_bg \ # background noises + data/simu_rirs \ # room impulse responses + > mixture.scp # output scp-like file + +The actual data dir and wav files are generated using make_mixture.py: + common/make_mixture.py \ + mixture.scp \ # scp-like file for mixture + data/mixture \ # output data dir + wav/mixture # output wav dir +""" + +import argparse +import os +from eend import kaldi_data +import random +import numpy as np +import json +import itertools + +parser = argparse.ArgumentParser() +parser.add_argument('data_dir', + help='data dir of single-speaker recordings') +parser.add_argument('noise_dir', + help='data dir of background noise recordings') +parser.add_argument('rir_dir', + help='data dir of room impulse responses') +parser.add_argument('--n_mixtures', type=int, default=10, + help='number of mixture recordings') +parser.add_argument('--n_speakers', type=int, default=4, + help='number of speakers in a mixture') +parser.add_argument('--min_utts', type=int, default=10, + help='minimum number of uttenraces per speaker') +parser.add_argument('--max_utts', type=int, default=20, + help='maximum number of utterances per speaker') +parser.add_argument('--sil_scale', type=float, default=10.0, + help='average silence time') +parser.add_argument('--noise_snrs', default="10:15:20", + help='colon-delimited SNRs for background noises') +parser.add_argument('--random_seed', type=int, default=777, + help='random seed') +parser.add_argument('--speech_rvb_probability', type=float, default=1, + help='reverb probability') +args = parser.parse_args() + +random.seed(args.random_seed) +np.random.seed(args.random_seed) + +# load list of wav files from kaldi-style data dirs +wavs = kaldi_data.load_wav_scp( + os.path.join(args.data_dir, 'wav.scp')) +noises = kaldi_data.load_wav_scp( + os.path.join(args.noise_dir, 'wav.scp')) +rirs = kaldi_data.load_wav_scp( + os.path.join(args.rir_dir, 'wav.scp')) + +# spk2utt is used for counting number of utterances per speaker +spk2utt = kaldi_data.load_spk2utt( + os.path.join(args.data_dir, 'spk2utt')) + +segments = kaldi_data.load_segments_hash( + os.path.join(args.data_dir, 'segments')) + +# choice lists for random sampling +all_speakers = list(spk2utt.keys()) +all_noises = list(noises.keys()) +all_rirs = list(rirs.keys()) +noise_snrs = [float(x) for x in args.noise_snrs.split(':')] + +mixtures = [] +for it in range(args.n_mixtures): + # recording ids are mix_0000001, mix_0000002, ... + recid = 'mix_{:07d}'.format(it + 1) + # randomly select speakers, a background noise and a SNR + speakers = random.sample(all_speakers, args.n_speakers) + noise = random.choice(all_noises) + noise_snr = random.choice(noise_snrs) + mixture = {'speakers': []} + for speaker in speakers: + # randomly select the number of utterances + n_utts = np.random.randint(args.min_utts, args.max_utts + 1) + # utts = spk2utt[speaker][:n_utts] + cycle_utts = itertools.cycle(spk2utt[speaker]) + # random start utterance + roll = np.random.randint(0, len(spk2utt[speaker])) + for i in range(roll): + next(cycle_utts) + utts = [next(cycle_utts) for i in range(n_utts)] + # randomly select wait time before appending utterance + intervals = np.random.exponential(args.sil_scale, size=n_utts) + # randomly select a room impulse response + if random.random() < args.speech_rvb_probability: + rir = rirs[random.choice(all_rirs)] + else: + rir = None + if segments is not None: + utts = [segments[utt] for utt in utts] + utts = [(wavs[rec], st, et) for (rec, st, et) in utts] + mixture['speakers'].append({ + 'spkid': speaker, + 'rir': rir, + 'utts': utts, + 'intervals': intervals.tolist() + }) + else: + mixture['speakers'].append({ + 'spkid': speaker, + 'rir': rir, + 'utts': [wavs[utt] for utt in utts], + 'intervals': intervals.tolist() + }) + mixture['noise'] = noises[noise] + mixture['snr'] = noise_snr + mixture['recid'] = recid + print(recid, json.dumps(mixture)) diff --git a/egs/callhome/eend_ola/local/run_blstm.sh b/egs/callhome/eend_ola/local/run_blstm.sh new file mode 100644 index 000000000..71270a4a2 --- /dev/null +++ b/egs/callhome/eend_ola/local/run_blstm.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita) +# Licensed under the MIT license. +# +# BLSTM-based model experiment +./run.sh --train-config conf/blstm/train.yaml --average-start 20 --average-end 20 \ + --adapt-config conf/blstm/adapt.yaml --adapt-average-start 10 --adapt-average-end 10 \ + --infer-config conf/blstm/infer.yaml $* diff --git a/egs/callhome/eend_ola/local/run_prepare_shared_eda.sh b/egs/callhome/eend_ola/local/run_prepare_shared_eda.sh new file mode 100644 index 000000000..f48adc54f --- /dev/null +++ b/egs/callhome/eend_ola/local/run_prepare_shared_eda.sh @@ -0,0 +1,235 @@ +#!/bin/bash + +# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita, Shota Horiguchi) +# Licensed under the MIT license. +# +# This script prepares kaldi-style data sets shared with different experiments +# - data/xxxx +# callhome, sre, swb2, and swb_cellular datasets +# - data/simu_${simu_outputs} +# simulation mixtures generated with various options + +stage=0 + +# Modify corpus directories +# - callhome_dir +# CALLHOME (LDC2001S97) +# - swb2_phase1_train +# Switchboard-2 Phase 1 (LDC98S75) +# - data_root +# LDC99S79, LDC2002S06, LDC2001S13, LDC2004S07, +# LDC2006S44, LDC2011S01, LDC2011S04, LDC2011S09, +# LDC2011S10, LDC2012S01, LDC2011S05, LDC2011S08 +# - musan_root +# MUSAN corpus (https://www.openslr.org/17/) +callhome_dir=/export/corpora/NIST/LDC2001S97 +swb2_phase1_train=/export/corpora/LDC/LDC98S75 +data_root=/export/corpora5/LDC +musan_root=/export/corpora/JHU/musan +# Modify simulated data storage area. +# This script distributes simulated data under these directories +simu_actual_dirs=( +/export/c05/$USER/diarization-data +/export/c08/$USER/diarization-data +/export/c09/$USER/diarization-data +) + +# data preparation options +max_jobs_run=4 +sad_num_jobs=30 +sad_opts="--extra-left-context 79 --extra-right-context 21 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0 --acwt 0.3" +sad_graph_opts="--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0" +sad_priors_opts="--sil-scale=0.1" + +# simulation options +simu_opts_overlap=yes +simu_opts_num_speaker_array=(1 2 3 4) +simu_opts_sil_scale_array=(2 2 5 9) +simu_opts_rvb_prob=0.5 +simu_opts_num_train=100000 +simu_opts_min_utts=10 +simu_opts_max_utts=20 + +simu_cmd="run.pl" +train_cmd="run.pl" +random_mixture_cmd="run.pl" +make_mixture_cmd="run.pl" + +. parse_options.sh || exit + +if [ $stage -le 0 ]; then + echo "prepare kaldi-style datasets" + # Prepare CALLHOME dataset. This will be used to evaluation. + if ! validate_data_dir.sh --no-text --no-feats data/callhome1_spkall \ + || ! validate_data_dir.sh --no-text --no-feats data/callhome2_spkall; then + # imported from https://github.com/kaldi-asr/kaldi/blob/master/egs/callhome_diarization/v1 + local/make_callhome.sh $callhome_dir data + # Generate two-speaker subsets + for dset in callhome1 callhome2; do + # Extract two-speaker recordings in wav.scp + copy_data_dir.sh data/${dset} data/${dset}_spkall + # Regenerate segments file from fullref.rttm + # $2: recid, $4: start_time, $5: duration, $8: speakerid + awk '{printf "%s_%s_%07d_%07d %s %.2f %.2f\n", \ + $2, $8, $4*100, ($4+$5)*100, $2, $4, $4+$5}' \ + data/callhome/fullref.rttm | sort > data/${dset}_spkall/segments + utils/fix_data_dir.sh data/${dset}_spkall + # Speaker ID is '[recid]_[speakerid] + awk '{split($1,A,"_"); printf "%s %s_%s\n", $1, A[1], A[2]}' \ + data/${dset}_spkall/segments > data/${dset}_spkall/utt2spk + utils/fix_data_dir.sh data/${dset}_spkall + # Generate rttm files for scoring + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ + data/${dset}_spkall/utt2spk data/${dset}_spkall/segments \ + data/${dset}_spkall/rttm + utils/data/get_reco2dur.sh data/${dset}_spkall + done + fi + # Prepare a collection of NIST SRE and SWB data. This will be used to train, + if ! validate_data_dir.sh --no-text --no-feats data/swb_sre_comb; then + local/make_sre.sh $data_root data + # Prepare SWB for x-vector DNN training. + local/make_swbd2_phase1.pl $swb2_phase1_train \ + data/swbd2_phase1_train + local/make_swbd2_phase2.pl $data_root/LDC99S79 \ + data/swbd2_phase2_train + local/make_swbd2_phase3.pl $data_root/LDC2002S06 \ + data/swbd2_phase3_train + local/make_swbd_cellular1.pl $data_root/LDC2001S13 \ + data/swbd_cellular1_train + local/make_swbd_cellular2.pl $data_root/LDC2004S07 \ + data/swbd_cellular2_train + # Combine swb and sre data + utils/combine_data.sh data/swb_sre_comb \ + data/swbd_cellular1_train data/swbd_cellular2_train \ + data/swbd2_phase1_train \ + data/swbd2_phase2_train data/swbd2_phase3_train data/sre + fi + # musan data. "back-ground + if ! validate_data_dir.sh --no-text --no-feats data/musan_noise_bg; then + local/make_musan.sh $musan_root data + utils/copy_data_dir.sh data/musan_noise data/musan_noise_bg + awk '{if(NR>1) print $1,$1}' $musan_root/noise/free-sound/ANNOTATIONS > data/musan_noise_bg/utt2spk + utils/fix_data_dir.sh data/musan_noise_bg + fi + # simu rirs 8k + if ! validate_data_dir.sh --no-text --no-feats data/simu_rirs_8k; then + mkdir -p data/simu_rirs_8k + if [ ! -e sim_rir_8k.zip ]; then + wget --no-check-certificate http://www.openslr.org/resources/26/sim_rir_8k.zip + fi + unzip sim_rir_8k.zip -d data/sim_rir_8k + find $PWD/data/sim_rir_8k -iname "*.wav" \ + | awk '{n=split($1,A,/[\/\.]/); print A[n-3]"_"A[n-1], $1}' \ + | sort > data/simu_rirs_8k/wav.scp + awk '{print $1, $1}' data/simu_rirs_8k/wav.scp > data/simu_rirs_8k/utt2spk + utils/fix_data_dir.sh data/simu_rirs_8k + fi + # Automatic segmentation using pretrained SAD model + # it will take one day using 30 CPU jobs: + # make_mfcc: 1 hour, compute_output: 18 hours, decode: 0.5 hours + sad_nnet_dir=exp/segmentation_1a/tdnn_stats_asr_sad_1a + sad_work_dir=exp/segmentation_1a/tdnn_stats_asr_sad_1a + if ! validate_data_dir.sh --no-text $sad_work_dir/swb_sre_comb_seg; then + if [ ! -d exp/segmentation_1a ]; then + wget http://kaldi-asr.org/models/4/0004_tdnn_stats_asr_sad_1a.tar.gz + tar zxf 0004_tdnn_stats_asr_sad_1a.tar.gz + fi + steps/segmentation/detect_speech_activity.sh \ + --nj $sad_num_jobs \ + --graph-opts "$sad_graph_opts" \ + --transform-probs-opts "$sad_priors_opts" $sad_opts \ + data/swb_sre_comb $sad_nnet_dir mfcc_hires $sad_work_dir \ + $sad_work_dir/swb_sre_comb || exit 1 + fi + # Extract >1.5 sec segments and split into train/valid sets + if ! validate_data_dir.sh --no-text --no-feats data/swb_sre_cv; then + copy_data_dir.sh data/swb_sre_comb data/swb_sre_comb_seg + awk '$4-$3>1.5{print;}' $sad_work_dir/swb_sre_comb_seg/segments > data/swb_sre_comb_seg/segments + cp $sad_work_dir/swb_sre_comb_seg/{utt2spk,spk2utt} data/swb_sre_comb_seg + fix_data_dir.sh data/swb_sre_comb_seg + utils/subset_data_dir_tr_cv.sh data/swb_sre_comb_seg data/swb_sre_tr data/swb_sre_cv + fi +fi + +simudir=data/simu +if [ $stage -le 1 ]; then + echo "simulation of mixture" + mkdir -p $simudir/.work + local/random_mixture_cmd=random_mixture.py + local/make_mixture_cmd=make_mixture.py + + for ((i=0; i<${#simu_opts_sil_scale_array[@]}; ++i)); do + simu_opts_num_speaker=${simu_opts_num_speaker_array[i]} + simu_opts_sil_scale=${simu_opts_sil_scale_array[i]} + for dset in swb_sre_tr swb_sre_cv; do + if [ "$dset" == "swb_sre_tr" ]; then + n_mixtures=${simu_opts_num_train} + else + n_mixtures=500 + fi + simuid=${dset}_ns${simu_opts_num_speaker}_beta${simu_opts_sil_scale}_${n_mixtures} + # check if you have the simulation + if ! validate_data_dir.sh --no-text --no-feats $simudir/data/$simuid; then + # random mixture generation + $train_cmd $simudir/.work/random_mixture_$simuid.log \ + $random_mixture_cmd --n_speakers $simu_opts_num_speaker --n_mixtures $n_mixtures \ + --speech_rvb_probability $simu_opts_rvb_prob \ + --sil_scale $simu_opts_sil_scale \ + data/$dset data/musan_noise_bg data/simu_rirs_8k \ + \> $simudir/.work/mixture_$simuid.scp + nj=64 + mkdir -p $simudir/wav/$simuid + # distribute simulated data to $simu_actual_dir + split_scps= + for n in $(seq $nj); do + split_scps="$split_scps $simudir/.work/mixture_$simuid.$n.scp" + mkdir -p $simudir/.work/data_$simuid.$n + actual=${simu_actual_dirs[($n-1)%${#simu_actual_dirs[@]}]}/$simudir/wav/$simuid/$n + mkdir -p $actual + ln -nfs $actual $simudir/wav/$simuid/$n + done + utils/split_scp.pl $simudir/.work/mixture_$simuid.scp $split_scps || exit 1 + + $simu_cmd --max-jobs-run 64 JOB=1:$nj $simudir/.work/make_mixture_$simuid.JOB.log \ + $make_mixture_cmd --rate=8000 \ + $simudir/.work/mixture_$simuid.JOB.scp \ + $simudir/.work/data_$simuid.JOB $simudir/wav/$simuid/JOB + utils/combine_data.sh $simudir/data/$simuid $simudir/.work/data_$simuid.* + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ + $simudir/data/$simuid/utt2spk $simudir/data/$simuid/segments \ + $simudir/data/$simuid/rttm + utils/data/get_reco2dur.sh $simudir/data/$simuid + fi + simuid_concat=${dset}_ns"$(IFS="n"; echo "${simu_opts_num_speaker_array[*]}")"_beta"$(IFS="n"; echo "${simu_opts_sil_scale_array[*]}")"_${n_mixtures} + mkdir -p $simudir/data/$simuid_concat + for f in `ls -F $simudir/data/$simuid | grep -v "/"`; do + cat $simudir/data/$simuid/$f >> $simudir/data/$simuid_concat/$f + done + done + done +fi + +if [ $stage -le 3 ]; then + # compose eval/callhome2_spkall + eval_set=data/eval/callhome2_spkall + if ! validate_data_dir.sh --no-text --no-feats $eval_set; then + utils/copy_data_dir.sh data/callhome2_spkall $eval_set + cp data/callhome2_spkall/rttm $eval_set/rttm + awk -v dstdir=wav/eval/callhome2_spkall '{print $1, dstdir"/"$1".wav"}' data/callhome2_spkall/wav.scp > $eval_set/wav.scp + mkdir -p wav/eval/callhome2_spkall + wav-copy scp:data/callhome2_spkall/wav.scp scp:$eval_set/wav.scp + utils/data/get_reco2dur.sh $eval_set + fi + + # compose eval/callhome1_spkall + adapt_set=data/eval/callhome1_spkall + if ! validate_data_dir.sh --no-text --no-feats $adapt_set; then + utils/copy_data_dir.sh data/callhome1_spkall $adapt_set + cp data/callhome1_spkall/rttm $adapt_set/rttm + awk -v dstdir=wav/eval/callhome1_spkall '{print $1, dstdir"/"$1".wav"}' data/callhome1_spkall/wav.scp > $adapt_set/wav.scp + mkdir -p wav/eval/callhome1_spkall + wav-copy scp:data/callhome1_spkall/wav.scp scp:$adapt_set/wav.scp + utils/data/get_reco2dur.sh $adapt_set + fi +fi diff --git a/egs/callhome/eend_ola/path.sh b/egs/callhome/eend_ola/path.sh index ea3c0be2f..e1906b741 100755 --- a/egs/callhome/eend_ola/path.sh +++ b/egs/callhome/eend_ola/path.sh @@ -1,5 +1,12 @@ export FUNASR_DIR=$PWD/../../.. +# kaldi-related +export KALDI_ROOT= +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh + # NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C export PYTHONIOENCODING=UTF-8 export PYTHONPATH=../../../:$PYTHONPATH diff --git a/egs/callhome/eend_ola/run.sh b/egs/callhome/eend_ola/run.sh index 893613752..f5afd73ea 100644 --- a/egs/callhome/eend_ola/run.sh +++ b/egs/callhome/eend_ola/run.sh @@ -27,8 +27,8 @@ callhome_average_end=100 exp_dir="." input_size=345 -stage=1 -stop_stage=4 +stage=-1 +stop_stage=-1 # exp tag tag="exp_fix" @@ -50,11 +50,26 @@ simu_allspkr_model_dir="baseline_$(basename "${simu_allspkr_diar_config}" .yaml) simu_allspkr_chunk2000_model_dir="baseline_$(basename "${simu_allspkr_chunk2000_diar_config}" .yaml)_${tag}" callhome_model_dir="baseline_$(basename "${callhome_diar_config}" .yaml)_${tag}" -# Prepare data for training and inference -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - echo "stage 0: Prepare data for training and inference" +# simulate mixture data for training and inference +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + echo "stage 0: Simulate mixture data for training and inference" + echo "The detail can be found in https://github.com/hitachi-speech/EEND" + ehco "Before running this step, you should download and compile kaldi and set KALDI_ROOT in this script and path.sh" + echo "This stage may take a long time, please waiting..." + KALDI_ROOT= + ln -s $KALDI_ROOT/egs/wsj/s5/steps steps + ln -s $KALDI_ROOT/egs/wsj/s5/utils utils + . local/run_prepare_shared_eda.sh fi +## Prepare data for training and inference +#if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then +# echo "stage 0: Prepare data for training and inference" +# echo "The detail can be found in https://github.com/hitachi-speech/EEND" +# . ./local/ +#fi +# + # Training on simulated two-speaker data world_size=$gpu_num simu_2spkr_ave_id=avg${simu_average_2spkr_start}-${simu_average_2spkr_end} diff --git a/egs/callhome/diarization/sond/sond.yaml b/egs/callhome/sond/sond.yaml similarity index 100% rename from egs/callhome/diarization/sond/sond.yaml rename to egs/callhome/sond/sond.yaml diff --git a/egs/callhome/diarization/sond/sond_fbank.yaml b/egs/callhome/sond/sond_fbank.yaml similarity index 100% rename from egs/callhome/diarization/sond/sond_fbank.yaml rename to egs/callhome/sond/sond_fbank.yaml diff --git a/egs/callhome/diarization/sond/unit_test.py b/egs/callhome/sond/unit_test.py similarity index 100% rename from egs/callhome/diarization/sond/unit_test.py rename to egs/callhome/sond/unit_test.py