FunASR/egs/callhome/eend_ola/local/random_mixture.py
2023-07-19 10:35:12 +08:00

146 lines
5.6 KiB
Python
Executable File

#!/usr/bin/env python3
# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita)
# Licensed under the MIT license.
"""
This script generates random multi-talker mixtures for diarization.
It generates a scp-like outputs: lines of "[recid] [json]".
recid: recording id of mixture
serial numbers like mix_0000001, mix_0000002, ...
json: mixture configuration formatted in "one-line"
The json format is as following:
{
'speakers':[ # list of speakers
{
'spkid': 'Name', # speaker id
'rir': '/rirdir/rir.wav', # wav_rxfilename of room impulse response
'utts': [ # list of wav_rxfilenames of utterances
'/wavdir/utt1.wav',
'/wavdir/utt2.wav',...],
'intervals': [1.2, 3.4, ...] # list of silence durations before utterances
}, ... ],
'noise': '/noisedir/noise.wav' # wav_rxfilename of background noise
'snr': 15.0, # SNR for mixing background noise
'recid': 'mix_000001' # recording id of the mixture
}
Usage:
common/random_mixture.py \
--n_mixtures=10000 \ # number of mixtures
data/voxceleb1_train \ # kaldi-style data dir of utterances
data/musan_noise_bg \ # background noises
data/simu_rirs \ # room impulse responses
> mixture.scp # output scp-like file
The actual data dir and wav files are generated using make_mixture.py:
common/make_mixture.py \
mixture.scp \ # scp-like file for mixture
data/mixture \ # output data dir
wav/mixture # output wav dir
"""
import argparse
import os
from funasr.modules.eend_ola.utils import kaldi_data
import random
import numpy as np
import json
import itertools
parser = argparse.ArgumentParser()
parser.add_argument('data_dir',
help='data dir of single-speaker recordings')
parser.add_argument('noise_dir',
help='data dir of background noise recordings')
parser.add_argument('rir_dir',
help='data dir of room impulse responses')
parser.add_argument('--n_mixtures', type=int, default=10,
help='number of mixture recordings')
parser.add_argument('--n_speakers', type=int, default=4,
help='number of speakers in a mixture')
parser.add_argument('--min_utts', type=int, default=10,
help='minimum number of uttenraces per speaker')
parser.add_argument('--max_utts', type=int, default=20,
help='maximum number of utterances per speaker')
parser.add_argument('--sil_scale', type=float, default=10.0,
help='average silence time')
parser.add_argument('--noise_snrs', default="10:15:20",
help='colon-delimited SNRs for background noises')
parser.add_argument('--random_seed', type=int, default=777,
help='random seed')
parser.add_argument('--speech_rvb_probability', type=float, default=1,
help='reverb probability')
args = parser.parse_args()
random.seed(args.random_seed)
np.random.seed(args.random_seed)
# load list of wav files from kaldi-style data dirs
wavs = kaldi_data.load_wav_scp(
os.path.join(args.data_dir, 'wav.scp'))
noises = kaldi_data.load_wav_scp(
os.path.join(args.noise_dir, 'wav.scp'))
rirs = kaldi_data.load_wav_scp(
os.path.join(args.rir_dir, 'wav.scp'))
# spk2utt is used for counting number of utterances per speaker
spk2utt = kaldi_data.load_spk2utt(
os.path.join(args.data_dir, 'spk2utt'))
segments = kaldi_data.load_segments_hash(
os.path.join(args.data_dir, 'segments'))
# choice lists for random sampling
all_speakers = list(spk2utt.keys())
all_noises = list(noises.keys())
all_rirs = list(rirs.keys())
noise_snrs = [float(x) for x in args.noise_snrs.split(':')]
mixtures = []
for it in range(args.n_mixtures):
# recording ids are mix_0000001, mix_0000002, ...
recid = 'mix_{:07d}'.format(it + 1)
# randomly select speakers, a background noise and a SNR
speakers = random.sample(all_speakers, args.n_speakers)
noise = random.choice(all_noises)
noise_snr = random.choice(noise_snrs)
mixture = {'speakers': []}
for speaker in speakers:
# randomly select the number of utterances
n_utts = np.random.randint(args.min_utts, args.max_utts + 1)
# utts = spk2utt[speaker][:n_utts]
cycle_utts = itertools.cycle(spk2utt[speaker])
# random start utterance
roll = np.random.randint(0, len(spk2utt[speaker]))
for i in range(roll):
next(cycle_utts)
utts = [next(cycle_utts) for i in range(n_utts)]
# randomly select wait time before appending utterance
intervals = np.random.exponential(args.sil_scale, size=n_utts)
# randomly select a room impulse response
if random.random() < args.speech_rvb_probability:
rir = rirs[random.choice(all_rirs)]
else:
rir = None
if segments is not None:
utts = [segments[utt] for utt in utts]
utts = [(wavs[rec], st, et) for (rec, st, et) in utts]
mixture['speakers'].append({
'spkid': speaker,
'rir': rir,
'utts': utts,
'intervals': intervals.tolist()
})
else:
mixture['speakers'].append({
'spkid': speaker,
'rir': rir,
'utts': [wavs[utt] for utt in utts],
'intervals': intervals.tolist()
})
mixture['noise'] = noises[noise]
mixture['snr'] = noise_snr
mixture['recid'] = recid
print(recid, json.dumps(mixture))