mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
146 lines
5.6 KiB
Python
Executable File
146 lines
5.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita)
|
|
# Licensed under the MIT license.
|
|
|
|
"""
|
|
This script generates random multi-talker mixtures for diarization.
|
|
It generates a scp-like outputs: lines of "[recid] [json]".
|
|
recid: recording id of mixture
|
|
serial numbers like mix_0000001, mix_0000002, ...
|
|
json: mixture configuration formatted in "one-line"
|
|
The json format is as following:
|
|
{
|
|
'speakers':[ # list of speakers
|
|
{
|
|
'spkid': 'Name', # speaker id
|
|
'rir': '/rirdir/rir.wav', # wav_rxfilename of room impulse response
|
|
'utts': [ # list of wav_rxfilenames of utterances
|
|
'/wavdir/utt1.wav',
|
|
'/wavdir/utt2.wav',...],
|
|
'intervals': [1.2, 3.4, ...] # list of silence durations before utterances
|
|
}, ... ],
|
|
'noise': '/noisedir/noise.wav' # wav_rxfilename of background noise
|
|
'snr': 15.0, # SNR for mixing background noise
|
|
'recid': 'mix_000001' # recording id of the mixture
|
|
}
|
|
|
|
Usage:
|
|
common/random_mixture.py \
|
|
--n_mixtures=10000 \ # number of mixtures
|
|
data/voxceleb1_train \ # kaldi-style data dir of utterances
|
|
data/musan_noise_bg \ # background noises
|
|
data/simu_rirs \ # room impulse responses
|
|
> mixture.scp # output scp-like file
|
|
|
|
The actual data dir and wav files are generated using make_mixture.py:
|
|
common/make_mixture.py \
|
|
mixture.scp \ # scp-like file for mixture
|
|
data/mixture \ # output data dir
|
|
wav/mixture # output wav dir
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
from funasr.modules.eend_ola.utils import kaldi_data
|
|
import random
|
|
import numpy as np
|
|
import json
|
|
import itertools
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('data_dir',
|
|
help='data dir of single-speaker recordings')
|
|
parser.add_argument('noise_dir',
|
|
help='data dir of background noise recordings')
|
|
parser.add_argument('rir_dir',
|
|
help='data dir of room impulse responses')
|
|
parser.add_argument('--n_mixtures', type=int, default=10,
|
|
help='number of mixture recordings')
|
|
parser.add_argument('--n_speakers', type=int, default=4,
|
|
help='number of speakers in a mixture')
|
|
parser.add_argument('--min_utts', type=int, default=10,
|
|
help='minimum number of uttenraces per speaker')
|
|
parser.add_argument('--max_utts', type=int, default=20,
|
|
help='maximum number of utterances per speaker')
|
|
parser.add_argument('--sil_scale', type=float, default=10.0,
|
|
help='average silence time')
|
|
parser.add_argument('--noise_snrs', default="10:15:20",
|
|
help='colon-delimited SNRs for background noises')
|
|
parser.add_argument('--random_seed', type=int, default=777,
|
|
help='random seed')
|
|
parser.add_argument('--speech_rvb_probability', type=float, default=1,
|
|
help='reverb probability')
|
|
args = parser.parse_args()
|
|
|
|
random.seed(args.random_seed)
|
|
np.random.seed(args.random_seed)
|
|
|
|
# load list of wav files from kaldi-style data dirs
|
|
wavs = kaldi_data.load_wav_scp(
|
|
os.path.join(args.data_dir, 'wav.scp'))
|
|
noises = kaldi_data.load_wav_scp(
|
|
os.path.join(args.noise_dir, 'wav.scp'))
|
|
rirs = kaldi_data.load_wav_scp(
|
|
os.path.join(args.rir_dir, 'wav.scp'))
|
|
|
|
# spk2utt is used for counting number of utterances per speaker
|
|
spk2utt = kaldi_data.load_spk2utt(
|
|
os.path.join(args.data_dir, 'spk2utt'))
|
|
|
|
segments = kaldi_data.load_segments_hash(
|
|
os.path.join(args.data_dir, 'segments'))
|
|
|
|
# choice lists for random sampling
|
|
all_speakers = list(spk2utt.keys())
|
|
all_noises = list(noises.keys())
|
|
all_rirs = list(rirs.keys())
|
|
noise_snrs = [float(x) for x in args.noise_snrs.split(':')]
|
|
|
|
mixtures = []
|
|
for it in range(args.n_mixtures):
|
|
# recording ids are mix_0000001, mix_0000002, ...
|
|
recid = 'mix_{:07d}'.format(it + 1)
|
|
# randomly select speakers, a background noise and a SNR
|
|
speakers = random.sample(all_speakers, args.n_speakers)
|
|
noise = random.choice(all_noises)
|
|
noise_snr = random.choice(noise_snrs)
|
|
mixture = {'speakers': []}
|
|
for speaker in speakers:
|
|
# randomly select the number of utterances
|
|
n_utts = np.random.randint(args.min_utts, args.max_utts + 1)
|
|
# utts = spk2utt[speaker][:n_utts]
|
|
cycle_utts = itertools.cycle(spk2utt[speaker])
|
|
# random start utterance
|
|
roll = np.random.randint(0, len(spk2utt[speaker]))
|
|
for i in range(roll):
|
|
next(cycle_utts)
|
|
utts = [next(cycle_utts) for i in range(n_utts)]
|
|
# randomly select wait time before appending utterance
|
|
intervals = np.random.exponential(args.sil_scale, size=n_utts)
|
|
# randomly select a room impulse response
|
|
if random.random() < args.speech_rvb_probability:
|
|
rir = rirs[random.choice(all_rirs)]
|
|
else:
|
|
rir = None
|
|
if segments is not None:
|
|
utts = [segments[utt] for utt in utts]
|
|
utts = [(wavs[rec], st, et) for (rec, st, et) in utts]
|
|
mixture['speakers'].append({
|
|
'spkid': speaker,
|
|
'rir': rir,
|
|
'utts': utts,
|
|
'intervals': intervals.tolist()
|
|
})
|
|
else:
|
|
mixture['speakers'].append({
|
|
'spkid': speaker,
|
|
'rir': rir,
|
|
'utts': [wavs[utt] for utt in utts],
|
|
'intervals': intervals.tolist()
|
|
})
|
|
mixture['noise'] = noises[noise]
|
|
mixture['snr'] = noise_snr
|
|
mixture['recid'] = recid
|
|
print(recid, json.dumps(mixture))
|