FunASR/egs/alimeeting/sa_asr/local/alimeeting_data_prep.sh
yhliang e8528b8f62
Dev lyh (#645)
* update

* update

* fix bug

* fix bug
2023-06-16 20:16:47 +08:00

161 lines
5.7 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail
log() {
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
help_messge=$(cat << EOF
Usage: $0
Options:
--no_overlap (bool): Whether to ignore the overlapping utterance in the training set.
--tgt (string): Which set to process, test or train.
EOF
)
SECONDS=0
tgt=Train #Train or Eval
min_wav_duration=0.1
max_wav_duration=20
log "$0 $*"
echo $tgt
. ./utils/parse_options.sh
. ./path.sh
AliMeeting="${PWD}/dataset"
if [ $# -gt 2 ]; then
log "${help_message}"
exit 2
fi
if [ ! -d "${AliMeeting}" ]; then
log "Error: ${AliMeeting} is empty."
exit 2
fi
# To absolute path
AliMeeting=$(cd ${AliMeeting}; pwd)
echo $AliMeeting
far_raw_dir=${AliMeeting}/${tgt}_Ali_far/
near_raw_dir=${AliMeeting}/${tgt}_Ali_near/
far_dir=data/local/${tgt}_Ali_far
near_dir=data/local/${tgt}_Ali_near
far_single_speaker_dir=data/local/${tgt}_Ali_far_correct_single_speaker
mkdir -p $far_single_speaker_dir
stage=1
stop_stage=4
mkdir -p $far_dir
mkdir -p $near_dir
mkdir -p data/org
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
log "stage 1:process alimeeting near dir"
find -L $near_raw_dir/audio_dir -iname "*.wav" | sort > $near_dir/wavlist
awk -F '/' '{print $NF}' $near_dir/wavlist | awk -F '.' '{print $1}' | sort > $near_dir/uttid
find -L $near_raw_dir/textgrid_dir -iname "*.TextGrid" > $near_dir/textgrid.flist
n1_wav=$(wc -l < $near_dir/wavlist)
n2_text=$(wc -l < $near_dir/textgrid.flist)
log near file found $n1_wav wav and $n2_text text.
paste $near_dir/uttid $near_dir/wavlist -d " " > $near_dir/wav.scp
python local/alimeeting_process_textgrid.py --path $near_dir --no-overlap False
cat $near_dir/text_all | local/text_normalize.pl | local/text_format.pl | sort -u > $near_dir/text
utils/filter_scp.pl -f 1 $near_dir/text $near_dir/utt2spk_all | sort -u > $near_dir/utt2spk
local/utt2spk_to_spk2utt.pl $near_dir/utt2spk > $near_dir/spk2utt
utils/filter_scp.pl -f 1 $near_dir/text $near_dir/segments_all | sort -u > $near_dir/segments
sed -e 's/ $//g' $near_dir/text> $near_dir/tmp1
sed -e 's///g' $near_dir/tmp1> $near_dir/tmp2
sed -e 's///g' $near_dir/tmp2> $near_dir/text
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
log "stage 2:process alimeeting far dir"
find -L $far_raw_dir/audio_dir -iname "*.wav" | sort > $far_dir/wavlist
awk -F '/' '{print $NF}' $far_dir/wavlist | awk -F '.' '{print $1}' > $far_dir/uttid
find -L $far_raw_dir/textgrid_dir -iname "*.TextGrid" | sort > $far_dir/textgrid.flist
n1_wav=$(wc -l < $far_dir/wavlist)
n2_text=$(wc -l < $far_dir/textgrid.flist)
log far file found $n1_wav wav and $n2_text text.
paste $far_dir/uttid $far_dir/wavlist -d " " > $far_dir/wav.scp
python local/alimeeting_process_overlap_force.py --path $far_dir \
--no-overlap false --mars True \
--overlap_length 0.8 --max_length 7
cat $far_dir/text_all | local/text_normalize.pl | local/text_format.pl | sort -u > $far_dir/text
utils/filter_scp.pl -f 1 $far_dir/text $far_dir/utt2spk_all | sort -u > $far_dir/utt2spk
#sed -e 's/ [a-z,A-Z,_,0-9,-]\+SPK/ SPK/' $far_dir/utt2spk_old >$far_dir/utt2spk
local/utt2spk_to_spk2utt.pl $far_dir/utt2spk > $far_dir/spk2utt
utils/filter_scp.pl -f 1 $far_dir/text $far_dir/segments_all | sort -u > $far_dir/segments
sed -e 's/SRC/$/g' $far_dir/text> $far_dir/tmp1
sed -e 's/ $//g' $far_dir/tmp1> $far_dir/tmp2
sed -e 's///g' $far_dir/tmp2> $far_dir/tmp3
sed -e 's///g' $far_dir/tmp3> $far_dir/text
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
log "stage 3: final data process"
local/fix_data_dir.sh $near_dir
local/fix_data_dir.sh $far_dir
local/copy_data_dir.sh $near_dir data/org/${tgt}_Ali_near
local/copy_data_dir.sh $far_dir data/org/${tgt}_Ali_far
sort $far_dir/utt2spk_all_fifo > data/org/${tgt}_Ali_far/utt2spk_all_fifo
sed -i "s/src/$/g" data/org/${tgt}_Ali_far/utt2spk_all_fifo
# remove space in text
for x in ${tgt}_Ali_near ${tgt}_Ali_far; do
cp data/org/${x}/text data/org/${x}/text.org
paste -d " " <(cut -f 1 -d" " data/org/${x}/text.org) <(cut -f 2- -d" " data/org/${x}/text.org | tr -d " ") \
> data/org/${x}/text
rm data/org/${x}/text.org
done
log "Successfully finished. [elapsed=${SECONDS}s]"
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
log "stage 4: process alimeeting far dir (single speaker by oracle time stamp)"
cp -r $far_dir/* $far_single_speaker_dir
mv $far_single_speaker_dir/textgrid.flist $far_single_speaker_dir/textgrid_oldpath
paste -d " " $far_single_speaker_dir/uttid $far_single_speaker_dir/textgrid_oldpath > $far_single_speaker_dir/textgrid.flist
python local/process_textgrid_to_single_speaker_wav.py --path $far_single_speaker_dir
cp $far_single_speaker_dir/utt2spk $far_single_speaker_dir/text
local/utt2spk_to_spk2utt.pl $far_single_speaker_dir/utt2spk > $far_single_speaker_dir/spk2utt
./local/fix_data_dir.sh $far_single_speaker_dir
local/copy_data_dir.sh $far_single_speaker_dir data/org/${tgt}_Ali_far_single_speaker
# remove space in text
for x in ${tgt}_Ali_far_single_speaker; do
cp data/org/${x}/text data/org/${x}/text.org
paste -d " " <(cut -f 1 -d" " data/org/${x}/text.org) <(cut -f 2- -d" " data/org/${x}/text.org | tr -d " ") \
> data/org/${x}/text
rm data/org/${x}/text.org
done
rm -rf data/local
log "Successfully finished. [elapsed=${SECONDS}s]"
fi