mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
54 lines
1.5 KiB
Bash
Executable File
54 lines
1.5 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG)
|
|
# 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
|
|
# Apache 2.0
|
|
|
|
# transform raw AISHELL-2 data to kaldi format
|
|
|
|
. ./path.sh || exit 1;
|
|
|
|
tmp=
|
|
dir=
|
|
|
|
if [ $# != 3 ]; then
|
|
echo "Usage: $0 <corpus-data-dir> <tmp-dir> <output-dir>"
|
|
echo " $0 /export/AISHELL-2/iOS/train data/local/train data/train"
|
|
exit 1;
|
|
fi
|
|
|
|
corpus=$1
|
|
tmp=$2
|
|
dir=$3
|
|
|
|
echo "prepare_data.sh: Preparing data in $corpus"
|
|
|
|
mkdir -p $tmp
|
|
mkdir -p $dir
|
|
|
|
# corpus check
|
|
if [ ! -d $corpus ] || [ ! -f $corpus/wav.scp ] || [ ! -f $corpus/trans.txt ]; then
|
|
echo "Error: $0 requires wav.scp and trans.txt under $corpus directory."
|
|
exit 1;
|
|
fi
|
|
|
|
# validate utt-key list, IC0803W0380 is a bad utterance
|
|
awk '{print $1}' $corpus/wav.scp | grep -v 'IC0803W0380' > $tmp/wav_utt.list
|
|
awk '{print $1}' $corpus/trans.txt > $tmp/trans_utt.list
|
|
utils/filter_scp.pl -f 1 $tmp/wav_utt.list $tmp/trans_utt.list > $tmp/utt.list
|
|
|
|
# wav.scp
|
|
awk -F'\t' -v path_prefix=$corpus '{printf("%s\t%s/%s\n",$1,path_prefix,$2)}' $corpus/wav.scp > $tmp/tmp_wav.scp
|
|
utils/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tmp/wav.scp
|
|
|
|
# text
|
|
utils/filter_scp.pl -f 1 $tmp/utt.list $corpus/trans.txt | sort -k 1 | uniq > $tmp/text
|
|
|
|
# copy prepared resources from tmp_dir to target dir
|
|
mkdir -p $dir
|
|
for f in wav.scp text; do
|
|
cp $tmp/$f $dir/$f || exit 1;
|
|
done
|
|
|
|
echo "local/prepare_data.sh succeeded"
|
|
exit 0;
|