mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
* multi tokenizer * support fsmn_kws, fsmn_kws_mt, sanm_kws, sanm_kws_streaming training * kws --------- Co-authored-by: pengteng.spt <pengteng.spt@alibaba-inc.com>
173 lines
5.4 KiB
Bash
Executable File
173 lines
5.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
# Set bash to 'debug' mode, it will exit on :
|
|
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
|
|
set -e
|
|
set -u
|
|
set -o pipefail
|
|
|
|
. ./path.sh
|
|
workspace=`pwd`
|
|
|
|
CUDA_VISIBLE_DEVICES="0,1"
|
|
|
|
stage=2
|
|
stop_stage=3
|
|
|
|
inference_device="cpu" #"cpu"
|
|
inference_device="cuda" #"cpu"
|
|
inference_checkpoint="model.pt.avg10"
|
|
inference_scp="wav.scp"
|
|
inference_batch_size=32
|
|
nj=32
|
|
test_sets="test"
|
|
|
|
# model_name from model_hub, or model_dir in local path
|
|
|
|
## option 1, download model automatically, unsupported currently
|
|
model_name_or_model_dir="iic/speech_sanm_kws_phone-xiaoyun-commands-offline"
|
|
|
|
## option 2, download model by git
|
|
local_path_root=${workspace}/modelscope_models
|
|
model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
|
|
if [ ! -d $model_name_or_model_dir ]; then
|
|
mkdir -p ${model_name_or_model_dir}
|
|
git clone https://www.modelscope.cn/iic/speech_sanm_kws_phone-xiaoyun-commands-offline.git ${model_name_or_model_dir}
|
|
fi
|
|
|
|
config=sanm_6e_320_256_fdim40_t2602.yaml
|
|
token_list=${model_name_or_model_dir}/tokens_2602.txt
|
|
lexicon_list=${model_name_or_model_dir}/lexicon.txt
|
|
cmvn_file=${model_name_or_model_dir}/am.mvn.dim40_l3r3
|
|
init_param="${model_name_or_model_dir}/basetrain_sanm_6e_320_256_fdim40_t2602_offline.pt"
|
|
|
|
|
|
# data prepare
|
|
# data dir, which contains: train.json, val.json
|
|
data_dir=../../data
|
|
|
|
train_data="${data_dir}/train.jsonl"
|
|
val_data="${data_dir}/val.jsonl"
|
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
|
echo "stage 1: Generate audio json list"
|
|
# generate train.jsonl and val.jsonl from wav.scp and text.txt
|
|
python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
|
|
++scp_file_list='['''${data_dir}/train_wav.scp''', '''${data_dir}/train_text.txt''']' \
|
|
++data_type_list='["source", "target"]' \
|
|
++jsonl_file_out="${train_data}"
|
|
|
|
python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
|
|
++scp_file_list='['''${data_dir}/val_wav.scp''', '''${data_dir}/val_text.txt''']' \
|
|
++data_type_list='["source", "target"]' \
|
|
++jsonl_file_out="${val_data}"
|
|
fi
|
|
|
|
# exp output dir
|
|
output_dir="${workspace}/exp/finetune_outputs"
|
|
|
|
# Training Stage
|
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
|
echo "stage 2: KWS Training"
|
|
|
|
mkdir -p ${output_dir}
|
|
current_time=$(date "+%Y-%m-%d_%H-%M")
|
|
log_file="${output_dir}/train.log.txt.${current_time}"
|
|
echo "log_file: ${log_file}"
|
|
echo "finetune use basetrain model: ${init_param}"
|
|
|
|
export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
|
|
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
|
torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
|
|
../../../funasr/bin/train.py \
|
|
--config-path "${workspace}/conf" \
|
|
--config-name "${config}" \
|
|
++init_param="${init_param}" \
|
|
++disable_update=true \
|
|
++train_data_set_list="${train_data}" \
|
|
++valid_data_set_list="${val_data}" \
|
|
++tokenizer_conf.token_list="${token_list}" \
|
|
++tokenizer_conf.seg_dict="${lexicon_list}" \
|
|
++frontend_conf.cmvn_file="${cmvn_file}" \
|
|
++output_dir="${output_dir}" &> ${log_file}
|
|
fi
|
|
|
|
|
|
# Testing Stage
|
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
|
echo "stage 3: Inference"
|
|
keywords=(小云小云)
|
|
keywords_string=$(IFS=,; echo "${keywords[*]}")
|
|
echo "keywords: $keywords_string"
|
|
|
|
if [ ${inference_device} == "cuda" ]; then
|
|
nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
|
else
|
|
inference_batch_size=1
|
|
CUDA_VISIBLE_DEVICES=""
|
|
for JOB in $(seq ${nj}); do
|
|
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
|
|
done
|
|
fi
|
|
|
|
for dset in ${test_sets}; do
|
|
inference_dir="${output_dir}/inference-${inference_checkpoint}/${dset}"
|
|
_logdir="${inference_dir}/logdir"
|
|
echo "inference_dir: ${inference_dir}"
|
|
|
|
mkdir -p "${_logdir}"
|
|
test_data_dir="${data_dir}/${dset}"
|
|
key_file=${test_data_dir}/${inference_scp}
|
|
|
|
split_scps=
|
|
for JOB in $(seq "${nj}"); do
|
|
split_scps+=" ${_logdir}/keys.${JOB}.scp"
|
|
done
|
|
$FUNASR_DIR/examples/aishell/paraformer/utils/split_scp.pl "${key_file}" ${split_scps}
|
|
|
|
gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
|
|
for JOB in $(seq ${nj}); do
|
|
{
|
|
id=$((JOB-1))
|
|
gpuid=${gpuid_list_array[$id]}
|
|
|
|
echo "${output_dir}"
|
|
|
|
export CUDA_VISIBLE_DEVICES=${gpuid}
|
|
python ../../../funasr/bin/inference.py \
|
|
--config-path="${output_dir}" \
|
|
--config-name="config.yaml" \
|
|
++init_param="${output_dir}/${inference_checkpoint}" \
|
|
++tokenizer_conf.token_list="${token_list}" \
|
|
++tokenizer_conf.seg_dict="${lexicon_list}" \
|
|
++frontend_conf.cmvn_file="${cmvn_file}" \
|
|
++keywords="\"$keywords_string"\" \
|
|
++input="${_logdir}/keys.${JOB}.scp" \
|
|
++output_dir="${inference_dir}/${JOB}" \
|
|
++device="${inference_device}" \
|
|
++ncpu=1 \
|
|
++disable_log=true \
|
|
++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt
|
|
# ++batch_size="${inference_batch_size}"
|
|
}&
|
|
|
|
done
|
|
wait
|
|
|
|
for f in detect score; do
|
|
if [ -f "${inference_dir}/${JOB}/${f}" ]; then
|
|
for JOB in $(seq "${nj}"); do
|
|
cat "${inference_dir}/${JOB}/${f}"
|
|
done | sort -k1 >"${inference_dir}/${f}"
|
|
fi
|
|
done
|
|
|
|
python funasr/utils/compute_det_ctc.py \
|
|
--keywords ${keywords_string} \
|
|
--test_data ${test_data_dir}/wav.scp \
|
|
--trans_data ${test_data_dir}/text \
|
|
--score_file ${inference_dir}/detect \
|
|
--stats_dir ${inference_dir}
|
|
done
|
|
|
|
fi
|