FunASR/examples/industrial_data_pretraining/sanm_kws/finetune.sh
zhifu gao 2196844d1d
Dev kws (#2105)
* multi tokenizer

* support fsmn_kws, fsmn_kws_mt, sanm_kws, sanm_kws_streaming training

* kws

---------

Co-authored-by: pengteng.spt <pengteng.spt@alibaba-inc.com>
2024-09-25 15:10:50 +08:00

173 lines
5.4 KiB
Bash
Executable File

#!/usr/bin/env bash
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail
. ./path.sh
workspace=`pwd`
CUDA_VISIBLE_DEVICES="0,1"
stage=2
stop_stage=3
inference_device="cpu" #"cpu"
inference_device="cuda" #"cpu"
inference_checkpoint="model.pt.avg10"
inference_scp="wav.scp"
inference_batch_size=32
nj=32
test_sets="test"
# model_name from model_hub, or model_dir in local path
## option 1, download model automatically, unsupported currently
model_name_or_model_dir="iic/speech_sanm_kws_phone-xiaoyun-commands-offline"
## option 2, download model by git
local_path_root=${workspace}/modelscope_models
model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
if [ ! -d $model_name_or_model_dir ]; then
mkdir -p ${model_name_or_model_dir}
git clone https://www.modelscope.cn/iic/speech_sanm_kws_phone-xiaoyun-commands-offline.git ${model_name_or_model_dir}
fi
config=sanm_6e_320_256_fdim40_t2602.yaml
token_list=${model_name_or_model_dir}/tokens_2602.txt
lexicon_list=${model_name_or_model_dir}/lexicon.txt
cmvn_file=${model_name_or_model_dir}/am.mvn.dim40_l3r3
init_param="${model_name_or_model_dir}/basetrain_sanm_6e_320_256_fdim40_t2602_offline.pt"
# data prepare
# data dir, which contains: train.json, val.json
data_dir=../../data
train_data="${data_dir}/train.jsonl"
val_data="${data_dir}/val.jsonl"
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: Generate audio json list"
# generate train.jsonl and val.jsonl from wav.scp and text.txt
python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
++scp_file_list='['''${data_dir}/train_wav.scp''', '''${data_dir}/train_text.txt''']' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
++scp_file_list='['''${data_dir}/val_wav.scp''', '''${data_dir}/val_text.txt''']' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
fi
# exp output dir
output_dir="${workspace}/exp/finetune_outputs"
# Training Stage
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: KWS Training"
mkdir -p ${output_dir}
current_time=$(date "+%Y-%m-%d_%H-%M")
log_file="${output_dir}/train.log.txt.${current_time}"
echo "log_file: ${log_file}"
echo "finetune use basetrain model: ${init_param}"
export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
../../../funasr/bin/train.py \
--config-path "${workspace}/conf" \
--config-name "${config}" \
++init_param="${init_param}" \
++disable_update=true \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++tokenizer_conf.token_list="${token_list}" \
++tokenizer_conf.seg_dict="${lexicon_list}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++output_dir="${output_dir}" &> ${log_file}
fi
# Testing Stage
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "stage 3: Inference"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
if [ ${inference_device} == "cuda" ]; then
nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
else
inference_batch_size=1
CUDA_VISIBLE_DEVICES=""
for JOB in $(seq ${nj}); do
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
done
fi
for dset in ${test_sets}; do
inference_dir="${output_dir}/inference-${inference_checkpoint}/${dset}"
_logdir="${inference_dir}/logdir"
echo "inference_dir: ${inference_dir}"
mkdir -p "${_logdir}"
test_data_dir="${data_dir}/${dset}"
key_file=${test_data_dir}/${inference_scp}
split_scps=
for JOB in $(seq "${nj}"); do
split_scps+=" ${_logdir}/keys.${JOB}.scp"
done
$FUNASR_DIR/examples/aishell/paraformer/utils/split_scp.pl "${key_file}" ${split_scps}
gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
for JOB in $(seq ${nj}); do
{
id=$((JOB-1))
gpuid=${gpuid_list_array[$id]}
echo "${output_dir}"
export CUDA_VISIBLE_DEVICES=${gpuid}
python ../../../funasr/bin/inference.py \
--config-path="${output_dir}" \
--config-name="config.yaml" \
++init_param="${output_dir}/${inference_checkpoint}" \
++tokenizer_conf.token_list="${token_list}" \
++tokenizer_conf.seg_dict="${lexicon_list}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++keywords="\"$keywords_string"\" \
++input="${_logdir}/keys.${JOB}.scp" \
++output_dir="${inference_dir}/${JOB}" \
++device="${inference_device}" \
++ncpu=1 \
++disable_log=true \
++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt
# ++batch_size="${inference_batch_size}"
}&
done
wait
for f in detect score; do
if [ -f "${inference_dir}/${JOB}/${f}" ]; then
for JOB in $(seq "${nj}"); do
cat "${inference_dir}/${JOB}/${f}"
done | sort -k1 >"${inference_dir}/${f}"
fi
done
python funasr/utils/compute_det_ctc.py \
--keywords ${keywords_string} \
--test_data ${test_data_dir}/wav.scp \
--trans_data ${test_data_dir}/text \
--score_file ${inference_dir}/detect \
--stats_dir ${inference_dir}
done
fi