Merge branch 'main' into dev_cmz2

2025-09-15 14:48:36 +08:00 · 2023-04-07 15:54:09 +08:00 · 2023-04-07 15:54:09 +08:00 · 2e769fb36c
commit 2e769fb36c
parent c4490d3575 3526eaa8d2
51 changed files with 887 additions and 132 deletions
--- a/egs/aishell/transformer/utils/cmvn_converter.py
+++ b/egs/aishell/transformer/utils/cmvn_converter.py
@ -0,0 +1,53 @@
+import argparse
+import json
+import numpy as np
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="cmvn converter",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--cmvn-json",
+        "-c",
+        default=False,
+        required=True,
+        type=str,
+        help="cmvn json file",
+    )
+    parser.add_argument(
+        "--am-mvn",
+        "-a",
+        default=False,
+        required=True,
+        type=str,
+        help="am mvn file",
+    )
+    return parser
+
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+
+    with open(args.cmvn_json, "r") as fin:
+        cmvn_dict = json.load(fin)
+
+    mean_stats = np.array(cmvn_dict["mean_stats"])
+    var_stats = np.array(cmvn_dict["var_stats"])
+    total_frame = np.array(cmvn_dict["total_frames"])
+
+    mean = -1.0 * mean_stats / total_frame
+    var = 1.0 / np.sqrt(var_stats / total_frame - mean * mean)
+    dims = mean.shape[0]
+    with open(args.am_mvn, 'w') as fout:
+        fout.write("<Nnet>" + "\n" + "<Splice> " + str(dims) + " " + str(dims) + '\n' + "[ 0 ]" + "\n" + "<AddShift> " + str(dims) + " " + str(dims) + "\n")
+        mean_str = str(list(mean)).replace(',', '').replace('[', '[ ').replace(']', ' ]')
+        fout.write("<LearnRateCoef> 0 " + mean_str + '\n')
+        fout.write("<Rescale> " + str(dims) + " " + str(dims) + '\n')
+        var_str = str(list(var)).replace(',', '').replace('[', '[ ').replace(']', ' ]')
+        fout.write("<LearnRateCoef> 0 " + var_str + '\n')
+        fout.write("</Nnet>" + '\n')
+
+if __name__ == '__main__':
+    main()
--- a/egs/aishell/transformer/utils/compute_wer.py
+++ b/egs/aishell/transformer/utils/compute_wer.py
@ -45,8 +45,8 @@ def compute_wer(ref_file,
           if out_item['wrong'] > 0:
               rst['wrong_sentences'] += 1
           cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
-           cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
-           cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')
+           cer_detail_writer.write("ref:" + '\t' + " ".join(list(map(lambda x: x.lower(), ref_dict[hyp_key]))) + '\n')
+           cer_detail_writer.write("hyp:" + '\t' + " ".join(list(map(lambda x: x.lower(), hyp_dict[hyp_key]))) + '\n')

    if rst['Wrd'] > 0:
        rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)
--- a/egs/librispeech/conformer/conf/decode_asr_transformer.yaml
+++ b/egs/librispeech/conformer/conf/decode_asr_transformer.yaml
@ -0,0 +1,6 @@
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.7
--- a/egs/librispeech/conformer/conf/train_asr_conformer.yaml
+++ b/egs/librispeech/conformer/conf/train_asr_conformer.yaml
@ -0,0 +1,80 @@
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+accum_grad: 2
+max_epoch: 50
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 40000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 10
+
+dataset_conf:
+    shuffle: True
+    shuffle_conf:
+        shuffle_size: 1024
+        sort_size: 500
+    batch_conf:
+        batch_type: token
+        batch_size: 10000
+    num_workers: 8
+
+log_interval: 50
+normalize: None
--- a/egs/librispeech/conformer/conf/train_asr_conformer_uttnorm.yaml
+++ b/egs/librispeech/conformer/conf/train_asr_conformer_uttnorm.yaml
@ -0,0 +1,80 @@
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+accum_grad: 2
+max_epoch: 50
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 40000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 10
+
+dataset_conf:
+    shuffle: True
+    shuffle_conf:
+        shuffle_size: 1024
+        sort_size: 500
+    batch_conf:
+        batch_type: token
+        batch_size: 10000
+    num_workers: 8
+
+log_interval: 50
+normalize: utterance_mvn
--- a/egs/librispeech/conformer/local/data_prep_librispeech.sh
+++ b/egs/librispeech/conformer/local/data_prep_librispeech.sh
@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+
+# Copyright 2014  Vassil Panayotov
+#           2014  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 <src-dir> <dst-dir>"
+  echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
+  exit 1
+fi
+
+src=$1
+dst=$2
+
+# all utterances are FLAC compressed
+if ! which flac >&/dev/null; then
+   echo "Please install 'flac' on ALL worker nodes!"
+   exit 1
+fi
+
+spk_file=$src/../SPEAKERS.TXT
+
+mkdir -p $dst || exit 1
+
+[ ! -d $src ] && echo "$0: no such directory $src" && exit 1
+[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1
+
+
+wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
+trans=$dst/text; [[ -f "$trans" ]] && rm $trans
+
+for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
+  reader=$(basename $reader_dir)
+  if ! [ $reader -eq $reader ]; then  # not integer.
+    echo "$0: unexpected subdirectory name $reader"
+    exit 1
+  fi
+
+  for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
+    chapter=$(basename $chapter_dir)
+    if ! [ "$chapter" -eq "$chapter" ]; then
+      echo "$0: unexpected chapter-subdirectory name $chapter"
+      exit 1
+    fi
+
+    find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
+      awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac \n", $0, dir, $0}' >>$wav_scp|| exit 1
+
+    chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
+    [ ! -f  $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
+    cat $chapter_trans >>$trans
+  done
+done
+
+echo "$0: successfully prepared data in $dst"
+
+exit 0
--- a/egs/librispeech/conformer/path.sh
+++ b/egs/librispeech/conformer/path.sh
@ -0,0 +1,5 @@
+export FUNASR_DIR=$PWD/../../..
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
--- a/egs/librispeech/conformer/run.sh
+++ b/egs/librispeech/conformer/run.sh
@ -0,0 +1,262 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+
+# machines configuration
+CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+gpu_num=8
+count=1
+gpu_inference=true  # Whether to perform gpu decoding, set false for cpu decoding
+# for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob
+njob=5
+train_cmd=utils/run.pl
+infer_cmd=utils/run.pl
+
+# general configuration
+feats_dir="../DATA" #feature output dictionary
+exp_dir="."
+lang=en
+dumpdir=dump/fbank
+feats_type=fbank
+token_type=bpe
+dataset_type=large
+scp=feats.scp
+type=kaldi_ark
+stage=3
+stop_stage=4
+
+# feature configuration
+feats_dim=80
+sample_frequency=16000
+nj=100
+speed_perturb="0.9,1.0,1.1"
+
+# data
+data_librispeech=
+
+# bpe model
+nbpe=5000
+bpemode=unigram
+
+# exp tag
+tag=""
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train_960
+valid_set=dev
+test_sets="test_clean test_other dev_clean dev_other"
+
+asr_config=conf/train_asr_conformer.yaml
+#asr_config=conf/train_asr_conformer_uttnorm.yaml
+model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
+
+inference_config=conf/decode_asr_transformer.yaml
+#inference_config=conf/decode_asr_transformer_beam60_ctc0.3.yaml
+inference_asr_model=valid.acc.ave_10best.pth
+
+# you can set gpu num for decoding here
+gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
+ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
+
+if ${gpu_inference}; then
+    inference_nj=$[${ngpu}*${njob}]
+    _ngpu=1
+else
+    inference_nj=$njob
+    _ngpu=0
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "stage 0: Data preparation"
+    # Data preparation
+    for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
+        local/data_prep_librispeech.sh ${data_librispeech}/LibriSpeech/${x} ${feats_dir}/data/${x//-/_}
+    done
+fi
+
+feat_train_dir=${feats_dir}/${dumpdir}/$train_set; mkdir -p ${feat_train_dir}
+feat_dev_clean_dir=${feats_dir}/${dumpdir}/dev_clean; mkdir -p ${feat_dev_clean_dir}
+feat_dev_other_dir=${feats_dir}/${dumpdir}/dev_other; mkdir -p ${feat_dev_other_dir}
+feat_test_clean_dir=${feats_dir}/${dumpdir}/test_clean; mkdir -p ${feat_test_clean_dir}
+feat_test_other_dir=${feats_dir}/${dumpdir}/test_other; mkdir -p ${feat_test_other_dir}
+feat_dev_dir=${feats_dir}/${dumpdir}/$valid_set; mkdir -p ${feat_dev_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: Feature Generation"
+    # compute fbank features
+    fbankdir=${feats_dir}/fbank
+    for x in dev_clean dev_other test_clean test_other; do
+        utils/compute_fbank.sh --cmd "$train_cmd" --nj 1 --max_lengths 3000 --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} \
+            ${feats_dir}/data/${x} ${exp_dir}/exp/make_fbank/${x} ${fbankdir}/${x}
+        utils/fix_data_feat.sh ${fbankdir}/${x}
+    done
+
+    mkdir ${feats_dir}/data/$train_set
+    train_sets="train_clean_100 train_clean_360 train_other_500"
+    for file in wav.scp text; do
+        ( for f in $train_sets; do cat $feats_dir/data/$f/$file; done ) | sort -k1 > $feats_dir/data/$train_set/$file || exit 1;
+    done
+    utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --max_lengths 3000 --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} --speed_perturb ${speed_perturb} \
+    ${feats_dir}/data/$train_set ${exp_dir}/exp/make_fbank/$train_set ${fbankdir}/$train_set
+    utils/fix_data_feat.sh ${fbankdir}/$train_set
+
+    # compute global cmvn
+    utils/compute_cmvn.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} \
+        ${fbankdir}/$train_set ${exp_dir}/exp/make_fbank/$train_set
+
+    # apply cmvn
+    utils/apply_cmvn.sh --cmd "$train_cmd" --nj $nj \
+        ${fbankdir}/$train_set ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/$train_set ${feat_train_dir}
+    utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \
+        ${fbankdir}/dev_clean ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/dev_clean ${feat_dev_clean_dir}
+    utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1\
+        ${fbankdir}/dev_other ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/dev_other ${feat_dev_other_dir}
+    utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \
+        ${fbankdir}/test_clean ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/test_clean ${feat_test_clean_dir}
+    utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \
+        ${fbankdir}/test_other ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/test_other ${feat_test_other_dir}
+
+    cp ${fbankdir}/$train_set/text ${fbankdir}/$train_set/speech_shape ${fbankdir}/$train_set/text_shape ${feat_train_dir}
+    cp ${fbankdir}/dev_clean/text ${fbankdir}/dev_clean/speech_shape ${fbankdir}/dev_clean/text_shape ${feat_dev_clean_dir}
+    cp ${fbankdir}/dev_other/text ${fbankdir}/dev_other/speech_shape ${fbankdir}/dev_other/text_shape ${feat_dev_other_dir}
+    cp ${fbankdir}/test_clean/text ${fbankdir}/test_clean/speech_shape ${fbankdir}/test_clean/text_shape ${feat_test_clean_dir}
+    cp ${fbankdir}/test_other/text ${fbankdir}/test_other/speech_shape ${fbankdir}/test_other/text_shape ${feat_test_other_dir}
+
+    dev_sets="dev_clean dev_other"
+    for file in feats.scp text speech_shape text_shape; do
+        ( for f in $dev_sets; do cat $feats_dir/${dumpdir}/$f/$file; done ) | sort -k1 > $feat_dev_dir/$file || exit 1;
+    done
+
+    #generate ark list
+    utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_train_dir} ${fbankdir}/${train_set} ${feat_train_dir}
+    utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_dev_dir} ${fbankdir}/${valid_set} ${feat_dev_dir}
+fi
+
+dict=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p ${feats_dir}/data/lang_char/
+    echo "<blank>" > ${dict}
+    echo "<s>" >> ${dict}
+    echo "</s>" >> ${dict}
+    cut -f 2- -d" " ${feats_dir}/data/${train_set}/text > ${feats_dir}/data/lang_char/input.txt
+    spm_train --input=${feats_dir}/data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+    spm_encode --model=${bpemodel}.model --output_format=piece < ${feats_dir}/data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0}' >> ${dict}
+    echo "<unk>" >> ${dict}
+    wc -l ${dict}
+
+    vocab_size=$(cat ${dict} | wc -l)
+    awk -v v=,${vocab_size} '{print $0v}' ${feat_train_dir}/text_shape > ${feat_train_dir}/text_shape.char
+    awk -v v=,${vocab_size} '{print $0v}' ${feat_dev_dir}/text_shape > ${feat_dev_dir}/text_shape.char
+    mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/$train_set
+    mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/$valid_set
+    cp ${feat_train_dir}/speech_shape ${feat_train_dir}/text_shape ${feat_train_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/$train_set
+    cp ${feat_dev_dir}/speech_shape ${feat_dev_dir}/text_shape ${feat_dev_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/$valid_set
+fi
+
+
+# Training Stage
+world_size=$gpu_num  # run on one machine
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: Training"
+    mkdir -p ${exp_dir}/exp/${model_dir}
+    mkdir -p ${exp_dir}/exp/${model_dir}/log
+    INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init
+    if [ -f $INIT_FILE ];then
+        rm -f $INIT_FILE
+    fi
+    init_method=file://$(readlink -f $INIT_FILE)
+    echo "$0: init method is $init_method"
+    for ((i = 0; i < $gpu_num; ++i)); do
+        {
+            rank=$i
+            local_rank=$i
+            gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+            asr_train.py \
+                --gpu_id $gpu_id \
+                --use_preprocessor true \
+                --split_with_space false \
+                --bpemodel ${bpemodel}.model \
+                --token_type $token_type \
+                --dataset_type $dataset_type \
+                --token_list $dict \
+                --train_data_file $feats_dir/$dumpdir/${train_set}/ark_txt.scp \
+                --valid_data_file $feats_dir/$dumpdir/${valid_set}/ark_txt.scp \
+                --resume true \
+                --output_dir ${exp_dir}/exp/${model_dir} \
+                --config $asr_config \
+                --input_size $feats_dim \
+                --ngpu $gpu_num \
+                --num_worker_count $count \
+                --multiprocessing_distributed true \
+                --dist_init_method $init_method \
+                --dist_world_size $world_size \
+                --dist_rank $rank \
+                --local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
+        } &
+        done
+        wait
+fi
+
+# Testing Stage
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Inference"
+    for dset in ${test_sets}; do
+        asr_exp=${exp_dir}/exp/${model_dir}
+        inference_tag="$(basename "${inference_config}" .yaml)"
+        _dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}"
+        _logdir="${_dir}/logdir"
+        if [ -d ${_dir} ]; then
+            echo "${_dir} is already exists. if you want to decode again, please delete this dir first."
+            exit 0
+        fi
+        mkdir -p "${_logdir}"
+        _data="${feats_dir}/${dumpdir}/${dset}"
+        key_file=${_data}/${scp}
+        num_scp_file="$(<${key_file} wc -l)"
+        _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file")
+        split_scps=
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/keys.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+        ${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
+            python -m funasr.bin.asr_inference_launch \
+                --batch_size 1 \
+                --ngpu "${_ngpu}" \
+                --njob ${njob} \
+                --gpuid_list ${gpuid_list} \
+                --data_path_and_name_and_type "${_data}/${scp},speech,${type}" \
+                --key_file "${_logdir}"/keys.JOB.scp \
+                --asr_train_config "${asr_exp}"/config.yaml \
+                --asr_model_file "${asr_exp}"/"${inference_asr_model}" \
+                --output_dir "${_logdir}"/output.JOB \
+                --mode asr \
+                ${_opts}
+
+        for f in token token_int score text; do
+            if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
+                done | sort -k1 >"${_dir}/${f}"
+            fi
+        done
+        python utils/compute_wer.py ${_data}/text ${_dir}/text ${_dir}/text.cer
+        tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
+        cat ${_dir}/text.cer.txt
+    done
+fi
--- a/egs/librispeech/conformer/utils
+++ b/egs/librispeech/conformer/utils
@ -0,0 +1 @@
+../../aishell/transformer/utils
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer.py
@ -74,7 +74,7 @@ def modelscope_infer(params):
    # If text exists, compute CER
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
-        text_proc_file = os.path.join(best_recog_path, "token")
+        text_proc_file = os.path.join(best_recog_path, "text")
        compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))


--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
@ -38,7 +38,7 @@ def modelscope_infer_after_finetune(params):
    # computer CER if GT text is set
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
-        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
+        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))


--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer.py
@ -74,7 +74,7 @@ def modelscope_infer(params):
    # If text exists, compute CER
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
-        text_proc_file = os.path.join(best_recog_path, "token")
+        text_proc_file = os.path.join(best_recog_path, "text")
        compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))


--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
@ -38,7 +38,7 @@ def modelscope_infer_after_finetune(params):
    # computer CER if GT text is set
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
-        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
+        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))


--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
@ -17,7 +17,7 @@ def modelscope_infer(args):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
-    parser.add_argument('--audio_in', type=str, default="./data/test")
+    parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
    parser.add_argument('--output_dir', type=str, default="./results/")
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--gpuid', type=str, default="0")
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
@ -63,8 +63,8 @@ fi

 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
    echo "Computing WER ..."
-    python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
-    python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref
+    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
+    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
    python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
    tail -n 3 ${output_dir}/1best_recog/text.cer
 fi
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
@ -34,7 +34,7 @@ def modelscope_infer_after_finetune(params):
    # computer CER if GT text is set
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
-        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
+        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))


--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
@ -17,7 +17,7 @@ def modelscope_infer(args):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, default="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1")
-    parser.add_argument('--audio_in', type=str, default="./data/test")
+    parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
    parser.add_argument('--output_dir', type=str, default="./results/")
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--gpuid', type=str, default="0")
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.sh
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.sh
@ -63,8 +63,8 @@ fi

 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
    echo "Computing WER ..."
-    python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
-    python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref
+    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
+    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
    python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
    tail -n 3 ${output_dir}/1best_recog/text.cer
 fi
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
@ -34,7 +34,7 @@ def modelscope_infer_after_finetune(params):
    # computer CER if GT text is set
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
-        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
+        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))


--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py
@ -75,7 +75,7 @@ def modelscope_infer(params):
    # If text exists, compute CER
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
-        text_proc_file = os.path.join(best_recog_path, "token")
+        text_proc_file = os.path.join(best_recog_path, "text")
        compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))


--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
@ -39,7 +39,7 @@ def modelscope_infer_after_finetune(params):
    # computer CER if GT text is set
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
-        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
+        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))


--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer.py
@ -75,7 +75,7 @@ def modelscope_infer(params):
    # If text exists, compute CER
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
-        text_proc_file = os.path.join(best_recog_path, "token")
+        text_proc_file = os.path.join(best_recog_path, "text")
        compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))


--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
@ -39,7 +39,7 @@ def modelscope_infer_after_finetune(params):
    # computer CER if GT text is set
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
-        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
+        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))


--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@ -797,7 +797,7 @@ def inference_modelscope(
                        finish_count += 1
                        # asr_utils.print_progress(finish_count / file_count)
                        if writer is not None:
-                            ibest_writer["text"][key] = text_postprocessed
+                            ibest_writer["text"][key] = " ".join(word_lists)

                    logging.info("decoding, utt: {}, predictions: {}".format(key, text))
        rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor))
--- a/funasr/bin/asr_inference_paraformer_streaming.py
+++ b/funasr/bin/asr_inference_paraformer_streaming.py
@ -42,6 +42,7 @@ from funasr.utils import asr_utils, wav_utils, postprocess_utils
 from funasr.models.frontend.wav_frontend import WavFrontend
 from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
 from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
+np.set_printoptions(threshold=np.inf)

 class Speech2Text:
    """Speech2Text class
@ -203,7 +204,6 @@ class Speech2Text:
        # Input as audio signal
        if isinstance(speech, np.ndarray):
            speech = torch.tensor(speech)
-
        if self.frontend is not None:
            feats, feats_len = self.frontend.forward(speech, speech_lengths)
            feats = to_device(feats, device=self.device)
@ -213,13 +213,16 @@ class Speech2Text:
            feats = speech
            feats_len = speech_lengths
        lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
+        feats_len = cache["encoder"]["stride"] + cache["encoder"]["pad_left"] + cache["encoder"]["pad_right"]
+        feats = feats[:,cache["encoder"]["start_idx"]:cache["encoder"]["start_idx"]+feats_len,:]
+        feats_len = torch.tensor([feats_len])
        batch = {"speech": feats, "speech_lengths": feats_len, "cache": cache}

        # a. To device
        batch = to_device(batch, device=self.device)

        # b. Forward Encoder
-        enc, enc_len = self.asr_model.encode_chunk(**batch)
+        enc, enc_len = self.asr_model.encode_chunk(feats, feats_len, cache)
        if isinstance(enc, tuple):
            enc = enc[0]
        # assert len(enc) == 1, len(enc)
@ -578,7 +581,22 @@ def inference_modelscope(
        speech2text = Speech2TextExport(**speech2text_kwargs)
    else:
        speech2text = Speech2Text(**speech2text_kwargs)
+        
+    def _load_bytes(input):
+        middle_data = np.frombuffer(input, dtype=np.int16)
+        middle_data = np.asarray(middle_data)
+        if middle_data.dtype.kind not in 'iu':
+            raise TypeError("'middle_data' must be an array of integers")
+        dtype = np.dtype('float32')
+        if dtype.kind != 'f':
+            raise TypeError("'dtype' must be a floating point type")

+        i = np.iinfo(middle_data.dtype)
+        abs_max = 2 ** (i.bits - 1)
+        offset = i.min + abs_max
+        array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
+        return array
+    
    def _forward(
            data_path_and_name_and_type,
            raw_inputs: Union[np.ndarray, torch.Tensor] = None,
@ -589,10 +607,12 @@ def inference_modelscope(
    ):

        # 3. Build data-iterator
+        if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "bytes":
+            raw_inputs = _load_bytes(data_path_and_name_and_type[0])
+            raw_inputs = torch.tensor(raw_inputs)
        if data_path_and_name_and_type is None and raw_inputs is not None:
            if isinstance(raw_inputs, np.ndarray):
                raw_inputs = torch.tensor(raw_inputs)
-
        is_final = False
        if param_dict is not None and "cache" in param_dict:
            cache = param_dict["cache"]
@ -605,62 +625,87 @@ def inference_modelscope(
        asr_result = ""
        wait = True
        if len(cache) == 0:
-            cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None}
+            cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None, "is_final": is_final, "left": 0, "right": 0}
            cache_de = {"decode_fsmn": None}
            cache["decoder"] = cache_de
            cache["first_chunk"] = True
            cache["speech"] = []
-            cache["chunk_index"] = 0
-            cache["speech_chunk"] = []
+            cache["accum_speech"] = 0

        if raw_inputs is not None:
            if len(cache["speech"]) == 0:
                cache["speech"] = raw_inputs
            else:
                cache["speech"] = torch.cat([cache["speech"], raw_inputs], dim=0)
-            if len(cache["speech_chunk"]) == 0:
-                cache["speech_chunk"] = raw_inputs
-            else:
-                cache["speech_chunk"] = torch.cat([cache["speech_chunk"], raw_inputs], dim=0)
-            while len(cache["speech_chunk"]) >= 960:
+            cache["accum_speech"] += len(raw_inputs)
+            while cache["accum_speech"] >= 960:
                if cache["first_chunk"]:
-                    if len(cache["speech_chunk"]) >= 14400:
-                        speech = torch.unsqueeze(cache["speech_chunk"][0:14400], axis=0)
-                        speech_length = torch.tensor([14400])
+                    if cache["accum_speech"] >= 14400:
+                        speech = torch.unsqueeze(cache["speech"], axis=0)
+                        speech_length = torch.tensor([len(cache["speech"])])
+                        cache["encoder"]["pad_left"] = 5 
+                        cache["encoder"]["pad_right"] = 5 
+                        cache["encoder"]["stride"] = 10
+                        cache["encoder"]["left"] = 5
+                        cache["encoder"]["right"] = 0
                        results = speech2text(cache, speech, speech_length)
-                        cache["speech_chunk"]= cache["speech_chunk"][4800:]
+                        cache["accum_speech"] -= 4800
                        cache["first_chunk"] = False
                        cache["encoder"]["start_idx"] = -5
+                        cache["encoder"]["is_final"] = False
                        wait = False
                    else:
                        if is_final:
-                            cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
+                            cache["encoder"]["stride"] = len(cache["speech"]) // 960
+                            cache["encoder"]["pad_left"] = 0
                            cache["encoder"]["pad_right"] = 0
-                            speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
-                            speech_length = torch.tensor([len(cache["speech_chunk"])])
+                            speech = torch.unsqueeze(cache["speech"], axis=0)
+                            speech_length = torch.tensor([len(cache["speech"])])
                            results = speech2text(cache, speech, speech_length)
-                            cache["speech_chunk"] = []
+                            cache["accum_speech"] = 0
                            wait = False
                        else:
                            break
                else:
-                    if len(cache["speech_chunk"]) >= 19200:
+                    if cache["accum_speech"] >= 19200:
                        cache["encoder"]["start_idx"] += 10
+                        cache["encoder"]["stride"] = 10
                        cache["encoder"]["pad_left"] = 5
-                        speech = torch.unsqueeze(cache["speech_chunk"][:19200], axis=0)
-                        speech_length = torch.tensor([19200])
+                        cache["encoder"]["pad_right"] = 5
+                        cache["encoder"]["left"] = 0
+                        cache["encoder"]["right"] = 0
+                        speech = torch.unsqueeze(cache["speech"], axis=0)
+                        speech_length = torch.tensor([len(cache["speech"])])
                        results = speech2text(cache, speech, speech_length)
-                        cache["speech_chunk"] = cache["speech_chunk"][9600:]
+                        cache["accum_speech"] -= 9600
                        wait = False
                    else:
                        if is_final:
-                            cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
-                            cache["encoder"]["pad_right"] = 0
-                            speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
-                            speech_length = torch.tensor([len(cache["speech_chunk"])])
-                            results = speech2text(cache, speech, speech_length)
-                            cache["speech_chunk"] = []
-                            wait = False
+                            cache["encoder"]["is_final"] = True
+                            if cache["accum_speech"] >= 14400:
+                                cache["encoder"]["start_idx"] += 10
+                                cache["encoder"]["stride"] = 10
+                                cache["encoder"]["pad_left"] = 5
+                                cache["encoder"]["pad_right"] = 5
+                                cache["encoder"]["left"] = 0
+                                cache["encoder"]["right"] = cache["accum_speech"] // 960 - 15
+                                speech = torch.unsqueeze(cache["speech"], axis=0)
+                                speech_length = torch.tensor([len(cache["speech"])])
+                                results = speech2text(cache, speech, speech_length)
+                                cache["accum_speech"] -= 9600
+                                wait = False
+                            else:
+                                cache["encoder"]["start_idx"] += 10
+                                cache["encoder"]["stride"] = cache["accum_speech"] // 960 - 5
+                                cache["encoder"]["pad_left"] = 5
+                                cache["encoder"]["pad_right"] = 0
+                                cache["encoder"]["left"] = 0
+                                cache["encoder"]["right"] = 0
+                                speech = torch.unsqueeze(cache["speech"], axis=0)
+                                speech_length = torch.tensor([len(cache["speech"])])
+                                results = speech2text(cache, speech, speech_length)
+                                cache["accum_speech"] = 0
+                                wait = False
                        else:
                            break
                
--- a/funasr/bin/asr_inference_paraformer_vad.py
+++ b/funasr/bin/asr_inference_paraformer_vad.py
@ -338,7 +338,7 @@ def inference_modelscope(
                    ibest_writer["token"][key] = " ".join(token)
                    ibest_writer["token_int"][key] = " ".join(map(str, token_int))
                    ibest_writer["vad"][key] = "{}".format(vadsegments)
-                    ibest_writer["text"][key] = text_postprocessed
+                    ibest_writer["text"][key] = " ".join(word_lists)
                    ibest_writer["text_with_punc"][key] = text_postprocessed_punc
                    if time_stamp_postprocessed is not None:
                        ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)
--- a/funasr/bin/asr_inference_paraformer_vad_punc.py
+++ b/funasr/bin/asr_inference_paraformer_vad_punc.py
@ -670,7 +670,7 @@ def inference_modelscope(
                    ibest_writer["token"][key] = " ".join(token)
                    ibest_writer["token_int"][key] = " ".join(map(str, token_int))
                    ibest_writer["vad"][key] = "{}".format(vadsegments)
-                    ibest_writer["text"][key] = text_postprocessed
+                    ibest_writer["text"][key] = " ".join(word_lists)
                    ibest_writer["text_with_punc"][key] = text_postprocessed_punc
                    if time_stamp_postprocessed is not None:
                        ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)
--- a/funasr/bin/asr_inference_rnnt.py
+++ b/funasr/bin/asr_inference_rnnt.py
@ -738,13 +738,13 @@ def inference_modelscope(
                        ibest_writer["rtf"][key] = rtf_cur

                    if text is not None:
-                        text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
+                        text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
                        item = {'key': key, 'value': text_postprocessed}
                        asr_result_list.append(item)
                        finish_count += 1
                        # asr_utils.print_progress(finish_count / file_count)
                        if writer is not None:
-                            ibest_writer["text"][key] = text_postprocessed
+                            ibest_writer["text"][key] = " ".join(word_lists)

                    logging.info("decoding, utt: {}, predictions: {}".format(key, text))
        rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor))
--- a/funasr/bin/asr_inference_uniasr.py
+++ b/funasr/bin/asr_inference_uniasr.py
@ -504,13 +504,13 @@ def inference_modelscope(
                    ibest_writer["score"][key] = str(hyp.score)
    
                if text is not None:
-                    text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
+                    text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
                    item = {'key': key, 'value': text_postprocessed}
                    asr_result_list.append(item)
                    finish_count += 1
                    asr_utils.print_progress(finish_count / file_count)
                    if writer is not None:
-                        ibest_writer["text"][key] = text_postprocessed
+                        ibest_writer["text"][key] = " ".join(word_lists)
        return asr_result_list
    
    return _forward
--- a/funasr/bin/asr_inference_uniasr_vad.py
+++ b/funasr/bin/asr_inference_uniasr_vad.py
@ -507,13 +507,13 @@ def inference_modelscope(
                    ibest_writer["score"][key] = str(hyp.score)
    
                if text is not None:
-                    text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
+                    text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
                    item = {'key': key, 'value': text_postprocessed}
                    asr_result_list.append(item)
                    finish_count += 1
                    asr_utils.print_progress(finish_count / file_count)
                    if writer is not None:
-                        ibest_writer["text"][key] = text_postprocessed
+                        ibest_writer["text"][key] = " ".join(word_lists)
        return asr_result_list
    
    return _forward
--- a/funasr/datasets/large_datasets/utils/tokenize.py
+++ b/funasr/datasets/large_datasets/utils/tokenize.py
@ -37,7 +37,7 @@ def tokenize(data,
    vad = -2

    if bpe_tokenizer is not None:
-        text = bpe_tokenizer.text2tokens(text)
+        text = bpe_tokenizer.text2tokens("".join(text))

    if seg_dict is not None:
        assert isinstance(seg_dict, dict)
--- a/funasr/export/export_model.py
+++ b/funasr/export/export_model.py
@ -19,6 +19,7 @@ class ModelExport:
        self,
        cache_dir: Union[Path, str] = None,
        onnx: bool = True,
+        device: str = "cpu",
        quant: bool = True,
        fallback_num: int = 0,
        audio_in: str = None,
@ -36,6 +37,7 @@ class ModelExport:
        )
        print("output dir: {}".format(self.cache_dir))
        self.onnx = onnx
+        self.device = device
        self.quant = quant
        self.fallback_num = fallback_num
        self.frontend = None
@ -112,6 +114,10 @@ class ModelExport:
        else:
            dummy_input = model.get_dummy_inputs()

+        if self.device == 'cuda':
+            model = model.cuda()
+            dummy_input = tuple([i.cuda() for i in dummy_input])
+
        # model_script = torch.jit.script(model)
        model_script = torch.jit.trace(model, dummy_input)
        model_script.save(os.path.join(path, f'{model.model_name}.torchscripts'))
@ -260,6 +266,7 @@ if __name__ == '__main__':
    parser.add_argument('--model-name', type=str, required=True)
    parser.add_argument('--export-dir', type=str, required=True)
    parser.add_argument('--type', type=str, default='onnx', help='["onnx", "torch"]')
+    parser.add_argument('--device', type=str, default='cpu', help='["cpu", "cuda"]')
    parser.add_argument('--quantize', type=str2bool, default=False, help='export quantized model')
    parser.add_argument('--fallback-num', type=int, default=0, help='amp fallback number')
    parser.add_argument('--audio_in', type=str, default=None, help='["wav", "wav.scp"]')
@ -269,6 +276,7 @@ if __name__ == '__main__':
    export_model = ModelExport(
        cache_dir=args.export_dir,
        onnx=args.type == 'onnx',
+        device=args.device,
        quant=args.quantize,
        fallback_num=args.fallback_num,
        audio_in=args.audio_in,
--- a/funasr/export/models/modules/multihead_att.py
+++ b/funasr/export/models/modules/multihead_att.py
@ -75,8 +75,8 @@ def preprocess_for_attn(x, mask, cache, pad_fn):
    return x, cache


-torch_version = float(".".join(torch.__version__.split(".")[:2]))
-if torch_version >= 1.8:
+torch_version = tuple([int(i) for i in torch.__version__.split(".")[:2]])
+if torch_version >= (1, 8):
    import torch.fx
    torch.fx.wrap('preprocess_for_attn')

--- a/funasr/models/decoder/contextual_decoder.py
+++ b/funasr/models/decoder/contextual_decoder.py
@ -74,7 +74,7 @@ class ContextualDecoderLayer(nn.Module):
        return x, tgt_mask, x_self_attn, x_src_attn


-class ContexutalBiasDecoder(nn.Module):
+class ContextualBiasDecoder(nn.Module):
    def __init__(
        self,
        size,
@ -83,7 +83,7 @@ class ContexutalBiasDecoder(nn.Module):
        normalize_before=True,
    ):
        """Construct an DecoderLayer object."""
-        super(ContexutalBiasDecoder, self).__init__()
+        super(ContextualBiasDecoder, self).__init__()
        self.size = size
        self.src_attn = src_attn
        if src_attn is not None:
@ -186,7 +186,7 @@ class ContextualParaformerDecoder(ParaformerSANMDecoder):
            ),
        )
        self.dropout = nn.Dropout(dropout_rate)
-        self.bias_decoder = ContexutalBiasDecoder(
+        self.bias_decoder = ContextualBiasDecoder(
            size=attention_dim,
            src_attn=MultiHeadedAttentionCrossAtt(
                attention_heads, attention_dim, src_attention_dropout_rate
--- a/funasr/models/decoder/sanm_decoder.py
+++ b/funasr/models/decoder/sanm_decoder.py
@ -104,7 +104,6 @@ class DecoderLayerSANM(nn.Module):

            x = residual + self.dropout(self.src_attn(x, memory, memory_mask))

-
        return x, tgt_mask, memory, memory_mask, cache

    def forward_chunk(self, tgt, tgt_mask, memory, memory_mask=None, cache=None):
@ -400,7 +399,7 @@ class FsmnDecoderSCAMAOpt(BaseTransformerDecoder):
        for i in range(self.att_layer_num):
            decoder = self.decoders[i]
            c = cache[i]
-            x, tgt_mask, memory, memory_mask, c_ret = decoder(
+            x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
                x, tgt_mask, memory, memory_mask, cache=c
            )
            new_cache.append(c_ret)
@ -410,13 +409,13 @@ class FsmnDecoderSCAMAOpt(BaseTransformerDecoder):
                j = i + self.att_layer_num
                decoder = self.decoders2[i]
                c = cache[j]
-                x, tgt_mask, memory, memory_mask, c_ret = decoder(
+                x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
                    x, tgt_mask, memory, memory_mask, cache=c
                )
                new_cache.append(c_ret)

        for decoder in self.decoders3:
-            x, tgt_mask, memory, memory_mask, _ = decoder(
+            x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
                x, tgt_mask, memory, None, cache=None
            )

@ -1077,7 +1076,7 @@ class ParaformerSANMDecoder(BaseTransformerDecoder):
        for i in range(self.att_layer_num):
            decoder = self.decoders[i]
            c = cache[i]
-            x, tgt_mask, memory, memory_mask, c_ret = decoder(
+            x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
                x, tgt_mask, memory, None, cache=c
            )
            new_cache.append(c_ret)
@ -1087,14 +1086,14 @@ class ParaformerSANMDecoder(BaseTransformerDecoder):
                j = i + self.att_layer_num
                decoder = self.decoders2[i]
                c = cache[j]
-                x, tgt_mask, memory, memory_mask, c_ret = decoder(
+                x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
                    x, tgt_mask, memory, None, cache=c
                )
                new_cache.append(c_ret)

        for decoder in self.decoders3:

-            x, tgt_mask, memory, memory_mask, _ = decoder(
+            x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
                x, tgt_mask, memory, None, cache=None
            )

--- a/funasr/models/e2e_asr_paraformer.py
+++ b/funasr/models/e2e_asr_paraformer.py
@ -370,19 +370,10 @@ class Paraformer(AbsESPnetModel):
                encoder_out, encoder_out_lens
            )

-        assert encoder_out.size(0) == speech.size(0), (
-            encoder_out.size(),
-            speech.size(0),
-        )
-        assert encoder_out.size(1) <= encoder_out_lens.max(), (
-            encoder_out.size(),
-            encoder_out_lens.max(),
-        )
-
        if intermediate_outs is not None:
            return (encoder_out, intermediate_outs), encoder_out_lens

-        return encoder_out, encoder_out_lens
+        return encoder_out, torch.tensor([encoder_out.size(1)])

    def calc_predictor(self, encoder_out, encoder_out_lens):

@ -1034,16 +1025,76 @@ class BiCifParaformer(Paraformer):

        # 1. Encoder
        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+        intermediate_outs = None
+        if isinstance(encoder_out, tuple):
+            intermediate_outs = encoder_out[1]
+            encoder_out = encoder_out[0]

+        loss_att, acc_att, cer_att, wer_att = None, None, None, None
+        loss_ctc, cer_ctc = None, None
+        loss_pre = None
        stats = dict()

+        # 1. CTC branch
+        if self.ctc_weight != 0.0:
+            loss_ctc, cer_ctc = self._calc_ctc_loss(
+                encoder_out, encoder_out_lens, text, text_lengths
+            )
+
+            # Collect CTC branch stats
+            stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
+            stats["cer_ctc"] = cer_ctc
+
+        # Intermediate CTC (optional)
+        loss_interctc = 0.0
+        if self.interctc_weight != 0.0 and intermediate_outs is not None:
+            for layer_idx, intermediate_out in intermediate_outs:
+                # we assume intermediate_out has the same length & padding
+                # as those of encoder_out
+                loss_ic, cer_ic = self._calc_ctc_loss(
+                    intermediate_out, encoder_out_lens, text, text_lengths
+                )
+                loss_interctc = loss_interctc + loss_ic
+
+                # Collect Intermedaite CTC stats
+                stats["loss_interctc_layer{}".format(layer_idx)] = (
+                    loss_ic.detach() if loss_ic is not None else None
+                )
+                stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic
+
+            loss_interctc = loss_interctc / len(intermediate_outs)
+
+            # calculate whole encoder loss
+            loss_ctc = (
+                               1 - self.interctc_weight
+                       ) * loss_ctc + self.interctc_weight * loss_interctc
+
+        # 2b. Attention decoder branch
+        if self.ctc_weight != 1.0:
+            loss_att, acc_att, cer_att, wer_att, loss_pre = self._calc_att_loss(
+                encoder_out, encoder_out_lens, text, text_lengths
+            )
+
        loss_pre2 = self._calc_pre2_loss(
            encoder_out, encoder_out_lens, text, text_lengths
        )

-        loss = loss_pre2
+        # 3. CTC-Att loss definition
+        if self.ctc_weight == 0.0:
+            loss = loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5
+        elif self.ctc_weight == 1.0:
+            loss = loss_ctc
+        else:
+            loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5

+        # Collect Attn branch stats
+        stats["loss_att"] = loss_att.detach() if loss_att is not None else None
+        stats["acc"] = acc_att
+        stats["cer"] = cer_att
+        stats["wer"] = wer_att
+        stats["loss_pre"] = loss_pre.detach().cpu() if loss_pre is not None else None
        stats["loss_pre2"] = loss_pre2.detach().cpu()
+
        stats["loss"] = torch.clone(loss.detach())

        # force_gatherable: to-device and to-tensor if scalar for DataParallel
@ -1094,6 +1145,7 @@ class ContextualParaformer(Paraformer):
            inner_dim: int = 256,
            bias_encoder_type: str = 'lstm',
            label_bracket: bool = False,
+            use_decoder_embedding: bool = False,
    ):
        assert check_argument_types()
        assert 0.0 <= ctc_weight <= 1.0, ctc_weight
@ -1147,6 +1199,7 @@ class ContextualParaformer(Paraformer):
            self.hotword_buffer = None
            self.length_record = []
            self.current_buffer_length = 0
+        self.use_decoder_embedding = use_decoder_embedding

    def forward(
            self,
@ -1288,7 +1341,10 @@ class ContextualParaformer(Paraformer):
                    hw_list.append(hw_tokens)
        # padding
        hw_list_pad = pad_list(hw_list, 0)
-        hw_embed = self.decoder.embed(hw_list_pad)
+        if self.use_decoder_embedding:
+            hw_embed = self.decoder.embed(hw_list_pad)
+        else:
+            hw_embed = self.bias_embed(hw_list_pad)
        hw_embed, (_, _) = self.bias_encoder(hw_embed)
        _ind = np.arange(0, len(hw_list)).tolist()
        # update self.hotword_buffer, throw a part if oversize
@ -1404,13 +1460,19 @@ class ContextualParaformer(Paraformer):
            # default hotword list
            hw_list = [torch.Tensor([self.sos]).long().to(encoder_out.device)]  # empty hotword list
            hw_list_pad = pad_list(hw_list, 0)
-            hw_embed = self.bias_embed(hw_list_pad)
+            if self.use_decoder_embedding:
+                hw_embed = self.decoder.embed(hw_list_pad)
+            else:
+                hw_embed = self.bias_embed(hw_list_pad)
            _, (h_n, _) = self.bias_encoder(hw_embed)
            contextual_info = h_n.squeeze(0).repeat(encoder_out.shape[0], 1, 1)
        else:
            hw_lengths = [len(i) for i in hw_list]
            hw_list_pad = pad_list([torch.Tensor(i).long() for i in hw_list], 0).to(encoder_out.device)
-            hw_embed = self.bias_embed(hw_list_pad)
+            if self.use_decoder_embedding:
+                hw_embed = self.decoder.embed(hw_list_pad)
+            else:
+                hw_embed = self.bias_embed(hw_list_pad)
            hw_embed = torch.nn.utils.rnn.pack_padded_sequence(hw_embed, hw_lengths, batch_first=True,
                                                               enforce_sorted=False)
            _, (h_n, _) = self.bias_encoder(hw_embed)
--- a/funasr/models/predictor/cif.py
+++ b/funasr/models/predictor/cif.py
@ -200,6 +200,7 @@ class CifPredictorV2(nn.Module):
        return acoustic_embeds, token_num, alphas, cif_peak

    def forward_chunk(self, hidden, cache=None):
+        b, t, d = hidden.size()
        h = hidden
        context = h.transpose(1, 2)
        queries = self.pad(context)
@ -220,6 +221,8 @@ class CifPredictorV2(nn.Module):
            alphas = alphas * mask_chunk_predictor
      
        if cache is not None:
+            if cache["is_final"]:
+                alphas[:, cache["stride"] + cache["pad_left"] - 1] += 0.45
            if cache["cif_hidden"] is not None:
                hidden = torch.cat((cache["cif_hidden"], hidden), 1)
            if cache["cif_alphas"] is not None:
@ -241,7 +244,6 @@ class CifPredictorV2(nn.Module):
                mask_chunk_peak_predictor[:, :pre_alphas_length] = 1.0
            mask_chunk_peak_predictor[:, pre_alphas_length + cache["pad_left"]:pre_alphas_length + cache["stride"] + cache["pad_left"]] = 1.0
            
-
        if mask_chunk_peak_predictor is not None:
            cif_peak = cif_peak * mask_chunk_peak_predictor.squeeze(-1)
        
--- a/funasr/modules/embedding.py
+++ b/funasr/modules/embedding.py
@ -8,7 +8,7 @@

 import math
 import torch
-
+import torch.nn.functional as F

 def _pre_hook(
    state_dict,
@ -409,9 +409,18 @@ class SinusoidalPositionEncoder(torch.nn.Module):

    def forward_chunk(self, x, cache=None):
        start_idx = 0
+        pad_left = 0
+        pad_right = 0
        batch_size, timesteps, input_dim = x.size()
        if cache is not None:
            start_idx = cache["start_idx"]
+            pad_left = cache["left"]
+            pad_right = cache["right"]
        positions = torch.arange(1, timesteps+start_idx+1)[None, :]
        position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
-        return x + position_encoding[:, start_idx: start_idx + timesteps]
+        outputs = x + position_encoding[:, start_idx: start_idx + timesteps]
+        outputs = outputs.transpose(1,2)
+        outputs = F.pad(outputs, (pad_left, pad_right))
+        outputs = outputs.transpose(1,2)
+        return outputs
+       
--- a/funasr/runtime/grpc/Readme.md
+++ b/funasr/runtime/grpc/Readme.md
@ -53,6 +53,68 @@ cd ../python/grpc
 python grpc_main_client_mic.py  --host $server_ip --port 10108
 ```

+The `grpc_main_client_mic.py` follows the [original design] (https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/grpc#workflow-in-desgin) by sending audio_data with chunks. If you want to send audio_data in one request, here is an example:
+
+```
+# go to ../python/grpc to find this package
+import paraformer_pb2
+
+
+class RecognizeStub:
+    def __init__(self, channel):
+        self.Recognize = channel.stream_stream(
+                '/paraformer.ASR/Recognize',
+                request_serializer=paraformer_pb2.Request.SerializeToString,
+                response_deserializer=paraformer_pb2.Response.FromString,
+                )
+
+
+async def send(channel, data, speaking, isEnd):
+    stub = RecognizeStub(channel)
+    req = paraformer_pb2.Request()
+    if data:
+        req.audio_data = data
+    req.user = 'zz'
+    req.language = 'zh-CN'
+    req.speaking = speaking
+    req.isEnd = isEnd
+    q = queue.SimpleQueue()
+    q.put(req)
+    return stub.Recognize(iter(q.get, None))
+
+# send the audio data once
+async def grpc_rec(data, grpc_uri):
+    with grpc.insecure_channel(grpc_uri) as channel:
+        b = time.time()
+        response = await send(channel, data, False, False)
+        resp = response.next()
+        text = ''
+        if 'decoding' == resp.action:
+            resp = response.next()
+            if 'finish' == resp.action:
+                text = json.loads(resp.sentence)['text']
+        response = await send(channel, None, False, True)
+        return {
+                'text': text,
+                'time': time.time() - b,
+                }
+
+async def test():
+    # fc = FunAsrGrpcClient('127.0.0.1', 9900)
+    # t = await fc.rec(wav.tobytes())
+    # print(t)
+    wav, _ = sf.read('z-10s.wav', dtype='int16')
+    uri = '127.0.0.1:9900'
+    res = await grpc_rec(wav.tobytes(), uri)
+    print(res)
+
+
+if __name__ == '__main__':
+    asyncio.run(test())
+
+```
+
+
 ## Acknowledge
 1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
 2. We acknowledge [DeepScience](https://www.deepscience.cn) for contributing the grpc service.
--- a/funasr/runtime/grpc/paraformer_server.cc
+++ b/funasr/runtime/grpc/paraformer_server.cc
@ -88,7 +88,7 @@ grpc::Status ASRServicer::Recognize(
            res.set_language(req.language());
            stream->Write(res);
        } else if (!req.speaking()) {
-            if (client_buffers.count(req.user()) == 0) {
+            if (client_buffers.count(req.user()) == 0 && req.audio_data().size() == 0) {
                Response res;
                res.set_sentence(
                    R"({"success": true, "detail": "waiting_for_voice"})"
@ -99,14 +99,18 @@ grpc::Status ASRServicer::Recognize(
                stream->Write(res);
            }else {
                auto begin_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+                if (req.audio_data().size() > 0) {
+                  auto& buf = client_buffers[req.user()];
+                  buf.insert(buf.end(), req.audio_data().begin(), req.audio_data().end());
+                }
                std::string tmp_data = this->client_buffers[req.user()];
                this->clear_states(req.user());
-                
+
                Response res;
                res.set_sentence(
                    R"({"success": true, "detail": "decoding data: " + std::to_string(tmp_data.length()) + " bytes"})"
                );
-		int data_len_int = tmp_data.length();
+                int data_len_int = tmp_data.length();
                std::string data_len = std::to_string(data_len_int);
                std::stringstream ss;
                ss << R"({"success": true, "detail": "decoding data: )" << data_len << R"( bytes")"  << R"("})";
@ -129,18 +133,18 @@ grpc::Status ASRServicer::Recognize(
                    res.set_user(req.user());
                    res.set_action("finish");
                    res.set_language(req.language());
-                    
-                    
-                    
+
+
+
                    stream->Write(res);
                }
                else {
-                    RPASR_RESULT Result= RapidAsrRecogPCMBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, RASR_NONE, NULL);   
+                    RPASR_RESULT Result= RapidAsrRecogPCMBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, RASR_NONE, NULL);
                    std::string asr_result = ((RPASR_RECOG_RESULT*)Result)->msg;

                    auto end_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
                    std::string delay_str = std::to_string(end_time - begin_time);
-                    
+
                    std::cout << "user: " << req.user() << " , delay(ms): " << delay_str << ", text: " << asr_result << std::endl;
                    Response res;
                    std::stringstream ss;
@ -150,8 +154,8 @@ grpc::Status ASRServicer::Recognize(
                    res.set_user(req.user());
                    res.set_action("finish");
                    res.set_language(req.language());
-                    
-                    
+
+
                    stream->Write(res);
                }
            }
@ -165,7 +169,7 @@ grpc::Status ASRServicer::Recognize(
            res.set_language(req.language());
            stream->Write(res);
        }
-    }    
+    }
    return Status::OK;
 }

--- a/funasr/runtime/python/grpc/grpc_server.py
+++ b/funasr/runtime/python/grpc/grpc_server.py
@ -109,7 +109,7 @@ class ASRServicer(paraformer_pb2_grpc.ASRServicer):
                            else:
                                asr_result = ""
                        elif self.backend == "onnxruntime":
-                            from rapid_paraformer.utils.frontend import load_bytes
+                            from funasr_onnx.utils.frontend import load_bytes
                            array = load_bytes(tmp_data)
                            asr_result = self.inference_16k_pipeline(array)[0]
                        end_time = int(round(time.time() * 1000))
--- a/funasr/runtime/python/libtorch/README.md
+++ b/funasr/runtime/python/libtorch/README.md
@ -31,7 +31,7 @@

    ```shell
    git clone https://github.com/alibaba/FunASR.git && cd FunASR
-    cd funasr/runtime/python/funasr_torch
+    cd funasr/runtime/python/libtorch
    python setup.py build
    python setup.py install
    ```
--- a/funasr/runtime/python/libtorch/demo.py
+++ b/funasr/runtime/python/libtorch/demo.py
@ -1,10 +1,15 @@
-
 from funasr_torch import Paraformer

-model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-model = Paraformer(model_dir, batch_size=1)

-wav_path = ['/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+
+model = Paraformer(model_dir, batch_size=1)  # cpu
+# model = Paraformer(model_dir, batch_size=1, device_id=0)  # gpu
+
+# when using paraformer-large-vad-punc model, you can set plot_timestamp_to="./xx.png" to get figure of alignment besides timestamps
+# model = Paraformer(model_dir, batch_size=1, plot_timestamp_to="test.png")
+
+wav_path = "YourPath/xx.wav"

 result = model(wav_path)
-print(result)
+print(result)
--- a/funasr/runtime/python/libtorch/funasr_torch/paraformer_bin.py
+++ b/funasr/runtime/python/libtorch/funasr_torch/paraformer_bin.py
@ -46,6 +46,7 @@ class Paraformer():
        )
        self.ort_infer = torch.jit.load(model_file)
        self.batch_size = batch_size
+        self.device_id = device_id
        self.plot_timestamp_to = plot_timestamp_to
        self.pred_bias = pred_bias

@ -58,8 +59,13 @@ class Paraformer():
            end_idx = min(waveform_nums, beg_idx + self.batch_size)
            feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
            try:
-                outputs = self.ort_infer(feats, feats_len)
-                am_scores, valid_token_lens = outputs[0], outputs[1]
+                with torch.no_grad():
+                    if int(self.device_id) == -1:
+                        outputs = self.ort_infer(feats, feats_len)
+                        am_scores, valid_token_lens = outputs[0], outputs[1]
+                    else:
+                        outputs = self.ort_infer(feats.cuda(), feats_len.cuda())
+                        am_scores, valid_token_lens = outputs[0].cpu(), outputs[1].cpu()
                if len(outputs) == 4:
                    # for BiCifParaformer Inference
                    us_alphas, us_peaks = outputs[2], outputs[3]
--- a/funasr/runtime/python/onnxruntime/README.md
+++ b/funasr/runtime/python/onnxruntime/README.md
@ -32,7 +32,7 @@ or install from source code

 ```shell
 git clone https://github.com/alibaba/FunASR.git && cd FunASR
-cd funasr/runtime/python/funasr_onnx
+cd funasr/runtime/python/onnxruntime
 python setup.py build
 python setup.py install
 ```
--- a/funasr/runtime/python/onnxruntime/demo.py
+++ b/funasr/runtime/python/onnxruntime/demo.py
@ -1,13 +1,15 @@
-
 from funasr_onnx import Paraformer

-model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch"

-# if you use paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch, you should set pred_bias=0
-# plot_timestamp_to works only when using speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch
-model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0) 
+model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"

-wav_path = "/Users/shixian/code/funasr/export/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/example/asr_example.wav"
+model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0)  # cpu
+# model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0, device_id=0)  # gpu
+
+# when using paraformer-large-vad-punc model, you can set plot_timestamp_to="./xx.png" to get figure of alignment besides timestamps
+# model = Paraformer(model_dir, batch_size=1, plot_timestamp_to="test.png")
+
+wav_path = "YourPath/xx.wav"

 result = model(wav_path)
-print(result)
+print(result)
--- a/funasr/tasks/abs_task.py
+++ b/funasr/tasks/abs_task.py
@ -464,6 +464,12 @@ class AbsTask(ABC):
            default=sys.maxsize,
            help="The maximum number update step to train",
        )
+        parser.add_argument(
+            "--batch_interval",
+            type=int,
+            default=10000,
+            help="The batch interval for saving model.",
+        )
        group.add_argument(
            "--patience",
            type=int_or_none,
@ -1355,15 +1361,15 @@ class AbsTask(ABC):
                from funasr.datasets.large_datasets.build_dataloader import ArkDataLoader
                train_iter_factory = ArkDataLoader(args.train_data_file, args.token_list, args.dataset_conf,
                                                   frontend_conf=args.frontend_conf if hasattr(args, "frontend_conf") else None,
-                                                   seg_dict_file=args.seg_dict_file if hasattr(args,
-                                                                                               "seg_dict_file") else None,
+                                                   seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
                                                   punc_dict_file=args.punc_list if hasattr(args, "punc_list") else None,
+                                                   bpemodel_file=args.bpemodel if hasattr(args, "bpemodel") else None,
                                                   mode="train")
                valid_iter_factory = ArkDataLoader(args.valid_data_file, args.token_list, args.dataset_conf, 
                                                   frontend_conf=args.frontend_conf if hasattr(args, "frontend_conf") else None,
-                                                   seg_dict_file=args.seg_dict_file if hasattr(args,
-                                                                                               "seg_dict_file") else None,
+                                                   seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
                                                   punc_dict_file=args.punc_list if hasattr(args, "punc_list") else None,
+                                                   bpemodel_file=args.bpemodel if hasattr(args, "bpemodel") else None,
                                                   mode="eval")
            elif args.dataset_type == "small":
                train_iter_factory = cls.build_iter_factory(
@ -1576,13 +1582,18 @@ class AbsTask(ABC):
    ) -> AbsIterFactory:
        assert check_argument_types()

+        if args.frontend_conf is not None and "fs" in args.frontend_conf:
+            dest_sample_rate = args.frontend_conf["fs"]
+        else:
+            dest_sample_rate = 16000
+
        dataset = ESPnetDataset(
            iter_options.data_path_and_name_and_type,
            float_dtype=args.train_dtype,
            preprocess=iter_options.preprocess_fn,
            max_cache_size=iter_options.max_cache_size,
            max_cache_fd=iter_options.max_cache_fd,
-            dest_sample_rate=args.frontend_conf["fs"],
+            dest_sample_rate=dest_sample_rate,
        )
        cls.check_task_requirements(
            dataset, args.allow_variable_data_keys, train=iter_options.train
--- a/funasr/tasks/asr.py
+++ b/funasr/tasks/asr.py
@ -412,12 +412,6 @@ class ASRTask(AbsTask):
            default="13_15",
            help="The range of noise decibel level.",
        )
-        parser.add_argument(
-            "--batch_interval",
-            type=int,
-            default=10000,
-            help="The batch interval for saving model.",
-        )

        for class_choices in cls.class_choices_list:
            # Append --<name> and --<name>_conf.
--- a/funasr/train/trainer.py
+++ b/funasr/train/trainer.py
@ -579,9 +579,10 @@ class Trainer:
            reporter.measure_iter_time(iterator, "iter_time"), 1
        ):
            assert isinstance(batch, dict), type(batch)
-        
-            if rank == 0 and hasattr(model.module, "num_updates"):
-                num_batch_updates = model.module.get_num_updates()
+
+            if rank == 0:
+                if hasattr(model, "num_updates") or (hasattr(model, "module") and hasattr(model.module, "num_updates")):
+                    num_batch_updates = model.get_num_updates() if hasattr(model,"num_updates") else model.module.get_num_updates()
                if (num_batch_updates%batch_interval == 0) and (options.oss_bucket is not None) and options.use_pai:
                    buffer = BytesIO()
                    torch.save(model.state_dict(), buffer)
--- a/funasr/utils/compute_wer.py
+++ b/funasr/utils/compute_wer.py
@ -45,8 +45,8 @@ def compute_wer(ref_file,
           if out_item['wrong'] > 0:
               rst['wrong_sentences'] += 1
           cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
-           cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
-           cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')
+           cer_detail_writer.write("ref:" + '\t' + " ".join(list(map(lambda x: x.lower(), ref_dict[hyp_key]))) + '\n')
+           cer_detail_writer.write("hyp:" + '\t' + " ".join(list(map(lambda x: x.lower(), hyp_dict[hyp_key]))) + '\n')

    if rst['Wrd'] > 0:
        rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)
--- a/funasr/version.txt
+++ b/funasr/version.txt
@ -1 +1 @@
-0.3.2
+0.3.3
 @ -1 +1 @@
 .3.2
 .3.3