From 72c8f70ef470a878901dedb575081660ac474133 Mon Sep 17 00:00:00 2001
From: onlybetheone <iriszhangchong@gmail.com>
Date: Fri, 10 Mar 2023 18:08:18 +0800
Subject: [PATCH 01/37] add egs_modelscope/uniasr/ he my ur examples

---
 .../finetune.py                               | 35 +++++++++++++++++++
 .../infer.py                                  | 13 +++++++
 .../finetune.py                               | 35 +++++++++++++++++++
 .../infer.py                                  | 13 +++++++
 .../finetune.py                               | 35 +++++++++++++++++++
 .../infer.py                                  | 13 +++++++
 6 files changed, 144 insertions(+)
 create mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py
 create mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py
 create mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py
 create mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py
 create mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py
 create mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py

diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py
new file mode 100644
index 000000000..56fb58302
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py
@@ -0,0 +1,35 @@
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from funasr.datasets.ms_dataset import MsDataset
+
+
+def modelscope_finetune(params):
+    if not os.path.exists(params["output_dir"]):
+        os.makedirs(params["output_dir"], exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = MsDataset.load(params["data_dir"])
+    kwargs = dict(
+        model=params["model"],
+        model_revision=params["model_revision"],
+        data_dir=ds_dict,
+        dataset_type=params["dataset_type"],
+        work_dir=params["output_dir"],
+        batch_bins=params["batch_bins"],
+        max_epoch=params["max_epoch"],
+        lr=params["lr"])
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    params = {}
+    params["output_dir"] = "./checkpoint"
+    params["data_dir"] = "./data"
+    params["batch_bins"] = 2000
+    params["dataset_type"] = "small"
+    params["max_epoch"] = 50
+    params["lr"] = 0.00005
+    params["model"] = "damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch"
+    params["model_revision"] = None
+    modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py
new file mode 100644
index 000000000..c54ab8c83
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py
@@ -0,0 +1,13 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == "__main__":
+    audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_he.wav"
+    output_dir = "./results"
+    inference_pipline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model="damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch",
+        output_dir=output_dir,
+    )
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py
new file mode 100644
index 000000000..8bbce606c
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py
@@ -0,0 +1,35 @@
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from funasr.datasets.ms_dataset import MsDataset
+
+
+def modelscope_finetune(params):
+    if not os.path.exists(params["output_dir"]):
+        os.makedirs(params["output_dir"], exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = MsDataset.load(params["data_dir"])
+    kwargs = dict(
+        model=params["model"],
+        model_revision=params["model_revision"],
+        data_dir=ds_dict,
+        dataset_type=params["dataset_type"],
+        work_dir=params["output_dir"],
+        batch_bins=params["batch_bins"],
+        max_epoch=params["max_epoch"],
+        lr=params["lr"])
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    params = {}
+    params["output_dir"] = "./checkpoint"
+    params["data_dir"] = "./data"
+    params["batch_bins"] = 2000
+    params["dataset_type"] = "small"
+    params["max_epoch"] = 50
+    params["lr"] = 0.00005
+    params["model"] = "damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch"
+    params["model_revision"] = None
+    modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py
new file mode 100644
index 000000000..cfd869f04
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py
@@ -0,0 +1,13 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == "__main__":
+    audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_my.wav"
+    output_dir = "./results"
+    inference_pipline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model="damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch",
+        output_dir=output_dir,
+    )
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py
new file mode 100644
index 000000000..5e313e533
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py
@@ -0,0 +1,35 @@
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from funasr.datasets.ms_dataset import MsDataset
+
+
+def modelscope_finetune(params):
+    if not os.path.exists(params["output_dir"]):
+        os.makedirs(params["output_dir"], exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = MsDataset.load(params["data_dir"])
+    kwargs = dict(
+        model=params["model"],
+        model_revision=params["model_revision"],
+        data_dir=ds_dict,
+        dataset_type=params["dataset_type"],
+        work_dir=params["output_dir"],
+        batch_bins=params["batch_bins"],
+        max_epoch=params["max_epoch"],
+        lr=params["lr"])
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    params = {}
+    params["output_dir"] = "./checkpoint"
+    params["data_dir"] = "./data"
+    params["batch_bins"] = 2000
+    params["dataset_type"] = "small"
+    params["max_epoch"] = 50
+    params["lr"] = 0.00005
+    params["model"] = "damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch"
+    params["model_revision"] = None
+    modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py
new file mode 100644
index 000000000..e8c5524f0
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py
@@ -0,0 +1,13 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == "__main__":
+    audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ur.wav"
+    output_dir = "./results"
+    inference_pipline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model="damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch",
+        output_dir=output_dir,
+    )
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    print(rec_result)

From 71d466e7451435eefb604f22576aba04bd39e285 Mon Sep 17 00:00:00 2001
From: "shixian.shi" <shixian.shi@alibaba-inc.com>
Date: Mon, 13 Mar 2023 19:47:42 +0800
Subject: [PATCH 02/37] update AverageShiftCalculator in utils

---
 funasr/utils/timestamp_tools.py | 139 ++++++++++++++++++++++++++++++++
 1 file changed, 139 insertions(+)

diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py
index f5a238ea9..73f0c7afa 100644
--- a/funasr/utils/timestamp_tools.py
+++ b/funasr/utils/timestamp_tools.py
@@ -1,6 +1,10 @@
+from scipy.fftpack import shift
 import torch
 import copy
+import codecs
 import logging
+import edit_distance
+import argparse
 import numpy as np
 from typing import Any, List, Tuple, Union
 
@@ -121,4 +125,139 @@ def time_stamp_sentence(punc_id_list, time_stamp_postprocessed, text_postprocess
     return res
 
 
+class AverageShiftCalculator():
+    def __init__(self):
+        logging.warning("Calculating average shift.")
+    def __call__(self, file1, file2):
+        uttid_list1, ts_dict1 = self.read_timestamps(file1)
+        uttid_list2, ts_dict2 = self.read_timestamps(file2)
+        uttid_intersection = self._intersection(uttid_list1, uttid_list2)
+        res = self.as_cal(uttid_intersection, ts_dict1, ts_dict2)
+        logging.warning("Average shift of {} and {}: {}.".format(file1, file2, str(res)[:8]))
+        logging.warning("Following timestamp pair differs most: {}, detail:{}".format(self.max_shift_uttid))
+
+    def _intersection(list1, list2):
+        set1 = set(list1)
+        set2 = set(list2)
+        if set1 == set2:
+            logging.warning("Uttid same checked.")
+            return set1
+        itsc = list(set1 & set2)
+        logging.warning("Uttid differs: file1 {}, file2 {}, lines same {}.".format(len(list1), len(list2), len(itsc)))
+        return itsc
+
+    def read_timestamps(self, file):
+        # read timestamps file in standard format
+        uttid_list = []
+        ts_dict = {}
+        with codecs.open(file, 'r') as fin:
+            for line in fin.readlines():
+                text = ''
+                ts_list = []
+                line = line.rstrip()
+                uttid = line.split()[0]
+                uttid_list.append(uttid)
+                body = " ".join(line.split()[1:])
+                for pd in body.split(';'):
+                    if not len(pd): continue
+                    # pdb.set_trace() 
+                    char, start, end = pd.lstrip(" ").split(' ')
+                    text += char + ','
+                    ts_list.append((float(start), float(end)))
+                # ts_lists.append(ts_list)
+                ts_dict[uttid] = (text[:-1], ts_list)
+        logging.warning("File {} read done.".format(file))
+        return uttid_list, ts_dict
+
+    def _shift(self, filtered_timestamp_list1, filtered_timestamp_list2):
+        for fts1, fts2 in zip(filtered_timestamp_list1, filtered_timestamp_list2):
+            shift_time = abs(fts1[0] - fts2[0]) + abs(fts1[1] - fts2[1])
+        num_tokens = len(filtered_timestamp_list1)
+        return shift_time, num_tokens
+
+    def as_cal(self, uttid_list, ts_dict1, ts_dict2):
+        # calculate average shift between timestamp1 and timestamp2
+        # when characters differ, use edit distance alignment
+        # and calculate the error between the same characters
+        self._accumlated_shift = 0
+        self._accumlated_tokens = 0
+        self.max_shift = 0
+        self.max_shift_uttid = None
+        for uttid in uttid_list:
+            (t1, ts1) = ts_dict1[uttid]
+            (t2, ts2) = ts_dict2[uttid]
+            _align, _align2, _align3 = [], [], []
+            fts1, fts2 = [], []
+            _t1, _t2 = [], []
+            sm = edit_distance.SequenceMatcher(t1.split(','), t2.split(','))
+            s = sm.get_opcodes()
+            for j in range(len(s)):
+                if s[j][0] == "replace" or s[j][0] == "insert":
+                    _align.append(0)
+                if s[j][0] == "replace" or s[j][0] == "delete":
+                    _align3.append(0)
+                elif s[j][0] == "equal":
+                    _align.append(1)
+                    _align3.append(1)
+                else:
+                    continue
+            # use s to index t2
+            for a, ts , t in zip(_align, ts2, t2.split(',')):
+                if a: 
+                    fts2.append(ts)
+                    _t2.append(t)
+            sm2 = edit_distance.SequenceMatcher(t2.split(','), t1.split(','))
+            s = sm2.get_opcodes()
+            for j in range(len(s)):
+                if s[j][0] == "replace" or s[j][0] == "insert":
+                    _align2.append(0)
+                elif s[j][0] == "equal":
+                    _align2.append(1)
+                else:
+                    continue
+            # use s2 tp index t1
+            for a, ts, t in zip(_align3, ts1, t1.split(',')):
+                if a: 
+                    fts1.append(ts)
+                    _t1.append(t)
+            if len(fts1) == len(fts2):
+                shift_time, num_tokens = self._shift(fts1, fts2)
+                self._accumlated_shift += shift_time
+                self._accumlated_tokens += num_tokens
+                if shift_time/num_tokens > self.max_shift:
+                    self.max_shift = shift_time/num_tokens
+                    self.max_shift_uttid = uttid
+            else:
+                logging.warning("length mismatch")
+        return self._accumlated_shift / self._accumlated_tokens
+
+
+SUPPORTED_MODES = ['cal_aas']
+
+
+def main(args):
+    if args.mode == 'cal_aas':
+        asc = AverageShiftCalculator()
+        asc(args.input, args.input2)
+    else:
+        logging.error("Mode {} not in SUPPORTED_MODES: {}.".format(args.mode, SUPPORTED_MODES))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='timestamp tools')
+    parser.add_argument('--mode', 
+                        default=None, 
+                        type=str, 
+                        choices=SUPPORTED_MODES, 
+                        help='timestamp related toolbox')
+    parser.add_argument('--input', default=None, type=str, help='input file path')
+    parser.add_argument('--output', default=None, type=str, help='output file name')
+    parser.add_argument('--input2', default=None, type=str, help='input2 file path')
+    parser.add_argument('--kaldi-ts-type', 
+                        default='v2', 
+                        type=str, 
+                        choices=['v0', 'v1', 'v2'], 
+                        help='kaldi timestamp to write')
+    args = parser.parse_args()
+    main(args)
 

From 4b16316d4917f1c8da434218949343ccf1a817c9 Mon Sep 17 00:00:00 2001
From: "shixian.shi" <shixian.shi@alibaba-inc.com>
Date: Mon, 13 Mar 2023 19:53:33 +0800
Subject: [PATCH 03/37] bug fic

---
 funasr/utils/timestamp_tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py
index 73f0c7afa..27095a65a 100644
--- a/funasr/utils/timestamp_tools.py
+++ b/funasr/utils/timestamp_tools.py
@@ -136,7 +136,7 @@ class AverageShiftCalculator():
         logging.warning("Average shift of {} and {}: {}.".format(file1, file2, str(res)[:8]))
         logging.warning("Following timestamp pair differs most: {}, detail:{}".format(self.max_shift_uttid))
 
-    def _intersection(list1, list2):
+    def _intersection(self, list1, list2):
         set1 = set(list1)
         set2 = set(list2)
         if set1 == set2:

From 9c21bbb96b95980ac059df991e442c437a69c828 Mon Sep 17 00:00:00 2001
From: "shixian.shi" <shixian.shi@alibaba-inc.com>
Date: Mon, 13 Mar 2023 19:55:47 +0800
Subject: [PATCH 04/37] bug fix

---
 funasr/utils/timestamp_tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py
index 27095a65a..7ba3e0b42 100644
--- a/funasr/utils/timestamp_tools.py
+++ b/funasr/utils/timestamp_tools.py
@@ -134,7 +134,7 @@ class AverageShiftCalculator():
         uttid_intersection = self._intersection(uttid_list1, uttid_list2)
         res = self.as_cal(uttid_intersection, ts_dict1, ts_dict2)
         logging.warning("Average shift of {} and {}: {}.".format(file1, file2, str(res)[:8]))
-        logging.warning("Following timestamp pair differs most: {}, detail:{}".format(self.max_shift_uttid))
+        logging.warning("Following timestamp pair differs most: {}, detail:{}".format(self.max_shift, self.max_shift_uttid))
 
     def _intersection(self, list1, list2):
         set1 = set(list1)

From 0b06794fde09bedfb75ee85504148cf3a4707e21 Mon Sep 17 00:00:00 2001
From: "shixian.shi" <shixian.shi@alibaba-inc.com>
Date: Mon, 13 Mar 2023 20:14:41 +0800
Subject: [PATCH 05/37] bug fix

---
 funasr/utils/timestamp_tools.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py
index 7ba3e0b42..2bccd50e6 100644
--- a/funasr/utils/timestamp_tools.py
+++ b/funasr/utils/timestamp_tools.py
@@ -170,8 +170,9 @@ class AverageShiftCalculator():
         return uttid_list, ts_dict
 
     def _shift(self, filtered_timestamp_list1, filtered_timestamp_list2):
+        shift_time = 0
         for fts1, fts2 in zip(filtered_timestamp_list1, filtered_timestamp_list2):
-            shift_time = abs(fts1[0] - fts2[0]) + abs(fts1[1] - fts2[1])
+            shift_time += abs(fts1[0] - fts2[0]) + abs(fts1[1] - fts2[1])
         num_tokens = len(filtered_timestamp_list1)
         return shift_time, num_tokens
 

From 4d2bf9fe3cc385b441e94c3000b34a44cac8a8db Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Tue, 14 Mar 2023 17:13:25 +0800
Subject: [PATCH 06/37] update

---
 funasr/models/frontend/wav_frontend.py | 51 ++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/funasr/models/frontend/wav_frontend.py b/funasr/models/frontend/wav_frontend.py
index 445efca24..c4b79104b 100644
--- a/funasr/models/frontend/wav_frontend.py
+++ b/funasr/models/frontend/wav_frontend.py
@@ -7,6 +7,7 @@ import numpy as np
 import torch
 import torchaudio.compliance.kaldi as kaldi
 from funasr.models.frontend.abs_frontend import AbsFrontend
+import funasr.models.frontend.eend_ola_feature as eend_ola_feature
 from typeguard import check_argument_types
 from torch.nn.utils.rnn import pad_sequence
 
@@ -444,3 +445,53 @@ class WavFrontendOnline(AbsFrontend):
         self.reserve_waveforms = None
         self.input_cache = None
         self.lfr_splice_cache = []
+
+
+class WavFrontendMel23(AbsFrontend):
+    """Conventional frontend structure for ASR.
+    """
+
+    def __init__(
+            self,
+            fs: int = 16000,
+            frame_length: int = 25,
+            frame_shift: int = 10,
+            lfr_m: int = 1,
+            lfr_n: int = 1,
+    ):
+        assert check_argument_types()
+        super().__init__()
+        self.fs = fs
+        self.frame_length = frame_length
+        self.frame_shift = frame_shift
+        self.lfr_m = lfr_m
+        self.lfr_n = lfr_n
+
+    def output_size(self) -> int:
+        return self.n_mels * self.lfr_m
+
+    def forward(
+            self,
+            input: torch.Tensor,
+            input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch_size = input.size(0)
+        feats = []
+        feats_lens = []
+        for i in range(batch_size):
+            waveform_length = input_lengths[i]
+            waveform = input[i][:waveform_length]
+            waveform = waveform.unsqueeze(0).numpy()
+            mat = eend_ola_feature.stft(waveform, self.frame_length, self.frame_shift)
+            mat = eend_ola_feature.transform(mat)
+            mat = mat.splice(mat, context_size=self.lfr_m)
+            mat = mat[::self.lfr_n]
+            mat = torch.from_numpy(mat)
+            feat_length = mat.size(0)
+            feats.append(mat)
+            feats_lens.append(feat_length)
+
+        feats_lens = torch.as_tensor(feats_lens)
+        feats_pad = pad_sequence(feats,
+                                 batch_first=True,
+                                 padding_value=0.0)
+        return feats_pad, feats_lens
\ No newline at end of file

From 46d9cc0b374470ca03339d63a38d213eb4fd889e Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Tue, 14 Mar 2023 22:54:24 +0800
Subject: [PATCH 07/37] update

---
 .../infer.py                                              | 8 ++++++++
 funasr/bin/diar_inference_launch.py                       | 3 +++
 2 files changed, 11 insertions(+)
 create mode 100644 egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py

diff --git a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
new file mode 100644
index 000000000..fa4e8bf04
--- /dev/null
+++ b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
@@ -0,0 +1,8 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_diar_pipline = pipeline(
+    task=Tasks.speaker_diarization,
+    model='damo/speech_diarization_eend-ola-en-us-callhome-8k',
+)
+results = inference_diar_pipline(audio_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav")
\ No newline at end of file
diff --git a/funasr/bin/diar_inference_launch.py b/funasr/bin/diar_inference_launch.py
index 7738f4f4f..70bb947b4 100755
--- a/funasr/bin/diar_inference_launch.py
+++ b/funasr/bin/diar_inference_launch.py
@@ -142,6 +142,9 @@ def inference_launch(mode, **kwargs):
         else:
             kwargs["param_dict"] = param_dict
         return inference_modelscope(mode=mode, **kwargs)
+    elif mode == "eend-ola":
+        from funasr.bin.eend_ola_inference import inference_modelscope
+        return inference_modelscope(mode=mode, **kwargs)
     else:
         logging.info("Unknown decoding mode: {}".format(mode))
         return None

From 1e4eba6a72ea97d9a9e733df3e3b1eb86e4fd44d Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Tue, 14 Mar 2023 23:22:31 +0800
Subject: [PATCH 08/37] update

---
 .../speech_diarization_eend-ola-en-us-callhome-8k/infer.py       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
index fa4e8bf04..75f9c7346 100644
--- a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
+++ b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
@@ -4,5 +4,6 @@ from modelscope.utils.constant import Tasks
 inference_diar_pipline = pipeline(
     task=Tasks.speaker_diarization,
     model='damo/speech_diarization_eend-ola-en-us-callhome-8k',
+    model_revision="v1.0.0",
 )
 results = inference_diar_pipline(audio_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav")
\ No newline at end of file

From f59a72d24e917fb2e9560fa646ae80285dba6c95 Mon Sep 17 00:00:00 2001
From: "shixian.shi" <shixian.shi@alibaba-inc.com>
Date: Wed, 15 Mar 2023 10:21:32 +0800
Subject: [PATCH 09/37] release timestasmp related tools

---
 funasr/utils/timestamp_tools.py | 50 +++++++++++++++++++++++++++++++--
 1 file changed, 48 insertions(+), 2 deletions(-)

diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py
index 2bccd50e6..09c3becfc 100644
--- a/funasr/utils/timestamp_tools.py
+++ b/funasr/utils/timestamp_tools.py
@@ -1,3 +1,4 @@
+from pydoc import TextRepr
 from scipy.fftpack import shift
 import torch
 import copy
@@ -5,6 +6,7 @@ import codecs
 import logging
 import edit_distance
 import argparse
+import pdb
 import numpy as np
 from typing import Any, List, Tuple, Union
 
@@ -13,7 +15,8 @@ def ts_prediction_lfr6_standard(us_alphas,
                        us_peaks, 
                        char_list, 
                        vad_offset=0.0, 
-                       force_time_shift=-1.5
+                       force_time_shift=-1.5,
+                       sil_in_str=True
                        ):
     if not len(char_list):
         return []
@@ -66,6 +69,8 @@ def ts_prediction_lfr6_standard(us_alphas,
             timestamp_list[i][1] = timestamp_list[i][1] + vad_offset / 1000.0
     res_txt = ""
     for char, timestamp in zip(new_char_list, timestamp_list):
+        #if char != '<sil>':
+        if not sil_in_str and char == '<sil>': continue
         res_txt += "{} {} {};".format(char, str(timestamp[0]+0.0005)[:5], str(timestamp[1]+0.0005)[:5])
     res = []
     for char, timestamp in zip(new_char_list, timestamp_list):
@@ -233,13 +238,54 @@ class AverageShiftCalculator():
         return self._accumlated_shift / self._accumlated_tokens
 
 
-SUPPORTED_MODES = ['cal_aas']
+def convert_external_alphas(alphas_file, text_file, output_file):
+    from funasr.models.predictor.cif import cif_wo_hidden
+    with open(alphas_file, 'r') as f1, open(text_file, 'r') as f2, open(output_file, 'w') as f3:
+        for line1, line2 in zip(f1.readlines(), f2.readlines()):
+            line1 = line1.rstrip()
+            line2 = line2.rstrip()
+            assert line1.split()[0] == line2.split()[0]
+            uttid = line1.split()[0]
+            alphas = [float(i) for i in line1.split()[1:]]
+            new_alphas = np.array(remove_chunk_padding(alphas))
+            new_alphas[-1] += 1e-4
+            text = line2.split()[1:]
+            if len(text) + 1 != int(new_alphas.sum()):
+                # force resize
+                new_alphas *= (len(text) + 1) / int(new_alphas.sum())
+            peaks = cif_wo_hidden(torch.Tensor(new_alphas).unsqueeze(0), 1.0-1e-4)
+            if " " in text:
+                text = text.split()
+            else:
+                text = [i for i in text]
+            res_str, _ = ts_prediction_lfr6_standard(new_alphas, peaks[0], text, 
+                                                     force_time_shift=-7.0, 
+                                                     sil_in_str=False)
+            f3.write("{} {}\n".format(uttid, res_str))
+
+
+def remove_chunk_padding(alphas):
+    # remove the padding part in alphas if using chunk paraformer for GPU
+    START_ZERO = 45
+    MID_ZERO = 75
+    REAL_FRAMES = 360  # for chunk based encoder 10-120-10 and fsmn padding 5
+    alphas = alphas[START_ZERO:]  # remove the padding at beginning
+    new_alphas = []
+    while True:
+        new_alphas = new_alphas + alphas[:REAL_FRAMES]
+        alphas = alphas[REAL_FRAMES+MID_ZERO:]
+        if len(alphas) < REAL_FRAMES: break
+    return new_alphas
+
+SUPPORTED_MODES = ['cal_aas', 'read_ext_alphas']
 
 
 def main(args):
     if args.mode == 'cal_aas':
         asc = AverageShiftCalculator()
         asc(args.input, args.input2)
+    elif args.mode == 'read_ext_alphas':
+        convert_external_alphas(args.input, args.input2, args.output)
     else:
         logging.error("Mode {} not in SUPPORTED_MODES: {}.".format(args.mode, SUPPORTED_MODES))
 

From f63a72c52eada7c25fde2538f290ef1420c193fb Mon Sep 17 00:00:00 2001
From: "shixian.shi" <shixian.shi@alibaba-inc.com>
Date: Wed, 15 Mar 2023 10:22:30 +0800
Subject: [PATCH 10/37] update tools

---
 funasr/utils/timestamp_tools.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py
index 09c3becfc..423110cdc 100644
--- a/funasr/utils/timestamp_tools.py
+++ b/funasr/utils/timestamp_tools.py
@@ -1,5 +1,3 @@
-from pydoc import TextRepr
-from scipy.fftpack import shift
 import torch
 import copy
 import codecs

From 4d60eb6ada430098654cf58faf5b0758388e366a Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Wed, 15 Mar 2023 11:12:51 +0800
Subject: [PATCH 11/37] update

---
 funasr/bin/eend_ola_inference.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py
index d65895f30..1b3622005 100755
--- a/funasr/bin/eend_ola_inference.py
+++ b/funasr/bin/eend_ola_inference.py
@@ -179,7 +179,6 @@ def inference_modelscope(
         diar_model_file=diar_model_file,
         device=device,
         dtype=dtype,
-        streaming=streaming,
     )
     logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs))
     speech2diar = Speech2Diarization.from_pretrained(

From 6165c139182c31252e9d69e95837546637f9e2da Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Wed, 15 Mar 2023 11:15:00 +0800
Subject: [PATCH 12/37] update

---
 funasr/models/frontend/wav_frontend.py | 38 ++++++++++++++++----------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/funasr/models/frontend/wav_frontend.py b/funasr/models/frontend/wav_frontend.py
index c4b79104b..f61d7dd17 100644
--- a/funasr/models/frontend/wav_frontend.py
+++ b/funasr/models/frontend/wav_frontend.py
@@ -1,15 +1,15 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Part of the implementation is borrowed from espnet/espnet.
-from abc import ABC
 from typing import Tuple
 
 import numpy as np
 import torch
 import torchaudio.compliance.kaldi as kaldi
-from funasr.models.frontend.abs_frontend import AbsFrontend
-import funasr.models.frontend.eend_ola_feature as eend_ola_feature
-from typeguard import check_argument_types
 from torch.nn.utils.rnn import pad_sequence
+from typeguard import check_argument_types
+
+import funasr.models.frontend.eend_ola_feature as eend_ola_feature
+from funasr.models.frontend.abs_frontend import AbsFrontend
 
 
 def load_cmvn(cmvn_file):
@@ -276,7 +276,8 @@ class WavFrontendOnline(AbsFrontend):
     # inputs tensor has catted the cache tensor
     # def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, inputs_lfr_cache: torch.Tensor = None,
     #               is_final: bool = False) -> Tuple[torch.Tensor, torch.Tensor, int]:
-    def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, is_final: bool = False) -> Tuple[torch.Tensor, torch.Tensor, int]:
+    def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, is_final: bool = False) -> Tuple[
+        torch.Tensor, torch.Tensor, int]:
         """
         Apply lfr with data
         """
@@ -377,7 +378,8 @@ class WavFrontendOnline(AbsFrontend):
             if self.lfr_m != 1 or self.lfr_n != 1:
                 # update self.lfr_splice_cache in self.apply_lfr
                 # mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n, self.lfr_splice_cache[i],
-                mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n, is_final)
+                mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n,
+                                                                                     is_final)
             if self.cmvn_file is not None:
                 mat = self.apply_cmvn(mat, self.cmvn)
             feat_length = mat.size(0)
@@ -399,9 +401,10 @@ class WavFrontendOnline(AbsFrontend):
         assert batch_size == 1, 'we support to extract feature online only when the batch size is equal to 1 now'
         waveforms, feats, feats_lengths = self.forward_fbank(input, input_lengths)  # input shape: B T D
         if feats.shape[0]:
-            #if self.reserve_waveforms is None and self.lfr_m > 1:
+            # if self.reserve_waveforms is None and self.lfr_m > 1:
             #    self.reserve_waveforms = waveforms[:, :(self.lfr_m - 1) // 2 * self.frame_shift_sample_length]
-            self.waveforms = waveforms if self.reserve_waveforms is None else torch.cat((self.reserve_waveforms, waveforms), dim=1)
+            self.waveforms = waveforms if self.reserve_waveforms is None else torch.cat(
+                (self.reserve_waveforms, waveforms), dim=1)
             if not self.lfr_splice_cache:  # 初始化splice_cache
                 for i in range(batch_size):
                     self.lfr_splice_cache.append(feats[i][0, :].unsqueeze(dim=0).repeat((self.lfr_m - 1) // 2, 1))
@@ -410,7 +413,8 @@ class WavFrontendOnline(AbsFrontend):
                 lfr_splice_cache_tensor = torch.stack(self.lfr_splice_cache)  # B T D
                 feats = torch.cat((lfr_splice_cache_tensor, feats), dim=1)
                 feats_lengths += lfr_splice_cache_tensor[0].shape[0]
-                frame_from_waveforms = int((self.waveforms.shape[1] - self.frame_sample_length) / self.frame_shift_sample_length + 1)
+                frame_from_waveforms = int(
+                    (self.waveforms.shape[1] - self.frame_sample_length) / self.frame_shift_sample_length + 1)
                 minus_frame = (self.lfr_m - 1) // 2 if self.reserve_waveforms is None else 0
                 feats, feats_lengths, lfr_splice_frame_idxs = self.forward_lfr_cmvn(feats, feats_lengths, is_final)
                 if self.lfr_m == 1:
@@ -419,19 +423,22 @@ class WavFrontendOnline(AbsFrontend):
                     reserve_frame_idx = lfr_splice_frame_idxs[0] - minus_frame
                     # print('reserve_frame_idx:  ' + str(reserve_frame_idx))
                     # print('frame_frame:  ' + str(frame_from_waveforms))
-                    self.reserve_waveforms = self.waveforms[:, reserve_frame_idx * self.frame_shift_sample_length:frame_from_waveforms * self.frame_shift_sample_length]
-                    sample_length = (frame_from_waveforms - 1) * self.frame_shift_sample_length + self.frame_sample_length
+                    self.reserve_waveforms = self.waveforms[:,
+                                             reserve_frame_idx * self.frame_shift_sample_length:frame_from_waveforms * self.frame_shift_sample_length]
+                    sample_length = (
+                                                frame_from_waveforms - 1) * self.frame_shift_sample_length + self.frame_sample_length
                     self.waveforms = self.waveforms[:, :sample_length]
             else:
                 # update self.reserve_waveforms and self.lfr_splice_cache
-                self.reserve_waveforms = self.waveforms[:, :-(self.frame_sample_length - self.frame_shift_sample_length)]
+                self.reserve_waveforms = self.waveforms[:,
+                                         :-(self.frame_sample_length - self.frame_shift_sample_length)]
                 for i in range(batch_size):
                     self.lfr_splice_cache[i] = torch.cat((self.lfr_splice_cache[i], feats[i]), dim=0)
                 return torch.empty(0), feats_lengths
         else:
             if is_final:
                 self.waveforms = waveforms if self.reserve_waveforms is None else self.reserve_waveforms
-                feats = torch.stack(self.lfr_splice_cache) 
+                feats = torch.stack(self.lfr_splice_cache)
                 feats_lengths = torch.zeros(batch_size, dtype=torch.int) + feats.shape[1]
                 feats, feats_lengths, _ = self.forward_lfr_cmvn(feats, feats_lengths, is_final)
         if is_final:
@@ -466,9 +473,10 @@ class WavFrontendMel23(AbsFrontend):
         self.frame_shift = frame_shift
         self.lfr_m = lfr_m
         self.lfr_n = lfr_n
+        self.n_mels = 23
 
     def output_size(self) -> int:
-        return self.n_mels * self.lfr_m
+        return self.n_mels * (2 * self.lfr_m + 1)
 
     def forward(
             self,
@@ -494,4 +502,4 @@ class WavFrontendMel23(AbsFrontend):
         feats_pad = pad_sequence(feats,
                                  batch_first=True,
                                  padding_value=0.0)
-        return feats_pad, feats_lens
\ No newline at end of file
+        return feats_pad, feats_lens

From ab6d93b4eb3605738ca4af440dd1b296458fe485 Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Wed, 15 Mar 2023 11:19:07 +0800
Subject: [PATCH 13/37] update

---
 funasr/tasks/diar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/funasr/tasks/diar.py b/funasr/tasks/diar.py
index ae7ee9b40..67d0c59c5 100644
--- a/funasr/tasks/diar.py
+++ b/funasr/tasks/diar.py
@@ -823,7 +823,7 @@ class EENDOLADiarTask(AbsTask):
 
         # 2. Encoder
         encoder_class = encoder_choices.get_class(args.encoder)
-        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
+        encoder = encoder_class(**args.encoder_conf)
 
         # 3. EncoderDecoderAttractor
         encoder_decoder_attractor_class = encoder_decoder_attractor_choices.get_class(args.encoder_decoder_attractor)

From 6fe0d840f7908dd1ab74de839987819234890725 Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Wed, 15 Mar 2023 11:23:29 +0800
Subject: [PATCH 14/37] update

---
 funasr/models/e2e_diar_eend_ola.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/funasr/models/e2e_diar_eend_ola.py b/funasr/models/e2e_diar_eend_ola.py
index f589269c5..6835a6409 100644
--- a/funasr/models/e2e_diar_eend_ola.py
+++ b/funasr/models/e2e_diar_eend_ola.py
@@ -240,3 +240,6 @@ class DiarEENDOLAModel(AbsESPnetModel):
             torch.float32)
         decisions = decisions[:, :n_speaker]
         return decisions
+
+    def collect_feats(self, **batch: torch.Tensor) -> Dict[str, torch.Tensor]:
+        pass
\ No newline at end of file

From 26b81480a88cc2868639c5160989394199acdcdd Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Wed, 15 Mar 2023 11:35:18 +0800
Subject: [PATCH 15/37] update

---
 funasr/models/e2e_diar_eend_ola.py   | 16 ++++++++--------
 tests/test_asr_inference_pipeline.py |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/funasr/models/e2e_diar_eend_ola.py b/funasr/models/e2e_diar_eend_ola.py
index 6835a6409..f3e34bc0b 100644
--- a/funasr/models/e2e_diar_eend_ola.py
+++ b/funasr/models/e2e_diar_eend_ola.py
@@ -52,15 +52,15 @@ class DiarEENDOLAModel(AbsESPnetModel):
 
         super().__init__()
         self.frontend = frontend
-        self.encoder = encoder
-        self.encoder_decoder_attractor = encoder_decoder_attractor
+        self.enc = encoder
+        self.eda = encoder_decoder_attractor
         self.attractor_loss_weight = attractor_loss_weight
         self.max_n_speaker = max_n_speaker
         if mapping_dict is None:
             mapping_dict = generate_mapping_dict(max_speaker_num=self.max_n_speaker)
             self.mapping_dict = mapping_dict
         # PostNet
-        self.PostNet = nn.LSTM(self.max_n_speaker, n_units, 1, batch_first=True)
+        self.postnet = nn.LSTM(self.max_n_speaker, n_units, 1, batch_first=True)
         self.output_layer = nn.Linear(n_units, mapping_dict['oov'] + 1)
 
     def forward_encoder(self, xs, ilens):
@@ -68,7 +68,7 @@ class DiarEENDOLAModel(AbsESPnetModel):
         pad_shape = xs.shape
         xs_mask = [torch.ones(ilen).to(xs.device) for ilen in ilens]
         xs_mask = torch.nn.utils.rnn.pad_sequence(xs_mask, batch_first=True, padding_value=0).unsqueeze(-2)
-        emb = self.encoder(xs, xs_mask)
+        emb = self.enc(xs, xs_mask)
         emb = torch.split(emb.view(pad_shape[0], pad_shape[1], -1), 1, dim=0)
         emb = [e[0][:ilen] for e, ilen in zip(emb, ilens)]
         return emb
@@ -77,7 +77,7 @@ class DiarEENDOLAModel(AbsESPnetModel):
         maxlen = torch.max(ilens).to(torch.int).item()
         logits = nn.utils.rnn.pad_sequence(logits, batch_first=True, padding_value=-1)
         logits = nn.utils.rnn.pack_padded_sequence(logits, ilens, batch_first=True, enforce_sorted=False)
-        outputs, (_, _) = self.PostNet(logits)
+        outputs, (_, _) = self.postnet(logits)
         outputs = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True, padding_value=-1, total_length=maxlen)[0]
         outputs = [output[:ilens[i].to(torch.int).item()] for i, output in enumerate(outputs)]
         outputs = [self.output_layer(output) for output in outputs]
@@ -112,7 +112,7 @@ class DiarEENDOLAModel(AbsESPnetModel):
         text = text[:, : text_lengths.max()]
 
         # 1. Encoder
-        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+        encoder_out, encoder_out_lens = self.enc(speech, speech_lengths)
         intermediate_outs = None
         if isinstance(encoder_out, tuple):
             intermediate_outs = encoder_out[1]
@@ -198,10 +198,10 @@ class DiarEENDOLAModel(AbsESPnetModel):
             orders = [np.arange(e.shape[0]) for e in emb]
             for order in orders:
                 np.random.shuffle(order)
-            attractors, probs = self.encoder_decoder_attractor.estimate(
+            attractors, probs = self.eda.estimate(
                 [e[torch.from_numpy(order).to(torch.long).to(speech[0].device)] for e, order in zip(emb, orders)])
         else:
-            attractors, probs = self.encoder_decoder_attractor.estimate(emb)
+            attractors, probs = self.eda.estimate(emb)
         attractors_active = []
         for p, att, e in zip(probs, attractors, emb):
             if n_speakers and n_speakers >= 0:
diff --git a/tests/test_asr_inference_pipeline.py b/tests/test_asr_inference_pipeline.py
index 70dbe8952..32b8af5ec 100644
--- a/tests/test_asr_inference_pipeline.py
+++ b/tests/test_asr_inference_pipeline.py
@@ -451,7 +451,7 @@ class TestUniasrInferencePipelines(unittest.TestCase):
 
     def test_uniasr_2pass_zhcn_16k_common_vocab8358_offline(self):
         inference_pipeline = pipeline(
-            task=Tasks.,
+            task=Tasks.auto_speech_recognition,
             model='damo/speech_UniASauto_speech_recognitionR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline')
         rec_result = inference_pipeline(
             audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav',

From 36e9d36997a7ed21080997d99c13ffbc5bdda279 Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Wed, 15 Mar 2023 12:15:49 +0800
Subject: [PATCH 16/37] update

---
 .../infer.py                                  |  2 +-
 funasr/tasks/diar.py                          | 62 +++++++++----------
 2 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
index 75f9c7346..dfcb8e649 100644
--- a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
+++ b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
@@ -6,4 +6,4 @@ inference_diar_pipline = pipeline(
     model='damo/speech_diarization_eend-ola-en-us-callhome-8k',
     model_revision="v1.0.0",
 )
-results = inference_diar_pipline(audio_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav")
\ No newline at end of file
+results = inference_diar_pipline(audio_in=["https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav"])
\ No newline at end of file
diff --git a/funasr/tasks/diar.py b/funasr/tasks/diar.py
index 67d0c59c5..6204cb7d2 100644
--- a/funasr/tasks/diar.py
+++ b/funasr/tasks/diar.py
@@ -750,37 +750,37 @@ class EENDOLADiarTask(AbsTask):
             cls, args: argparse.Namespace, train: bool
     ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
         assert check_argument_types()
-        if args.use_preprocessor:
-            retval = CommonPreprocessor(
-                train=train,
-                token_type=args.token_type,
-                token_list=args.token_list,
-                bpemodel=None,
-                non_linguistic_symbols=None,
-                text_cleaner=None,
-                g2p_type=None,
-                split_with_space=args.split_with_space if hasattr(args, "split_with_space") else False,
-                seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
-                # NOTE(kamo): Check attribute existence for backward compatibility
-                rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
-                rir_apply_prob=args.rir_apply_prob
-                if hasattr(args, "rir_apply_prob")
-                else 1.0,
-                noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
-                noise_apply_prob=args.noise_apply_prob
-                if hasattr(args, "noise_apply_prob")
-                else 1.0,
-                noise_db_range=args.noise_db_range
-                if hasattr(args, "noise_db_range")
-                else "13_15",
-                speech_volume_normalize=args.speech_volume_normalize
-                if hasattr(args, "rir_scp")
-                else None,
-            )
-        else:
-            retval = None
-        assert check_return_type(retval)
-        return retval
+        # if args.use_preprocessor:
+        #     retval = CommonPreprocessor(
+        #         train=train,
+        #         token_type=args.token_type,
+        #         token_list=args.token_list,
+        #         bpemodel=None,
+        #         non_linguistic_symbols=None,
+        #         text_cleaner=None,
+        #         g2p_type=None,
+        #         split_with_space=args.split_with_space if hasattr(args, "split_with_space") else False,
+        #         seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
+        #         # NOTE(kamo): Check attribute existence for backward compatibility
+        #         rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
+        #         rir_apply_prob=args.rir_apply_prob
+        #         if hasattr(args, "rir_apply_prob")
+        #         else 1.0,
+        #         noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
+        #         noise_apply_prob=args.noise_apply_prob
+        #         if hasattr(args, "noise_apply_prob")
+        #         else 1.0,
+        #         noise_db_range=args.noise_db_range
+        #         if hasattr(args, "noise_db_range")
+        #         else "13_15",
+        #         speech_volume_normalize=args.speech_volume_normalize
+        #         if hasattr(args, "rir_scp")
+        #         else None,
+        #     )
+        # else:
+        #     retval = None
+        # assert check_return_type(retval)
+        return None
 
     @classmethod
     def required_data_names(

From 3f2981bb8da44881460c8b290e62a3c6fce998d3 Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Wed, 15 Mar 2023 14:27:44 +0800
Subject: [PATCH 17/37] update

---
 funasr/tasks/diar.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/funasr/tasks/diar.py b/funasr/tasks/diar.py
index 6204cb7d2..696291526 100644
--- a/funasr/tasks/diar.py
+++ b/funasr/tasks/diar.py
@@ -787,10 +787,10 @@ class EENDOLADiarTask(AbsTask):
             cls, train: bool = True, inference: bool = False
     ) -> Tuple[str, ...]:
         if not inference:
-            retval = ("speech", "profile", "binary_labels")
+            retval = ("speech", )
         else:
             # Recognition mode
-            retval = ("speech")
+            retval = ("speech", )
         return retval
 
     @classmethod

From 2cfe010d7b0f17877a271cc401e2c2f8f8d4c42c Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Wed, 15 Mar 2023 14:42:32 +0800
Subject: [PATCH 18/37] update

---
 funasr/bin/eend_ola_inference.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py
index 1b3622005..96e7516e3 100755
--- a/funasr/bin/eend_ola_inference.py
+++ b/funasr/bin/eend_ola_inference.py
@@ -27,6 +27,8 @@ from funasr.utils.types import str2bool
 from funasr.utils.types import str2triple_str
 from funasr.utils.types import str_or_none
 
+from modelscope.utils.logger import get_logger
+logger = get_logger()
 
 class Speech2Diarization:
     """Speech2Diarlization class
@@ -209,6 +211,7 @@ def inference_modelscope(
             if isinstance(raw_inputs, torch.Tensor):
                 raw_inputs = raw_inputs.numpy()
             data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
+        logger.info(data_path_and_name_and_type)
         loader = EENDOLADiarTask.build_streaming_iterator(
             data_path_and_name_and_type,
             dtype=dtype,
@@ -228,6 +231,8 @@ def inference_modelscope(
             output_writer = open("{}/result.txt".format(output_path), "w")
         result_list = []
         for keys, batch in loader:
+            logger.info("keys: {}".format(keys))
+            logger.info("batch: {}".format(batch))
             assert isinstance(batch, dict), type(batch)
             assert all(isinstance(s, str) for s in keys), keys
             _bs = len(next(iter(batch.values())))

From 85c1848286e206195a94993b49e8c32117cadc90 Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Wed, 15 Mar 2023 15:17:27 +0800
Subject: [PATCH 19/37] update

---
 .../unit_test.py                                           | 7 +++----
 funasr/bin/eend_ola_inference.py                           | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py b/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py
index 3cb31cfb7..5f4563dbc 100644
--- a/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py
+++ b/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py
@@ -14,13 +14,12 @@ inference_diar_pipline = pipeline(
 )
 
 # 以 audio_list 作为输入，其中第一个音频为待检测语音，后面的音频为不同说话人的声纹注册语音
-audio_list = [[
+audio_list = [
     "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav",
     "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_A.wav",
     "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B.wav",
     "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B1.wav"
-]]
+]
 
 results = inference_diar_pipline(audio_in=audio_list)
-for rst in results:
-    print(rst["value"])
+print(results)
diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py
index 96e7516e3..2ff7eeff2 100755
--- a/funasr/bin/eend_ola_inference.py
+++ b/funasr/bin/eend_ola_inference.py
@@ -210,7 +210,7 @@ def inference_modelscope(
         if data_path_and_name_and_type is None and raw_inputs is not None:
             if isinstance(raw_inputs, torch.Tensor):
                 raw_inputs = raw_inputs.numpy()
-            data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
+            data_path_and_name_and_type = [raw_inputs[0], "speech", "bytes"]
         logger.info(data_path_and_name_and_type)
         loader = EENDOLADiarTask.build_streaming_iterator(
             data_path_and_name_and_type,

From 2f933cb101e56c3c12c76c38d368b94111b52f64 Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Wed, 15 Mar 2023 15:23:08 +0800
Subject: [PATCH 20/37] update

---
 funasr/bin/eend_ola_inference.py   | 4 ----
 funasr/models/e2e_diar_eend_ola.py | 2 --
 2 files changed, 6 deletions(-)

diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py
index 2ff7eeff2..fbcfc7d97 100755
--- a/funasr/bin/eend_ola_inference.py
+++ b/funasr/bin/eend_ola_inference.py
@@ -27,9 +27,6 @@ from funasr.utils.types import str2bool
 from funasr.utils.types import str2triple_str
 from funasr.utils.types import str_or_none
 
-from modelscope.utils.logger import get_logger
-logger = get_logger()
-
 class Speech2Diarization:
     """Speech2Diarlization class
 
@@ -211,7 +208,6 @@ def inference_modelscope(
             if isinstance(raw_inputs, torch.Tensor):
                 raw_inputs = raw_inputs.numpy()
             data_path_and_name_and_type = [raw_inputs[0], "speech", "bytes"]
-        logger.info(data_path_and_name_and_type)
         loader = EENDOLADiarTask.build_streaming_iterator(
             data_path_and_name_and_type,
             dtype=dtype,
diff --git a/funasr/models/e2e_diar_eend_ola.py b/funasr/models/e2e_diar_eend_ola.py
index f3e34bc0b..79cb61496 100644
--- a/funasr/models/e2e_diar_eend_ola.py
+++ b/funasr/models/e2e_diar_eend_ola.py
@@ -190,8 +190,6 @@ class DiarEENDOLAModel(AbsESPnetModel):
                             shuffle: bool = True,
                             threshold: float = 0.5,
                             **kwargs):
-        if self.frontend is not None:
-            speech = self.frontend(speech)
         speech = [s[:s_len] for s, s_len in zip(speech, speech_lengths)]
         emb = self.forward_encoder(speech, speech_lengths)
         if shuffle:

From e9f6703350fc6616b06c0e60944f6359a329e214 Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Wed, 15 Mar 2023 15:29:31 +0800
Subject: [PATCH 21/37] update

---
 funasr/bin/eend_ola_inference.py       | 2 --
 funasr/models/frontend/wav_frontend.py | 5 +++++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py
index fbcfc7d97..2887b3754 100755
--- a/funasr/bin/eend_ola_inference.py
+++ b/funasr/bin/eend_ola_inference.py
@@ -227,8 +227,6 @@ def inference_modelscope(
             output_writer = open("{}/result.txt".format(output_path), "w")
         result_list = []
         for keys, batch in loader:
-            logger.info("keys: {}".format(keys))
-            logger.info("batch: {}".format(batch))
             assert isinstance(batch, dict), type(batch)
             assert all(isinstance(s, str) for s in keys), keys
             _bs = len(next(iter(batch.values())))
diff --git a/funasr/models/frontend/wav_frontend.py b/funasr/models/frontend/wav_frontend.py
index f61d7dd17..8e17102ca 100644
--- a/funasr/models/frontend/wav_frontend.py
+++ b/funasr/models/frontend/wav_frontend.py
@@ -11,6 +11,8 @@ from typeguard import check_argument_types
 import funasr.models.frontend.eend_ola_feature as eend_ola_feature
 from funasr.models.frontend.abs_frontend import AbsFrontend
 
+from modelscope.utils.logger import get_logger
+logger = get_logger()
 
 def load_cmvn(cmvn_file):
     with open(cmvn_file, 'r', encoding='utf-8') as f:
@@ -485,6 +487,9 @@ class WavFrontendMel23(AbsFrontend):
         batch_size = input.size(0)
         feats = []
         feats_lens = []
+        logger.info("batch_size: {}".format(batch_size))
+        logger.info("input: {}".format(input))
+        logger.info("input_lengths: {}".format(input_lengths))
         for i in range(batch_size):
             waveform_length = input_lengths[i]
             waveform = input[i][:waveform_length]

From f691014c8a97f2ea27dc72c9d3b374bdd05aa6c9 Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Wed, 15 Mar 2023 15:43:18 +0800
Subject: [PATCH 22/37] update

---
 funasr/models/frontend/wav_frontend.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/funasr/models/frontend/wav_frontend.py b/funasr/models/frontend/wav_frontend.py
index 8e17102ca..ca2217596 100644
--- a/funasr/models/frontend/wav_frontend.py
+++ b/funasr/models/frontend/wav_frontend.py
@@ -493,10 +493,10 @@ class WavFrontendMel23(AbsFrontend):
         for i in range(batch_size):
             waveform_length = input_lengths[i]
             waveform = input[i][:waveform_length]
-            waveform = waveform.unsqueeze(0).numpy()
+            waveform = waveform.numpy()
             mat = eend_ola_feature.stft(waveform, self.frame_length, self.frame_shift)
             mat = eend_ola_feature.transform(mat)
-            mat = mat.splice(mat, context_size=self.lfr_m)
+            mat = eend_ola_feature.splice(mat, context_size=self.lfr_m)
             mat = mat[::self.lfr_n]
             mat = torch.from_numpy(mat)
             feat_length = mat.size(0)

From 429ea5d3786fb77d1b53728307a59fe3d204d4ce Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Wed, 15 Mar 2023 15:48:35 +0800
Subject: [PATCH 23/37] update

---
 funasr/bin/eend_ola_inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py
index 2887b3754..1a47c9224 100755
--- a/funasr/bin/eend_ola_inference.py
+++ b/funasr/bin/eend_ola_inference.py
@@ -145,7 +145,7 @@ def inference_modelscope(
         output_dir: Optional[str] = None,
         batch_size: int = 1,
         dtype: str = "float32",
-        ngpu: int = 0,
+        ngpu: int = 1,
         num_workers: int = 0,
         log_level: Union[int, str] = "INFO",
         key_file: Optional[str] = None,

From fbec0f003d4de9e4b6ccb6bb58d2d4926a0ff332 Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Wed, 15 Mar 2023 15:58:43 +0800
Subject: [PATCH 24/37] update

---
 funasr/models/frontend/wav_frontend.py               | 11 ++---------
 funasr/modules/eend_ola/encoder_decoder_attractor.py |  5 ++++-
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/funasr/models/frontend/wav_frontend.py b/funasr/models/frontend/wav_frontend.py
index ca2217596..475a9398a 100644
--- a/funasr/models/frontend/wav_frontend.py
+++ b/funasr/models/frontend/wav_frontend.py
@@ -11,8 +11,6 @@ from typeguard import check_argument_types
 import funasr.models.frontend.eend_ola_feature as eend_ola_feature
 from funasr.models.frontend.abs_frontend import AbsFrontend
 
-from modelscope.utils.logger import get_logger
-logger = get_logger()
 
 def load_cmvn(cmvn_file):
     with open(cmvn_file, 'r', encoding='utf-8') as f:
@@ -425,10 +423,8 @@ class WavFrontendOnline(AbsFrontend):
                     reserve_frame_idx = lfr_splice_frame_idxs[0] - minus_frame
                     # print('reserve_frame_idx:  ' + str(reserve_frame_idx))
                     # print('frame_frame:  ' + str(frame_from_waveforms))
-                    self.reserve_waveforms = self.waveforms[:,
-                                             reserve_frame_idx * self.frame_shift_sample_length:frame_from_waveforms * self.frame_shift_sample_length]
-                    sample_length = (
-                                                frame_from_waveforms - 1) * self.frame_shift_sample_length + self.frame_sample_length
+                    self.reserve_waveforms = self.waveforms[:, reserve_frame_idx * self.frame_shift_sample_length:frame_from_waveforms * self.frame_shift_sample_length]
+                    sample_length = (frame_from_waveforms - 1) * self.frame_shift_sample_length + self.frame_sample_length
                     self.waveforms = self.waveforms[:, :sample_length]
             else:
                 # update self.reserve_waveforms and self.lfr_splice_cache
@@ -487,9 +483,6 @@ class WavFrontendMel23(AbsFrontend):
         batch_size = input.size(0)
         feats = []
         feats_lens = []
-        logger.info("batch_size: {}".format(batch_size))
-        logger.info("input: {}".format(input))
-        logger.info("input_lengths: {}".format(input_lengths))
         for i in range(batch_size):
             waveform_length = input_lengths[i]
             waveform = input[i][:waveform_length]
diff --git a/funasr/modules/eend_ola/encoder_decoder_attractor.py b/funasr/modules/eend_ola/encoder_decoder_attractor.py
index db01b0006..4e599ab31 100644
--- a/funasr/modules/eend_ola/encoder_decoder_attractor.py
+++ b/funasr/modules/eend_ola/encoder_decoder_attractor.py
@@ -2,7 +2,8 @@ import numpy as np
 import torch
 import torch.nn.functional as F
 from torch import nn
-
+from modelscope.utils.logger import get_logger
+logger = get_logger()
 
 class EncoderDecoderAttractor(nn.Module):
 
@@ -16,7 +17,9 @@ class EncoderDecoderAttractor(nn.Module):
         self.n_units = n_units
 
     def forward_core(self, xs, zeros):
+        logger.info("xs: ".format(xs))
         ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).to(torch.float32).to(xs[0].device)
+        logger.info("ilens: ".format(ilens))
         xs = [self.enc0_dropout(x) for x in xs]
         xs = nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=-1)
         xs = nn.utils.rnn.pack_padded_sequence(xs, ilens, batch_first=True, enforce_sorted=False)

From f33ebfd1c70859f38eaac22673ab0ee9682ea7c3 Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Wed, 15 Mar 2023 16:11:44 +0800
Subject: [PATCH 25/37] update

---
 funasr/models/e2e_diar_eend_ola.py                 | 14 ++++++++++++--
 .../modules/eend_ola/encoder_decoder_attractor.py  | 11 ++++-------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/funasr/models/e2e_diar_eend_ola.py b/funasr/models/e2e_diar_eend_ola.py
index 79cb61496..097b23a57 100644
--- a/funasr/models/e2e_diar_eend_ola.py
+++ b/funasr/models/e2e_diar_eend_ola.py
@@ -76,7 +76,7 @@ class DiarEENDOLAModel(AbsESPnetModel):
     def forward_post_net(self, logits, ilens):
         maxlen = torch.max(ilens).to(torch.int).item()
         logits = nn.utils.rnn.pad_sequence(logits, batch_first=True, padding_value=-1)
-        logits = nn.utils.rnn.pack_padded_sequence(logits, ilens, batch_first=True, enforce_sorted=False)
+        logits = nn.utils.rnn.pack_padded_sequence(logits, ilens.cpu().to(torch.int64), batch_first=True, enforce_sorted=False)
         outputs, (_, _) = self.postnet(logits)
         outputs = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True, padding_value=-1, total_length=maxlen)[0]
         outputs = [output[:ilens[i].to(torch.int).item()] for i, output in enumerate(outputs)]
@@ -231,7 +231,7 @@ class DiarEENDOLAModel(AbsESPnetModel):
                 pred[i] = pred[i - 1]
             else:
                 pred[i] = 0
-        pred = [self.reporter.inv_mapping_func(i, self.mapping_dict) for i in pred]
+        pred = [self.inv_mapping_func(i) for i in pred]
         decisions = [bin(num)[2:].zfill(self.max_n_speaker)[::-1] for num in pred]
         decisions = torch.from_numpy(
             np.stack([np.array([int(i) for i in dec]) for dec in decisions], axis=0)).to(logit.device).to(
@@ -239,5 +239,15 @@ class DiarEENDOLAModel(AbsESPnetModel):
         decisions = decisions[:, :n_speaker]
         return decisions
 
+    def inv_mapping_func(self, label):
+
+        if not isinstance(label, int):
+            label = int(label)
+        if label in self.mapping_dict['label2dec'].keys():
+            num = self.mapping_dict['label2dec'][label]
+        else:
+            num = -1
+        return num
+
     def collect_feats(self, **batch: torch.Tensor) -> Dict[str, torch.Tensor]:
         pass
\ No newline at end of file
diff --git a/funasr/modules/eend_ola/encoder_decoder_attractor.py b/funasr/modules/eend_ola/encoder_decoder_attractor.py
index 4e599ab31..45ac98219 100644
--- a/funasr/modules/eend_ola/encoder_decoder_attractor.py
+++ b/funasr/modules/eend_ola/encoder_decoder_attractor.py
@@ -2,8 +2,7 @@ import numpy as np
 import torch
 import torch.nn.functional as F
 from torch import nn
-from modelscope.utils.logger import get_logger
-logger = get_logger()
+
 
 class EncoderDecoderAttractor(nn.Module):
 
@@ -17,14 +16,12 @@ class EncoderDecoderAttractor(nn.Module):
         self.n_units = n_units
 
     def forward_core(self, xs, zeros):
-        logger.info("xs: ".format(xs))
-        ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).to(torch.float32).to(xs[0].device)
-        logger.info("ilens: ".format(ilens))
+        ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).to(torch.int64)
         xs = [self.enc0_dropout(x) for x in xs]
         xs = nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=-1)
         xs = nn.utils.rnn.pack_padded_sequence(xs, ilens, batch_first=True, enforce_sorted=False)
         _, (hx, cx) = self.encoder(xs)
-        zlens = torch.from_numpy(np.array([z.shape[0] for z in zeros])).to(torch.float32).to(zeros[0].device)
+        zlens = torch.from_numpy(np.array([z.shape[0] for z in zeros])).to(torch.int64)
         max_zlen = torch.max(zlens).to(torch.int).item()
         zeros = [self.enc0_dropout(z) for z in zeros]
         zeros = nn.utils.rnn.pad_sequence(zeros, batch_first=True, padding_value=-1)
@@ -50,4 +47,4 @@ class EncoderDecoderAttractor(nn.Module):
         zeros = [torch.zeros(max_n_speakers, self.n_units).to(torch.float32).to(xs[0].device) for _ in xs]
         attractors = self.forward_core(xs, zeros)
         probs = [torch.sigmoid(torch.flatten(self.counter(att))) for att in attractors]
-        return attractors, probs
\ No newline at end of file
+        return attractors, probs

From 7c6ed3830acf6413ab86fd9a5f38825db617f989 Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Wed, 15 Mar 2023 16:39:40 +0800
Subject: [PATCH 26/37] update

---
 funasr/bin/eend_ola_inference.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py
index 1a47c9224..79e93a863 100755
--- a/funasr/bin/eend_ola_inference.py
+++ b/funasr/bin/eend_ola_inference.py
@@ -17,6 +17,7 @@ from typing import Union
 import numpy as np
 import torch
 from typeguard import check_argument_types
+from scipy.signal import medfilt
 
 from funasr.models.frontend.wav_frontend import WavFrontendMel23
 from funasr.tasks.diar import EENDOLADiarTask
@@ -234,9 +235,22 @@ def inference_modelscope(
             # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
 
             results = speech2diar(**batch)
+
+            # post process
+            a = medfilt(results[0], (11, 1))
+            rst = []
+            for spkid, frames in enumerate(a.T):
+                frames = np.pad(frames, (1, 1), 'constant')
+                changes, = np.where(np.diff(frames, axis=0) != 0)
+                fmt = "SPEAKER {:s} 1 {:7.2f} {:7.2f} <NA> <NA> {:s} <NA>"
+                for s, e in zip(changes[::2], changes[1::2]):
+                    st = s / 10.
+                    ed = e / 10.
+                    rst.append(fmt.format(keys[0], st, ed, "{}_{}".format(keys[0],str(spkid))))
+
             # Only supporting batch_size==1
-            key, value = keys[0], output_results_str(results, keys[0])
-            item = {"key": key, "value": value}
+            value = "\n".join(rst)
+            item = {"key": keys[0], "value": value}
             result_list.append(item)
             if output_path is not None:
                 output_writer.write(value)

From dd4946a50db62a180ab11a5e371ea2ef44954c3b Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Wed, 15 Mar 2023 16:44:59 +0800
Subject: [PATCH 27/37] update

---
 funasr/bin/eend_ola_inference.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py
index 79e93a863..b35824aaa 100755
--- a/funasr/bin/eend_ola_inference.py
+++ b/funasr/bin/eend_ola_inference.py
@@ -237,7 +237,8 @@ def inference_modelscope(
             results = speech2diar(**batch)
 
             # post process
-            a = medfilt(results[0], (11, 1))
+            a = results[0].cpu().numpy()
+            a = medfilt(a, (11, 1))
             rst = []
             for spkid, frames in enumerate(a.T):
                 frames = np.pad(frames, (1, 1), 'constant')

From b4598f30a54c3a8d5e6084d983fac0fa5a51992b Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Wed, 15 Mar 2023 17:20:24 +0800
Subject: [PATCH 28/37] update

---
 .../infer.py                                             | 5 +++--
 funasr/bin/asr_inference_launch.py                       | 3 +++
 funasr/bin/eend_ola_inference.py                         | 9 +++++----
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
index dfcb8e649..e0ac08ced 100644
--- a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
+++ b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
@@ -2,8 +2,9 @@ from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 
 inference_diar_pipline = pipeline(
-    task=Tasks.speaker_diarization,
+    task=Tasks.auto_speech_recognition,
     model='damo/speech_diarization_eend-ola-en-us-callhome-8k',
     model_revision="v1.0.0",
 )
-results = inference_diar_pipline(audio_in=["https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav"])
\ No newline at end of file
+results = inference_diar_pipline(audio_in=["https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record2.wav"])
+print(results)
\ No newline at end of file
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index 1fae766ea..0ab6b1ad3 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -234,6 +234,9 @@ def inference_launch(**kwargs):
     elif mode == "rnnt":
         from funasr.bin.asr_inference_rnnt import inference_modelscope
         return inference_modelscope(**kwargs)
+    elif mode == "eend-ola":
+        from funasr.bin.eend_ola_inference import inference_modelscope
+        return inference_modelscope(mode=mode, **kwargs)
     else:
         logging.info("Unknown decoding mode: {}".format(mode))
         return None
diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py
index b35824aaa..048327856 100755
--- a/funasr/bin/eend_ola_inference.py
+++ b/funasr/bin/eend_ola_inference.py
@@ -16,8 +16,8 @@ from typing import Union
 
 import numpy as np
 import torch
-from typeguard import check_argument_types
 from scipy.signal import medfilt
+from typeguard import check_argument_types
 
 from funasr.models.frontend.wav_frontend import WavFrontendMel23
 from funasr.tasks.diar import EENDOLADiarTask
@@ -28,6 +28,7 @@ from funasr.utils.types import str2bool
 from funasr.utils.types import str2triple_str
 from funasr.utils.types import str_or_none
 
+
 class Speech2Diarization:
     """Speech2Diarlization class
 
@@ -237,7 +238,7 @@ def inference_modelscope(
             results = speech2diar(**batch)
 
             # post process
-            a = results[0].cpu().numpy()
+            a = results[0][0].cpu().numpy()
             a = medfilt(a, (11, 1))
             rst = []
             for spkid, frames in enumerate(a.T):
@@ -246,8 +247,8 @@ def inference_modelscope(
                 fmt = "SPEAKER {:s} 1 {:7.2f} {:7.2f} <NA> <NA> {:s} <NA>"
                 for s, e in zip(changes[::2], changes[1::2]):
                     st = s / 10.
-                    ed = e / 10.
-                    rst.append(fmt.format(keys[0], st, ed, "{}_{}".format(keys[0],str(spkid))))
+                    dur = (e - s) / 10.
+                    rst.append(fmt.format(keys[0], st, dur, "{}_{}".format(keys[0], str(spkid))))
 
             # Only supporting batch_size==1
             value = "\n".join(rst)

From 7ee716759b4a38e0776ebad3c5fac5fc969bec68 Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Wed, 15 Mar 2023 17:22:01 +0800
Subject: [PATCH 29/37] update

---
 .../speech_diarization_eend-ola-en-us-callhome-8k/infer.py     | 2 +-
 funasr/bin/asr_inference_launch.py                             | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
index e0ac08ced..81cb2c629 100644
--- a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
+++ b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
@@ -2,7 +2,7 @@ from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 
 inference_diar_pipline = pipeline(
-    task=Tasks.auto_speech_recognition,
+    task=Tasks.speaker_diarization,
     model='damo/speech_diarization_eend-ola-en-us-callhome-8k',
     model_revision="v1.0.0",
 )
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index 0ab6b1ad3..1fae766ea 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -234,9 +234,6 @@ def inference_launch(**kwargs):
     elif mode == "rnnt":
         from funasr.bin.asr_inference_rnnt import inference_modelscope
         return inference_modelscope(**kwargs)
-    elif mode == "eend-ola":
-        from funasr.bin.eend_ola_inference import inference_modelscope
-        return inference_modelscope(mode=mode, **kwargs)
     else:
         logging.info("Unknown decoding mode: {}".format(mode))
         return None

From 06975be6bf1eb83c12666c6c93b7f5412e5749ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B8=B8=E9=9B=81?= <zhifu.gzf@alibaba-inc.com>
Date: Wed, 15 Mar 2023 21:39:47 +0800
Subject: [PATCH 30/37] benchmark cpu

---
 funasr/runtime/python/README.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 funasr/runtime/python/README.md

diff --git a/funasr/runtime/python/README.md b/funasr/runtime/python/README.md
new file mode 100644
index 000000000..999597459
--- /dev/null
+++ b/funasr/runtime/python/README.md
@@ -0,0 +1,21 @@
+Benchmark [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) based on Aishell1 test set , the total audio duration is 36108.919 seconds.
+
+(Note: The service has been fully warm up.)
+
+ Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz   16core-32processor    with avx512_vnni
+
+| concurrent-tasks | processing time(s) |  RTF   | Speedup Rate |
+|:----------------:|:------------------:|:------:|:------------:|
+|  1 (onnx fp32)   |        2806        | 0.0777 |     12.9     |
+|  1 (onnx int8)   |        1611        | 0.0446 |     22.4     |
+|  8 (onnx fp32)   |        538         | 0.0149 |     67.1     |
+|  8 (onnx int8)   |        210         | 0.0058 |    172.4     |
+|  16 (onnx fp32)  |        288         | 0.0080 |    125.2     |
+|  16 (onnx int8)  |        117         | 0.0032 |    309.9     |
+|  32 (onnx fp32)  |        167         | 0.0046 |    216.5     |
+|  32 (onnx int8)  |        107         | 0.0030 |    338.0     |
+|  64 (onnx fp32)  |        158         | 0.0044 |    228.1     |
+|  64 (onnx int8)  |         82         | 0.0023 |    442.8     |
+|  96 (onnx fp32)  |        151         | 0.0042 |    238.0     |
+|  96 (onnx int8)  |         80         | 0.0022 |    452.0     |
+

From 495e7071eab6b3280c2c06201907236a106c660e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B8=B8=E9=9B=81?= <zhifu.gzf@alibaba-inc.com>
Date: Wed, 15 Mar 2023 21:44:31 +0800
Subject: [PATCH 31/37] benchmark cpu

---
 funasr/runtime/python/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/funasr/runtime/python/README.md b/funasr/runtime/python/README.md
index 999597459..c47f8e787 100644
--- a/funasr/runtime/python/README.md
+++ b/funasr/runtime/python/README.md
@@ -1,4 +1,4 @@
-Benchmark [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) based on Aishell1 test set , the total audio duration is 36108.919 seconds.
+Benchmark [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) based on Aishell1 test set , the total audio duration is 36108.919 seconds.
 
 (Note: The service has been fully warm up.)
 

From c3bce4c288f73a3bbf5559b019d4480f95acffaa Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Thu, 16 Mar 2023 10:44:15 +0800
Subject: [PATCH 32/37] update

---
 funasr/bin/eend_ola_inference.py   | 2 +-
 funasr/modules/eend_ola/encoder.py | 2 +-
 setup.py                           | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py
index 048327856..bc29fa206 100755
--- a/funasr/bin/eend_ola_inference.py
+++ b/funasr/bin/eend_ola_inference.py
@@ -209,7 +209,7 @@ def inference_modelscope(
         if data_path_and_name_and_type is None and raw_inputs is not None:
             if isinstance(raw_inputs, torch.Tensor):
                 raw_inputs = raw_inputs.numpy()
-            data_path_and_name_and_type = [raw_inputs[0], "speech", "bytes"]
+            data_path_and_name_and_type = [raw_inputs[0], "speech", "sound"]
         loader = EENDOLADiarTask.build_streaming_iterator(
             data_path_and_name_and_type,
             dtype=dtype,
diff --git a/funasr/modules/eend_ola/encoder.py b/funasr/modules/eend_ola/encoder.py
index 4999031b1..90a63f369 100644
--- a/funasr/modules/eend_ola/encoder.py
+++ b/funasr/modules/eend_ola/encoder.py
@@ -87,7 +87,7 @@ class EENDOLATransformerEncoder(nn.Module):
                  n_layers: int,
                  n_units: int,
                  e_units: int = 2048,
-                 h: int = 8,
+                 h: int = 4,
                  dropout_rate: float = 0.1,
                  use_pos_emb: bool = False):
         super(EENDOLATransformerEncoder, self).__init__()
diff --git a/setup.py b/setup.py
index 087d90d26..e6b9d38f6 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,7 @@ requirements = {
         "humanfriendly",
         "scipy>=1.4.1",
         # "filelock",
-        "librosa>=0.8.0",
+        "librosa==0.8.1",
         "jamo==0.4.1",  # For kss
         "PyYAML>=5.1.2",
         "soundfile>=0.10.2",

From 2ba4683eb2ce42eec91250debe88b424cbc2d67f Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: Thu, 16 Mar 2023 11:14:42 +0800
Subject: [PATCH 33/37] update

---
 egs/aishell/conformer/run.sh                  |  2 +-
 .../data2vec_paraformer_finetune/run.sh       |  2 +-
 .../data2vec_transformer_finetune/run.sh      |  2 +-
 egs/aishell/paraformer/run.sh                 |  2 +-
 egs/aishell/paraformerbert/run.sh             |  2 +-
 egs/aishell/transformer/run.sh                |  2 +-
 egs/aishell2/conformer/run.sh                 |  2 +-
 egs/aishell2/paraformer/run.sh                |  2 +-
 egs/aishell2/paraformerbert/run.sh            |  2 +-
 egs/aishell2/transformer/run.sh               |  2 +-
 egs/aishell2/transformerLM/run.sh             |  2 +-
 .../diarization/sond/infer_alimeeting_test.py |  2 +-
 egs/alimeeting/diarization/sond/run.sh        |  6 ++--
 egs/alimeeting/diarization/sond/unit_test.py  |  8 ++---
 egs/callhome/diarization/sond/unit_test.py    |  8 ++---
 egs/mars/sd/local_run.sh                      |  2 +-
 .../README.md                                 |  2 +-
 .../infer_after_finetune.py                   |  2 +-
 .../README.md                                 |  2 +-
 .../infer_after_finetune.py                   |  2 +-
 .../README.md                                 |  2 +-
 .../infer_after_finetune.py                   |  2 +-
 .../infer_after_finetune.py                   |  2 +-
 .../infer_after_finetune.py                   |  2 +-
 .../README.md                                 |  2 +-
 .../infer_after_finetune.py                   |  2 +-
 .../infer_after_finetune.py                   |  2 +-
 .../README.md                                 |  2 +-
 .../infer_after_finetune.py                   |  2 +-
 .../README.md                                 |  2 +-
 .../infer_after_finetune.py                   |  2 +-
 .../README.md                                 |  2 +-
 .../infer_after_finetune.py                   |  2 +-
 .../README.md                                 |  2 +-
 .../infer_after_finetune.py                   |  2 +-
 .../README.md                                 |  3 +-
 .../infer_after_finetune.py                   |  2 +-
 .../README.md                                 |  2 +-
 .../infer_after_finetune.py                   |  2 +-
 funasr/bin/asr_inference.py                   |  2 +-
 funasr/bin/asr_inference_mfcca.py             |  2 +-
 funasr/bin/asr_inference_paraformer.py        |  2 +-
 .../bin/asr_inference_paraformer_vad_punc.py  |  2 +-
 funasr/bin/asr_inference_rnnt.py              |  2 +-
 funasr/bin/asr_inference_uniasr.py            |  2 +-
 funasr/bin/asr_inference_uniasr_vad.py        |  2 +-
 funasr/bin/diar_inference_launch.py           |  2 +-
 funasr/bin/eend_ola_inference.py              |  2 +-
 funasr/bin/sond_inference.py                  |  2 +-
 funasr/bin/sv_inference.py                    |  4 +--
 funasr/main_funcs/average_nbest_models.py     | 18 +++++-----
 funasr/main_funcs/pack_funcs.py               |  4 +--
 funasr/tasks/abs_task.py                      |  8 ++---
 funasr/tasks/asr.py                           |  4 +--
 funasr/tasks/diar.py                          |  2 +-
 funasr/tasks/sv.py                            |  2 +-
 funasr/torch_utils/load_pretrained_model.py   | 10 +++---
 funasr/train/trainer.py                       | 36 +++++++++----------
 58 files changed, 102 insertions(+), 101 deletions(-)

diff --git a/egs/aishell/conformer/run.sh b/egs/aishell/conformer/run.sh
index 41db45dfd..09ddab8a5 100755
--- a/egs/aishell/conformer/run.sh
+++ b/egs/aishell/conformer/run.sh
@@ -52,7 +52,7 @@ asr_config=conf/train_asr_conformer.yaml
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/data2vec_paraformer_finetune/run.sh b/egs/aishell/data2vec_paraformer_finetune/run.sh
index cada164dc..d033ce26a 100755
--- a/egs/aishell/data2vec_paraformer_finetune/run.sh
+++ b/egs/aishell/data2vec_paraformer_finetune/run.sh
@@ -55,7 +55,7 @@ asr_config=conf/train_asr_paraformer_transformer_12e_6d_3072_768.yaml
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/data2vec_transformer_finetune/run.sh b/egs/aishell/data2vec_transformer_finetune/run.sh
index 7ab8626bb..26222e666 100755
--- a/egs/aishell/data2vec_transformer_finetune/run.sh
+++ b/egs/aishell/data2vec_transformer_finetune/run.sh
@@ -55,7 +55,7 @@ asr_config=conf/train_asr_transformer_12e_6d_3072_768.yaml
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.cer_ctc.ave_10best.pth
+inference_asr_model=valid.cer_ctc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/paraformer/run.sh b/egs/aishell/paraformer/run.sh
index 2b0f1449b..53b5f906d 100755
--- a/egs/aishell/paraformer/run.sh
+++ b/egs/aishell/paraformer/run.sh
@@ -52,7 +52,7 @@ asr_config=conf/train_asr_paraformer_conformer_12e_6d_2048_256.yaml
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/paraformerbert/run.sh b/egs/aishell/paraformerbert/run.sh
index 96310ab84..2487eacd8 100755
--- a/egs/aishell/paraformerbert/run.sh
+++ b/egs/aishell/paraformerbert/run.sh
@@ -56,7 +56,7 @@ asr_config=conf/train_asr_paraformerbert_conformer_12e_6d_2048_256.yaml
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/transformer/run.sh b/egs/aishell/transformer/run.sh
index 4c307b07c..f66a338ba 100755
--- a/egs/aishell/transformer/run.sh
+++ b/egs/aishell/transformer/run.sh
@@ -52,7 +52,7 @@ asr_config=conf/train_asr_conformer.yaml
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell2/conformer/run.sh b/egs/aishell2/conformer/run.sh
index bd6d81ea9..f9ea69ada 100755
--- a/egs/aishell2/conformer/run.sh
+++ b/egs/aishell2/conformer/run.sh
@@ -54,7 +54,7 @@ asr_config=conf/train_asr_conformer.yaml
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/paraformer/run.sh b/egs/aishell2/paraformer/run.sh
index 2b7d84131..e1ea4fe73 100755
--- a/egs/aishell2/paraformer/run.sh
+++ b/egs/aishell2/paraformer/run.sh
@@ -54,7 +54,7 @@ asr_config=conf/train_asr_paraformer_conformer_20e_1280_320_6d_1280_320.yaml
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/paraformerbert/run.sh b/egs/aishell2/paraformerbert/run.sh
index d0407d480..239a7e339 100755
--- a/egs/aishell2/paraformerbert/run.sh
+++ b/egs/aishell2/paraformerbert/run.sh
@@ -58,7 +58,7 @@ asr_config=conf/train_asr_paraformerbert_conformer_20e_6d_1280_320.yaml
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/transformer/run.sh b/egs/aishell2/transformer/run.sh
index a5a14ec09..6f2dd4d8d 100755
--- a/egs/aishell2/transformer/run.sh
+++ b/egs/aishell2/transformer/run.sh
@@ -54,7 +54,7 @@ asr_config=conf/train_asr_transformer.yaml
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/transformerLM/run.sh b/egs/aishell2/transformerLM/run.sh
index 28e376287..9e7a7135b 100755
--- a/egs/aishell2/transformerLM/run.sh
+++ b/egs/aishell2/transformerLM/run.sh
@@ -34,7 +34,7 @@ exp_dir=./data
 tag=exp1
 model_dir="baseline_$(basename "${lm_config}" .yaml)_${lang}_${token_type}_${tag}"
 lm_exp=${exp_dir}/exp/${model_dir}
-inference_lm=valid.loss.ave.pth       # Language model path for decoding.
+inference_lm=valid.loss.ave.pb       # Language model path for decoding.
 
 stage=0
 stop_stage=3
diff --git a/egs/alimeeting/diarization/sond/infer_alimeeting_test.py b/egs/alimeeting/diarization/sond/infer_alimeeting_test.py
index 0988f5d03..b4d534bee 100644
--- a/egs/alimeeting/diarization/sond/infer_alimeeting_test.py
+++ b/egs/alimeeting/diarization/sond/infer_alimeeting_test.py
@@ -4,7 +4,7 @@ import sys
 
 def main():
     diar_config_path = sys.argv[1] if len(sys.argv) > 1 else "sond_fbank.yaml"
-    diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pth"
+    diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pb"
     output_dir = sys.argv[3] if len(sys.argv) > 3 else "./outputs"
     data_path_and_name_and_type = [
         ("data/test_rmsil/feats.scp", "speech", "kaldi_ark"),
diff --git a/egs/alimeeting/diarization/sond/run.sh b/egs/alimeeting/diarization/sond/run.sh
index 7e9a7f7ba..19ae40cdd 100644
--- a/egs/alimeeting/diarization/sond/run.sh
+++ b/egs/alimeeting/diarization/sond/run.sh
@@ -17,9 +17,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
   echo "Downloading Pre-trained model..."
   git clone https://www.modelscope.cn/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch.git
   git clone https://www.modelscope.cn/damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch.git
-  ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth ./sv.pth
+  ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb ./sv.pb
   cp speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.yaml ./sv.yaml
-  ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pth ./sond.pth
+  ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pb ./sond.pb
   cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond_fbank.yaml ./sond_fbank.yaml
   cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.yaml ./sond.yaml
   echo "Done."
@@ -30,7 +30,7 @@ fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
   echo "Calculating diarization results..."
-  python infer_alimeeting_test.py sond_fbank.yaml sond.pth outputs
+  python infer_alimeeting_test.py sond_fbank.yaml sond.pb outputs
   python local/convert_label_to_rttm.py \
     outputs/labels.txt \
     data/test_rmsil/raw_rmsil_map.scp \
diff --git a/egs/alimeeting/diarization/sond/unit_test.py b/egs/alimeeting/diarization/sond/unit_test.py
index 84a424762..0f40ab29e 100644
--- a/egs/alimeeting/diarization/sond/unit_test.py
+++ b/egs/alimeeting/diarization/sond/unit_test.py
@@ -4,7 +4,7 @@ import os
 
 def test_fbank_cpu_infer():
     diar_config_path = "config_fbank.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     data_path_and_name_and_type = [
         ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -24,7 +24,7 @@ def test_fbank_cpu_infer():
 
 def test_fbank_gpu_infer():
     diar_config_path = "config_fbank.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     data_path_and_name_and_type = [
         ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -45,7 +45,7 @@ def test_fbank_gpu_infer():
 
 def test_wav_gpu_infer():
     diar_config_path = "config.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     data_path_and_name_and_type = [
         ("data/unit_test/test_wav.scp", "speech", "sound"),
@@ -66,7 +66,7 @@ def test_wav_gpu_infer():
 
 def test_without_profile_gpu_infer():
     diar_config_path = "config.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     raw_inputs = [[
         "data/unit_test/raw_inputs/record.wav",
diff --git a/egs/callhome/diarization/sond/unit_test.py b/egs/callhome/diarization/sond/unit_test.py
index 519ac5695..a48eda148 100644
--- a/egs/callhome/diarization/sond/unit_test.py
+++ b/egs/callhome/diarization/sond/unit_test.py
@@ -4,7 +4,7 @@ import os
 
 def test_fbank_cpu_infer():
     diar_config_path = "sond_fbank.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     data_path_and_name_and_type = [
         ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -24,7 +24,7 @@ def test_fbank_cpu_infer():
 
 def test_fbank_gpu_infer():
     diar_config_path = "sond_fbank.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     data_path_and_name_and_type = [
         ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -45,7 +45,7 @@ def test_fbank_gpu_infer():
 
 def test_wav_gpu_infer():
     diar_config_path = "config.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     data_path_and_name_and_type = [
         ("data/unit_test/test_wav.scp", "speech", "sound"),
@@ -66,7 +66,7 @@ def test_wav_gpu_infer():
 
 def test_without_profile_gpu_infer():
     diar_config_path = "config.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     raw_inputs = [[
         "data/unit_test/raw_inputs/record.wav",
diff --git a/egs/mars/sd/local_run.sh b/egs/mars/sd/local_run.sh
index 3b319f46e..4516e9f96 100755
--- a/egs/mars/sd/local_run.sh
+++ b/egs/mars/sd/local_run.sh
@@ -49,7 +49,7 @@ asr_config=conf/train_asr_conformer.yaml
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md
index c2e4354c1..053986d3d 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md
@@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed~~~~
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
index 56c282ce2..b3260672c 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
@@ -48,5 +48,5 @@ if __name__ == '__main__':
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.cer_ctc.ave.pth"
+    params["decoding_model_name"] = "valid.cer_ctc.ave.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md
index c2e4354c1..053986d3d 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md
@@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed~~~~
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
index e163999b7..2f038a85a 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
@@ -48,5 +48,5 @@ if __name__ == '__main__':
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.cer_ctc.ave.pth"
+    params["decoding_model_name"] = "valid.cer_ctc.ave.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md
index 9097e7ab9..16aeada4b 100644
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md
@@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.sp.cer` and `
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
index e714a3d03..333b66a72 100755
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
@@ -63,5 +63,5 @@ if __name__ == '__main__':
     params["required_files"] = ["feats_stats.npz", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./example_data/validation"
-    params["decoding_model_name"] = "valid.acc.ave.pth"
+    params["decoding_model_name"] = "valid.acc.ave.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py
index 6c34ed099..f1f29faff 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py
@@ -49,5 +49,5 @@ if __name__ == '__main__':
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
+    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py
index 6140bb71f..8cb537bd2 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py
@@ -49,5 +49,5 @@ if __name__ == '__main__':
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
+    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
index dfd509dd4..b68f1e921 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
index 94393ec5e..f26f2378b 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
@@ -49,5 +49,5 @@ if __name__ == '__main__':
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
+    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
index 96102ccfa..726009de7 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
@@ -49,5 +49,5 @@ if __name__ == '__main__':
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
+    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md
index dfd509dd4..b68f1e921 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md
@@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py
index d91a40a6c..6593f4e3f 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py
@@ -50,5 +50,5 @@ if __name__ == '__main__':
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "20epoch.pth"
+    params["decoding_model_name"] = "20epoch.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md
index dfd509dd4..b68f1e921 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md
@@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py
index f9fb0db8a..f067c8193 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py
@@ -50,5 +50,5 @@ if __name__ == '__main__':
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "20epoch.pth"
+    params["decoding_model_name"] = "20epoch.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md
index dd947d329..9a84f9b57 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md
@@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py
index 030c2e278..d4df29e01 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py
@@ -50,5 +50,5 @@ if __name__ == '__main__':
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "20epoch.pth"
+    params["decoding_model_name"] = "20epoch.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md
index dd947d329..9a84f9b57 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md
@@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
index 3b39a1665..861fefb7f 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
@@ -49,5 +49,5 @@ if __name__ == '__main__':
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "20epoch.pth"
+    params["decoding_model_name"] = "20epoch.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md
index dd947d329..eff933e8d 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md
@@ -41,7 +41,8 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave
+      .pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
index 4860cf743..d73cae267 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
@@ -49,5 +49,5 @@ if __name__ == '__main__':
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "20epoch.pth"
+    params["decoding_model_name"] = "20epoch.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
index 1094bb5ff..94144efa7 100644
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@@ -34,7 +34,7 @@ Or you can use the finetuned model for inference directly.
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
index 5f171b419..3712cb828 100644
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
@@ -53,5 +53,5 @@ if __name__ == '__main__':
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json", "punc/punc.pb", "punc/punc.yaml", "vad/vad.mvn", "vad/vad.pb", "vad/vad.yaml"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
+    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
     modelscope_infer_after_finetune(params)
diff --git a/funasr/bin/asr_inference.py b/funasr/bin/asr_inference.py
index 318d3d7a2..f3b4d560a 100644
--- a/funasr/bin/asr_inference.py
+++ b/funasr/bin/asr_inference.py
@@ -52,7 +52,7 @@ class Speech2Text:
 
     Examples:
         >>> import soundfile
-        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2text(audio)
         [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_mfcca.py b/funasr/bin/asr_inference_mfcca.py
index 4176ba6ab..888d4d2f8 100644
--- a/funasr/bin/asr_inference_mfcca.py
+++ b/funasr/bin/asr_inference_mfcca.py
@@ -55,7 +55,7 @@ class Speech2Text:
 
     Examples:
         >>> import soundfile
-        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2text(audio)
         [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
index 6413d92b0..e45e575ed 100644
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -50,7 +50,7 @@ class Speech2Text:
 
     Examples:
             >>> import soundfile
-            >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+            >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
             >>> audio, rate = soundfile.read("speech.wav")
             >>> speech2text(audio)
             [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py
index a0e7b47d2..3f5775195 100644
--- a/funasr/bin/asr_inference_paraformer_vad_punc.py
+++ b/funasr/bin/asr_inference_paraformer_vad_punc.py
@@ -58,7 +58,7 @@ class Speech2Text:
 
     Examples:
             >>> import soundfile
-            >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+            >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
             >>> audio, rate = soundfile.read("speech.wav")
             >>> speech2text(audio)
             [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_rnnt.py b/funasr/bin/asr_inference_rnnt.py
index 6cd70613b..4a9ff0bda 100644
--- a/funasr/bin/asr_inference_rnnt.py
+++ b/funasr/bin/asr_inference_rnnt.py
@@ -49,7 +49,7 @@ class Speech2Text:
 
     Examples:
             >>> import soundfile
-            >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+            >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
             >>> audio, rate = soundfile.read("speech.wav")
             >>> speech2text(audio)
             [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_uniasr.py b/funasr/bin/asr_inference_uniasr.py
index 8b31fad13..ac71538a6 100644
--- a/funasr/bin/asr_inference_uniasr.py
+++ b/funasr/bin/asr_inference_uniasr.py
@@ -46,7 +46,7 @@ class Speech2Text:
 
     Examples:
         >>> import soundfile
-        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2text(audio)
         [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_uniasr_vad.py b/funasr/bin/asr_inference_uniasr_vad.py
index e5815df11..7cb889b7d 100644
--- a/funasr/bin/asr_inference_uniasr_vad.py
+++ b/funasr/bin/asr_inference_uniasr_vad.py
@@ -46,7 +46,7 @@ class Speech2Text:
 
     Examples:
         >>> import soundfile
-        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2text(audio)
         [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/diar_inference_launch.py b/funasr/bin/diar_inference_launch.py
index 70bb947b4..85e451836 100755
--- a/funasr/bin/diar_inference_launch.py
+++ b/funasr/bin/diar_inference_launch.py
@@ -133,7 +133,7 @@ def inference_launch(mode, **kwargs):
         param_dict = {
             "extract_profile": True,
             "sv_train_config": "sv.yaml",
-            "sv_model_file": "sv.pth",
+            "sv_model_file": "sv.pb",
         }
         if "param_dict" in kwargs and kwargs["param_dict"] is not None:
             for key in param_dict:
diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py
index bc29fa206..01d3f296a 100755
--- a/funasr/bin/eend_ola_inference.py
+++ b/funasr/bin/eend_ola_inference.py
@@ -35,7 +35,7 @@ class Speech2Diarization:
     Examples:
         >>> import soundfile
         >>> import numpy as np
-        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pth")
+        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
         >>> profile = np.load("profiles.npy")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2diar(audio, profile)
diff --git a/funasr/bin/sond_inference.py b/funasr/bin/sond_inference.py
index ab6d26f45..936dc21f3 100755
--- a/funasr/bin/sond_inference.py
+++ b/funasr/bin/sond_inference.py
@@ -42,7 +42,7 @@ class Speech2Diarization:
     Examples:
         >>> import soundfile
         >>> import numpy as np
-        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pth")
+        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
         >>> profile = np.load("profiles.npy")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2diar(audio, profile)
diff --git a/funasr/bin/sv_inference.py b/funasr/bin/sv_inference.py
index a78bccded..7e63bbd2d 100755
--- a/funasr/bin/sv_inference.py
+++ b/funasr/bin/sv_inference.py
@@ -36,7 +36,7 @@ class Speech2Xvector:
 
     Examples:
         >>> import soundfile
-        >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pth")
+        >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pb")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2xvector(audio)
         [(text, token, token_int, hypothesis object), ...]
@@ -169,7 +169,7 @@ def inference_modelscope(
         log_level: Union[int, str] = "INFO",
         key_file: Optional[str] = None,
         sv_train_config: Optional[str] = "sv.yaml",
-        sv_model_file: Optional[str] =  "sv.pth",
+        sv_model_file: Optional[str] =  "sv.pb",
         model_tag: Optional[str] = None,
         allow_variable_data_keys: bool = True,
         streaming: bool = False,
diff --git a/funasr/main_funcs/average_nbest_models.py b/funasr/main_funcs/average_nbest_models.py
index 53f956800..d8df94985 100644
--- a/funasr/main_funcs/average_nbest_models.py
+++ b/funasr/main_funcs/average_nbest_models.py
@@ -66,13 +66,13 @@ def average_nbest_models(
             elif n == 1:
                 # The averaged model is same as the best model
                 e, _ = epoch_and_values[0]
-                op = output_dir / f"{e}epoch.pth"
-                sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pth"
+                op = output_dir / f"{e}epoch.pb"
+                sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pb"
                 if sym_op.is_symlink() or sym_op.exists():
                     sym_op.unlink()
                 sym_op.symlink_to(op.name)
             else:
-                op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pth"
+                op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pb"
                 logging.info(
                     f"Averaging {n}best models: " f'criterion="{ph}.{cr}": {op}'
                 )
@@ -83,12 +83,12 @@ def average_nbest_models(
                     if e not in _loaded:
                         if oss_bucket is None:
                             _loaded[e] = torch.load(
-                                output_dir / f"{e}epoch.pth",
+                                output_dir / f"{e}epoch.pb",
                                 map_location="cpu",
                             )
                         else:
                             buffer = BytesIO(
-                                oss_bucket.get_object(os.path.join(pai_output_dir, f"{e}epoch.pth")).read())
+                                oss_bucket.get_object(os.path.join(pai_output_dir, f"{e}epoch.pb")).read())
                             _loaded[e] = torch.load(buffer)
                     states = _loaded[e]
 
@@ -115,13 +115,13 @@ def average_nbest_models(
                 else:
                     buffer = BytesIO()
                     torch.save(avg, buffer)
-                    oss_bucket.put_object(os.path.join(pai_output_dir, f"{ph}.{cr}.ave_{n}best.{suffix}pth"),
+                    oss_bucket.put_object(os.path.join(pai_output_dir, f"{ph}.{cr}.ave_{n}best.{suffix}pb"),
                                           buffer.getvalue())
 
-        # 3. *.*.ave.pth is a symlink to the max ave model
+        # 3. *.*.ave.pb is a symlink to the max ave model
         if oss_bucket is None:
-            op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pth"
-            sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pth"
+            op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pb"
+            sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pb"
             if sym_op.is_symlink() or sym_op.exists():
                 sym_op.unlink()
             sym_op.symlink_to(op.name)
diff --git a/funasr/main_funcs/pack_funcs.py b/funasr/main_funcs/pack_funcs.py
index ffa807e23..fe365d8e7 100644
--- a/funasr/main_funcs/pack_funcs.py
+++ b/funasr/main_funcs/pack_funcs.py
@@ -191,12 +191,12 @@ def unpack(
 
     Examples:
         tarfile:
-           model.pth
+           model.pb
            some1.file
            some2.file
 
         >>> unpack("tarfile", "out")
-        {'asr_model_file': 'out/model.pth'}
+        {'asr_model_file': 'out/model.pb'}
     """
     input_archive = Path(input_archive)
     outpath = Path(outpath)
diff --git a/funasr/tasks/abs_task.py b/funasr/tasks/abs_task.py
index e0884cef6..3f20b4f4c 100644
--- a/funasr/tasks/abs_task.py
+++ b/funasr/tasks/abs_task.py
@@ -639,12 +639,12 @@ class AbsTask(ABC):
                  "and exclude_keys excludes keys of model states for the initialization."
                  "e.g.\n"
                  "  # Load all parameters"
-                 "  --init_param some/where/model.pth\n"
+                 "  --init_param some/where/model.pb\n"
                  "  # Load only decoder parameters"
-                 "  --init_param some/where/model.pth:decoder:decoder\n"
+                 "  --init_param some/where/model.pb:decoder:decoder\n"
                  "  # Load only decoder parameters excluding decoder.embed"
-                 "  --init_param some/where/model.pth:decoder:decoder:decoder.embed\n"
-                 "  --init_param some/where/model.pth:decoder:decoder:decoder.embed\n",
+                 "  --init_param some/where/model.pb:decoder:decoder:decoder.embed\n"
+                 "  --init_param some/where/model.pb:decoder:decoder:decoder.embed\n",
         )
         group.add_argument(
             "--ignore_init_mismatch",
diff --git a/funasr/tasks/asr.py b/funasr/tasks/asr.py
index 36499a257..e15147332 100644
--- a/funasr/tasks/asr.py
+++ b/funasr/tasks/asr.py
@@ -826,7 +826,7 @@ class ASRTaskUniASR(ASRTask):
             if "model.ckpt-" in model_name or ".bin" in model_name:
                 model_name_pth = os.path.join(model_dir, model_name.replace('.bin',
                                                                             '.pb')) if ".bin" in model_name else os.path.join(
-                    model_dir, "{}.pth".format(model_name))
+                    model_dir, "{}.pb".format(model_name))
                 if os.path.exists(model_name_pth):
                     logging.info("model_file is load from pth: {}".format(model_name_pth))
                     model_dict = torch.load(model_name_pth, map_location=device)
@@ -1073,7 +1073,7 @@ class ASRTaskParaformer(ASRTask):
             if "model.ckpt-" in model_name or ".bin" in model_name:
                 model_name_pth = os.path.join(model_dir, model_name.replace('.bin',
                                                                             '.pb')) if ".bin" in model_name else os.path.join(
-                    model_dir, "{}.pth".format(model_name))
+                    model_dir, "{}.pb".format(model_name))
                 if os.path.exists(model_name_pth):
                     logging.info("model_file is load from pth: {}".format(model_name_pth))
                     model_dict = torch.load(model_name_pth, map_location=device)
diff --git a/funasr/tasks/diar.py b/funasr/tasks/diar.py
index 696291526..9875f6a45 100644
--- a/funasr/tasks/diar.py
+++ b/funasr/tasks/diar.py
@@ -553,7 +553,7 @@ class DiarTask(AbsTask):
                 if ".bin" in model_name:
                     model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb'))
                 else:
-                    model_name_pth = os.path.join(model_dir, "{}.pth".format(model_name))
+                    model_name_pth = os.path.join(model_dir, "{}.pb".format(model_name))
                 if os.path.exists(model_name_pth):
                     logging.info("model_file is load from pth: {}".format(model_name_pth))
                     model_dict = torch.load(model_name_pth, map_location=device)
diff --git a/funasr/tasks/sv.py b/funasr/tasks/sv.py
index 1b08c4dad..bef5dc588 100644
--- a/funasr/tasks/sv.py
+++ b/funasr/tasks/sv.py
@@ -501,7 +501,7 @@ class SVTask(AbsTask):
                 if ".bin" in model_name:
                     model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb'))
                 else:
-                    model_name_pth = os.path.join(model_dir, "{}.pth".format(model_name))
+                    model_name_pth = os.path.join(model_dir, "{}.pb".format(model_name))
                 if os.path.exists(model_name_pth):
                     logging.info("model_file is load from pth: {}".format(model_name_pth))
                     model_dict = torch.load(model_name_pth, map_location=device)
diff --git a/funasr/torch_utils/load_pretrained_model.py b/funasr/torch_utils/load_pretrained_model.py
index 8e3f05e1e..e9b18cd0d 100644
--- a/funasr/torch_utils/load_pretrained_model.py
+++ b/funasr/torch_utils/load_pretrained_model.py
@@ -52,13 +52,13 @@ def load_pretrained_model(
         init_param: <file_path>:<src_key>:<dst_key>:<exclude_Keys>
 
     Examples:
-        >>> load_pretrained_model("somewhere/model.pth", model)
-        >>> load_pretrained_model("somewhere/model.pth:decoder:decoder", model)
-        >>> load_pretrained_model("somewhere/model.pth:decoder:decoder:", model)
+        >>> load_pretrained_model("somewhere/model.pb", model)
+        >>> load_pretrained_model("somewhere/model.pb:decoder:decoder", model)
+        >>> load_pretrained_model("somewhere/model.pb:decoder:decoder:", model)
         >>> load_pretrained_model(
-        ...     "somewhere/model.pth:decoder:decoder:decoder.embed", model
+        ...     "somewhere/model.pb:decoder:decoder:decoder.embed", model
         ... )
-        >>> load_pretrained_model("somewhere/decoder.pth::decoder", model)
+        >>> load_pretrained_model("somewhere/decoder.pb::decoder", model)
     """
     sps = init_param.split(":", 4)
     if len(sps) == 4:
diff --git a/funasr/train/trainer.py b/funasr/train/trainer.py
index 50bce477a..efe2009c4 100644
--- a/funasr/train/trainer.py
+++ b/funasr/train/trainer.py
@@ -205,9 +205,9 @@ class Trainer:
         else:
             scaler = None
 
-        if trainer_options.resume and (output_dir / "checkpoint.pth").exists():
+        if trainer_options.resume and (output_dir / "checkpoint.pb").exists():
             cls.resume(
-                checkpoint=output_dir / "checkpoint.pth",
+                checkpoint=output_dir / "checkpoint.pb",
                 model=model,
                 optimizers=optimizers,
                 schedulers=schedulers,
@@ -361,7 +361,7 @@ class Trainer:
                         },
                         buffer,
                     )
-                    trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir, "checkpoint.pth"), buffer.getvalue())
+                    trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir, "checkpoint.pb"), buffer.getvalue())
                 else:
                     torch.save(
                         {
@@ -374,7 +374,7 @@ class Trainer:
                             ],
                             "scaler": scaler.state_dict() if scaler is not None else None,
                         },
-                        output_dir / "checkpoint.pth",
+                        output_dir / "checkpoint.pb",
                     )
 
                 # 5. Save and log the model and update the link to the best model
@@ -382,22 +382,22 @@ class Trainer:
                     buffer = BytesIO()
                     torch.save(model.state_dict(), buffer)
                     trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir,
-                                                                       f"{iepoch}epoch.pth"),buffer.getvalue())
+                                                                       f"{iepoch}epoch.pb"),buffer.getvalue())
                 else:
-                    torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pth")
+                    torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pb")
 
-                # Creates a sym link latest.pth -> {iepoch}epoch.pth
+                # Creates a sym link latest.pb -> {iepoch}epoch.pb
                 if trainer_options.use_pai:
-                    p = os.path.join(trainer_options.output_dir, "latest.pth")
+                    p = os.path.join(trainer_options.output_dir, "latest.pb")
                     if trainer_options.oss_bucket.object_exists(p):
                         trainer_options.oss_bucket.delete_object(p)
                     trainer_options.oss_bucket.copy_object(trainer_options.oss_bucket.bucket_name,
-                                           os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pth"), p)
+                                           os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pb"), p)
                 else:
-                    p = output_dir / "latest.pth"
+                    p = output_dir / "latest.pb"
                     if p.is_symlink() or p.exists():
                         p.unlink()
-                    p.symlink_to(f"{iepoch}epoch.pth")
+                    p.symlink_to(f"{iepoch}epoch.pb")
 
                 _improved = []
                 for _phase, k, _mode in trainer_options.best_model_criterion:
@@ -407,16 +407,16 @@ class Trainer:
                         # Creates sym links if it's the best result
                         if best_epoch == iepoch:
                             if trainer_options.use_pai:
-                                p = os.path.join(trainer_options.output_dir, f"{_phase}.{k}.best.pth")
+                                p = os.path.join(trainer_options.output_dir, f"{_phase}.{k}.best.pb")
                                 if trainer_options.oss_bucket.object_exists(p):
                                     trainer_options.oss_bucket.delete_object(p)
                                 trainer_options.oss_bucket.copy_object(trainer_options.oss_bucket.bucket_name,
-                                                       os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pth"),p)
+                                                       os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pb"),p)
                             else:
-                                p = output_dir / f"{_phase}.{k}.best.pth"
+                                p = output_dir / f"{_phase}.{k}.best.pb"
                                 if p.is_symlink() or p.exists():
                                     p.unlink()
-                                p.symlink_to(f"{iepoch}epoch.pth")
+                                p.symlink_to(f"{iepoch}epoch.pb")
                             _improved.append(f"{_phase}.{k}")
                 if len(_improved) == 0:
                     logging.info("There are no improvements in this epoch")
@@ -438,7 +438,7 @@ class Trainer:
                         type="model",
                         metadata={"improved": _improved},
                     )
-                    artifact.add_file(str(output_dir / f"{iepoch}epoch.pth"))
+                    artifact.add_file(str(output_dir / f"{iepoch}epoch.pb"))
                     aliases = [
                         f"epoch-{iepoch}",
                         "best" if best_epoch == iepoch else "",
@@ -473,12 +473,12 @@ class Trainer:
 
                 for e in range(1, iepoch):
                     if trainer_options.use_pai:
-                        p = os.path.join(trainer_options.output_dir, f"{e}epoch.pth")
+                        p = os.path.join(trainer_options.output_dir, f"{e}epoch.pb")
                         if trainer_options.oss_bucket.object_exists(p) and e not in nbests:
                             trainer_options.oss_bucket.delete_object(p)
                             _removed.append(str(p))
                     else:
-                        p = output_dir / f"{e}epoch.pth"
+                        p = output_dir / f"{e}epoch.pb"
                         if p.exists() and e not in nbests:
                             p.unlink()
                             _removed.append(str(p))

From f1273775414e2f0664e00c44dbd8bbf897ba0183 Mon Sep 17 00:00:00 2001
From: mayong <mayong@qianxin.com>
Date: Thu, 16 Mar 2023 11:44:13 +0800
Subject: [PATCH 34/37] Remove VAD.

---
 funasr/runtime/onnxruntime/src/Audio.cpp      |  5 +-
 .../onnxruntime/src/librapidasrapi.cpp        | 17 +++---
 funasr/runtime/onnxruntime/tester/tester.cpp  | 57 ++++++++++++++++---
 3 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/funasr/runtime/onnxruntime/src/Audio.cpp b/funasr/runtime/onnxruntime/src/Audio.cpp
index 43dfb6b78..53bf9d02a 100644
--- a/funasr/runtime/onnxruntime/src/Audio.cpp
+++ b/funasr/runtime/onnxruntime/src/Audio.cpp
@@ -237,7 +237,7 @@ bool Audio::loadpcmwav(const char* buf, int nBufLen)
 
     size_t nOffset = 0;
 
-#define WAV_HEADER_SIZE 44
+
 
     speech_len = nBufLen / 2;
     speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size);
@@ -263,7 +263,8 @@ bool Audio::loadpcmwav(const char* buf, int nBufLen)
             speech_data[i] = (float)speech_buff[i] / scale;
         }
 
-
+        AudioFrame* frame = new AudioFrame(speech_len);
+        frame_queue.push(frame);
         return true;
 
     }
diff --git a/funasr/runtime/onnxruntime/src/librapidasrapi.cpp b/funasr/runtime/onnxruntime/src/librapidasrapi.cpp
index 1f8f7ca63..f5f9d66be 100644
--- a/funasr/runtime/onnxruntime/src/librapidasrapi.cpp
+++ b/funasr/runtime/onnxruntime/src/librapidasrapi.cpp
@@ -26,8 +26,9 @@ extern "C" {
 			return nullptr;
 
 		Audio audio(1);
-		audio.loadwav(szBuf,nLen);
-		audio.split();
+		if (!audio.loadwav(szBuf, nLen))
+			return nullptr;
+		//audio.split();
 
 		float* buff;
 		int len;
@@ -58,8 +59,9 @@ extern "C" {
 			return nullptr;
 
 		Audio audio(1);
-		audio.loadpcmwav(szBuf, nLen);
-		audio.split();
+		if (!audio.loadpcmwav(szBuf, nLen))
+			return nullptr;
+		//audio.split();
 
 		float* buff;
 		int len;
@@ -91,8 +93,9 @@ extern "C" {
 			return nullptr;
 
 		Audio audio(1);
-		audio.loadpcmwav(szFileName);
-		audio.split();
+		if (!audio.loadpcmwav(szFileName))
+			return nullptr;
+		//audio.split();
 
 		float* buff;
 		int len;
@@ -125,7 +128,7 @@ extern "C" {
 		Audio audio(1);
 		if(!audio.loadwav(szWavfile))
 			return nullptr;
-		audio.split();
+		//audio.split();
 
 		float* buff;
 		int len;
diff --git a/funasr/runtime/onnxruntime/tester/tester.cpp b/funasr/runtime/onnxruntime/tester/tester.cpp
index b9a85b7c0..ba5c61ccb 100644
--- a/funasr/runtime/onnxruntime/tester/tester.cpp
+++ b/funasr/runtime/onnxruntime/tester/tester.cpp
@@ -8,7 +8,7 @@
 #include "librapidasrapi.h"
 
 #include <iostream>
-
+#include <fstream>
 using namespace std;
 
 int main(int argc, char *argv[])
@@ -40,10 +40,13 @@ int main(int argc, char *argv[])
 
 
     gettimeofday(&start, NULL);
-
-    RPASR_RESULT Result=RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);
-    gettimeofday(&end, NULL);
     float snippet_time = 0.0f;
+
+
+     RPASR_RESULT Result=RapidAsrRecogFile(AsrHanlde, argv[2], RASR_NONE, NULL);
+
+    gettimeofday(&end, NULL);
+   
     if (Result)
     {
         string msg = RapidAsrGetResult(Result, 0);
@@ -56,11 +59,51 @@ int main(int argc, char *argv[])
     }
     else
     {
-        cout <<("no return data!");
+        cout <<"no return data!";
     }
-  
-    printf("Audio length %lfs.\n", (double)snippet_time);
+ 
+ 
+    //char* buff = nullptr;
+    //int len = 0;
+    //ifstream ifs(argv[2], std::ios::binary | std::ios::in);
+    //if (ifs.is_open())
+    //{
+    //    ifs.seekg(0, std::ios::end);
+    //    len = ifs.tellg();
+    //    ifs.seekg(0, std::ios::beg);
 
+    //    buff = new char[len];
+
+    //    ifs.read(buff, len);
+
+
+    //    //RPASR_RESULT Result = RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);
+
+    //    RPASR_RESULT Result=RapidAsrRecogPCMBuffer(AsrHanlde, buff,len, RASR_NONE, NULL);
+    //    //RPASR_RESULT Result = RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);
+    //    gettimeofday(&end, NULL);
+    //   
+    //    if (Result)
+    //    {
+    //        string msg = RapidAsrGetResult(Result, 0);
+    //        setbuf(stdout, NULL);
+    //        cout << "Result: \"";
+    //        cout << msg << endl;
+    //        cout << "\"." << endl;
+    //        snippet_time = RapidAsrGetRetSnippetTime(Result);
+    //        RapidAsrFreeResult(Result);
+    //    }
+    //    else
+    //    {
+    //        cout <<"no return data!";
+    //    }
+  
+    //   
+    //delete[]buff;
+    //}
+
+ 
+    printf("Audio length %lfs.\n", (double)snippet_time);
     seconds = (end.tv_sec - start.tv_sec);
     long taking_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
     printf("Model inference takes %lfs.\n", (double)taking_micros / 1000000);

From cc8e2638455c02235b376900ae330bb6608e4494 Mon Sep 17 00:00:00 2001
From: "shixian.shi" <shixian.shi@alibaba-inc.com>
Date: Thu, 16 Mar 2023 14:45:15 +0800
Subject: [PATCH 35/37] update setup.py

---
 setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.py b/setup.py
index e6b9d38f6..c85476938 100644
--- a/setup.py
+++ b/setup.py
@@ -41,6 +41,8 @@ requirements = {
         # PAI
         "oss2",
         "kaldi-native-fbank",
+        # timestamp
+        "edit-distance"
     ],
     # train: The modules invoked when training only.
     "train": [

From 80cc48a676668e70b585f38d1487bf377361a3d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B8=B8=E9=9B=81?= <zhifu.gzf@alibaba-inc.com>
Date: Thu, 16 Mar 2023 15:02:00 +0800
Subject: [PATCH 36/37] readme

---
 README.md | 30 ++----------------------------
 1 file changed, 2 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index 0d1079b36..7cf5a6cea 100644
--- a/README.md
+++ b/README.md
@@ -17,34 +17,8 @@
 
 ## What's new: 
 
-### 2023.2.17, funasr-0.2.0, modelscope-1.3.0
-- We support a new feature, export paraformer models into [onnx and torchscripts](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export) from modelscope. The local finetuned models are also supported.
-- We support a new feature, [onnxruntime](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python), you could deploy the runtime without modelscope or funasr, for the [paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) model, the rtf of onnxruntime is 3x speedup(0.110->0.038) on cpu, [details](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer#speed).
-- We support a new feature, [grpc](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/grpc), you could build the ASR service with grpc, by deploying the modelscope pipeline or onnxruntime.
-- We release a new model [paraformer-large-contextual](https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary), which supports the hotword customization based on the incentive enhancement, and improves the recall and precision of hotwords.
-- We optimize the timestamp alignment of [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), the prediction accuracy of timestamp is much improved, and achieving accumulated average shift (aas) of 74.7ms, [details](https://arxiv.org/abs/2301.12343).
-- We release a new model, [8k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://github.com/alibaba-damo-academy/FunASR/discussions/134).
-- We release a new model, [MFCCA](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary), a multi-channel multi-speaker model which is independent of the number and geometry of microphones and supports Mandarin meeting transcription.
-- We release several new UniASR model: 
-[Southern Fujian Dialect model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/summary),
-[French model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/summary), 
-[German model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/summary), 
-[Vietnamese model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/summary), 
-[Persian model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/summary).
-- We release a new model, [paraformer-data2vec model](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/summary), an unsupervised pretraining model on AISHELL-2, which is inited for paraformer model and then finetune on AISHEL-1.
-- We release a new feature, the `VAD`, `ASR` and `PUNC` models could be integrated freely, which could be models from [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), or the local finetine models. The [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/134).
-- We optimized the [punctuation common model](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), enhance the recall and precision, fix the badcases of missing punctuation marks.
-- Various new types of audio input types are now supported by modelscope inference pipeline, including: mp3、flac、ogg、opus...
-### 2023.1.16, funasr-0.1.6， modelscope-1.2.0
-- We release a new version model [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), which integrate the [VAD](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) model, [ASR](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary),
- [Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary) model and timestamp together. The model could take in several hours long inputs.
-- We release a new model, [16k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).
-- We release a new model, [Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), which could predict the punctuation of ASR models's results. It could be freely integrated with any ASR models in [Model Zoo](docs/modelscope_models.md).
-- We release a new model, [Data2vec](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/summary), an unsupervised pretraining model which could be finetuned on ASR and other downstream tasks.
-- We release a new model, [Paraformer-Tiny](https://www.modelscope.cn/models/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/summary), a lightweight Paraformer model which supports Mandarin command words recognition.
-- We release a new model, [SV](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary), which could extract speaker embeddings and further perform speaker verification on paired utterances. It will be supported for speaker diarization in the future version.
-- We improve the pipeline of modelscope to speedup the inference, by integrating the process of build model into build pipeline.
-- Various new types of audio input types are now supported by modelscope inference pipeline, including wav.scp, wav format, audio bytes, wave samples...
+## What's new: 
+For the release notes, please ref to [news](https://github.com/alibaba-damo-academy/FunASR/releases)
 
 ## Highlights
 - Many types of typical models are supported, e.g., [Tranformer](https://arxiv.org/abs/1706.03762), [Conformer](https://arxiv.org/abs/2005.08100), [Paraformer](https://arxiv.org/abs/2206.08317).

From 64bd637c301c3e4f771466808c30ec96d5531e45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B8=B8=E9=9B=81?= <zhifu.gzf@alibaba-inc.com>
Date: Thu, 16 Mar 2023 15:02:54 +0800
Subject: [PATCH 37/37] readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7cf5a6cea..23f1abec6 100644
--- a/README.md
+++ b/README.md
@@ -15,9 +15,9 @@
 | [**Model Zoo**](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
 | [**Contact**](#contact)
 
-## What's new: 
 
 ## What's new: 
+
 For the release notes, please ref to [news](https://github.com/alibaba-damo-academy/FunASR/releases)
 
 ## Highlights