From 72c8f70ef470a878901dedb575081660ac474133 Mon Sep 17 00:00:00 2001 From: onlybetheone Date: Fri, 10 Mar 2023 18:08:18 +0800 Subject: [PATCH 01/37] add egs_modelscope/uniasr/ he my ur examples --- .../finetune.py | 35 +++++++++++++++++++ .../infer.py | 13 +++++++ .../finetune.py | 35 +++++++++++++++++++ .../infer.py | 13 +++++++ .../finetune.py | 35 +++++++++++++++++++ .../infer.py | 13 +++++++ 6 files changed, 144 insertions(+) create mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py create mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py create mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py create mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py create mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py create mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py new file mode 100644 index 000000000..56fb58302 --- /dev/null +++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py @@ -0,0 +1,35 @@ +import os +from modelscope.metainfo import Trainers +from modelscope.trainers import build_trainer +from funasr.datasets.ms_dataset import MsDataset + + +def modelscope_finetune(params): + if not os.path.exists(params["output_dir"]): + os.makedirs(params["output_dir"], exist_ok=True) + # dataset split ["train", "validation"] + ds_dict = MsDataset.load(params["data_dir"]) + kwargs = dict( + model=params["model"], + model_revision=params["model_revision"], + data_dir=ds_dict, + dataset_type=params["dataset_type"], + work_dir=params["output_dir"], + batch_bins=params["batch_bins"], + max_epoch=params["max_epoch"], + lr=params["lr"]) + trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) + trainer.train() + + +if __name__ == '__main__': + params = {} + params["output_dir"] = "./checkpoint" + params["data_dir"] = "./data" + params["batch_bins"] = 2000 + params["dataset_type"] = "small" + params["max_epoch"] = 50 + params["lr"] = 0.00005 + params["model"] = "damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch" + params["model_revision"] = None + modelscope_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py new file mode 100644 index 000000000..c54ab8c83 --- /dev/null +++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py @@ -0,0 +1,13 @@ +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks + +if __name__ == "__main__": + audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_he.wav" + output_dir = "./results" + inference_pipline = pipeline( + task=Tasks.auto_speech_recognition, + model="damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch", + output_dir=output_dir, + ) + rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"}) + print(rec_result) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py new file mode 100644 index 000000000..8bbce606c --- /dev/null +++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py @@ -0,0 +1,35 @@ +import os +from modelscope.metainfo import Trainers +from modelscope.trainers import build_trainer +from funasr.datasets.ms_dataset import MsDataset + + +def modelscope_finetune(params): + if not os.path.exists(params["output_dir"]): + os.makedirs(params["output_dir"], exist_ok=True) + # dataset split ["train", "validation"] + ds_dict = MsDataset.load(params["data_dir"]) + kwargs = dict( + model=params["model"], + model_revision=params["model_revision"], + data_dir=ds_dict, + dataset_type=params["dataset_type"], + work_dir=params["output_dir"], + batch_bins=params["batch_bins"], + max_epoch=params["max_epoch"], + lr=params["lr"]) + trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) + trainer.train() + + +if __name__ == '__main__': + params = {} + params["output_dir"] = "./checkpoint" + params["data_dir"] = "./data" + params["batch_bins"] = 2000 + params["dataset_type"] = "small" + params["max_epoch"] = 50 + params["lr"] = 0.00005 + params["model"] = "damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch" + params["model_revision"] = None + modelscope_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py new file mode 100644 index 000000000..cfd869f04 --- /dev/null +++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py @@ -0,0 +1,13 @@ +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks + +if __name__ == "__main__": + audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_my.wav" + output_dir = "./results" + inference_pipline = pipeline( + task=Tasks.auto_speech_recognition, + model="damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch", + output_dir=output_dir, + ) + rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"}) + print(rec_result) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py new file mode 100644 index 000000000..5e313e533 --- /dev/null +++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py @@ -0,0 +1,35 @@ +import os +from modelscope.metainfo import Trainers +from modelscope.trainers import build_trainer +from funasr.datasets.ms_dataset import MsDataset + + +def modelscope_finetune(params): + if not os.path.exists(params["output_dir"]): + os.makedirs(params["output_dir"], exist_ok=True) + # dataset split ["train", "validation"] + ds_dict = MsDataset.load(params["data_dir"]) + kwargs = dict( + model=params["model"], + model_revision=params["model_revision"], + data_dir=ds_dict, + dataset_type=params["dataset_type"], + work_dir=params["output_dir"], + batch_bins=params["batch_bins"], + max_epoch=params["max_epoch"], + lr=params["lr"]) + trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) + trainer.train() + + +if __name__ == '__main__': + params = {} + params["output_dir"] = "./checkpoint" + params["data_dir"] = "./data" + params["batch_bins"] = 2000 + params["dataset_type"] = "small" + params["max_epoch"] = 50 + params["lr"] = 0.00005 + params["model"] = "damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch" + params["model_revision"] = None + modelscope_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py new file mode 100644 index 000000000..e8c5524f0 --- /dev/null +++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py @@ -0,0 +1,13 @@ +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks + +if __name__ == "__main__": + audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ur.wav" + output_dir = "./results" + inference_pipline = pipeline( + task=Tasks.auto_speech_recognition, + model="damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch", + output_dir=output_dir, + ) + rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"}) + print(rec_result) From 71d466e7451435eefb604f22576aba04bd39e285 Mon Sep 17 00:00:00 2001 From: "shixian.shi" Date: Mon, 13 Mar 2023 19:47:42 +0800 Subject: [PATCH 02/37] update AverageShiftCalculator in utils --- funasr/utils/timestamp_tools.py | 139 ++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py index f5a238ea9..73f0c7afa 100644 --- a/funasr/utils/timestamp_tools.py +++ b/funasr/utils/timestamp_tools.py @@ -1,6 +1,10 @@ +from scipy.fftpack import shift import torch import copy +import codecs import logging +import edit_distance +import argparse import numpy as np from typing import Any, List, Tuple, Union @@ -121,4 +125,139 @@ def time_stamp_sentence(punc_id_list, time_stamp_postprocessed, text_postprocess return res +class AverageShiftCalculator(): + def __init__(self): + logging.warning("Calculating average shift.") + def __call__(self, file1, file2): + uttid_list1, ts_dict1 = self.read_timestamps(file1) + uttid_list2, ts_dict2 = self.read_timestamps(file2) + uttid_intersection = self._intersection(uttid_list1, uttid_list2) + res = self.as_cal(uttid_intersection, ts_dict1, ts_dict2) + logging.warning("Average shift of {} and {}: {}.".format(file1, file2, str(res)[:8])) + logging.warning("Following timestamp pair differs most: {}, detail:{}".format(self.max_shift_uttid)) + + def _intersection(list1, list2): + set1 = set(list1) + set2 = set(list2) + if set1 == set2: + logging.warning("Uttid same checked.") + return set1 + itsc = list(set1 & set2) + logging.warning("Uttid differs: file1 {}, file2 {}, lines same {}.".format(len(list1), len(list2), len(itsc))) + return itsc + + def read_timestamps(self, file): + # read timestamps file in standard format + uttid_list = [] + ts_dict = {} + with codecs.open(file, 'r') as fin: + for line in fin.readlines(): + text = '' + ts_list = [] + line = line.rstrip() + uttid = line.split()[0] + uttid_list.append(uttid) + body = " ".join(line.split()[1:]) + for pd in body.split(';'): + if not len(pd): continue + # pdb.set_trace() + char, start, end = pd.lstrip(" ").split(' ') + text += char + ',' + ts_list.append((float(start), float(end))) + # ts_lists.append(ts_list) + ts_dict[uttid] = (text[:-1], ts_list) + logging.warning("File {} read done.".format(file)) + return uttid_list, ts_dict + + def _shift(self, filtered_timestamp_list1, filtered_timestamp_list2): + for fts1, fts2 in zip(filtered_timestamp_list1, filtered_timestamp_list2): + shift_time = abs(fts1[0] - fts2[0]) + abs(fts1[1] - fts2[1]) + num_tokens = len(filtered_timestamp_list1) + return shift_time, num_tokens + + def as_cal(self, uttid_list, ts_dict1, ts_dict2): + # calculate average shift between timestamp1 and timestamp2 + # when characters differ, use edit distance alignment + # and calculate the error between the same characters + self._accumlated_shift = 0 + self._accumlated_tokens = 0 + self.max_shift = 0 + self.max_shift_uttid = None + for uttid in uttid_list: + (t1, ts1) = ts_dict1[uttid] + (t2, ts2) = ts_dict2[uttid] + _align, _align2, _align3 = [], [], [] + fts1, fts2 = [], [] + _t1, _t2 = [], [] + sm = edit_distance.SequenceMatcher(t1.split(','), t2.split(',')) + s = sm.get_opcodes() + for j in range(len(s)): + if s[j][0] == "replace" or s[j][0] == "insert": + _align.append(0) + if s[j][0] == "replace" or s[j][0] == "delete": + _align3.append(0) + elif s[j][0] == "equal": + _align.append(1) + _align3.append(1) + else: + continue + # use s to index t2 + for a, ts , t in zip(_align, ts2, t2.split(',')): + if a: + fts2.append(ts) + _t2.append(t) + sm2 = edit_distance.SequenceMatcher(t2.split(','), t1.split(',')) + s = sm2.get_opcodes() + for j in range(len(s)): + if s[j][0] == "replace" or s[j][0] == "insert": + _align2.append(0) + elif s[j][0] == "equal": + _align2.append(1) + else: + continue + # use s2 tp index t1 + for a, ts, t in zip(_align3, ts1, t1.split(',')): + if a: + fts1.append(ts) + _t1.append(t) + if len(fts1) == len(fts2): + shift_time, num_tokens = self._shift(fts1, fts2) + self._accumlated_shift += shift_time + self._accumlated_tokens += num_tokens + if shift_time/num_tokens > self.max_shift: + self.max_shift = shift_time/num_tokens + self.max_shift_uttid = uttid + else: + logging.warning("length mismatch") + return self._accumlated_shift / self._accumlated_tokens + + +SUPPORTED_MODES = ['cal_aas'] + + +def main(args): + if args.mode == 'cal_aas': + asc = AverageShiftCalculator() + asc(args.input, args.input2) + else: + logging.error("Mode {} not in SUPPORTED_MODES: {}.".format(args.mode, SUPPORTED_MODES)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='timestamp tools') + parser.add_argument('--mode', + default=None, + type=str, + choices=SUPPORTED_MODES, + help='timestamp related toolbox') + parser.add_argument('--input', default=None, type=str, help='input file path') + parser.add_argument('--output', default=None, type=str, help='output file name') + parser.add_argument('--input2', default=None, type=str, help='input2 file path') + parser.add_argument('--kaldi-ts-type', + default='v2', + type=str, + choices=['v0', 'v1', 'v2'], + help='kaldi timestamp to write') + args = parser.parse_args() + main(args) From 4b16316d4917f1c8da434218949343ccf1a817c9 Mon Sep 17 00:00:00 2001 From: "shixian.shi" Date: Mon, 13 Mar 2023 19:53:33 +0800 Subject: [PATCH 03/37] bug fic --- funasr/utils/timestamp_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py index 73f0c7afa..27095a65a 100644 --- a/funasr/utils/timestamp_tools.py +++ b/funasr/utils/timestamp_tools.py @@ -136,7 +136,7 @@ class AverageShiftCalculator(): logging.warning("Average shift of {} and {}: {}.".format(file1, file2, str(res)[:8])) logging.warning("Following timestamp pair differs most: {}, detail:{}".format(self.max_shift_uttid)) - def _intersection(list1, list2): + def _intersection(self, list1, list2): set1 = set(list1) set2 = set(list2) if set1 == set2: From 9c21bbb96b95980ac059df991e442c437a69c828 Mon Sep 17 00:00:00 2001 From: "shixian.shi" Date: Mon, 13 Mar 2023 19:55:47 +0800 Subject: [PATCH 04/37] bug fix --- funasr/utils/timestamp_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py index 27095a65a..7ba3e0b42 100644 --- a/funasr/utils/timestamp_tools.py +++ b/funasr/utils/timestamp_tools.py @@ -134,7 +134,7 @@ class AverageShiftCalculator(): uttid_intersection = self._intersection(uttid_list1, uttid_list2) res = self.as_cal(uttid_intersection, ts_dict1, ts_dict2) logging.warning("Average shift of {} and {}: {}.".format(file1, file2, str(res)[:8])) - logging.warning("Following timestamp pair differs most: {}, detail:{}".format(self.max_shift_uttid)) + logging.warning("Following timestamp pair differs most: {}, detail:{}".format(self.max_shift, self.max_shift_uttid)) def _intersection(self, list1, list2): set1 = set(list1) From 0b06794fde09bedfb75ee85504148cf3a4707e21 Mon Sep 17 00:00:00 2001 From: "shixian.shi" Date: Mon, 13 Mar 2023 20:14:41 +0800 Subject: [PATCH 05/37] bug fix --- funasr/utils/timestamp_tools.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py index 7ba3e0b42..2bccd50e6 100644 --- a/funasr/utils/timestamp_tools.py +++ b/funasr/utils/timestamp_tools.py @@ -170,8 +170,9 @@ class AverageShiftCalculator(): return uttid_list, ts_dict def _shift(self, filtered_timestamp_list1, filtered_timestamp_list2): + shift_time = 0 for fts1, fts2 in zip(filtered_timestamp_list1, filtered_timestamp_list2): - shift_time = abs(fts1[0] - fts2[0]) + abs(fts1[1] - fts2[1]) + shift_time += abs(fts1[0] - fts2[0]) + abs(fts1[1] - fts2[1]) num_tokens = len(filtered_timestamp_list1) return shift_time, num_tokens From 4d2bf9fe3cc385b441e94c3000b34a44cac8a8db Mon Sep 17 00:00:00 2001 From: speech_asr Date: Tue, 14 Mar 2023 17:13:25 +0800 Subject: [PATCH 06/37] update --- funasr/models/frontend/wav_frontend.py | 51 ++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/funasr/models/frontend/wav_frontend.py b/funasr/models/frontend/wav_frontend.py index 445efca24..c4b79104b 100644 --- a/funasr/models/frontend/wav_frontend.py +++ b/funasr/models/frontend/wav_frontend.py @@ -7,6 +7,7 @@ import numpy as np import torch import torchaudio.compliance.kaldi as kaldi from funasr.models.frontend.abs_frontend import AbsFrontend +import funasr.models.frontend.eend_ola_feature as eend_ola_feature from typeguard import check_argument_types from torch.nn.utils.rnn import pad_sequence @@ -444,3 +445,53 @@ class WavFrontendOnline(AbsFrontend): self.reserve_waveforms = None self.input_cache = None self.lfr_splice_cache = [] + + +class WavFrontendMel23(AbsFrontend): + """Conventional frontend structure for ASR. + """ + + def __init__( + self, + fs: int = 16000, + frame_length: int = 25, + frame_shift: int = 10, + lfr_m: int = 1, + lfr_n: int = 1, + ): + assert check_argument_types() + super().__init__() + self.fs = fs + self.frame_length = frame_length + self.frame_shift = frame_shift + self.lfr_m = lfr_m + self.lfr_n = lfr_n + + def output_size(self) -> int: + return self.n_mels * self.lfr_m + + def forward( + self, + input: torch.Tensor, + input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + batch_size = input.size(0) + feats = [] + feats_lens = [] + for i in range(batch_size): + waveform_length = input_lengths[i] + waveform = input[i][:waveform_length] + waveform = waveform.unsqueeze(0).numpy() + mat = eend_ola_feature.stft(waveform, self.frame_length, self.frame_shift) + mat = eend_ola_feature.transform(mat) + mat = mat.splice(mat, context_size=self.lfr_m) + mat = mat[::self.lfr_n] + mat = torch.from_numpy(mat) + feat_length = mat.size(0) + feats.append(mat) + feats_lens.append(feat_length) + + feats_lens = torch.as_tensor(feats_lens) + feats_pad = pad_sequence(feats, + batch_first=True, + padding_value=0.0) + return feats_pad, feats_lens \ No newline at end of file From 46d9cc0b374470ca03339d63a38d213eb4fd889e Mon Sep 17 00:00:00 2001 From: speech_asr Date: Tue, 14 Mar 2023 22:54:24 +0800 Subject: [PATCH 07/37] update --- .../infer.py | 8 ++++++++ funasr/bin/diar_inference_launch.py | 3 +++ 2 files changed, 11 insertions(+) create mode 100644 egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py diff --git a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py new file mode 100644 index 000000000..fa4e8bf04 --- /dev/null +++ b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py @@ -0,0 +1,8 @@ +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks + +inference_diar_pipline = pipeline( + task=Tasks.speaker_diarization, + model='damo/speech_diarization_eend-ola-en-us-callhome-8k', +) +results = inference_diar_pipline(audio_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav") \ No newline at end of file diff --git a/funasr/bin/diar_inference_launch.py b/funasr/bin/diar_inference_launch.py index 7738f4f4f..70bb947b4 100755 --- a/funasr/bin/diar_inference_launch.py +++ b/funasr/bin/diar_inference_launch.py @@ -142,6 +142,9 @@ def inference_launch(mode, **kwargs): else: kwargs["param_dict"] = param_dict return inference_modelscope(mode=mode, **kwargs) + elif mode == "eend-ola": + from funasr.bin.eend_ola_inference import inference_modelscope + return inference_modelscope(mode=mode, **kwargs) else: logging.info("Unknown decoding mode: {}".format(mode)) return None From 1e4eba6a72ea97d9a9e733df3e3b1eb86e4fd44d Mon Sep 17 00:00:00 2001 From: speech_asr Date: Tue, 14 Mar 2023 23:22:31 +0800 Subject: [PATCH 08/37] update --- .../speech_diarization_eend-ola-en-us-callhome-8k/infer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py index fa4e8bf04..75f9c7346 100644 --- a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py +++ b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py @@ -4,5 +4,6 @@ from modelscope.utils.constant import Tasks inference_diar_pipline = pipeline( task=Tasks.speaker_diarization, model='damo/speech_diarization_eend-ola-en-us-callhome-8k', + model_revision="v1.0.0", ) results = inference_diar_pipline(audio_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav") \ No newline at end of file From f59a72d24e917fb2e9560fa646ae80285dba6c95 Mon Sep 17 00:00:00 2001 From: "shixian.shi" Date: Wed, 15 Mar 2023 10:21:32 +0800 Subject: [PATCH 09/37] release timestasmp related tools --- funasr/utils/timestamp_tools.py | 50 +++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py index 2bccd50e6..09c3becfc 100644 --- a/funasr/utils/timestamp_tools.py +++ b/funasr/utils/timestamp_tools.py @@ -1,3 +1,4 @@ +from pydoc import TextRepr from scipy.fftpack import shift import torch import copy @@ -5,6 +6,7 @@ import codecs import logging import edit_distance import argparse +import pdb import numpy as np from typing import Any, List, Tuple, Union @@ -13,7 +15,8 @@ def ts_prediction_lfr6_standard(us_alphas, us_peaks, char_list, vad_offset=0.0, - force_time_shift=-1.5 + force_time_shift=-1.5, + sil_in_str=True ): if not len(char_list): return [] @@ -66,6 +69,8 @@ def ts_prediction_lfr6_standard(us_alphas, timestamp_list[i][1] = timestamp_list[i][1] + vad_offset / 1000.0 res_txt = "" for char, timestamp in zip(new_char_list, timestamp_list): + #if char != '': + if not sil_in_str and char == '': continue res_txt += "{} {} {};".format(char, str(timestamp[0]+0.0005)[:5], str(timestamp[1]+0.0005)[:5]) res = [] for char, timestamp in zip(new_char_list, timestamp_list): @@ -233,13 +238,54 @@ class AverageShiftCalculator(): return self._accumlated_shift / self._accumlated_tokens -SUPPORTED_MODES = ['cal_aas'] +def convert_external_alphas(alphas_file, text_file, output_file): + from funasr.models.predictor.cif import cif_wo_hidden + with open(alphas_file, 'r') as f1, open(text_file, 'r') as f2, open(output_file, 'w') as f3: + for line1, line2 in zip(f1.readlines(), f2.readlines()): + line1 = line1.rstrip() + line2 = line2.rstrip() + assert line1.split()[0] == line2.split()[0] + uttid = line1.split()[0] + alphas = [float(i) for i in line1.split()[1:]] + new_alphas = np.array(remove_chunk_padding(alphas)) + new_alphas[-1] += 1e-4 + text = line2.split()[1:] + if len(text) + 1 != int(new_alphas.sum()): + # force resize + new_alphas *= (len(text) + 1) / int(new_alphas.sum()) + peaks = cif_wo_hidden(torch.Tensor(new_alphas).unsqueeze(0), 1.0-1e-4) + if " " in text: + text = text.split() + else: + text = [i for i in text] + res_str, _ = ts_prediction_lfr6_standard(new_alphas, peaks[0], text, + force_time_shift=-7.0, + sil_in_str=False) + f3.write("{} {}\n".format(uttid, res_str)) + + +def remove_chunk_padding(alphas): + # remove the padding part in alphas if using chunk paraformer for GPU + START_ZERO = 45 + MID_ZERO = 75 + REAL_FRAMES = 360 # for chunk based encoder 10-120-10 and fsmn padding 5 + alphas = alphas[START_ZERO:] # remove the padding at beginning + new_alphas = [] + while True: + new_alphas = new_alphas + alphas[:REAL_FRAMES] + alphas = alphas[REAL_FRAMES+MID_ZERO:] + if len(alphas) < REAL_FRAMES: break + return new_alphas + +SUPPORTED_MODES = ['cal_aas', 'read_ext_alphas'] def main(args): if args.mode == 'cal_aas': asc = AverageShiftCalculator() asc(args.input, args.input2) + elif args.mode == 'read_ext_alphas': + convert_external_alphas(args.input, args.input2, args.output) else: logging.error("Mode {} not in SUPPORTED_MODES: {}.".format(args.mode, SUPPORTED_MODES)) From f63a72c52eada7c25fde2538f290ef1420c193fb Mon Sep 17 00:00:00 2001 From: "shixian.shi" Date: Wed, 15 Mar 2023 10:22:30 +0800 Subject: [PATCH 10/37] update tools --- funasr/utils/timestamp_tools.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py index 09c3becfc..423110cdc 100644 --- a/funasr/utils/timestamp_tools.py +++ b/funasr/utils/timestamp_tools.py @@ -1,5 +1,3 @@ -from pydoc import TextRepr -from scipy.fftpack import shift import torch import copy import codecs From 4d60eb6ada430098654cf58faf5b0758388e366a Mon Sep 17 00:00:00 2001 From: speech_asr Date: Wed, 15 Mar 2023 11:12:51 +0800 Subject: [PATCH 11/37] update --- funasr/bin/eend_ola_inference.py | 1 - 1 file changed, 1 deletion(-) diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py index d65895f30..1b3622005 100755 --- a/funasr/bin/eend_ola_inference.py +++ b/funasr/bin/eend_ola_inference.py @@ -179,7 +179,6 @@ def inference_modelscope( diar_model_file=diar_model_file, device=device, dtype=dtype, - streaming=streaming, ) logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs)) speech2diar = Speech2Diarization.from_pretrained( From 6165c139182c31252e9d69e95837546637f9e2da Mon Sep 17 00:00:00 2001 From: speech_asr Date: Wed, 15 Mar 2023 11:15:00 +0800 Subject: [PATCH 12/37] update --- funasr/models/frontend/wav_frontend.py | 38 ++++++++++++++++---------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/funasr/models/frontend/wav_frontend.py b/funasr/models/frontend/wav_frontend.py index c4b79104b..f61d7dd17 100644 --- a/funasr/models/frontend/wav_frontend.py +++ b/funasr/models/frontend/wav_frontend.py @@ -1,15 +1,15 @@ # Copyright (c) Alibaba, Inc. and its affiliates. # Part of the implementation is borrowed from espnet/espnet. -from abc import ABC from typing import Tuple import numpy as np import torch import torchaudio.compliance.kaldi as kaldi -from funasr.models.frontend.abs_frontend import AbsFrontend -import funasr.models.frontend.eend_ola_feature as eend_ola_feature -from typeguard import check_argument_types from torch.nn.utils.rnn import pad_sequence +from typeguard import check_argument_types + +import funasr.models.frontend.eend_ola_feature as eend_ola_feature +from funasr.models.frontend.abs_frontend import AbsFrontend def load_cmvn(cmvn_file): @@ -276,7 +276,8 @@ class WavFrontendOnline(AbsFrontend): # inputs tensor has catted the cache tensor # def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, inputs_lfr_cache: torch.Tensor = None, # is_final: bool = False) -> Tuple[torch.Tensor, torch.Tensor, int]: - def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, is_final: bool = False) -> Tuple[torch.Tensor, torch.Tensor, int]: + def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, is_final: bool = False) -> Tuple[ + torch.Tensor, torch.Tensor, int]: """ Apply lfr with data """ @@ -377,7 +378,8 @@ class WavFrontendOnline(AbsFrontend): if self.lfr_m != 1 or self.lfr_n != 1: # update self.lfr_splice_cache in self.apply_lfr # mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n, self.lfr_splice_cache[i], - mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n, is_final) + mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n, + is_final) if self.cmvn_file is not None: mat = self.apply_cmvn(mat, self.cmvn) feat_length = mat.size(0) @@ -399,9 +401,10 @@ class WavFrontendOnline(AbsFrontend): assert batch_size == 1, 'we support to extract feature online only when the batch size is equal to 1 now' waveforms, feats, feats_lengths = self.forward_fbank(input, input_lengths) # input shape: B T D if feats.shape[0]: - #if self.reserve_waveforms is None and self.lfr_m > 1: + # if self.reserve_waveforms is None and self.lfr_m > 1: # self.reserve_waveforms = waveforms[:, :(self.lfr_m - 1) // 2 * self.frame_shift_sample_length] - self.waveforms = waveforms if self.reserve_waveforms is None else torch.cat((self.reserve_waveforms, waveforms), dim=1) + self.waveforms = waveforms if self.reserve_waveforms is None else torch.cat( + (self.reserve_waveforms, waveforms), dim=1) if not self.lfr_splice_cache: # 初始化splice_cache for i in range(batch_size): self.lfr_splice_cache.append(feats[i][0, :].unsqueeze(dim=0).repeat((self.lfr_m - 1) // 2, 1)) @@ -410,7 +413,8 @@ class WavFrontendOnline(AbsFrontend): lfr_splice_cache_tensor = torch.stack(self.lfr_splice_cache) # B T D feats = torch.cat((lfr_splice_cache_tensor, feats), dim=1) feats_lengths += lfr_splice_cache_tensor[0].shape[0] - frame_from_waveforms = int((self.waveforms.shape[1] - self.frame_sample_length) / self.frame_shift_sample_length + 1) + frame_from_waveforms = int( + (self.waveforms.shape[1] - self.frame_sample_length) / self.frame_shift_sample_length + 1) minus_frame = (self.lfr_m - 1) // 2 if self.reserve_waveforms is None else 0 feats, feats_lengths, lfr_splice_frame_idxs = self.forward_lfr_cmvn(feats, feats_lengths, is_final) if self.lfr_m == 1: @@ -419,19 +423,22 @@ class WavFrontendOnline(AbsFrontend): reserve_frame_idx = lfr_splice_frame_idxs[0] - minus_frame # print('reserve_frame_idx: ' + str(reserve_frame_idx)) # print('frame_frame: ' + str(frame_from_waveforms)) - self.reserve_waveforms = self.waveforms[:, reserve_frame_idx * self.frame_shift_sample_length:frame_from_waveforms * self.frame_shift_sample_length] - sample_length = (frame_from_waveforms - 1) * self.frame_shift_sample_length + self.frame_sample_length + self.reserve_waveforms = self.waveforms[:, + reserve_frame_idx * self.frame_shift_sample_length:frame_from_waveforms * self.frame_shift_sample_length] + sample_length = ( + frame_from_waveforms - 1) * self.frame_shift_sample_length + self.frame_sample_length self.waveforms = self.waveforms[:, :sample_length] else: # update self.reserve_waveforms and self.lfr_splice_cache - self.reserve_waveforms = self.waveforms[:, :-(self.frame_sample_length - self.frame_shift_sample_length)] + self.reserve_waveforms = self.waveforms[:, + :-(self.frame_sample_length - self.frame_shift_sample_length)] for i in range(batch_size): self.lfr_splice_cache[i] = torch.cat((self.lfr_splice_cache[i], feats[i]), dim=0) return torch.empty(0), feats_lengths else: if is_final: self.waveforms = waveforms if self.reserve_waveforms is None else self.reserve_waveforms - feats = torch.stack(self.lfr_splice_cache) + feats = torch.stack(self.lfr_splice_cache) feats_lengths = torch.zeros(batch_size, dtype=torch.int) + feats.shape[1] feats, feats_lengths, _ = self.forward_lfr_cmvn(feats, feats_lengths, is_final) if is_final: @@ -466,9 +473,10 @@ class WavFrontendMel23(AbsFrontend): self.frame_shift = frame_shift self.lfr_m = lfr_m self.lfr_n = lfr_n + self.n_mels = 23 def output_size(self) -> int: - return self.n_mels * self.lfr_m + return self.n_mels * (2 * self.lfr_m + 1) def forward( self, @@ -494,4 +502,4 @@ class WavFrontendMel23(AbsFrontend): feats_pad = pad_sequence(feats, batch_first=True, padding_value=0.0) - return feats_pad, feats_lens \ No newline at end of file + return feats_pad, feats_lens From ab6d93b4eb3605738ca4af440dd1b296458fe485 Mon Sep 17 00:00:00 2001 From: speech_asr Date: Wed, 15 Mar 2023 11:19:07 +0800 Subject: [PATCH 13/37] update --- funasr/tasks/diar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/funasr/tasks/diar.py b/funasr/tasks/diar.py index ae7ee9b40..67d0c59c5 100644 --- a/funasr/tasks/diar.py +++ b/funasr/tasks/diar.py @@ -823,7 +823,7 @@ class EENDOLADiarTask(AbsTask): # 2. Encoder encoder_class = encoder_choices.get_class(args.encoder) - encoder = encoder_class(input_size=input_size, **args.encoder_conf) + encoder = encoder_class(**args.encoder_conf) # 3. EncoderDecoderAttractor encoder_decoder_attractor_class = encoder_decoder_attractor_choices.get_class(args.encoder_decoder_attractor) From 6fe0d840f7908dd1ab74de839987819234890725 Mon Sep 17 00:00:00 2001 From: speech_asr Date: Wed, 15 Mar 2023 11:23:29 +0800 Subject: [PATCH 14/37] update --- funasr/models/e2e_diar_eend_ola.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/funasr/models/e2e_diar_eend_ola.py b/funasr/models/e2e_diar_eend_ola.py index f589269c5..6835a6409 100644 --- a/funasr/models/e2e_diar_eend_ola.py +++ b/funasr/models/e2e_diar_eend_ola.py @@ -240,3 +240,6 @@ class DiarEENDOLAModel(AbsESPnetModel): torch.float32) decisions = decisions[:, :n_speaker] return decisions + + def collect_feats(self, **batch: torch.Tensor) -> Dict[str, torch.Tensor]: + pass \ No newline at end of file From 26b81480a88cc2868639c5160989394199acdcdd Mon Sep 17 00:00:00 2001 From: speech_asr Date: Wed, 15 Mar 2023 11:35:18 +0800 Subject: [PATCH 15/37] update --- funasr/models/e2e_diar_eend_ola.py | 16 ++++++++-------- tests/test_asr_inference_pipeline.py | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/funasr/models/e2e_diar_eend_ola.py b/funasr/models/e2e_diar_eend_ola.py index 6835a6409..f3e34bc0b 100644 --- a/funasr/models/e2e_diar_eend_ola.py +++ b/funasr/models/e2e_diar_eend_ola.py @@ -52,15 +52,15 @@ class DiarEENDOLAModel(AbsESPnetModel): super().__init__() self.frontend = frontend - self.encoder = encoder - self.encoder_decoder_attractor = encoder_decoder_attractor + self.enc = encoder + self.eda = encoder_decoder_attractor self.attractor_loss_weight = attractor_loss_weight self.max_n_speaker = max_n_speaker if mapping_dict is None: mapping_dict = generate_mapping_dict(max_speaker_num=self.max_n_speaker) self.mapping_dict = mapping_dict # PostNet - self.PostNet = nn.LSTM(self.max_n_speaker, n_units, 1, batch_first=True) + self.postnet = nn.LSTM(self.max_n_speaker, n_units, 1, batch_first=True) self.output_layer = nn.Linear(n_units, mapping_dict['oov'] + 1) def forward_encoder(self, xs, ilens): @@ -68,7 +68,7 @@ class DiarEENDOLAModel(AbsESPnetModel): pad_shape = xs.shape xs_mask = [torch.ones(ilen).to(xs.device) for ilen in ilens] xs_mask = torch.nn.utils.rnn.pad_sequence(xs_mask, batch_first=True, padding_value=0).unsqueeze(-2) - emb = self.encoder(xs, xs_mask) + emb = self.enc(xs, xs_mask) emb = torch.split(emb.view(pad_shape[0], pad_shape[1], -1), 1, dim=0) emb = [e[0][:ilen] for e, ilen in zip(emb, ilens)] return emb @@ -77,7 +77,7 @@ class DiarEENDOLAModel(AbsESPnetModel): maxlen = torch.max(ilens).to(torch.int).item() logits = nn.utils.rnn.pad_sequence(logits, batch_first=True, padding_value=-1) logits = nn.utils.rnn.pack_padded_sequence(logits, ilens, batch_first=True, enforce_sorted=False) - outputs, (_, _) = self.PostNet(logits) + outputs, (_, _) = self.postnet(logits) outputs = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True, padding_value=-1, total_length=maxlen)[0] outputs = [output[:ilens[i].to(torch.int).item()] for i, output in enumerate(outputs)] outputs = [self.output_layer(output) for output in outputs] @@ -112,7 +112,7 @@ class DiarEENDOLAModel(AbsESPnetModel): text = text[:, : text_lengths.max()] # 1. Encoder - encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) + encoder_out, encoder_out_lens = self.enc(speech, speech_lengths) intermediate_outs = None if isinstance(encoder_out, tuple): intermediate_outs = encoder_out[1] @@ -198,10 +198,10 @@ class DiarEENDOLAModel(AbsESPnetModel): orders = [np.arange(e.shape[0]) for e in emb] for order in orders: np.random.shuffle(order) - attractors, probs = self.encoder_decoder_attractor.estimate( + attractors, probs = self.eda.estimate( [e[torch.from_numpy(order).to(torch.long).to(speech[0].device)] for e, order in zip(emb, orders)]) else: - attractors, probs = self.encoder_decoder_attractor.estimate(emb) + attractors, probs = self.eda.estimate(emb) attractors_active = [] for p, att, e in zip(probs, attractors, emb): if n_speakers and n_speakers >= 0: diff --git a/tests/test_asr_inference_pipeline.py b/tests/test_asr_inference_pipeline.py index 70dbe8952..32b8af5ec 100644 --- a/tests/test_asr_inference_pipeline.py +++ b/tests/test_asr_inference_pipeline.py @@ -451,7 +451,7 @@ class TestUniasrInferencePipelines(unittest.TestCase): def test_uniasr_2pass_zhcn_16k_common_vocab8358_offline(self): inference_pipeline = pipeline( - task=Tasks., + task=Tasks.auto_speech_recognition, model='damo/speech_UniASauto_speech_recognitionR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline') rec_result = inference_pipeline( audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav', From 36e9d36997a7ed21080997d99c13ffbc5bdda279 Mon Sep 17 00:00:00 2001 From: speech_asr Date: Wed, 15 Mar 2023 12:15:49 +0800 Subject: [PATCH 16/37] update --- .../infer.py | 2 +- funasr/tasks/diar.py | 62 +++++++++---------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py index 75f9c7346..dfcb8e649 100644 --- a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py +++ b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py @@ -6,4 +6,4 @@ inference_diar_pipline = pipeline( model='damo/speech_diarization_eend-ola-en-us-callhome-8k', model_revision="v1.0.0", ) -results = inference_diar_pipline(audio_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav") \ No newline at end of file +results = inference_diar_pipline(audio_in=["https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav"]) \ No newline at end of file diff --git a/funasr/tasks/diar.py b/funasr/tasks/diar.py index 67d0c59c5..6204cb7d2 100644 --- a/funasr/tasks/diar.py +++ b/funasr/tasks/diar.py @@ -750,37 +750,37 @@ class EENDOLADiarTask(AbsTask): cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: assert check_argument_types() - if args.use_preprocessor: - retval = CommonPreprocessor( - train=train, - token_type=args.token_type, - token_list=args.token_list, - bpemodel=None, - non_linguistic_symbols=None, - text_cleaner=None, - g2p_type=None, - split_with_space=args.split_with_space if hasattr(args, "split_with_space") else False, - seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None, - # NOTE(kamo): Check attribute existence for backward compatibility - rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None, - rir_apply_prob=args.rir_apply_prob - if hasattr(args, "rir_apply_prob") - else 1.0, - noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None, - noise_apply_prob=args.noise_apply_prob - if hasattr(args, "noise_apply_prob") - else 1.0, - noise_db_range=args.noise_db_range - if hasattr(args, "noise_db_range") - else "13_15", - speech_volume_normalize=args.speech_volume_normalize - if hasattr(args, "rir_scp") - else None, - ) - else: - retval = None - assert check_return_type(retval) - return retval + # if args.use_preprocessor: + # retval = CommonPreprocessor( + # train=train, + # token_type=args.token_type, + # token_list=args.token_list, + # bpemodel=None, + # non_linguistic_symbols=None, + # text_cleaner=None, + # g2p_type=None, + # split_with_space=args.split_with_space if hasattr(args, "split_with_space") else False, + # seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None, + # # NOTE(kamo): Check attribute existence for backward compatibility + # rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None, + # rir_apply_prob=args.rir_apply_prob + # if hasattr(args, "rir_apply_prob") + # else 1.0, + # noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None, + # noise_apply_prob=args.noise_apply_prob + # if hasattr(args, "noise_apply_prob") + # else 1.0, + # noise_db_range=args.noise_db_range + # if hasattr(args, "noise_db_range") + # else "13_15", + # speech_volume_normalize=args.speech_volume_normalize + # if hasattr(args, "rir_scp") + # else None, + # ) + # else: + # retval = None + # assert check_return_type(retval) + return None @classmethod def required_data_names( From 3f2981bb8da44881460c8b290e62a3c6fce998d3 Mon Sep 17 00:00:00 2001 From: speech_asr Date: Wed, 15 Mar 2023 14:27:44 +0800 Subject: [PATCH 17/37] update --- funasr/tasks/diar.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/funasr/tasks/diar.py b/funasr/tasks/diar.py index 6204cb7d2..696291526 100644 --- a/funasr/tasks/diar.py +++ b/funasr/tasks/diar.py @@ -787,10 +787,10 @@ class EENDOLADiarTask(AbsTask): cls, train: bool = True, inference: bool = False ) -> Tuple[str, ...]: if not inference: - retval = ("speech", "profile", "binary_labels") + retval = ("speech", ) else: # Recognition mode - retval = ("speech") + retval = ("speech", ) return retval @classmethod From 2cfe010d7b0f17877a271cc401e2c2f8f8d4c42c Mon Sep 17 00:00:00 2001 From: speech_asr Date: Wed, 15 Mar 2023 14:42:32 +0800 Subject: [PATCH 18/37] update --- funasr/bin/eend_ola_inference.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py index 1b3622005..96e7516e3 100755 --- a/funasr/bin/eend_ola_inference.py +++ b/funasr/bin/eend_ola_inference.py @@ -27,6 +27,8 @@ from funasr.utils.types import str2bool from funasr.utils.types import str2triple_str from funasr.utils.types import str_or_none +from modelscope.utils.logger import get_logger +logger = get_logger() class Speech2Diarization: """Speech2Diarlization class @@ -209,6 +211,7 @@ def inference_modelscope( if isinstance(raw_inputs, torch.Tensor): raw_inputs = raw_inputs.numpy() data_path_and_name_and_type = [raw_inputs, "speech", "waveform"] + logger.info(data_path_and_name_and_type) loader = EENDOLADiarTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, @@ -228,6 +231,8 @@ def inference_modelscope( output_writer = open("{}/result.txt".format(output_path), "w") result_list = [] for keys, batch in loader: + logger.info("keys: {}".format(keys)) + logger.info("batch: {}".format(batch)) assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) From 85c1848286e206195a94993b49e8c32117cadc90 Mon Sep 17 00:00:00 2001 From: speech_asr Date: Wed, 15 Mar 2023 15:17:27 +0800 Subject: [PATCH 19/37] update --- .../unit_test.py | 7 +++---- funasr/bin/eend_ola_inference.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py b/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py index 3cb31cfb7..5f4563dbc 100644 --- a/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py +++ b/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py @@ -14,13 +14,12 @@ inference_diar_pipline = pipeline( ) # 以 audio_list 作为输入,其中第一个音频为待检测语音,后面的音频为不同说话人的声纹注册语音 -audio_list = [[ +audio_list = [ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav", "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_A.wav", "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B.wav", "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B1.wav" -]] +] results = inference_diar_pipline(audio_in=audio_list) -for rst in results: - print(rst["value"]) +print(results) diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py index 96e7516e3..2ff7eeff2 100755 --- a/funasr/bin/eend_ola_inference.py +++ b/funasr/bin/eend_ola_inference.py @@ -210,7 +210,7 @@ def inference_modelscope( if data_path_and_name_and_type is None and raw_inputs is not None: if isinstance(raw_inputs, torch.Tensor): raw_inputs = raw_inputs.numpy() - data_path_and_name_and_type = [raw_inputs, "speech", "waveform"] + data_path_and_name_and_type = [raw_inputs[0], "speech", "bytes"] logger.info(data_path_and_name_and_type) loader = EENDOLADiarTask.build_streaming_iterator( data_path_and_name_and_type, From 2f933cb101e56c3c12c76c38d368b94111b52f64 Mon Sep 17 00:00:00 2001 From: speech_asr Date: Wed, 15 Mar 2023 15:23:08 +0800 Subject: [PATCH 20/37] update --- funasr/bin/eend_ola_inference.py | 4 ---- funasr/models/e2e_diar_eend_ola.py | 2 -- 2 files changed, 6 deletions(-) diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py index 2ff7eeff2..fbcfc7d97 100755 --- a/funasr/bin/eend_ola_inference.py +++ b/funasr/bin/eend_ola_inference.py @@ -27,9 +27,6 @@ from funasr.utils.types import str2bool from funasr.utils.types import str2triple_str from funasr.utils.types import str_or_none -from modelscope.utils.logger import get_logger -logger = get_logger() - class Speech2Diarization: """Speech2Diarlization class @@ -211,7 +208,6 @@ def inference_modelscope( if isinstance(raw_inputs, torch.Tensor): raw_inputs = raw_inputs.numpy() data_path_and_name_and_type = [raw_inputs[0], "speech", "bytes"] - logger.info(data_path_and_name_and_type) loader = EENDOLADiarTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, diff --git a/funasr/models/e2e_diar_eend_ola.py b/funasr/models/e2e_diar_eend_ola.py index f3e34bc0b..79cb61496 100644 --- a/funasr/models/e2e_diar_eend_ola.py +++ b/funasr/models/e2e_diar_eend_ola.py @@ -190,8 +190,6 @@ class DiarEENDOLAModel(AbsESPnetModel): shuffle: bool = True, threshold: float = 0.5, **kwargs): - if self.frontend is not None: - speech = self.frontend(speech) speech = [s[:s_len] for s, s_len in zip(speech, speech_lengths)] emb = self.forward_encoder(speech, speech_lengths) if shuffle: From e9f6703350fc6616b06c0e60944f6359a329e214 Mon Sep 17 00:00:00 2001 From: speech_asr Date: Wed, 15 Mar 2023 15:29:31 +0800 Subject: [PATCH 21/37] update --- funasr/bin/eend_ola_inference.py | 2 -- funasr/models/frontend/wav_frontend.py | 5 +++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py index fbcfc7d97..2887b3754 100755 --- a/funasr/bin/eend_ola_inference.py +++ b/funasr/bin/eend_ola_inference.py @@ -227,8 +227,6 @@ def inference_modelscope( output_writer = open("{}/result.txt".format(output_path), "w") result_list = [] for keys, batch in loader: - logger.info("keys: {}".format(keys)) - logger.info("batch: {}".format(batch)) assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) diff --git a/funasr/models/frontend/wav_frontend.py b/funasr/models/frontend/wav_frontend.py index f61d7dd17..8e17102ca 100644 --- a/funasr/models/frontend/wav_frontend.py +++ b/funasr/models/frontend/wav_frontend.py @@ -11,6 +11,8 @@ from typeguard import check_argument_types import funasr.models.frontend.eend_ola_feature as eend_ola_feature from funasr.models.frontend.abs_frontend import AbsFrontend +from modelscope.utils.logger import get_logger +logger = get_logger() def load_cmvn(cmvn_file): with open(cmvn_file, 'r', encoding='utf-8') as f: @@ -485,6 +487,9 @@ class WavFrontendMel23(AbsFrontend): batch_size = input.size(0) feats = [] feats_lens = [] + logger.info("batch_size: {}".format(batch_size)) + logger.info("input: {}".format(input)) + logger.info("input_lengths: {}".format(input_lengths)) for i in range(batch_size): waveform_length = input_lengths[i] waveform = input[i][:waveform_length] From f691014c8a97f2ea27dc72c9d3b374bdd05aa6c9 Mon Sep 17 00:00:00 2001 From: speech_asr Date: Wed, 15 Mar 2023 15:43:18 +0800 Subject: [PATCH 22/37] update --- funasr/models/frontend/wav_frontend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/funasr/models/frontend/wav_frontend.py b/funasr/models/frontend/wav_frontend.py index 8e17102ca..ca2217596 100644 --- a/funasr/models/frontend/wav_frontend.py +++ b/funasr/models/frontend/wav_frontend.py @@ -493,10 +493,10 @@ class WavFrontendMel23(AbsFrontend): for i in range(batch_size): waveform_length = input_lengths[i] waveform = input[i][:waveform_length] - waveform = waveform.unsqueeze(0).numpy() + waveform = waveform.numpy() mat = eend_ola_feature.stft(waveform, self.frame_length, self.frame_shift) mat = eend_ola_feature.transform(mat) - mat = mat.splice(mat, context_size=self.lfr_m) + mat = eend_ola_feature.splice(mat, context_size=self.lfr_m) mat = mat[::self.lfr_n] mat = torch.from_numpy(mat) feat_length = mat.size(0) From 429ea5d3786fb77d1b53728307a59fe3d204d4ce Mon Sep 17 00:00:00 2001 From: speech_asr Date: Wed, 15 Mar 2023 15:48:35 +0800 Subject: [PATCH 23/37] update --- funasr/bin/eend_ola_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py index 2887b3754..1a47c9224 100755 --- a/funasr/bin/eend_ola_inference.py +++ b/funasr/bin/eend_ola_inference.py @@ -145,7 +145,7 @@ def inference_modelscope( output_dir: Optional[str] = None, batch_size: int = 1, dtype: str = "float32", - ngpu: int = 0, + ngpu: int = 1, num_workers: int = 0, log_level: Union[int, str] = "INFO", key_file: Optional[str] = None, From fbec0f003d4de9e4b6ccb6bb58d2d4926a0ff332 Mon Sep 17 00:00:00 2001 From: speech_asr Date: Wed, 15 Mar 2023 15:58:43 +0800 Subject: [PATCH 24/37] update --- funasr/models/frontend/wav_frontend.py | 11 ++--------- funasr/modules/eend_ola/encoder_decoder_attractor.py | 5 ++++- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/funasr/models/frontend/wav_frontend.py b/funasr/models/frontend/wav_frontend.py index ca2217596..475a9398a 100644 --- a/funasr/models/frontend/wav_frontend.py +++ b/funasr/models/frontend/wav_frontend.py @@ -11,8 +11,6 @@ from typeguard import check_argument_types import funasr.models.frontend.eend_ola_feature as eend_ola_feature from funasr.models.frontend.abs_frontend import AbsFrontend -from modelscope.utils.logger import get_logger -logger = get_logger() def load_cmvn(cmvn_file): with open(cmvn_file, 'r', encoding='utf-8') as f: @@ -425,10 +423,8 @@ class WavFrontendOnline(AbsFrontend): reserve_frame_idx = lfr_splice_frame_idxs[0] - minus_frame # print('reserve_frame_idx: ' + str(reserve_frame_idx)) # print('frame_frame: ' + str(frame_from_waveforms)) - self.reserve_waveforms = self.waveforms[:, - reserve_frame_idx * self.frame_shift_sample_length:frame_from_waveforms * self.frame_shift_sample_length] - sample_length = ( - frame_from_waveforms - 1) * self.frame_shift_sample_length + self.frame_sample_length + self.reserve_waveforms = self.waveforms[:, reserve_frame_idx * self.frame_shift_sample_length:frame_from_waveforms * self.frame_shift_sample_length] + sample_length = (frame_from_waveforms - 1) * self.frame_shift_sample_length + self.frame_sample_length self.waveforms = self.waveforms[:, :sample_length] else: # update self.reserve_waveforms and self.lfr_splice_cache @@ -487,9 +483,6 @@ class WavFrontendMel23(AbsFrontend): batch_size = input.size(0) feats = [] feats_lens = [] - logger.info("batch_size: {}".format(batch_size)) - logger.info("input: {}".format(input)) - logger.info("input_lengths: {}".format(input_lengths)) for i in range(batch_size): waveform_length = input_lengths[i] waveform = input[i][:waveform_length] diff --git a/funasr/modules/eend_ola/encoder_decoder_attractor.py b/funasr/modules/eend_ola/encoder_decoder_attractor.py index db01b0006..4e599ab31 100644 --- a/funasr/modules/eend_ola/encoder_decoder_attractor.py +++ b/funasr/modules/eend_ola/encoder_decoder_attractor.py @@ -2,7 +2,8 @@ import numpy as np import torch import torch.nn.functional as F from torch import nn - +from modelscope.utils.logger import get_logger +logger = get_logger() class EncoderDecoderAttractor(nn.Module): @@ -16,7 +17,9 @@ class EncoderDecoderAttractor(nn.Module): self.n_units = n_units def forward_core(self, xs, zeros): + logger.info("xs: ".format(xs)) ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).to(torch.float32).to(xs[0].device) + logger.info("ilens: ".format(ilens)) xs = [self.enc0_dropout(x) for x in xs] xs = nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=-1) xs = nn.utils.rnn.pack_padded_sequence(xs, ilens, batch_first=True, enforce_sorted=False) From f33ebfd1c70859f38eaac22673ab0ee9682ea7c3 Mon Sep 17 00:00:00 2001 From: speech_asr Date: Wed, 15 Mar 2023 16:11:44 +0800 Subject: [PATCH 25/37] update --- funasr/models/e2e_diar_eend_ola.py | 14 ++++++++++++-- .../modules/eend_ola/encoder_decoder_attractor.py | 11 ++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/funasr/models/e2e_diar_eend_ola.py b/funasr/models/e2e_diar_eend_ola.py index 79cb61496..097b23a57 100644 --- a/funasr/models/e2e_diar_eend_ola.py +++ b/funasr/models/e2e_diar_eend_ola.py @@ -76,7 +76,7 @@ class DiarEENDOLAModel(AbsESPnetModel): def forward_post_net(self, logits, ilens): maxlen = torch.max(ilens).to(torch.int).item() logits = nn.utils.rnn.pad_sequence(logits, batch_first=True, padding_value=-1) - logits = nn.utils.rnn.pack_padded_sequence(logits, ilens, batch_first=True, enforce_sorted=False) + logits = nn.utils.rnn.pack_padded_sequence(logits, ilens.cpu().to(torch.int64), batch_first=True, enforce_sorted=False) outputs, (_, _) = self.postnet(logits) outputs = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True, padding_value=-1, total_length=maxlen)[0] outputs = [output[:ilens[i].to(torch.int).item()] for i, output in enumerate(outputs)] @@ -231,7 +231,7 @@ class DiarEENDOLAModel(AbsESPnetModel): pred[i] = pred[i - 1] else: pred[i] = 0 - pred = [self.reporter.inv_mapping_func(i, self.mapping_dict) for i in pred] + pred = [self.inv_mapping_func(i) for i in pred] decisions = [bin(num)[2:].zfill(self.max_n_speaker)[::-1] for num in pred] decisions = torch.from_numpy( np.stack([np.array([int(i) for i in dec]) for dec in decisions], axis=0)).to(logit.device).to( @@ -239,5 +239,15 @@ class DiarEENDOLAModel(AbsESPnetModel): decisions = decisions[:, :n_speaker] return decisions + def inv_mapping_func(self, label): + + if not isinstance(label, int): + label = int(label) + if label in self.mapping_dict['label2dec'].keys(): + num = self.mapping_dict['label2dec'][label] + else: + num = -1 + return num + def collect_feats(self, **batch: torch.Tensor) -> Dict[str, torch.Tensor]: pass \ No newline at end of file diff --git a/funasr/modules/eend_ola/encoder_decoder_attractor.py b/funasr/modules/eend_ola/encoder_decoder_attractor.py index 4e599ab31..45ac98219 100644 --- a/funasr/modules/eend_ola/encoder_decoder_attractor.py +++ b/funasr/modules/eend_ola/encoder_decoder_attractor.py @@ -2,8 +2,7 @@ import numpy as np import torch import torch.nn.functional as F from torch import nn -from modelscope.utils.logger import get_logger -logger = get_logger() + class EncoderDecoderAttractor(nn.Module): @@ -17,14 +16,12 @@ class EncoderDecoderAttractor(nn.Module): self.n_units = n_units def forward_core(self, xs, zeros): - logger.info("xs: ".format(xs)) - ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).to(torch.float32).to(xs[0].device) - logger.info("ilens: ".format(ilens)) + ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).to(torch.int64) xs = [self.enc0_dropout(x) for x in xs] xs = nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=-1) xs = nn.utils.rnn.pack_padded_sequence(xs, ilens, batch_first=True, enforce_sorted=False) _, (hx, cx) = self.encoder(xs) - zlens = torch.from_numpy(np.array([z.shape[0] for z in zeros])).to(torch.float32).to(zeros[0].device) + zlens = torch.from_numpy(np.array([z.shape[0] for z in zeros])).to(torch.int64) max_zlen = torch.max(zlens).to(torch.int).item() zeros = [self.enc0_dropout(z) for z in zeros] zeros = nn.utils.rnn.pad_sequence(zeros, batch_first=True, padding_value=-1) @@ -50,4 +47,4 @@ class EncoderDecoderAttractor(nn.Module): zeros = [torch.zeros(max_n_speakers, self.n_units).to(torch.float32).to(xs[0].device) for _ in xs] attractors = self.forward_core(xs, zeros) probs = [torch.sigmoid(torch.flatten(self.counter(att))) for att in attractors] - return attractors, probs \ No newline at end of file + return attractors, probs From 7c6ed3830acf6413ab86fd9a5f38825db617f989 Mon Sep 17 00:00:00 2001 From: speech_asr Date: Wed, 15 Mar 2023 16:39:40 +0800 Subject: [PATCH 26/37] update --- funasr/bin/eend_ola_inference.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py index 1a47c9224..79e93a863 100755 --- a/funasr/bin/eend_ola_inference.py +++ b/funasr/bin/eend_ola_inference.py @@ -17,6 +17,7 @@ from typing import Union import numpy as np import torch from typeguard import check_argument_types +from scipy.signal import medfilt from funasr.models.frontend.wav_frontend import WavFrontendMel23 from funasr.tasks.diar import EENDOLADiarTask @@ -234,9 +235,22 @@ def inference_modelscope( # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} results = speech2diar(**batch) + + # post process + a = medfilt(results[0], (11, 1)) + rst = [] + for spkid, frames in enumerate(a.T): + frames = np.pad(frames, (1, 1), 'constant') + changes, = np.where(np.diff(frames, axis=0) != 0) + fmt = "SPEAKER {:s} 1 {:7.2f} {:7.2f} {:s} " + for s, e in zip(changes[::2], changes[1::2]): + st = s / 10. + ed = e / 10. + rst.append(fmt.format(keys[0], st, ed, "{}_{}".format(keys[0],str(spkid)))) + # Only supporting batch_size==1 - key, value = keys[0], output_results_str(results, keys[0]) - item = {"key": key, "value": value} + value = "\n".join(rst) + item = {"key": keys[0], "value": value} result_list.append(item) if output_path is not None: output_writer.write(value) From dd4946a50db62a180ab11a5e371ea2ef44954c3b Mon Sep 17 00:00:00 2001 From: speech_asr Date: Wed, 15 Mar 2023 16:44:59 +0800 Subject: [PATCH 27/37] update --- funasr/bin/eend_ola_inference.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py index 79e93a863..b35824aaa 100755 --- a/funasr/bin/eend_ola_inference.py +++ b/funasr/bin/eend_ola_inference.py @@ -237,7 +237,8 @@ def inference_modelscope( results = speech2diar(**batch) # post process - a = medfilt(results[0], (11, 1)) + a = results[0].cpu().numpy() + a = medfilt(a, (11, 1)) rst = [] for spkid, frames in enumerate(a.T): frames = np.pad(frames, (1, 1), 'constant') From b4598f30a54c3a8d5e6084d983fac0fa5a51992b Mon Sep 17 00:00:00 2001 From: speech_asr Date: Wed, 15 Mar 2023 17:20:24 +0800 Subject: [PATCH 28/37] update --- .../infer.py | 5 +++-- funasr/bin/asr_inference_launch.py | 3 +++ funasr/bin/eend_ola_inference.py | 9 +++++---- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py index dfcb8e649..e0ac08ced 100644 --- a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py +++ b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py @@ -2,8 +2,9 @@ from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks inference_diar_pipline = pipeline( - task=Tasks.speaker_diarization, + task=Tasks.auto_speech_recognition, model='damo/speech_diarization_eend-ola-en-us-callhome-8k', model_revision="v1.0.0", ) -results = inference_diar_pipline(audio_in=["https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav"]) \ No newline at end of file +results = inference_diar_pipline(audio_in=["https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record2.wav"]) +print(results) \ No newline at end of file diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py index 1fae766ea..0ab6b1ad3 100644 --- a/funasr/bin/asr_inference_launch.py +++ b/funasr/bin/asr_inference_launch.py @@ -234,6 +234,9 @@ def inference_launch(**kwargs): elif mode == "rnnt": from funasr.bin.asr_inference_rnnt import inference_modelscope return inference_modelscope(**kwargs) + elif mode == "eend-ola": + from funasr.bin.eend_ola_inference import inference_modelscope + return inference_modelscope(mode=mode, **kwargs) else: logging.info("Unknown decoding mode: {}".format(mode)) return None diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py index b35824aaa..048327856 100755 --- a/funasr/bin/eend_ola_inference.py +++ b/funasr/bin/eend_ola_inference.py @@ -16,8 +16,8 @@ from typing import Union import numpy as np import torch -from typeguard import check_argument_types from scipy.signal import medfilt +from typeguard import check_argument_types from funasr.models.frontend.wav_frontend import WavFrontendMel23 from funasr.tasks.diar import EENDOLADiarTask @@ -28,6 +28,7 @@ from funasr.utils.types import str2bool from funasr.utils.types import str2triple_str from funasr.utils.types import str_or_none + class Speech2Diarization: """Speech2Diarlization class @@ -237,7 +238,7 @@ def inference_modelscope( results = speech2diar(**batch) # post process - a = results[0].cpu().numpy() + a = results[0][0].cpu().numpy() a = medfilt(a, (11, 1)) rst = [] for spkid, frames in enumerate(a.T): @@ -246,8 +247,8 @@ def inference_modelscope( fmt = "SPEAKER {:s} 1 {:7.2f} {:7.2f} {:s} " for s, e in zip(changes[::2], changes[1::2]): st = s / 10. - ed = e / 10. - rst.append(fmt.format(keys[0], st, ed, "{}_{}".format(keys[0],str(spkid)))) + dur = (e - s) / 10. + rst.append(fmt.format(keys[0], st, dur, "{}_{}".format(keys[0], str(spkid)))) # Only supporting batch_size==1 value = "\n".join(rst) From 7ee716759b4a38e0776ebad3c5fac5fc969bec68 Mon Sep 17 00:00:00 2001 From: speech_asr Date: Wed, 15 Mar 2023 17:22:01 +0800 Subject: [PATCH 29/37] update --- .../speech_diarization_eend-ola-en-us-callhome-8k/infer.py | 2 +- funasr/bin/asr_inference_launch.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py index e0ac08ced..81cb2c629 100644 --- a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py +++ b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py @@ -2,7 +2,7 @@ from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks inference_diar_pipline = pipeline( - task=Tasks.auto_speech_recognition, + task=Tasks.speaker_diarization, model='damo/speech_diarization_eend-ola-en-us-callhome-8k', model_revision="v1.0.0", ) diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py index 0ab6b1ad3..1fae766ea 100644 --- a/funasr/bin/asr_inference_launch.py +++ b/funasr/bin/asr_inference_launch.py @@ -234,9 +234,6 @@ def inference_launch(**kwargs): elif mode == "rnnt": from funasr.bin.asr_inference_rnnt import inference_modelscope return inference_modelscope(**kwargs) - elif mode == "eend-ola": - from funasr.bin.eend_ola_inference import inference_modelscope - return inference_modelscope(mode=mode, **kwargs) else: logging.info("Unknown decoding mode: {}".format(mode)) return None From 06975be6bf1eb83c12666c6c93b7f5412e5749ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=B8=E9=9B=81?= Date: Wed, 15 Mar 2023 21:39:47 +0800 Subject: [PATCH 30/37] benchmark cpu --- funasr/runtime/python/README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 funasr/runtime/python/README.md diff --git a/funasr/runtime/python/README.md b/funasr/runtime/python/README.md new file mode 100644 index 000000000..999597459 --- /dev/null +++ b/funasr/runtime/python/README.md @@ -0,0 +1,21 @@ +Benchmark [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) based on Aishell1 test set , the total audio duration is 36108.919 seconds. + +(Note: The service has been fully warm up.) + + Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz 16core-32processor with avx512_vnni + +| concurrent-tasks | processing time(s) | RTF | Speedup Rate | +|:----------------:|:------------------:|:------:|:------------:| +| 1 (onnx fp32) | 2806 | 0.0777 | 12.9 | +| 1 (onnx int8) | 1611 | 0.0446 | 22.4 | +| 8 (onnx fp32) | 538 | 0.0149 | 67.1 | +| 8 (onnx int8) | 210 | 0.0058 | 172.4 | +| 16 (onnx fp32) | 288 | 0.0080 | 125.2 | +| 16 (onnx int8) | 117 | 0.0032 | 309.9 | +| 32 (onnx fp32) | 167 | 0.0046 | 216.5 | +| 32 (onnx int8) | 107 | 0.0030 | 338.0 | +| 64 (onnx fp32) | 158 | 0.0044 | 228.1 | +| 64 (onnx int8) | 82 | 0.0023 | 442.8 | +| 96 (onnx fp32) | 151 | 0.0042 | 238.0 | +| 96 (onnx int8) | 80 | 0.0022 | 452.0 | + From 495e7071eab6b3280c2c06201907236a106c660e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=B8=E9=9B=81?= Date: Wed, 15 Mar 2023 21:44:31 +0800 Subject: [PATCH 31/37] benchmark cpu --- funasr/runtime/python/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/funasr/runtime/python/README.md b/funasr/runtime/python/README.md index 999597459..c47f8e787 100644 --- a/funasr/runtime/python/README.md +++ b/funasr/runtime/python/README.md @@ -1,4 +1,4 @@ -Benchmark [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) based on Aishell1 test set , the total audio duration is 36108.919 seconds. +Benchmark [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) based on Aishell1 test set , the total audio duration is 36108.919 seconds. (Note: The service has been fully warm up.) From c3bce4c288f73a3bbf5559b019d4480f95acffaa Mon Sep 17 00:00:00 2001 From: speech_asr Date: Thu, 16 Mar 2023 10:44:15 +0800 Subject: [PATCH 32/37] update --- funasr/bin/eend_ola_inference.py | 2 +- funasr/modules/eend_ola/encoder.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py index 048327856..bc29fa206 100755 --- a/funasr/bin/eend_ola_inference.py +++ b/funasr/bin/eend_ola_inference.py @@ -209,7 +209,7 @@ def inference_modelscope( if data_path_and_name_and_type is None and raw_inputs is not None: if isinstance(raw_inputs, torch.Tensor): raw_inputs = raw_inputs.numpy() - data_path_and_name_and_type = [raw_inputs[0], "speech", "bytes"] + data_path_and_name_and_type = [raw_inputs[0], "speech", "sound"] loader = EENDOLADiarTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, diff --git a/funasr/modules/eend_ola/encoder.py b/funasr/modules/eend_ola/encoder.py index 4999031b1..90a63f369 100644 --- a/funasr/modules/eend_ola/encoder.py +++ b/funasr/modules/eend_ola/encoder.py @@ -87,7 +87,7 @@ class EENDOLATransformerEncoder(nn.Module): n_layers: int, n_units: int, e_units: int = 2048, - h: int = 8, + h: int = 4, dropout_rate: float = 0.1, use_pos_emb: bool = False): super(EENDOLATransformerEncoder, self).__init__() diff --git a/setup.py b/setup.py index 087d90d26..e6b9d38f6 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ requirements = { "humanfriendly", "scipy>=1.4.1", # "filelock", - "librosa>=0.8.0", + "librosa==0.8.1", "jamo==0.4.1", # For kss "PyYAML>=5.1.2", "soundfile>=0.10.2", From 2ba4683eb2ce42eec91250debe88b424cbc2d67f Mon Sep 17 00:00:00 2001 From: speech_asr Date: Thu, 16 Mar 2023 11:14:42 +0800 Subject: [PATCH 33/37] update --- egs/aishell/conformer/run.sh | 2 +- .../data2vec_paraformer_finetune/run.sh | 2 +- .../data2vec_transformer_finetune/run.sh | 2 +- egs/aishell/paraformer/run.sh | 2 +- egs/aishell/paraformerbert/run.sh | 2 +- egs/aishell/transformer/run.sh | 2 +- egs/aishell2/conformer/run.sh | 2 +- egs/aishell2/paraformer/run.sh | 2 +- egs/aishell2/paraformerbert/run.sh | 2 +- egs/aishell2/transformer/run.sh | 2 +- egs/aishell2/transformerLM/run.sh | 2 +- .../diarization/sond/infer_alimeeting_test.py | 2 +- egs/alimeeting/diarization/sond/run.sh | 6 ++-- egs/alimeeting/diarization/sond/unit_test.py | 8 ++--- egs/callhome/diarization/sond/unit_test.py | 8 ++--- egs/mars/sd/local_run.sh | 2 +- .../README.md | 2 +- .../infer_after_finetune.py | 2 +- .../README.md | 2 +- .../infer_after_finetune.py | 2 +- .../README.md | 2 +- .../infer_after_finetune.py | 2 +- .../infer_after_finetune.py | 2 +- .../infer_after_finetune.py | 2 +- .../README.md | 2 +- .../infer_after_finetune.py | 2 +- .../infer_after_finetune.py | 2 +- .../README.md | 2 +- .../infer_after_finetune.py | 2 +- .../README.md | 2 +- .../infer_after_finetune.py | 2 +- .../README.md | 2 +- .../infer_after_finetune.py | 2 +- .../README.md | 2 +- .../infer_after_finetune.py | 2 +- .../README.md | 3 +- .../infer_after_finetune.py | 2 +- .../README.md | 2 +- .../infer_after_finetune.py | 2 +- funasr/bin/asr_inference.py | 2 +- funasr/bin/asr_inference_mfcca.py | 2 +- funasr/bin/asr_inference_paraformer.py | 2 +- .../bin/asr_inference_paraformer_vad_punc.py | 2 +- funasr/bin/asr_inference_rnnt.py | 2 +- funasr/bin/asr_inference_uniasr.py | 2 +- funasr/bin/asr_inference_uniasr_vad.py | 2 +- funasr/bin/diar_inference_launch.py | 2 +- funasr/bin/eend_ola_inference.py | 2 +- funasr/bin/sond_inference.py | 2 +- funasr/bin/sv_inference.py | 4 +-- funasr/main_funcs/average_nbest_models.py | 18 +++++----- funasr/main_funcs/pack_funcs.py | 4 +-- funasr/tasks/abs_task.py | 8 ++--- funasr/tasks/asr.py | 4 +-- funasr/tasks/diar.py | 2 +- funasr/tasks/sv.py | 2 +- funasr/torch_utils/load_pretrained_model.py | 10 +++--- funasr/train/trainer.py | 36 +++++++++---------- 58 files changed, 102 insertions(+), 101 deletions(-) diff --git a/egs/aishell/conformer/run.sh b/egs/aishell/conformer/run.sh index 41db45dfd..09ddab8a5 100755 --- a/egs/aishell/conformer/run.sh +++ b/egs/aishell/conformer/run.sh @@ -52,7 +52,7 @@ asr_config=conf/train_asr_conformer.yaml model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}" inference_config=conf/decode_asr_transformer.yaml -inference_asr_model=valid.acc.ave_10best.pth +inference_asr_model=valid.acc.ave_10best.pb # you can set gpu num for decoding here gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default diff --git a/egs/aishell/data2vec_paraformer_finetune/run.sh b/egs/aishell/data2vec_paraformer_finetune/run.sh index cada164dc..d033ce26a 100755 --- a/egs/aishell/data2vec_paraformer_finetune/run.sh +++ b/egs/aishell/data2vec_paraformer_finetune/run.sh @@ -55,7 +55,7 @@ asr_config=conf/train_asr_paraformer_transformer_12e_6d_3072_768.yaml model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}" inference_config=conf/decode_asr_transformer_noctc_1best.yaml -inference_asr_model=valid.acc.ave_10best.pth +inference_asr_model=valid.acc.ave_10best.pb # you can set gpu num for decoding here gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default diff --git a/egs/aishell/data2vec_transformer_finetune/run.sh b/egs/aishell/data2vec_transformer_finetune/run.sh index 7ab8626bb..26222e666 100755 --- a/egs/aishell/data2vec_transformer_finetune/run.sh +++ b/egs/aishell/data2vec_transformer_finetune/run.sh @@ -55,7 +55,7 @@ asr_config=conf/train_asr_transformer_12e_6d_3072_768.yaml model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}" inference_config=conf/decode_asr_transformer.yaml -inference_asr_model=valid.cer_ctc.ave_10best.pth +inference_asr_model=valid.cer_ctc.ave_10best.pb # you can set gpu num for decoding here gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default diff --git a/egs/aishell/paraformer/run.sh b/egs/aishell/paraformer/run.sh index 2b0f1449b..53b5f906d 100755 --- a/egs/aishell/paraformer/run.sh +++ b/egs/aishell/paraformer/run.sh @@ -52,7 +52,7 @@ asr_config=conf/train_asr_paraformer_conformer_12e_6d_2048_256.yaml model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}" inference_config=conf/decode_asr_transformer_noctc_1best.yaml -inference_asr_model=valid.acc.ave_10best.pth +inference_asr_model=valid.acc.ave_10best.pb # you can set gpu num for decoding here gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default diff --git a/egs/aishell/paraformerbert/run.sh b/egs/aishell/paraformerbert/run.sh index 96310ab84..2487eacd8 100755 --- a/egs/aishell/paraformerbert/run.sh +++ b/egs/aishell/paraformerbert/run.sh @@ -56,7 +56,7 @@ asr_config=conf/train_asr_paraformerbert_conformer_12e_6d_2048_256.yaml model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}" inference_config=conf/decode_asr_transformer_noctc_1best.yaml -inference_asr_model=valid.acc.ave_10best.pth +inference_asr_model=valid.acc.ave_10best.pb # you can set gpu num for decoding here gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default diff --git a/egs/aishell/transformer/run.sh b/egs/aishell/transformer/run.sh index 4c307b07c..f66a338ba 100755 --- a/egs/aishell/transformer/run.sh +++ b/egs/aishell/transformer/run.sh @@ -52,7 +52,7 @@ asr_config=conf/train_asr_conformer.yaml model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}" inference_config=conf/decode_asr_transformer.yaml -inference_asr_model=valid.acc.ave_10best.pth +inference_asr_model=valid.acc.ave_10best.pb # you can set gpu num for decoding here gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default diff --git a/egs/aishell2/conformer/run.sh b/egs/aishell2/conformer/run.sh index bd6d81ea9..f9ea69ada 100755 --- a/egs/aishell2/conformer/run.sh +++ b/egs/aishell2/conformer/run.sh @@ -54,7 +54,7 @@ asr_config=conf/train_asr_conformer.yaml model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}" inference_config=conf/decode_asr_transformer.yaml -inference_asr_model=valid.acc.ave_10best.pth +inference_asr_model=valid.acc.ave_10best.pb # you can set gpu num for decoding here gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default diff --git a/egs/aishell2/paraformer/run.sh b/egs/aishell2/paraformer/run.sh index 2b7d84131..e1ea4fe73 100755 --- a/egs/aishell2/paraformer/run.sh +++ b/egs/aishell2/paraformer/run.sh @@ -54,7 +54,7 @@ asr_config=conf/train_asr_paraformer_conformer_20e_1280_320_6d_1280_320.yaml model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}" inference_config=conf/decode_asr_transformer_noctc_1best.yaml -inference_asr_model=valid.acc.ave_10best.pth +inference_asr_model=valid.acc.ave_10best.pb # you can set gpu num for decoding here gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default diff --git a/egs/aishell2/paraformerbert/run.sh b/egs/aishell2/paraformerbert/run.sh index d0407d480..239a7e339 100755 --- a/egs/aishell2/paraformerbert/run.sh +++ b/egs/aishell2/paraformerbert/run.sh @@ -58,7 +58,7 @@ asr_config=conf/train_asr_paraformerbert_conformer_20e_6d_1280_320.yaml model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}" inference_config=conf/decode_asr_transformer_noctc_1best.yaml -inference_asr_model=valid.acc.ave_10best.pth +inference_asr_model=valid.acc.ave_10best.pb # you can set gpu num for decoding here gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default diff --git a/egs/aishell2/transformer/run.sh b/egs/aishell2/transformer/run.sh index a5a14ec09..6f2dd4d8d 100755 --- a/egs/aishell2/transformer/run.sh +++ b/egs/aishell2/transformer/run.sh @@ -54,7 +54,7 @@ asr_config=conf/train_asr_transformer.yaml model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}" inference_config=conf/decode_asr_transformer.yaml -inference_asr_model=valid.acc.ave_10best.pth +inference_asr_model=valid.acc.ave_10best.pb # you can set gpu num for decoding here gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default diff --git a/egs/aishell2/transformerLM/run.sh b/egs/aishell2/transformerLM/run.sh index 28e376287..9e7a7135b 100755 --- a/egs/aishell2/transformerLM/run.sh +++ b/egs/aishell2/transformerLM/run.sh @@ -34,7 +34,7 @@ exp_dir=./data tag=exp1 model_dir="baseline_$(basename "${lm_config}" .yaml)_${lang}_${token_type}_${tag}" lm_exp=${exp_dir}/exp/${model_dir} -inference_lm=valid.loss.ave.pth # Language model path for decoding. +inference_lm=valid.loss.ave.pb # Language model path for decoding. stage=0 stop_stage=3 diff --git a/egs/alimeeting/diarization/sond/infer_alimeeting_test.py b/egs/alimeeting/diarization/sond/infer_alimeeting_test.py index 0988f5d03..b4d534bee 100644 --- a/egs/alimeeting/diarization/sond/infer_alimeeting_test.py +++ b/egs/alimeeting/diarization/sond/infer_alimeeting_test.py @@ -4,7 +4,7 @@ import sys def main(): diar_config_path = sys.argv[1] if len(sys.argv) > 1 else "sond_fbank.yaml" - diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pth" + diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pb" output_dir = sys.argv[3] if len(sys.argv) > 3 else "./outputs" data_path_and_name_and_type = [ ("data/test_rmsil/feats.scp", "speech", "kaldi_ark"), diff --git a/egs/alimeeting/diarization/sond/run.sh b/egs/alimeeting/diarization/sond/run.sh index 7e9a7f7ba..19ae40cdd 100644 --- a/egs/alimeeting/diarization/sond/run.sh +++ b/egs/alimeeting/diarization/sond/run.sh @@ -17,9 +17,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then echo "Downloading Pre-trained model..." git clone https://www.modelscope.cn/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch.git git clone https://www.modelscope.cn/damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch.git - ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth ./sv.pth + ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb ./sv.pb cp speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.yaml ./sv.yaml - ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pth ./sond.pth + ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pb ./sond.pb cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond_fbank.yaml ./sond_fbank.yaml cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.yaml ./sond.yaml echo "Done." @@ -30,7 +30,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then echo "Calculating diarization results..." - python infer_alimeeting_test.py sond_fbank.yaml sond.pth outputs + python infer_alimeeting_test.py sond_fbank.yaml sond.pb outputs python local/convert_label_to_rttm.py \ outputs/labels.txt \ data/test_rmsil/raw_rmsil_map.scp \ diff --git a/egs/alimeeting/diarization/sond/unit_test.py b/egs/alimeeting/diarization/sond/unit_test.py index 84a424762..0f40ab29e 100644 --- a/egs/alimeeting/diarization/sond/unit_test.py +++ b/egs/alimeeting/diarization/sond/unit_test.py @@ -4,7 +4,7 @@ import os def test_fbank_cpu_infer(): diar_config_path = "config_fbank.yaml" - diar_model_path = "sond.pth" + diar_model_path = "sond.pb" output_dir = "./outputs" data_path_and_name_and_type = [ ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"), @@ -24,7 +24,7 @@ def test_fbank_cpu_infer(): def test_fbank_gpu_infer(): diar_config_path = "config_fbank.yaml" - diar_model_path = "sond.pth" + diar_model_path = "sond.pb" output_dir = "./outputs" data_path_and_name_and_type = [ ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"), @@ -45,7 +45,7 @@ def test_fbank_gpu_infer(): def test_wav_gpu_infer(): diar_config_path = "config.yaml" - diar_model_path = "sond.pth" + diar_model_path = "sond.pb" output_dir = "./outputs" data_path_and_name_and_type = [ ("data/unit_test/test_wav.scp", "speech", "sound"), @@ -66,7 +66,7 @@ def test_wav_gpu_infer(): def test_without_profile_gpu_infer(): diar_config_path = "config.yaml" - diar_model_path = "sond.pth" + diar_model_path = "sond.pb" output_dir = "./outputs" raw_inputs = [[ "data/unit_test/raw_inputs/record.wav", diff --git a/egs/callhome/diarization/sond/unit_test.py b/egs/callhome/diarization/sond/unit_test.py index 519ac5695..a48eda148 100644 --- a/egs/callhome/diarization/sond/unit_test.py +++ b/egs/callhome/diarization/sond/unit_test.py @@ -4,7 +4,7 @@ import os def test_fbank_cpu_infer(): diar_config_path = "sond_fbank.yaml" - diar_model_path = "sond.pth" + diar_model_path = "sond.pb" output_dir = "./outputs" data_path_and_name_and_type = [ ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"), @@ -24,7 +24,7 @@ def test_fbank_cpu_infer(): def test_fbank_gpu_infer(): diar_config_path = "sond_fbank.yaml" - diar_model_path = "sond.pth" + diar_model_path = "sond.pb" output_dir = "./outputs" data_path_and_name_and_type = [ ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"), @@ -45,7 +45,7 @@ def test_fbank_gpu_infer(): def test_wav_gpu_infer(): diar_config_path = "config.yaml" - diar_model_path = "sond.pth" + diar_model_path = "sond.pb" output_dir = "./outputs" data_path_and_name_and_type = [ ("data/unit_test/test_wav.scp", "speech", "sound"), @@ -66,7 +66,7 @@ def test_wav_gpu_infer(): def test_without_profile_gpu_infer(): diar_config_path = "config.yaml" - diar_model_path = "sond.pth" + diar_model_path = "sond.pb" output_dir = "./outputs" raw_inputs = [[ "data/unit_test/raw_inputs/record.wav", diff --git a/egs/mars/sd/local_run.sh b/egs/mars/sd/local_run.sh index 3b319f46e..4516e9f96 100755 --- a/egs/mars/sd/local_run.sh +++ b/egs/mars/sd/local_run.sh @@ -49,7 +49,7 @@ asr_config=conf/train_asr_conformer.yaml model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}" inference_config=conf/decode_asr_transformer.yaml -inference_asr_model=valid.acc.ave_10best.pth +inference_asr_model=valid.acc.ave_10best.pb # you can set gpu num for decoding here gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md index c2e4354c1..053986d3d 100644 --- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md +++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md @@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i - Modify inference related parameters in `infer_after_finetune.py` - output_dir: # result dir - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed~~~~ - - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth` + - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb` - Then you can run the pipeline to finetune with: ```python diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py index 56c282ce2..b3260672c 100644 --- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py +++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py @@ -48,5 +48,5 @@ if __name__ == '__main__': params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"] params["output_dir"] = "./checkpoint" params["data_dir"] = "./data/test" - params["decoding_model_name"] = "valid.cer_ctc.ave.pth" + params["decoding_model_name"] = "valid.cer_ctc.ave.pb" modelscope_infer_after_finetune(params) diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md index c2e4354c1..053986d3d 100644 --- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md +++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md @@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i - Modify inference related parameters in `infer_after_finetune.py` - output_dir: # result dir - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed~~~~ - - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth` + - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb` - Then you can run the pipeline to finetune with: ```python diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py index e163999b7..2f038a85a 100644 --- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py +++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py @@ -48,5 +48,5 @@ if __name__ == '__main__': params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"] params["output_dir"] = "./checkpoint" params["data_dir"] = "./data/test" - params["decoding_model_name"] = "valid.cer_ctc.ave.pth" + params["decoding_model_name"] = "valid.cer_ctc.ave.pb" modelscope_infer_after_finetune(params) diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md index 9097e7ab9..16aeada4b 100644 --- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md +++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md @@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.sp.cer` and ` - Modify inference related parameters in `infer_after_finetune.py` - output_dir: # result dir - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed - - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth` + - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb` - Then you can run the pipeline to finetune with: ```python diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py index e714a3d03..333b66a72 100755 --- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py +++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py @@ -63,5 +63,5 @@ if __name__ == '__main__': params["required_files"] = ["feats_stats.npz", "decoding.yaml", "configuration.json"] params["output_dir"] = "./checkpoint" params["data_dir"] = "./example_data/validation" - params["decoding_model_name"] = "valid.acc.ave.pth" + params["decoding_model_name"] = "valid.acc.ave.pb" modelscope_infer_after_finetune(params) diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py index 6c34ed099..f1f29faff 100644 --- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py +++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py @@ -49,5 +49,5 @@ if __name__ == '__main__': params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"] params["output_dir"] = "./checkpoint" params["data_dir"] = "./data/test" - params["decoding_model_name"] = "valid.acc.ave_10best.pth" + params["decoding_model_name"] = "valid.acc.ave_10best.pb" modelscope_infer_after_finetune(params) diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py index 6140bb71f..8cb537bd2 100644 --- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py +++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py @@ -49,5 +49,5 @@ if __name__ == '__main__': params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"] params["output_dir"] = "./checkpoint" params["data_dir"] = "./data/test" - params["decoding_model_name"] = "valid.acc.ave_10best.pth" + params["decoding_model_name"] = "valid.acc.ave_10best.pb" modelscope_infer_after_finetune(params) diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md index dfd509dd4..b68f1e921 100644 --- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md +++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md @@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i - Modify inference related parameters in `infer_after_finetune.py` - output_dir: # result dir - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed - - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth` + - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb` - Then you can run the pipeline to finetune with: ```python diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py index 94393ec5e..f26f2378b 100644 --- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py +++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py @@ -49,5 +49,5 @@ if __name__ == '__main__': params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"] params["output_dir"] = "./checkpoint" params["data_dir"] = "./data/test" - params["decoding_model_name"] = "valid.acc.ave_10best.pth" + params["decoding_model_name"] = "valid.acc.ave_10best.pb" modelscope_infer_after_finetune(params) diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py index 96102ccfa..726009de7 100644 --- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py +++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py @@ -49,5 +49,5 @@ if __name__ == '__main__': params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"] params["output_dir"] = "./checkpoint" params["data_dir"] = "./data/test" - params["decoding_model_name"] = "valid.acc.ave_10best.pth" + params["decoding_model_name"] = "valid.acc.ave_10best.pb" modelscope_infer_after_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md index dfd509dd4..b68f1e921 100644 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md +++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md @@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i - Modify inference related parameters in `infer_after_finetune.py` - output_dir: # result dir - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed - - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth` + - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb` - Then you can run the pipeline to finetune with: ```python diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py index d91a40a6c..6593f4e3f 100644 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py +++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py @@ -50,5 +50,5 @@ if __name__ == '__main__': params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"] params["output_dir"] = "./checkpoint" params["data_dir"] = "./data/test" - params["decoding_model_name"] = "20epoch.pth" + params["decoding_model_name"] = "20epoch.pb" modelscope_infer_after_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md index dfd509dd4..b68f1e921 100644 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md +++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md @@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i - Modify inference related parameters in `infer_after_finetune.py` - output_dir: # result dir - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed - - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth` + - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb` - Then you can run the pipeline to finetune with: ```python diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py index f9fb0db8a..f067c8193 100644 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py +++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py @@ -50,5 +50,5 @@ if __name__ == '__main__': params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"] params["output_dir"] = "./checkpoint" params["data_dir"] = "./data/test" - params["decoding_model_name"] = "20epoch.pth" + params["decoding_model_name"] = "20epoch.pb" modelscope_infer_after_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md index dd947d329..9a84f9b57 100644 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md +++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md @@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i - Modify inference related parameters in `infer_after_finetune.py` - output_dir: # result dir - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed - - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth` + - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb` - Then you can run the pipeline to finetune with: ```python diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py index 030c2e278..d4df29e01 100644 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py +++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py @@ -50,5 +50,5 @@ if __name__ == '__main__': params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"] params["output_dir"] = "./checkpoint" params["data_dir"] = "./data/test" - params["decoding_model_name"] = "20epoch.pth" + params["decoding_model_name"] = "20epoch.pb" modelscope_infer_after_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md index dd947d329..9a84f9b57 100644 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md +++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md @@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i - Modify inference related parameters in `infer_after_finetune.py` - output_dir: # result dir - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed - - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth` + - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb` - Then you can run the pipeline to finetune with: ```python diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py index 3b39a1665..861fefb7f 100644 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py +++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py @@ -49,5 +49,5 @@ if __name__ == '__main__': params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"] params["output_dir"] = "./checkpoint" params["data_dir"] = "./data/test" - params["decoding_model_name"] = "20epoch.pth" + params["decoding_model_name"] = "20epoch.pb" modelscope_infer_after_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md index dd947d329..eff933e8d 100644 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md +++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md @@ -41,7 +41,8 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i - Modify inference related parameters in `infer_after_finetune.py` - output_dir: # result dir - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed - - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth` + - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave + .pb` - Then you can run the pipeline to finetune with: ```python diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py index 4860cf743..d73cae267 100644 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py +++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py @@ -49,5 +49,5 @@ if __name__ == '__main__': params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"] params["output_dir"] = "./checkpoint" params["data_dir"] = "./data/test" - params["decoding_model_name"] = "20epoch.pth" + params["decoding_model_name"] = "20epoch.pb" modelscope_infer_after_finetune(params) diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md index 1094bb5ff..94144efa7 100644 --- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md +++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md @@ -34,7 +34,7 @@ Or you can use the finetuned model for inference directly. - Modify inference related parameters in `infer_after_finetune.py` - output_dir: # result dir - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed - - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth` + - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb` - Then you can run the pipeline to finetune with: ```python diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py index 5f171b419..3712cb828 100644 --- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py +++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py @@ -53,5 +53,5 @@ if __name__ == '__main__': params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json", "punc/punc.pb", "punc/punc.yaml", "vad/vad.mvn", "vad/vad.pb", "vad/vad.yaml"] params["output_dir"] = "./checkpoint" params["data_dir"] = "./data/test" - params["decoding_model_name"] = "valid.acc.ave_10best.pth" + params["decoding_model_name"] = "valid.acc.ave_10best.pb" modelscope_infer_after_finetune(params) diff --git a/funasr/bin/asr_inference.py b/funasr/bin/asr_inference.py index 318d3d7a2..f3b4d560a 100644 --- a/funasr/bin/asr_inference.py +++ b/funasr/bin/asr_inference.py @@ -52,7 +52,7 @@ class Speech2Text: Examples: >>> import soundfile - >>> speech2text = Speech2Text("asr_config.yml", "asr.pth") + >>> speech2text = Speech2Text("asr_config.yml", "asr.pb") >>> audio, rate = soundfile.read("speech.wav") >>> speech2text(audio) [(text, token, token_int, hypothesis object), ...] diff --git a/funasr/bin/asr_inference_mfcca.py b/funasr/bin/asr_inference_mfcca.py index 4176ba6ab..888d4d2f8 100644 --- a/funasr/bin/asr_inference_mfcca.py +++ b/funasr/bin/asr_inference_mfcca.py @@ -55,7 +55,7 @@ class Speech2Text: Examples: >>> import soundfile - >>> speech2text = Speech2Text("asr_config.yml", "asr.pth") + >>> speech2text = Speech2Text("asr_config.yml", "asr.pb") >>> audio, rate = soundfile.read("speech.wav") >>> speech2text(audio) [(text, token, token_int, hypothesis object), ...] diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py index 6413d92b0..e45e575ed 100644 --- a/funasr/bin/asr_inference_paraformer.py +++ b/funasr/bin/asr_inference_paraformer.py @@ -50,7 +50,7 @@ class Speech2Text: Examples: >>> import soundfile - >>> speech2text = Speech2Text("asr_config.yml", "asr.pth") + >>> speech2text = Speech2Text("asr_config.yml", "asr.pb") >>> audio, rate = soundfile.read("speech.wav") >>> speech2text(audio) [(text, token, token_int, hypothesis object), ...] diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py index a0e7b47d2..3f5775195 100644 --- a/funasr/bin/asr_inference_paraformer_vad_punc.py +++ b/funasr/bin/asr_inference_paraformer_vad_punc.py @@ -58,7 +58,7 @@ class Speech2Text: Examples: >>> import soundfile - >>> speech2text = Speech2Text("asr_config.yml", "asr.pth") + >>> speech2text = Speech2Text("asr_config.yml", "asr.pb") >>> audio, rate = soundfile.read("speech.wav") >>> speech2text(audio) [(text, token, token_int, hypothesis object), ...] diff --git a/funasr/bin/asr_inference_rnnt.py b/funasr/bin/asr_inference_rnnt.py index 6cd70613b..4a9ff0bda 100644 --- a/funasr/bin/asr_inference_rnnt.py +++ b/funasr/bin/asr_inference_rnnt.py @@ -49,7 +49,7 @@ class Speech2Text: Examples: >>> import soundfile - >>> speech2text = Speech2Text("asr_config.yml", "asr.pth") + >>> speech2text = Speech2Text("asr_config.yml", "asr.pb") >>> audio, rate = soundfile.read("speech.wav") >>> speech2text(audio) [(text, token, token_int, hypothesis object), ...] diff --git a/funasr/bin/asr_inference_uniasr.py b/funasr/bin/asr_inference_uniasr.py index 8b31fad13..ac71538a6 100644 --- a/funasr/bin/asr_inference_uniasr.py +++ b/funasr/bin/asr_inference_uniasr.py @@ -46,7 +46,7 @@ class Speech2Text: Examples: >>> import soundfile - >>> speech2text = Speech2Text("asr_config.yml", "asr.pth") + >>> speech2text = Speech2Text("asr_config.yml", "asr.pb") >>> audio, rate = soundfile.read("speech.wav") >>> speech2text(audio) [(text, token, token_int, hypothesis object), ...] diff --git a/funasr/bin/asr_inference_uniasr_vad.py b/funasr/bin/asr_inference_uniasr_vad.py index e5815df11..7cb889b7d 100644 --- a/funasr/bin/asr_inference_uniasr_vad.py +++ b/funasr/bin/asr_inference_uniasr_vad.py @@ -46,7 +46,7 @@ class Speech2Text: Examples: >>> import soundfile - >>> speech2text = Speech2Text("asr_config.yml", "asr.pth") + >>> speech2text = Speech2Text("asr_config.yml", "asr.pb") >>> audio, rate = soundfile.read("speech.wav") >>> speech2text(audio) [(text, token, token_int, hypothesis object), ...] diff --git a/funasr/bin/diar_inference_launch.py b/funasr/bin/diar_inference_launch.py index 70bb947b4..85e451836 100755 --- a/funasr/bin/diar_inference_launch.py +++ b/funasr/bin/diar_inference_launch.py @@ -133,7 +133,7 @@ def inference_launch(mode, **kwargs): param_dict = { "extract_profile": True, "sv_train_config": "sv.yaml", - "sv_model_file": "sv.pth", + "sv_model_file": "sv.pb", } if "param_dict" in kwargs and kwargs["param_dict"] is not None: for key in param_dict: diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py index bc29fa206..01d3f296a 100755 --- a/funasr/bin/eend_ola_inference.py +++ b/funasr/bin/eend_ola_inference.py @@ -35,7 +35,7 @@ class Speech2Diarization: Examples: >>> import soundfile >>> import numpy as np - >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pth") + >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb") >>> profile = np.load("profiles.npy") >>> audio, rate = soundfile.read("speech.wav") >>> speech2diar(audio, profile) diff --git a/funasr/bin/sond_inference.py b/funasr/bin/sond_inference.py index ab6d26f45..936dc21f3 100755 --- a/funasr/bin/sond_inference.py +++ b/funasr/bin/sond_inference.py @@ -42,7 +42,7 @@ class Speech2Diarization: Examples: >>> import soundfile >>> import numpy as np - >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pth") + >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb") >>> profile = np.load("profiles.npy") >>> audio, rate = soundfile.read("speech.wav") >>> speech2diar(audio, profile) diff --git a/funasr/bin/sv_inference.py b/funasr/bin/sv_inference.py index a78bccded..7e63bbd2d 100755 --- a/funasr/bin/sv_inference.py +++ b/funasr/bin/sv_inference.py @@ -36,7 +36,7 @@ class Speech2Xvector: Examples: >>> import soundfile - >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pth") + >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pb") >>> audio, rate = soundfile.read("speech.wav") >>> speech2xvector(audio) [(text, token, token_int, hypothesis object), ...] @@ -169,7 +169,7 @@ def inference_modelscope( log_level: Union[int, str] = "INFO", key_file: Optional[str] = None, sv_train_config: Optional[str] = "sv.yaml", - sv_model_file: Optional[str] = "sv.pth", + sv_model_file: Optional[str] = "sv.pb", model_tag: Optional[str] = None, allow_variable_data_keys: bool = True, streaming: bool = False, diff --git a/funasr/main_funcs/average_nbest_models.py b/funasr/main_funcs/average_nbest_models.py index 53f956800..d8df94985 100644 --- a/funasr/main_funcs/average_nbest_models.py +++ b/funasr/main_funcs/average_nbest_models.py @@ -66,13 +66,13 @@ def average_nbest_models( elif n == 1: # The averaged model is same as the best model e, _ = epoch_and_values[0] - op = output_dir / f"{e}epoch.pth" - sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pth" + op = output_dir / f"{e}epoch.pb" + sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pb" if sym_op.is_symlink() or sym_op.exists(): sym_op.unlink() sym_op.symlink_to(op.name) else: - op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pth" + op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pb" logging.info( f"Averaging {n}best models: " f'criterion="{ph}.{cr}": {op}' ) @@ -83,12 +83,12 @@ def average_nbest_models( if e not in _loaded: if oss_bucket is None: _loaded[e] = torch.load( - output_dir / f"{e}epoch.pth", + output_dir / f"{e}epoch.pb", map_location="cpu", ) else: buffer = BytesIO( - oss_bucket.get_object(os.path.join(pai_output_dir, f"{e}epoch.pth")).read()) + oss_bucket.get_object(os.path.join(pai_output_dir, f"{e}epoch.pb")).read()) _loaded[e] = torch.load(buffer) states = _loaded[e] @@ -115,13 +115,13 @@ def average_nbest_models( else: buffer = BytesIO() torch.save(avg, buffer) - oss_bucket.put_object(os.path.join(pai_output_dir, f"{ph}.{cr}.ave_{n}best.{suffix}pth"), + oss_bucket.put_object(os.path.join(pai_output_dir, f"{ph}.{cr}.ave_{n}best.{suffix}pb"), buffer.getvalue()) - # 3. *.*.ave.pth is a symlink to the max ave model + # 3. *.*.ave.pb is a symlink to the max ave model if oss_bucket is None: - op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pth" - sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pth" + op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pb" + sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pb" if sym_op.is_symlink() or sym_op.exists(): sym_op.unlink() sym_op.symlink_to(op.name) diff --git a/funasr/main_funcs/pack_funcs.py b/funasr/main_funcs/pack_funcs.py index ffa807e23..fe365d8e7 100644 --- a/funasr/main_funcs/pack_funcs.py +++ b/funasr/main_funcs/pack_funcs.py @@ -191,12 +191,12 @@ def unpack( Examples: tarfile: - model.pth + model.pb some1.file some2.file >>> unpack("tarfile", "out") - {'asr_model_file': 'out/model.pth'} + {'asr_model_file': 'out/model.pb'} """ input_archive = Path(input_archive) outpath = Path(outpath) diff --git a/funasr/tasks/abs_task.py b/funasr/tasks/abs_task.py index e0884cef6..3f20b4f4c 100644 --- a/funasr/tasks/abs_task.py +++ b/funasr/tasks/abs_task.py @@ -639,12 +639,12 @@ class AbsTask(ABC): "and exclude_keys excludes keys of model states for the initialization." "e.g.\n" " # Load all parameters" - " --init_param some/where/model.pth\n" + " --init_param some/where/model.pb\n" " # Load only decoder parameters" - " --init_param some/where/model.pth:decoder:decoder\n" + " --init_param some/where/model.pb:decoder:decoder\n" " # Load only decoder parameters excluding decoder.embed" - " --init_param some/where/model.pth:decoder:decoder:decoder.embed\n" - " --init_param some/where/model.pth:decoder:decoder:decoder.embed\n", + " --init_param some/where/model.pb:decoder:decoder:decoder.embed\n" + " --init_param some/where/model.pb:decoder:decoder:decoder.embed\n", ) group.add_argument( "--ignore_init_mismatch", diff --git a/funasr/tasks/asr.py b/funasr/tasks/asr.py index 36499a257..e15147332 100644 --- a/funasr/tasks/asr.py +++ b/funasr/tasks/asr.py @@ -826,7 +826,7 @@ class ASRTaskUniASR(ASRTask): if "model.ckpt-" in model_name or ".bin" in model_name: model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb')) if ".bin" in model_name else os.path.join( - model_dir, "{}.pth".format(model_name)) + model_dir, "{}.pb".format(model_name)) if os.path.exists(model_name_pth): logging.info("model_file is load from pth: {}".format(model_name_pth)) model_dict = torch.load(model_name_pth, map_location=device) @@ -1073,7 +1073,7 @@ class ASRTaskParaformer(ASRTask): if "model.ckpt-" in model_name or ".bin" in model_name: model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb')) if ".bin" in model_name else os.path.join( - model_dir, "{}.pth".format(model_name)) + model_dir, "{}.pb".format(model_name)) if os.path.exists(model_name_pth): logging.info("model_file is load from pth: {}".format(model_name_pth)) model_dict = torch.load(model_name_pth, map_location=device) diff --git a/funasr/tasks/diar.py b/funasr/tasks/diar.py index 696291526..9875f6a45 100644 --- a/funasr/tasks/diar.py +++ b/funasr/tasks/diar.py @@ -553,7 +553,7 @@ class DiarTask(AbsTask): if ".bin" in model_name: model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb')) else: - model_name_pth = os.path.join(model_dir, "{}.pth".format(model_name)) + model_name_pth = os.path.join(model_dir, "{}.pb".format(model_name)) if os.path.exists(model_name_pth): logging.info("model_file is load from pth: {}".format(model_name_pth)) model_dict = torch.load(model_name_pth, map_location=device) diff --git a/funasr/tasks/sv.py b/funasr/tasks/sv.py index 1b08c4dad..bef5dc588 100644 --- a/funasr/tasks/sv.py +++ b/funasr/tasks/sv.py @@ -501,7 +501,7 @@ class SVTask(AbsTask): if ".bin" in model_name: model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb')) else: - model_name_pth = os.path.join(model_dir, "{}.pth".format(model_name)) + model_name_pth = os.path.join(model_dir, "{}.pb".format(model_name)) if os.path.exists(model_name_pth): logging.info("model_file is load from pth: {}".format(model_name_pth)) model_dict = torch.load(model_name_pth, map_location=device) diff --git a/funasr/torch_utils/load_pretrained_model.py b/funasr/torch_utils/load_pretrained_model.py index 8e3f05e1e..e9b18cd0d 100644 --- a/funasr/torch_utils/load_pretrained_model.py +++ b/funasr/torch_utils/load_pretrained_model.py @@ -52,13 +52,13 @@ def load_pretrained_model( init_param: ::: Examples: - >>> load_pretrained_model("somewhere/model.pth", model) - >>> load_pretrained_model("somewhere/model.pth:decoder:decoder", model) - >>> load_pretrained_model("somewhere/model.pth:decoder:decoder:", model) + >>> load_pretrained_model("somewhere/model.pb", model) + >>> load_pretrained_model("somewhere/model.pb:decoder:decoder", model) + >>> load_pretrained_model("somewhere/model.pb:decoder:decoder:", model) >>> load_pretrained_model( - ... "somewhere/model.pth:decoder:decoder:decoder.embed", model + ... "somewhere/model.pb:decoder:decoder:decoder.embed", model ... ) - >>> load_pretrained_model("somewhere/decoder.pth::decoder", model) + >>> load_pretrained_model("somewhere/decoder.pb::decoder", model) """ sps = init_param.split(":", 4) if len(sps) == 4: diff --git a/funasr/train/trainer.py b/funasr/train/trainer.py index 50bce477a..efe2009c4 100644 --- a/funasr/train/trainer.py +++ b/funasr/train/trainer.py @@ -205,9 +205,9 @@ class Trainer: else: scaler = None - if trainer_options.resume and (output_dir / "checkpoint.pth").exists(): + if trainer_options.resume and (output_dir / "checkpoint.pb").exists(): cls.resume( - checkpoint=output_dir / "checkpoint.pth", + checkpoint=output_dir / "checkpoint.pb", model=model, optimizers=optimizers, schedulers=schedulers, @@ -361,7 +361,7 @@ class Trainer: }, buffer, ) - trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir, "checkpoint.pth"), buffer.getvalue()) + trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir, "checkpoint.pb"), buffer.getvalue()) else: torch.save( { @@ -374,7 +374,7 @@ class Trainer: ], "scaler": scaler.state_dict() if scaler is not None else None, }, - output_dir / "checkpoint.pth", + output_dir / "checkpoint.pb", ) # 5. Save and log the model and update the link to the best model @@ -382,22 +382,22 @@ class Trainer: buffer = BytesIO() torch.save(model.state_dict(), buffer) trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir, - f"{iepoch}epoch.pth"),buffer.getvalue()) + f"{iepoch}epoch.pb"),buffer.getvalue()) else: - torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pth") + torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pb") - # Creates a sym link latest.pth -> {iepoch}epoch.pth + # Creates a sym link latest.pb -> {iepoch}epoch.pb if trainer_options.use_pai: - p = os.path.join(trainer_options.output_dir, "latest.pth") + p = os.path.join(trainer_options.output_dir, "latest.pb") if trainer_options.oss_bucket.object_exists(p): trainer_options.oss_bucket.delete_object(p) trainer_options.oss_bucket.copy_object(trainer_options.oss_bucket.bucket_name, - os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pth"), p) + os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pb"), p) else: - p = output_dir / "latest.pth" + p = output_dir / "latest.pb" if p.is_symlink() or p.exists(): p.unlink() - p.symlink_to(f"{iepoch}epoch.pth") + p.symlink_to(f"{iepoch}epoch.pb") _improved = [] for _phase, k, _mode in trainer_options.best_model_criterion: @@ -407,16 +407,16 @@ class Trainer: # Creates sym links if it's the best result if best_epoch == iepoch: if trainer_options.use_pai: - p = os.path.join(trainer_options.output_dir, f"{_phase}.{k}.best.pth") + p = os.path.join(trainer_options.output_dir, f"{_phase}.{k}.best.pb") if trainer_options.oss_bucket.object_exists(p): trainer_options.oss_bucket.delete_object(p) trainer_options.oss_bucket.copy_object(trainer_options.oss_bucket.bucket_name, - os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pth"),p) + os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pb"),p) else: - p = output_dir / f"{_phase}.{k}.best.pth" + p = output_dir / f"{_phase}.{k}.best.pb" if p.is_symlink() or p.exists(): p.unlink() - p.symlink_to(f"{iepoch}epoch.pth") + p.symlink_to(f"{iepoch}epoch.pb") _improved.append(f"{_phase}.{k}") if len(_improved) == 0: logging.info("There are no improvements in this epoch") @@ -438,7 +438,7 @@ class Trainer: type="model", metadata={"improved": _improved}, ) - artifact.add_file(str(output_dir / f"{iepoch}epoch.pth")) + artifact.add_file(str(output_dir / f"{iepoch}epoch.pb")) aliases = [ f"epoch-{iepoch}", "best" if best_epoch == iepoch else "", @@ -473,12 +473,12 @@ class Trainer: for e in range(1, iepoch): if trainer_options.use_pai: - p = os.path.join(trainer_options.output_dir, f"{e}epoch.pth") + p = os.path.join(trainer_options.output_dir, f"{e}epoch.pb") if trainer_options.oss_bucket.object_exists(p) and e not in nbests: trainer_options.oss_bucket.delete_object(p) _removed.append(str(p)) else: - p = output_dir / f"{e}epoch.pth" + p = output_dir / f"{e}epoch.pb" if p.exists() and e not in nbests: p.unlink() _removed.append(str(p)) From f1273775414e2f0664e00c44dbd8bbf897ba0183 Mon Sep 17 00:00:00 2001 From: mayong Date: Thu, 16 Mar 2023 11:44:13 +0800 Subject: [PATCH 34/37] Remove VAD. --- funasr/runtime/onnxruntime/src/Audio.cpp | 5 +- .../onnxruntime/src/librapidasrapi.cpp | 17 +++--- funasr/runtime/onnxruntime/tester/tester.cpp | 57 ++++++++++++++++--- 3 files changed, 63 insertions(+), 16 deletions(-) diff --git a/funasr/runtime/onnxruntime/src/Audio.cpp b/funasr/runtime/onnxruntime/src/Audio.cpp index 43dfb6b78..53bf9d02a 100644 --- a/funasr/runtime/onnxruntime/src/Audio.cpp +++ b/funasr/runtime/onnxruntime/src/Audio.cpp @@ -237,7 +237,7 @@ bool Audio::loadpcmwav(const char* buf, int nBufLen) size_t nOffset = 0; -#define WAV_HEADER_SIZE 44 + speech_len = nBufLen / 2; speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size); @@ -263,7 +263,8 @@ bool Audio::loadpcmwav(const char* buf, int nBufLen) speech_data[i] = (float)speech_buff[i] / scale; } - + AudioFrame* frame = new AudioFrame(speech_len); + frame_queue.push(frame); return true; } diff --git a/funasr/runtime/onnxruntime/src/librapidasrapi.cpp b/funasr/runtime/onnxruntime/src/librapidasrapi.cpp index 1f8f7ca63..f5f9d66be 100644 --- a/funasr/runtime/onnxruntime/src/librapidasrapi.cpp +++ b/funasr/runtime/onnxruntime/src/librapidasrapi.cpp @@ -26,8 +26,9 @@ extern "C" { return nullptr; Audio audio(1); - audio.loadwav(szBuf,nLen); - audio.split(); + if (!audio.loadwav(szBuf, nLen)) + return nullptr; + //audio.split(); float* buff; int len; @@ -58,8 +59,9 @@ extern "C" { return nullptr; Audio audio(1); - audio.loadpcmwav(szBuf, nLen); - audio.split(); + if (!audio.loadpcmwav(szBuf, nLen)) + return nullptr; + //audio.split(); float* buff; int len; @@ -91,8 +93,9 @@ extern "C" { return nullptr; Audio audio(1); - audio.loadpcmwav(szFileName); - audio.split(); + if (!audio.loadpcmwav(szFileName)) + return nullptr; + //audio.split(); float* buff; int len; @@ -125,7 +128,7 @@ extern "C" { Audio audio(1); if(!audio.loadwav(szWavfile)) return nullptr; - audio.split(); + //audio.split(); float* buff; int len; diff --git a/funasr/runtime/onnxruntime/tester/tester.cpp b/funasr/runtime/onnxruntime/tester/tester.cpp index b9a85b7c0..ba5c61ccb 100644 --- a/funasr/runtime/onnxruntime/tester/tester.cpp +++ b/funasr/runtime/onnxruntime/tester/tester.cpp @@ -8,7 +8,7 @@ #include "librapidasrapi.h" #include - +#include using namespace std; int main(int argc, char *argv[]) @@ -40,10 +40,13 @@ int main(int argc, char *argv[]) gettimeofday(&start, NULL); - - RPASR_RESULT Result=RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL); - gettimeofday(&end, NULL); float snippet_time = 0.0f; + + + RPASR_RESULT Result=RapidAsrRecogFile(AsrHanlde, argv[2], RASR_NONE, NULL); + + gettimeofday(&end, NULL); + if (Result) { string msg = RapidAsrGetResult(Result, 0); @@ -56,11 +59,51 @@ int main(int argc, char *argv[]) } else { - cout <<("no return data!"); + cout <<"no return data!"; } - - printf("Audio length %lfs.\n", (double)snippet_time); + + + //char* buff = nullptr; + //int len = 0; + //ifstream ifs(argv[2], std::ios::binary | std::ios::in); + //if (ifs.is_open()) + //{ + // ifs.seekg(0, std::ios::end); + // len = ifs.tellg(); + // ifs.seekg(0, std::ios::beg); + // buff = new char[len]; + + // ifs.read(buff, len); + + + // //RPASR_RESULT Result = RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL); + + // RPASR_RESULT Result=RapidAsrRecogPCMBuffer(AsrHanlde, buff,len, RASR_NONE, NULL); + // //RPASR_RESULT Result = RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL); + // gettimeofday(&end, NULL); + // + // if (Result) + // { + // string msg = RapidAsrGetResult(Result, 0); + // setbuf(stdout, NULL); + // cout << "Result: \""; + // cout << msg << endl; + // cout << "\"." << endl; + // snippet_time = RapidAsrGetRetSnippetTime(Result); + // RapidAsrFreeResult(Result); + // } + // else + // { + // cout <<"no return data!"; + // } + + // + //delete[]buff; + //} + + + printf("Audio length %lfs.\n", (double)snippet_time); seconds = (end.tv_sec - start.tv_sec); long taking_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); printf("Model inference takes %lfs.\n", (double)taking_micros / 1000000); From cc8e2638455c02235b376900ae330bb6608e4494 Mon Sep 17 00:00:00 2001 From: "shixian.shi" Date: Thu, 16 Mar 2023 14:45:15 +0800 Subject: [PATCH 35/37] update setup.py --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index e6b9d38f6..c85476938 100644 --- a/setup.py +++ b/setup.py @@ -41,6 +41,8 @@ requirements = { # PAI "oss2", "kaldi-native-fbank", + # timestamp + "edit-distance" ], # train: The modules invoked when training only. "train": [ From 80cc48a676668e70b585f38d1487bf377361a3d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=B8=E9=9B=81?= Date: Thu, 16 Mar 2023 15:02:00 +0800 Subject: [PATCH 36/37] readme --- README.md | 30 ++---------------------------- 1 file changed, 2 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 0d1079b36..7cf5a6cea 100644 --- a/README.md +++ b/README.md @@ -17,34 +17,8 @@ ## What's new: -### 2023.2.17, funasr-0.2.0, modelscope-1.3.0 -- We support a new feature, export paraformer models into [onnx and torchscripts](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export) from modelscope. The local finetuned models are also supported. -- We support a new feature, [onnxruntime](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python), you could deploy the runtime without modelscope or funasr, for the [paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) model, the rtf of onnxruntime is 3x speedup(0.110->0.038) on cpu, [details](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer#speed). -- We support a new feature, [grpc](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/grpc), you could build the ASR service with grpc, by deploying the modelscope pipeline or onnxruntime. -- We release a new model [paraformer-large-contextual](https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary), which supports the hotword customization based on the incentive enhancement, and improves the recall and precision of hotwords. -- We optimize the timestamp alignment of [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), the prediction accuracy of timestamp is much improved, and achieving accumulated average shift (aas) of 74.7ms, [details](https://arxiv.org/abs/2301.12343). -- We release a new model, [8k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://github.com/alibaba-damo-academy/FunASR/discussions/134). -- We release a new model, [MFCCA](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary), a multi-channel multi-speaker model which is independent of the number and geometry of microphones and supports Mandarin meeting transcription. -- We release several new UniASR model: -[Southern Fujian Dialect model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/summary), -[French model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/summary), -[German model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/summary), -[Vietnamese model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/summary), -[Persian model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/summary). -- We release a new model, [paraformer-data2vec model](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/summary), an unsupervised pretraining model on AISHELL-2, which is inited for paraformer model and then finetune on AISHEL-1. -- We release a new feature, the `VAD`, `ASR` and `PUNC` models could be integrated freely, which could be models from [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), or the local finetine models. The [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/134). -- We optimized the [punctuation common model](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), enhance the recall and precision, fix the badcases of missing punctuation marks. -- Various new types of audio input types are now supported by modelscope inference pipeline, including: mp3、flac、ogg、opus... -### 2023.1.16, funasr-0.1.6, modelscope-1.2.0 -- We release a new version model [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), which integrate the [VAD](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) model, [ASR](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), - [Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary) model and timestamp together. The model could take in several hours long inputs. -- We release a new model, [16k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary). -- We release a new model, [Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), which could predict the punctuation of ASR models's results. It could be freely integrated with any ASR models in [Model Zoo](docs/modelscope_models.md). -- We release a new model, [Data2vec](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/summary), an unsupervised pretraining model which could be finetuned on ASR and other downstream tasks. -- We release a new model, [Paraformer-Tiny](https://www.modelscope.cn/models/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/summary), a lightweight Paraformer model which supports Mandarin command words recognition. -- We release a new model, [SV](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary), which could extract speaker embeddings and further perform speaker verification on paired utterances. It will be supported for speaker diarization in the future version. -- We improve the pipeline of modelscope to speedup the inference, by integrating the process of build model into build pipeline. -- Various new types of audio input types are now supported by modelscope inference pipeline, including wav.scp, wav format, audio bytes, wave samples... +## What's new: +For the release notes, please ref to [news](https://github.com/alibaba-damo-academy/FunASR/releases) ## Highlights - Many types of typical models are supported, e.g., [Tranformer](https://arxiv.org/abs/1706.03762), [Conformer](https://arxiv.org/abs/2005.08100), [Paraformer](https://arxiv.org/abs/2206.08317). From 64bd637c301c3e4f771466808c30ec96d5531e45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=B8=E9=9B=81?= Date: Thu, 16 Mar 2023 15:02:54 +0800 Subject: [PATCH 37/37] readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7cf5a6cea..23f1abec6 100644 --- a/README.md +++ b/README.md @@ -15,9 +15,9 @@ | [**Model Zoo**](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) | [**Contact**](#contact) -## What's new: ## What's new: + For the release notes, please ref to [news](https://github.com/alibaba-damo-academy/FunASR/releases) ## Highlights