diff --git a/docs/images/dingding.jpg b/docs/images/dingding.jpg index e12e95296..5e6caced5 100644 Binary files a/docs/images/dingding.jpg and b/docs/images/dingding.jpg differ diff --git a/docs/images/wechat.png b/docs/images/wechat.png index 9fc6f4e17..8aec4cc11 100644 Binary files a/docs/images/wechat.png and b/docs/images/wechat.png differ diff --git a/funasr/bin/asr_inference.py b/funasr/bin/asr_inference.py index ca0201a75..d419018ec 100644 --- a/funasr/bin/asr_inference.py +++ b/funasr/bin/asr_inference.py @@ -46,11 +46,6 @@ from funasr.models.frontend.wav_frontend import WavFrontend header_colors = '\033[95m' end_colors = '\033[0m' -global_asr_language: str = 'zh-cn' -global_sample_rate: Union[int, Dict[Any, int]] = { - 'audio_fs': 16000, - 'model_fs': 16000 -} class Speech2Text: """Speech2Text class @@ -256,142 +251,6 @@ class Speech2Text: assert check_return_type(results) return results - -# def inference( -# maxlenratio: float, -# minlenratio: float, -# batch_size: int, -# beam_size: int, -# ngpu: int, -# ctc_weight: float, -# lm_weight: float, -# penalty: float, -# log_level: Union[int, str], -# data_path_and_name_and_type, -# asr_train_config: Optional[str], -# asr_model_file: Optional[str], -# cmvn_file: Optional[str] = None, -# lm_train_config: Optional[str] = None, -# lm_file: Optional[str] = None, -# token_type: Optional[str] = None, -# key_file: Optional[str] = None, -# word_lm_train_config: Optional[str] = None, -# bpemodel: Optional[str] = None, -# allow_variable_data_keys: bool = False, -# streaming: bool = False, -# output_dir: Optional[str] = None, -# dtype: str = "float32", -# seed: int = 0, -# ngram_weight: float = 0.9, -# nbest: int = 1, -# num_workers: int = 1, -# **kwargs, -# ): -# assert check_argument_types() -# if batch_size > 1: -# raise NotImplementedError("batch decoding is not implemented") -# if word_lm_train_config is not None: -# raise NotImplementedError("Word LM is not implemented") -# if ngpu > 1: -# raise NotImplementedError("only single GPU decoding is supported") -# -# logging.basicConfig( -# level=log_level, -# format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", -# ) -# -# if ngpu >= 1 and torch.cuda.is_available(): -# device = "cuda" -# else: -# device = "cpu" -# -# # 1. Set random-seed -# set_all_random_seed(seed) -# -# # 2. Build speech2text -# speech2text_kwargs = dict( -# asr_train_config=asr_train_config, -# asr_model_file=asr_model_file, -# cmvn_file=cmvn_file, -# lm_train_config=lm_train_config, -# lm_file=lm_file, -# token_type=token_type, -# bpemodel=bpemodel, -# device=device, -# maxlenratio=maxlenratio, -# minlenratio=minlenratio, -# dtype=dtype, -# beam_size=beam_size, -# ctc_weight=ctc_weight, -# lm_weight=lm_weight, -# ngram_weight=ngram_weight, -# penalty=penalty, -# nbest=nbest, -# streaming=streaming, -# ) -# logging.info("speech2text_kwargs: {}".format(speech2text_kwargs)) -# speech2text = Speech2Text(**speech2text_kwargs) -# -# # 3. Build data-iterator -# loader = ASRTask.build_streaming_iterator( -# data_path_and_name_and_type, -# dtype=dtype, -# batch_size=batch_size, -# key_file=key_file, -# num_workers=num_workers, -# preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), -# collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False), -# allow_variable_data_keys=allow_variable_data_keys, -# inference=True, -# ) -# -# finish_count = 0 -# file_count = 1 -# # 7 .Start for-loop -# # FIXME(kamo): The output format should be discussed about -# asr_result_list = [] -# if output_dir is not None: -# writer = DatadirWriter(output_dir) -# else: -# writer = None -# -# for keys, batch in loader: -# assert isinstance(batch, dict), type(batch) -# assert all(isinstance(s, str) for s in keys), keys -# _bs = len(next(iter(batch.values()))) -# assert len(keys) == _bs, f"{len(keys)} != {_bs}" -# #batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} -# -# # N-best list of (text, token, token_int, hyp_object) -# try: -# results = speech2text(**batch) -# except TooShortUttError as e: -# logging.warning(f"Utterance {keys} {e}") -# hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) -# results = [[" ", ["sil"], [2], hyp]] * nbest -# -# # Only supporting batch_size==1 -# key = keys[0] -# for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results): -# # Create a directory: outdir/{n}best_recog -# if writer is not None: -# ibest_writer = writer[f"{n}best_recog"] -# -# # Write the result to each file -# ibest_writer["token"][key] = " ".join(token) -# ibest_writer["token_int"][key] = " ".join(map(str, token_int)) -# ibest_writer["score"][key] = str(hyp.score) -# -# if text is not None: -# text_postprocessed = postprocess_utils.sentence_postprocess(token) -# item = {'key': key, 'value': text_postprocessed} -# asr_result_list.append(item) -# finish_count += 1 -# asr_utils.print_progress(finish_count / file_count) -# if writer is not None: -# ibest_writer["text"][key] = text -# return asr_result_list - def inference( maxlenratio: float, minlenratio: float, diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py index 14555173b..5d7d6ea77 100644 --- a/funasr/bin/asr_inference_paraformer.py +++ b/funasr/bin/asr_inference_paraformer.py @@ -280,162 +280,6 @@ class Speech2Text: return results -# def inference( -# maxlenratio: float, -# minlenratio: float, -# batch_size: int, -# beam_size: int, -# ngpu: int, -# ctc_weight: float, -# lm_weight: float, -# penalty: float, -# log_level: Union[int, str], -# data_path_and_name_and_type, -# asr_train_config: Optional[str], -# asr_model_file: Optional[str], -# cmvn_file: Optional[str] = None, -# raw_inputs: Union[np.ndarray, torch.Tensor] = None, -# lm_train_config: Optional[str] = None, -# lm_file: Optional[str] = None, -# token_type: Optional[str] = None, -# key_file: Optional[str] = None, -# word_lm_train_config: Optional[str] = None, -# bpemodel: Optional[str] = None, -# allow_variable_data_keys: bool = False, -# streaming: bool = False, -# output_dir: Optional[str] = None, -# dtype: str = "float32", -# seed: int = 0, -# ngram_weight: float = 0.9, -# nbest: int = 1, -# num_workers: int = 1, -# frontend_conf: dict = None, -# fs: Union[dict, int] = 16000, -# lang: Optional[str] = None, -# **kwargs, -# ): -# assert check_argument_types() -# -# if word_lm_train_config is not None: -# raise NotImplementedError("Word LM is not implemented") -# if ngpu > 1: -# raise NotImplementedError("only single GPU decoding is supported") -# -# logging.basicConfig( -# level=log_level, -# format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", -# ) -# -# if ngpu >= 1 and torch.cuda.is_available(): -# device = "cuda" -# else: -# device = "cpu" -# -# # 1. Set random-seed -# set_all_random_seed(seed) -# -# # 2. Build speech2text -# speech2text_kwargs = dict( -# asr_train_config=asr_train_config, -# asr_model_file=asr_model_file, -# cmvn_file=cmvn_file, -# lm_train_config=lm_train_config, -# lm_file=lm_file, -# token_type=token_type, -# bpemodel=bpemodel, -# device=device, -# maxlenratio=maxlenratio, -# minlenratio=minlenratio, -# dtype=dtype, -# beam_size=beam_size, -# ctc_weight=ctc_weight, -# lm_weight=lm_weight, -# ngram_weight=ngram_weight, -# penalty=penalty, -# nbest=nbest, -# frontend_conf=frontend_conf, -# ) -# speech2text = Speech2Text(**speech2text_kwargs) -# -# # 3. Build data-iterator -# loader = ASRTask.build_streaming_iterator( -# data_path_and_name_and_type, -# dtype=dtype, -# batch_size=batch_size, -# key_file=key_file, -# num_workers=num_workers, -# preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), -# collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False), -# allow_variable_data_keys=allow_variable_data_keys, -# inference=True, -# ) -# -# forward_time_total = 0.0 -# length_total = 0.0 -# finish_count = 0 -# file_count = 1 -# # 7 .Start for-loop -# # FIXME(kamo): The output format should be discussed about -# asr_result_list = [] -# if output_dir is not None: -# writer = DatadirWriter(output_dir) -# else: -# writer = None -# -# for keys, batch in loader: -# assert isinstance(batch, dict), type(batch) -# assert all(isinstance(s, str) for s in keys), keys -# _bs = len(next(iter(batch.values()))) -# assert len(keys) == _bs, f"{len(keys)} != {_bs}" -# # batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")} -# -# logging.info("decoding, utt_id: {}".format(keys)) -# # N-best list of (text, token, token_int, hyp_object) -# -# time_beg = time.time() -# results = speech2text(**batch) -# if len(results) < 1: -# hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) -# results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest -# time_end = time.time() -# forward_time = time_end - time_beg -# lfr_factor = results[0][-1] -# length = results[0][-2] -# forward_time_total += forward_time -# length_total += length -# logging.info( -# "decoding, feature length: {}, forward_time: {:.4f}, rtf: {:.4f}". -# format(length, forward_time, 100 * forward_time / (length*lfr_factor))) -# -# for batch_id in range(_bs): -# result = [results[batch_id][:-2]] -# -# key = keys[batch_id] -# for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), result): -# # Create a directory: outdir/{n}best_recog -# if writer is not None: -# ibest_writer = writer[f"{n}best_recog"] -# -# # Write the result to each file -# ibest_writer["token"][key] = " ".join(token) -# ibest_writer["token_int"][key] = " ".join(map(str, token_int)) -# ibest_writer["score"][key] = str(hyp.score) -# -# if text is not None: -# text_postprocessed = postprocess_utils.sentence_postprocess(token) -# item = {'key': key, 'value': text_postprocessed} -# asr_result_list.append(item) -# finish_count += 1 -# # asr_utils.print_progress(finish_count / file_count) -# if writer is not None: -# ibest_writer["text"][key] = text -# -# logging.info("decoding, utt: {}, predictions: {}".format(key, text)) -# -# logging.info("decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}". -# format(length_total, forward_time_total, 100 * forward_time_total / (length_total*lfr_factor))) -# return asr_result_list - def inference( maxlenratio: float, minlenratio: float, diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py index 27026177f..9001f1d6a 100644 --- a/funasr/bin/asr_inference_paraformer_vad_punc.py +++ b/funasr/bin/asr_inference_paraformer_vad_punc.py @@ -40,18 +40,10 @@ from funasr.models.frontend.wav_frontend import WavFrontend from funasr.tasks.vad import VADTask from funasr.utils.timestamp_tools import time_stamp_lfr6 from funasr.bin.punctuation_infer import Text2Punc -from funasr.torch_utils.forward_adaptor import ForwardAdaptor -from funasr.datasets.preprocessor import CommonPreprocessor -from funasr.punctuation.text_preprocessor import split_to_mini_sentence header_colors = '\033[95m' end_colors = '\033[0m' -global_asr_language: str = 'zh-cn' -global_sample_rate: Union[int, Dict[Any, int]] = { - 'audio_fs': 16000, - 'model_fs': 16000 -} class Speech2Text: """Speech2Text class diff --git a/funasr/bin/asr_inference_uniasr.py b/funasr/bin/asr_inference_uniasr.py index 8a9905830..cfec9a00c 100644 --- a/funasr/bin/asr_inference_uniasr.py +++ b/funasr/bin/asr_inference_uniasr.py @@ -272,150 +272,6 @@ class Speech2Text: return results -# def inference( -# maxlenratio: float, -# minlenratio: float, -# batch_size: int, -# beam_size: int, -# ngpu: int, -# ctc_weight: float, -# lm_weight: float, -# penalty: float, -# log_level: Union[int, str], -# data_path_and_name_and_type, -# asr_train_config: Optional[str], -# asr_model_file: Optional[str], -# ngram_file: Optional[str] = None, -# cmvn_file: Optional[str] = None, -# raw_inputs: Union[np.ndarray, torch.Tensor] = None, -# lm_train_config: Optional[str] = None, -# lm_file: Optional[str] = None, -# token_type: Optional[str] = None, -# key_file: Optional[str] = None, -# word_lm_train_config: Optional[str] = None, -# bpemodel: Optional[str] = None, -# allow_variable_data_keys: bool = False, -# streaming: bool = False, -# output_dir: Optional[str] = None, -# dtype: str = "float32", -# seed: int = 0, -# ngram_weight: float = 0.9, -# nbest: int = 1, -# num_workers: int = 1, -# token_num_relax: int = 1, -# decoding_ind: int = 0, -# decoding_mode: str = "model1", -# **kwargs, -# ): -# assert check_argument_types() -# if batch_size > 1: -# raise NotImplementedError("batch decoding is not implemented") -# if word_lm_train_config is not None: -# raise NotImplementedError("Word LM is not implemented") -# if ngpu > 1: -# raise NotImplementedError("only single GPU decoding is supported") -# -# logging.basicConfig( -# level=log_level, -# format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", -# ) -# -# if ngpu >= 1 and torch.cuda.is_available(): -# device = "cuda" -# else: -# device = "cpu" -# -# # 1. Set random-seed -# set_all_random_seed(seed) -# -# # 2. Build speech2text -# speech2text_kwargs = dict( -# asr_train_config=asr_train_config, -# asr_model_file=asr_model_file, -# cmvn_file=cmvn_file, -# lm_train_config=lm_train_config, -# lm_file=lm_file, -# ngram_file=ngram_file, -# token_type=token_type, -# bpemodel=bpemodel, -# device=device, -# maxlenratio=maxlenratio, -# minlenratio=minlenratio, -# dtype=dtype, -# beam_size=beam_size, -# ctc_weight=ctc_weight, -# lm_weight=lm_weight, -# ngram_weight=ngram_weight, -# penalty=penalty, -# nbest=nbest, -# streaming=streaming, -# token_num_relax=token_num_relax, -# decoding_ind=decoding_ind, -# decoding_mode=decoding_mode, -# ) -# speech2text = Speech2Text(**speech2text_kwargs) -# -# # 3. Build data-iterator -# loader = ASRTask.build_streaming_iterator( -# data_path_and_name_and_type, -# dtype=dtype, -# batch_size=batch_size, -# key_file=key_file, -# num_workers=num_workers, -# preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), -# collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False), -# allow_variable_data_keys=allow_variable_data_keys, -# inference=True, -# ) -# -# finish_count = 0 -# file_count = 1 -# # 7 .Start for-loop -# # FIXME(kamo): The output format should be discussed about -# asr_result_list = [] -# if output_dir is not None: -# writer = DatadirWriter(output_dir) -# else: -# writer = None -# -# for keys, batch in loader: -# assert isinstance(batch, dict), type(batch) -# assert all(isinstance(s, str) for s in keys), keys -# _bs = len(next(iter(batch.values()))) -# assert len(keys) == _bs, f"{len(keys)} != {_bs}" -# #batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} -# -# # N-best list of (text, token, token_int, hyp_object) -# try: -# results = speech2text(**batch) -# except TooShortUttError as e: -# logging.warning(f"Utterance {keys} {e}") -# hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) -# results = [[" ", ["sil"], [2], hyp]] * nbest -# -# # Only supporting batch_size==1 -# key = keys[0] -# logging.info(f"Utterance: {key}") -# for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results): -# # Create a directory: outdir/{n}best_recog -# if writer is not None: -# ibest_writer = writer[f"{n}best_recog"] -# -# # Write the result to each file -# ibest_writer["token"][key] = " ".join(token) -# ibest_writer["token_int"][key] = " ".join(map(str, token_int)) -# ibest_writer["score"][key] = str(hyp.score) -# -# if text is not None: -# text_postprocessed = postprocess_utils.sentence_postprocess(token) -# item = {'key': key, 'value': text_postprocessed} -# asr_result_list.append(item) -# finish_count += 1 -# asr_utils.print_progress(finish_count / file_count) -# if writer is not None: -# ibest_writer["text"][key] = text -# return asr_result_list - def inference( maxlenratio: float, minlenratio: float, diff --git a/funasr/bin/sv_inference.py b/funasr/bin/sv_inference.py index 6da696a4e..57ce91d6d 100755 --- a/funasr/bin/sv_inference.py +++ b/funasr/bin/sv_inference.py @@ -214,6 +214,7 @@ def inference_modelscope( data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None, raw_inputs: Union[np.ndarray, torch.Tensor] = None, output_dir_v2: Optional[str] = None, + fs: dict = None, param_dict: Optional[dict] = None, ): logging.info("param_dict: {}".format(param_dict)) diff --git a/funasr/bin/vad_inference.py b/funasr/bin/vad_inference.py index eb51400bf..9f1d0f310 100644 --- a/funasr/bin/vad_inference.py +++ b/funasr/bin/vad_inference.py @@ -116,90 +116,6 @@ class Speech2VadSegment: return segments -#def inference( -# batch_size: int, -# ngpu: int, -# log_level: Union[int, str], -# data_path_and_name_and_type, -# vad_infer_config: Optional[str], -# vad_model_file: Optional[str], -# vad_cmvn_file: Optional[str] = None, -# raw_inputs: Union[np.ndarray, torch.Tensor] = None, -# key_file: Optional[str] = None, -# allow_variable_data_keys: bool = False, -# output_dir: Optional[str] = None, -# dtype: str = "float32", -# seed: int = 0, -# num_workers: int = 1, -# fs: Union[dict, int] = 16000, -# **kwargs, -#): -# assert check_argument_types() -# if batch_size > 1: -# raise NotImplementedError("batch decoding is not implemented") -# if ngpu > 1: -# raise NotImplementedError("only single GPU decoding is supported") -# -# logging.basicConfig( -# level=log_level, -# format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", -# ) -# -# if ngpu >= 1 and torch.cuda.is_available(): -# device = "cuda" -# else: -# device = "cpu" -# -# # 1. Set random-seed -# set_all_random_seed(seed) -# -# # 2. Build speech2vadsegment -# speech2vadsegment_kwargs = dict( -# vad_infer_config=vad_infer_config, -# vad_model_file=vad_model_file, -# vad_cmvn_file=vad_cmvn_file, -# device=device, -# dtype=dtype, -# ) -# logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs)) -# speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs) -# # 3. Build data-iterator -# loader = VADTask.build_streaming_iterator( -# data_path_and_name_and_type, -# dtype=dtype, -# batch_size=batch_size, -# key_file=key_file, -# num_workers=num_workers, -# preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False), -# collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False), -# allow_variable_data_keys=allow_variable_data_keys, -# inference=True, -# ) -# -# finish_count = 0 -# file_count = 1 -# # 7 .Start for-loop -# # FIXME(kamo): The output format should be discussed about -# if output_dir is not None: -# writer = DatadirWriter(output_dir) -# else: -# writer = None -# -# vad_results = [] -# for keys, batch in loader: -# assert isinstance(batch, dict), type(batch) -# assert all(isinstance(s, str) for s in keys), keys -# _bs = len(next(iter(batch.values()))) -# assert len(keys) == _bs, f"{len(keys)} != {_bs}" -# # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} -# -# # do vad segment -# results = speech2vadsegment(**batch) -# for i, _ in enumerate(keys): -# item = {'key': keys[i], 'value': results[i]} -# vad_results.append(item) -# -# return vad_results def inference(