diff --git a/examples/industrial_data_pretraining/bicif_paraformer/demo.py b/examples/industrial_data_pretraining/bicif_paraformer/demo.py index 83e024e08..4a5e33389 100644 --- a/examples/industrial_data_pretraining/bicif_paraformer/demo.py +++ b/examples/industrial_data_pretraining/bicif_paraformer/demo.py @@ -5,10 +5,13 @@ from funasr import AutoModel -model = AutoModel(model="../modelscope_models/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch", - vad_model="../modelscope_models/speech_fsmn_vad_zh-cn-16k-common-pytorch", - punc_model="../modelscope_models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", +model = AutoModel(model="damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch", + model_revision="v2.0.0", + vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", + vad_model_revision="v2.0.0", + punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", + punc_model_revision="v2.0.0", ) -res = model(input="../modelscope_models/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav", batch_size_s=300, batch_size_threshold_s=60) +res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_vad_punc_example.wav", batch_size_s=300, batch_size_threshold_s=60) print(res) \ No newline at end of file diff --git a/examples/industrial_data_pretraining/bicif_paraformer/infer.sh b/examples/industrial_data_pretraining/bicif_paraformer/infer.sh index 2ec237b23..04cb6f2aa 100644 --- a/examples/industrial_data_pretraining/bicif_paraformer/infer.sh +++ b/examples/industrial_data_pretraining/bicif_paraformer/infer.sh @@ -1,27 +1,21 @@ -# download model -local_path_root=../modelscope_models -mkdir -p ${local_path_root} - -local_path=${local_path_root}/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch -git clone https://www.modelscope.cn/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git ${local_path} - -local_path_vad=${local_path_root}/speech_fsmn_vad_zh-cn-16k-common-pytorch -git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git ${local_path_vad} - -local_path_punc=${local_path_root}/punc_ct-transformer_zh-cn-common-vocab272727-pytorch -git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git ${local_path_punc} - +model="damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch" +model_revision="v2.0.0" +vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" +vad_model_revision="v2.0.0" +punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" +punc_model_revision="v2.0.0" python funasr/bin/inference.py \ -+model="${local_path}" \ -+vad_model="${local_path_vad}" \ -+punc_model="${local_path_punc}" \ -+input="${local_path}/example/asr_example.wav" \ ++model=${model} \ ++model_revision=${model_revision} \ ++vad_model=${vad_model} \ ++vad_model_revision=${vad_model_revision} \ ++punc_model=${punc_model} \ ++punc_model_revision=${punc_model_revision} \ ++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_vad_punc_example.wav" \ +output_dir="./outputs/debug" \ +device="cpu" \ +batch_size_s=300 \ -+batch_size_threshold_s=60 \ -+debug="true" \ -+"hotword='达摩院 魔搭'" ++batch_size_threshold_s=60 diff --git a/examples/industrial_data_pretraining/contextual_paraformer/demo.py b/examples/industrial_data_pretraining/contextual_paraformer/demo.py index 2e6d2034d..c705ca80f 100644 --- a/examples/industrial_data_pretraining/contextual_paraformer/demo.py +++ b/examples/industrial_data_pretraining/contextual_paraformer/demo.py @@ -5,8 +5,8 @@ from funasr import AutoModel -model = AutoModel(model="../modelscope_models/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404") +model = AutoModel(model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404", model_revision="v2.0.0") -res = model(input="../modelscope_models/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/example/asr_example.wav", +res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", hotword='达摩院 魔搭') print(res) \ No newline at end of file diff --git a/examples/industrial_data_pretraining/contextual_paraformer/infer.sh b/examples/industrial_data_pretraining/contextual_paraformer/infer.sh index 92d1a8bfd..158ce8ac7 100644 --- a/examples/industrial_data_pretraining/contextual_paraformer/infer.sh +++ b/examples/industrial_data_pretraining/contextual_paraformer/infer.sh @@ -1,14 +1,11 @@ -# download model -local_path_root=../modelscope_models -mkdir -p ${local_path_root} -local_path=${local_path_root}/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404 -git clone https://www.modelscope.cn/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404.git ${local_path} - +model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404" +model_revision="v2.0.0" python funasr/bin/inference.py \ -+model="${local_path}" \ -+input="${local_path}/example/asr_example.wav" \ ++model=${model} \ ++model_revision=${model_revision} \ ++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \ +output_dir="./outputs/debug" \ +device="cpu" \ +"hotword='达摩院 魔搭'" diff --git a/examples/industrial_data_pretraining/ct_transformer/demo.py b/examples/industrial_data_pretraining/ct_transformer/demo.py index d3b63db22..58ebd2aef 100644 --- a/examples/industrial_data_pretraining/ct_transformer/demo.py +++ b/examples/industrial_data_pretraining/ct_transformer/demo.py @@ -5,7 +5,7 @@ from funasr import AutoModel -model = AutoModel(model="/Users/zhifu/Downloads/modelscope_models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch") +model = AutoModel(model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", model_revision="v2.0.0") -res = model(input="/Users/zhifu/Downloads/modelscope_models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/example/punc_example.txt") +res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt") print(res) \ No newline at end of file diff --git a/examples/industrial_data_pretraining/ct_transformer/infer.sh b/examples/industrial_data_pretraining/ct_transformer/infer.sh index bd8ac0567..a48d56208 100644 --- a/examples/industrial_data_pretraining/ct_transformer/infer.sh +++ b/examples/industrial_data_pretraining/ct_transformer/infer.sh @@ -1,13 +1,10 @@ -# download model -local_path_root=../modelscope_models -mkdir -p ${local_path_root} -local_path=${local_path_root}/punc_ct-transformer_zh-cn-common-vocab272727-pytorch -git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git ${local_path} - +model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" +model_revision="v2.0.0" python funasr/bin/inference.py \ -+model="${local_path}" \ -+input="${local_path}/example/punc_example.txt" \ ++model=${model} \ ++model_revision=${model_revision} \ ++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt" \ +output_dir="./outputs/debug" \ +device="cpu" diff --git a/examples/industrial_data_pretraining/emotion2vec/demo.py b/examples/industrial_data_pretraining/emotion2vec/demo.py index 4ce7025b6..365331340 100644 --- a/examples/industrial_data_pretraining/emotion2vec/demo.py +++ b/examples/industrial_data_pretraining/emotion2vec/demo.py @@ -5,7 +5,7 @@ from funasr import AutoModel -model = AutoModel(model="../modelscope_models/emotion2vec_base") +model = AutoModel(model="damo/emotion2vec_base", model_revision="v2.0.0") -res = model(input="../modelscope_models/emotion2vec_base/example/test.wav", output_dir="./outputs") +res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", output_dir="./outputs") print(res) \ No newline at end of file diff --git a/examples/industrial_data_pretraining/emotion2vec/infer.sh b/examples/industrial_data_pretraining/emotion2vec/infer.sh index 99600caea..9b98715bd 100644 --- a/examples/industrial_data_pretraining/emotion2vec/infer.sh +++ b/examples/industrial_data_pretraining/emotion2vec/infer.sh @@ -1,14 +1,10 @@ -# download model -local_path_root=../modelscope_models -mkdir -p ${local_path_root} -local_path=${local_path_root}/emotion2vec_base -git clone https://www.modelscope.cn/damo/emotion2vec_base.git ${local_path} -#local_path=/Users/zhifu/Downloads/modelscope_models/emotion2vec_base +model="damo/emotion2vec_base" +model_revision="v2.0.0" python funasr/bin/inference.py \ -+model="${local_path}" \ -+input="${local_path}/example/test.wav" \ ++model=${model} \ ++model_revision=${model_revision} \ ++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \ +output_dir="./outputs/debug" \ +device="cpu" \ -+debug=true diff --git a/examples/industrial_data_pretraining/fsmn_vad/demo.py b/examples/industrial_data_pretraining/fsmn_vad/demo.py index 6c112e2fd..2a157ee23 100644 --- a/examples/industrial_data_pretraining/fsmn_vad/demo.py +++ b/examples/industrial_data_pretraining/fsmn_vad/demo.py @@ -5,7 +5,7 @@ from funasr import AutoModel -model = AutoModel(model="../modelscope_models/speech_fsmn_vad_zh-cn-16k-common-pytorch") +model = AutoModel(model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", model_revision="v2.0.0") -res = model(input="../modelscope_models/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav") +res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav") print(res) \ No newline at end of file diff --git a/examples/industrial_data_pretraining/fsmn_vad/infer.sh b/examples/industrial_data_pretraining/fsmn_vad/infer.sh index 94e1b3db7..dedd14abb 100644 --- a/examples/industrial_data_pretraining/fsmn_vad/infer.sh +++ b/examples/industrial_data_pretraining/fsmn_vad/infer.sh @@ -1,13 +1,11 @@ -# download model -local_path_root=../modelscope_models -mkdir -p ${local_path_root} -local_path=${local_path_root}/speech_fsmn_vad_zh-cn-16k-common-pytorch -git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git ${local_path} +model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" +model_revision="v2.0.0" python funasr/bin/inference.py \ -+model="${local_path}" \ -+input="${local_path}/example/vad_example.wav" \ ++model=${model} \ ++model_revision=${model_revision} \ ++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav" \ +output_dir="./outputs/debug" \ +device="cpu" \ diff --git a/examples/industrial_data_pretraining/monotonic_aligner/demo.py b/examples/industrial_data_pretraining/monotonic_aligner/demo.py index 149d488e0..f5df457e5 100644 --- a/examples/industrial_data_pretraining/monotonic_aligner/demo.py +++ b/examples/industrial_data_pretraining/monotonic_aligner/demo.py @@ -5,9 +5,9 @@ from funasr import AutoModel -model = AutoModel(model="../modelscope_models/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch") +model = AutoModel(model="damo/speech_timestamp_prediction-v1-16k-offline", model_revision="v2.0.0") -res = model(input=("../modelscope_models/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav", +res = model(input=("https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", "欢迎大家来到魔搭社区进行体验"), data_type=("sound", "text"), batch_size=2, diff --git a/examples/industrial_data_pretraining/monotonic_aligner/infer.sh b/examples/industrial_data_pretraining/monotonic_aligner/infer.sh index 179e7148a..34fd1f904 100644 --- a/examples/industrial_data_pretraining/monotonic_aligner/infer.sh +++ b/examples/industrial_data_pretraining/monotonic_aligner/infer.sh @@ -1,14 +1,11 @@ -# download model -local_path_root=../modelscope_models -mkdir -p ${local_path_root} -local_path=${local_path_root}/speech_timestamp_prediction-v1-16k-offline - git clone https://www.modelscope.cn/damo/speech_timestamp_prediction-v1-16k-offline.git ${local_path} - +model="damo/speech_timestamp_prediction-v1-16k-offline" +model_revision="v2.0.0" python funasr/bin/inference.py \ -+model="${local_path}" \ -+input='["../modelscope_models/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav", "欢迎大家来到魔搭社区进行体验"]' \ ++model=${model} \ ++model_revision=${model_revision} \ ++input='["https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", "欢迎大家来到魔搭社区进行体验"]' \ +data_type='["sound", "text"]' \ +output_dir="../outputs/debug" \ +device="cpu" \ diff --git a/examples/industrial_data_pretraining/paraformer/demo.py b/examples/industrial_data_pretraining/paraformer/demo.py index 119e14fcc..1f3b9a125 100644 --- a/examples/industrial_data_pretraining/paraformer/demo.py +++ b/examples/industrial_data_pretraining/paraformer/demo.py @@ -5,17 +5,17 @@ from funasr import AutoModel -model = AutoModel(model="../modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch") +model = AutoModel(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revison="v2.0.0") -res = model(input="../modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav") +res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav") print(res) from funasr import AutoFrontend -frontend = AutoFrontend(model="../modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch") +frontend = AutoFrontend(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revison="v2.0.0") -fbanks = frontend(input="../modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav", batch_size=2) +fbanks = frontend(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", batch_size=2) for batch_idx, fbank_dict in enumerate(fbanks): res = model(**fbank_dict) diff --git a/examples/industrial_data_pretraining/paraformer/infer.sh b/examples/industrial_data_pretraining/paraformer/infer.sh index a9bd8cd87..9436628b7 100644 --- a/examples/industrial_data_pretraining/paraformer/infer.sh +++ b/examples/industrial_data_pretraining/paraformer/infer.sh @@ -1,14 +1,11 @@ -# download model -local_path_root=../modelscope_models -mkdir -p ${local_path_root} -local_path=${local_path_root}/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch -git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git ${local_path} - +model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" +model_revision="v2.0.0" python funasr/bin/inference.py \ -+model="${local_path}" \ -+input="${local_path}/example/asr_example.wav" \ ++model=${model} \ ++model_revision=${model_revision} \ ++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \ +output_dir="./outputs/debug" \ +device="cpu" \ diff --git a/examples/industrial_data_pretraining/seaco_paraformer/demo.py b/examples/industrial_data_pretraining/seaco_paraformer/demo.py index 9aec94a08..84be0d8ff 100644 --- a/examples/industrial_data_pretraining/seaco_paraformer/demo.py +++ b/examples/industrial_data_pretraining/seaco_paraformer/demo.py @@ -5,12 +5,14 @@ from funasr import AutoModel -model = AutoModel(model="../modelscope_models/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", +model = AutoModel(model="damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", + model_revision="v2.0.0", + vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", + vad_model_revision="v2.0.0", + punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", + punc_model_revision="v2.0.0", ) -#vad_model="../modelscope_models/speech_fsmn_vad_zh-cn-16k-common-pytorch", -#punc_model="../modelscope_models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" - -res = model(input="../modelscope_models/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav", +res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", hotword='达摩院 磨搭') print(res) \ No newline at end of file diff --git a/examples/industrial_data_pretraining/seaco_paraformer/infer.sh b/examples/industrial_data_pretraining/seaco_paraformer/infer.sh index 121c61006..e92d59873 100644 --- a/examples/industrial_data_pretraining/seaco_paraformer/infer.sh +++ b/examples/industrial_data_pretraining/seaco_paraformer/infer.sh @@ -1,23 +1,19 @@ -# download model -local_path_root=../modelscope_models -mkdir -p ${local_path_root} - -local_path=${local_path_root}/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch -git clone https://www.modelscope.cn/damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git ${local_path} - -#local_path_vad=${local_path_root}/speech_fsmn_vad_zh-cn-16k-common-pytorch -#git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git ${local_path_vad} - -#local_path_punc=${local_path_root}/punc_ct-transformer_zh-cn-common-vocab272727-pytorch -#git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git ${local_path_punc} +model="damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" +model_revision="v2.0.0" +vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" +vad_model_revision="v2.0.0" +punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" +punc_model_revision="v2.0.0" python funasr/bin/inference.py \ -+model="${local_path}" \ -+input="${local_path}/example/asr_example.wav" \ ++model=${model} \ ++model_revision=${model_revision} \ ++vad_model=${vad_model} \ ++vad_model_revision=${vad_model_revision} \ ++punc_model=${punc_model} \ ++punc_model_revision=${punc_model_revision} \ ++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \ +output_dir="./outputs/debug" \ +device="cpu" \ +"hotword='达摩院 魔搭'" - -#+vad_model="${local_path_vad}" \ -#+punc_model="${local_path_punc}" \ \ No newline at end of file diff --git a/funasr/bin/inference.py b/funasr/bin/inference.py index 5b58907e2..dedaf7d48 100644 --- a/funasr/bin/inference.py +++ b/funasr/bin/inference.py @@ -20,6 +20,7 @@ from funasr.register import tables from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank from funasr.utils.vad_utils import slice_padding_audio_samples from funasr.utils.timestamp_tools import time_stamp_sentence +from funasr.download.file import download_from_url def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None): """ @@ -35,7 +36,8 @@ def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None): filelist = [".scp", ".txt", ".json", ".jsonl"] chars = string.ascii_letters + string.digits - + if isinstance(data_in, str) and data_in.startswith('http'): # url + data_in = download_from_url(data_in) if isinstance(data_in, str) and os.path.exists(data_in): # wav_path; filelist: wav.scp, file.jsonl;text.txt; _, file_extension = os.path.splitext(data_in) file_extension = file_extension.lower() @@ -59,7 +61,7 @@ def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None): data_list = [data_in] key_list = [key] elif isinstance(data_in, (list, tuple)): - if data_type is not None and isinstance(data_type, (list, tuple)): + if data_type is not None and isinstance(data_type, (list, tuple)): # mutiple inputs data_list_tmp = [] for data_in_i, data_type_i in zip(data_in, data_type): key_list, data_list_i = prepare_data_iterator(data_in=data_in_i, data_type=data_type_i) @@ -68,7 +70,7 @@ def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None): for item in zip(*data_list_tmp): data_list.append(item) else: - # [audio sample point, fbank] + # [audio sample point, fbank, text] data_list = data_in key_list = ["rand_key_" + ''.join(random.choice(chars) for _ in range(13)) for _ in range(len(data_in))] else: # raw text; audio sample point, fbank; bytes @@ -198,13 +200,12 @@ class AutoModel: kwargs = self.kwargs if kwargs is None else kwargs kwargs.update(cfg) model = self.model if model is None else model - - data_type = kwargs.get("data_type", "sound") + batch_size = kwargs.get("batch_size", 1) # if kwargs.get("device", "cpu") == "cpu": # batch_size = 1 - key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=data_type, key=key) + key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key) speed_stats = {} asr_result_list = [] @@ -268,8 +269,8 @@ class AutoModel: batch_size = int(kwargs.get("batch_size_s", 300))*1000 batch_size_threshold_ms = int(kwargs.get("batch_size_threshold_s", 60))*1000 kwargs["batch_size"] = batch_size - data_type = kwargs.get("data_type", "sound") - key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=data_type) + + key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=kwargs.get("data_type", None)) results_ret_list = [] time_speech_total_all_samples = 0.0 diff --git a/funasr/download/download_from_hub.py b/funasr/download/download_from_hub.py index abf3ba0f8..8a4044d54 100644 --- a/funasr/download/download_from_hub.py +++ b/funasr/download/download_from_hub.py @@ -21,8 +21,6 @@ def download_fr_ms(**kwargs): config = os.path.join(model_or_path, "config.yaml") if os.path.exists(config) and os.path.exists(os.path.join(model_or_path, "model.pb")): - # config = os.path.join(model_or_path, "config.yaml") - # assert os.path.exists(config), "{} is not exist!".format(config) cfg = OmegaConf.load(config) kwargs = OmegaConf.merge(cfg, kwargs) init_param = os.path.join(model_or_path, "model.pb") @@ -42,10 +40,10 @@ def download_fr_ms(**kwargs): assert os.path.exists(os.path.join(model_or_path, "configuration.json")) with open(os.path.join(model_or_path, "configuration.json"), 'r', encoding='utf-8') as f: conf_json = json.load(f) - config = os.path.join(model_or_path, conf_json["model"]["model_config"]) + config = os.path.join(model_or_path, conf_json["model_config"]) cfg = OmegaConf.load(config) kwargs = OmegaConf.merge(cfg, kwargs) - init_param = os.path.join(model_or_path, conf_json["model"]["model_name"]) + init_param = os.path.join(model_or_path, conf_json["model_file"]) kwargs["init_param"] = init_param kwargs["model"] = cfg["model"] return OmegaConf.to_container(kwargs, resolve=True) diff --git a/funasr/download/file.py b/funasr/download/file.py index d93f24c96..da4958aae 100644 --- a/funasr/download/file.py +++ b/funasr/download/file.py @@ -8,7 +8,23 @@ from pathlib import Path from typing import Generator, Union import requests +from urllib.parse import urlparse +def download_from_url(url): + result = urlparse(url) + file_path = None + if result.scheme is not None and len(result.scheme) > 0: + storage = HTTPStorage() + # bytes + data = storage.read(url) + work_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(work_dir): + os.makedirs(work_dir) + file_path = os.path.join(work_dir, os.path.basename(url)) + with open(file_path, 'wb') as fb: + fb.write(data) + assert file_path is not None, f"failed to download: {url}" + return file_path class Storage(metaclass=ABCMeta): """Abstract class of storage. diff --git a/funasr/models/ct_transformer/model.py b/funasr/models/ct_transformer/model.py index 24a6aea68..e32aa2564 100644 --- a/funasr/models/ct_transformer/model.py +++ b/funasr/models/ct_transformer/model.py @@ -11,6 +11,7 @@ from funasr.train_utils.device_funcs import to_device import torch import torch.nn as nn from funasr.models.ct_transformer.utils import split_to_mini_sentence, split_words +from funasr.utils.load_utils import load_audio_text_image_video from funasr.register import tables @@ -219,10 +220,10 @@ class CTTransformer(nn.Module): **kwargs, ): assert len(data_in) == 1 - + text = load_audio_text_image_video(data_in, data_type=kwargs.get("kwargs", "text"))[0] vad_indexes = kwargs.get("vad_indexes", None) - text = data_in[0] - text_lengths = data_lengths[0] if data_lengths is not None else None + # text = data_in[0] + # text_lengths = data_lengths[0] if data_lengths is not None else None split_size = kwargs.get("split_size", 20) tokens = split_words(text) diff --git a/funasr/models/monotonic_aligner/model.py b/funasr/models/monotonic_aligner/model.py index a0d745f2d..584b692bb 100644 --- a/funasr/models/monotonic_aligner/model.py +++ b/funasr/models/monotonic_aligner/model.py @@ -188,9 +188,12 @@ class MonotonicAligner(torch.nn.Module): text_postprocessed, time_stamp_postprocessed, _ = postprocess_utils.sentence_postprocess(token, timestamp) result_i = {"key": key[i], "text": text_postprocessed, "timestamp": time_stamp_postprocessed, - } - # ibest_writer["token"][key[i]] = " ".join(token) - ibest_writer["timestamp_list"][key[i]] = time_stamp_postprocessed - ibest_writer["timestamp_str"][key[i]] = timestamp_str + } results.append(result_i) + + if ibest_writer: + # ibest_writer["token"][key[i]] = " ".join(token) + ibest_writer["timestamp_list"][key[i]] = time_stamp_postprocessed + ibest_writer["timestamp_str"][key[i]] = timestamp_str + return results, meta_data \ No newline at end of file diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py index c5c3ffcf9..c82987f71 100644 --- a/funasr/utils/load_utils.py +++ b/funasr/utils/load_utils.py @@ -10,29 +10,13 @@ import time import logging from torch.nn.utils.rnn import pad_sequence try: - from urllib.parse import urlparse - from funasr.download.file import HTTPStorage - import tempfile + from funasr.download.file import download_from_url except: print("urllib is not installed, if you infer from url, please install it first.") -# def load_audio(data_or_path_or_list, fs: int=16000, audio_fs: int=16000): -# -# if isinstance(data_or_path_or_list, (list, tuple)): -# return [load_audio(audio, fs=fs, audio_fs=audio_fs) for audio in data_or_path_or_list] -# -# if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): -# data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list) -# data_or_path_or_list = data_or_path_or_list[0, :] -# elif isinstance(data_or_path_or_list, np.ndarray): # audio sample point -# data_or_path_or_list = np.squeeze(data_or_path_or_list) #[n_samples,] -# -# if audio_fs != fs: -# resampler = torchaudio.transforms.Resample(audio_fs, fs) -# data_or_path_or_list = resampler(data_or_path_or_list[None, :])[0, :] -# return data_or_path_or_list -def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: int = 16000, data_type=None, tokenizer=None): + +def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: int = 16000, data_type="sound", tokenizer=None): if isinstance(data_or_path_or_list, (list, tuple)): if data_type is not None and isinstance(data_type, (list, tuple)): @@ -47,16 +31,22 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: return data_or_path_or_list_ret else: - return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs) for audio in data_or_path_or_list] + return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs, data_type=data_type) for audio in data_or_path_or_list] if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith('http'): data_or_path_or_list = download_from_url(data_or_path_or_list) if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): - data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list) - data_or_path_or_list = data_or_path_or_list[0, :] + if data_type is None or data_type == "sound": + data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list) + data_or_path_or_list = data_or_path_or_list[0, :] + # elif data_type == "text" and tokenizer is not None: + # data_or_path_or_list = tokenizer.encode(data_or_path_or_list) + elif isinstance(data_or_path_or_list, str) and data_type == "text" and tokenizer is not None: + data_or_path_or_list = tokenizer.encode(data_or_path_or_list) elif isinstance(data_or_path_or_list, np.ndarray): # audio sample point data_or_path_or_list = np.squeeze(data_or_path_or_list) # [n_samples,] - elif isinstance(data_or_path_or_list, str) and data_type is not None and data_type == "text" and tokenizer is not None: - data_or_path_or_list = tokenizer.encode(data_or_path_or_list) + else: + pass + # print(f"unsupport data type: {data_or_path_or_list}, return raw data") if audio_fs != fs and data_type != "text": resampler = torchaudio.transforms.Resample(audio_fs, fs) @@ -107,19 +97,3 @@ def extract_fbank(data, data_len = None, data_type: str="sound", frontend=None): data_len = torch.tensor([data_len]) return data.to(torch.float32), data_len.to(torch.int32) -def download_from_url(url): - - result = urlparse(url) - file_path = None - if result.scheme is not None and len(result.scheme) > 0: - storage = HTTPStorage() - # bytes - data = storage.read(url) - work_dir = tempfile.TemporaryDirectory().name - if not os.path.exists(work_dir): - os.makedirs(work_dir) - file_path = os.path.join(work_dir, os.path.basename(url)) - with open(file_path, 'wb') as fb: - fb.write(data) - assert file_path is not None, f"failed to download: {url}" - return file_path \ No newline at end of file