From ea71c0f891ab2307dd71322f83e470e216af81fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 21 Feb 2024 14:41:39 +0800 Subject: [PATCH 001/101] add test --- .gitignore | 1 + .../contextual_paraformer/demo.py | 0 .../contextual_paraformer/demo2.py | 9 +++++++++ .../contextual_paraformer/infer.sh | 2 +- .../contextual_paraformer/path.sh | 6 ++++++ 5 files changed, 17 insertions(+), 1 deletion(-) mode change 100644 => 100755 examples/industrial_data_pretraining/contextual_paraformer/demo.py create mode 100644 examples/industrial_data_pretraining/contextual_paraformer/demo2.py mode change 100644 => 100755 examples/industrial_data_pretraining/contextual_paraformer/infer.sh create mode 100755 examples/industrial_data_pretraining/contextual_paraformer/path.sh diff --git a/.gitignore b/.gitignore index 6bdfd5d06..23864c3e0 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,4 @@ samples outputs* emotion2vec* GPT-SoVITS* +examples/*/*/outputs diff --git a/examples/industrial_data_pretraining/contextual_paraformer/demo.py b/examples/industrial_data_pretraining/contextual_paraformer/demo.py old mode 100644 new mode 100755 diff --git a/examples/industrial_data_pretraining/contextual_paraformer/demo2.py b/examples/industrial_data_pretraining/contextual_paraformer/demo2.py new file mode 100644 index 000000000..30bb76fbc --- /dev/null +++ b/examples/industrial_data_pretraining/contextual_paraformer/demo2.py @@ -0,0 +1,9 @@ +python -m funasr.bin.inference \ +--config-path="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3" \ +--config-name="config.yaml" \ +++init_param="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3/model.pt.ep38" \ +++tokenizer_conf.token_list="/mnt/nfs/zhifu.gzf/data/AISHELL-1-feats/DATA/data/zh_token_list/char/tokens.txt" \ +++frontend_conf.cmvn_file="/mnt/nfs/zhifu.gzf/data/AISHELL-1-feats/DATA/data/train/am.mvn" \ +++input="/mnt/nfs/zhifu.gzf/data/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav" \ +++output_dir="./outputs/debug" \ +++device="cuda:0" \ diff --git a/examples/industrial_data_pretraining/contextual_paraformer/infer.sh b/examples/industrial_data_pretraining/contextual_paraformer/infer.sh old mode 100644 new mode 100755 index 8fc66f34f..1bd4f7f5b --- a/examples/industrial_data_pretraining/contextual_paraformer/infer.sh +++ b/examples/industrial_data_pretraining/contextual_paraformer/infer.sh @@ -2,7 +2,7 @@ model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404" model_revision="v2.0.4" -python funasr/bin/inference.py \ +python ../../../funasr/bin/inference.py \ +model=${model} \ +model_revision=${model_revision} \ +input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \ diff --git a/examples/industrial_data_pretraining/contextual_paraformer/path.sh b/examples/industrial_data_pretraining/contextual_paraformer/path.sh new file mode 100755 index 000000000..1a6d67e08 --- /dev/null +++ b/examples/industrial_data_pretraining/contextual_paraformer/path.sh @@ -0,0 +1,6 @@ +export FUNASR_DIR=$PWD/../../../ + +# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PATH=$FUNASR_DIR/funasr/bin:$PATH +export PYTHONPATH=$FUNASR_DIR/funasr/bin:$FUNASR_DIR/funasr:$FUNASR_DIR:$PYTHONPATH From 691f3235cdfaea38bc92f52b5f9e14cc7ea98dcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 21 Feb 2024 15:10:00 +0800 Subject: [PATCH 002/101] add test --- .../contextual_paraformer/demo2.py | 9 --------- .../contextual_paraformer/demo2.sh | 9 +++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) delete mode 100644 examples/industrial_data_pretraining/contextual_paraformer/demo2.py create mode 100644 examples/industrial_data_pretraining/contextual_paraformer/demo2.sh diff --git a/examples/industrial_data_pretraining/contextual_paraformer/demo2.py b/examples/industrial_data_pretraining/contextual_paraformer/demo2.py deleted file mode 100644 index 30bb76fbc..000000000 --- a/examples/industrial_data_pretraining/contextual_paraformer/demo2.py +++ /dev/null @@ -1,9 +0,0 @@ -python -m funasr.bin.inference \ ---config-path="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3" \ ---config-name="config.yaml" \ -++init_param="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3/model.pt.ep38" \ -++tokenizer_conf.token_list="/mnt/nfs/zhifu.gzf/data/AISHELL-1-feats/DATA/data/zh_token_list/char/tokens.txt" \ -++frontend_conf.cmvn_file="/mnt/nfs/zhifu.gzf/data/AISHELL-1-feats/DATA/data/train/am.mvn" \ -++input="/mnt/nfs/zhifu.gzf/data/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav" \ -++output_dir="./outputs/debug" \ -++device="cuda:0" \ diff --git a/examples/industrial_data_pretraining/contextual_paraformer/demo2.sh b/examples/industrial_data_pretraining/contextual_paraformer/demo2.sh new file mode 100644 index 000000000..282f4f1f2 --- /dev/null +++ b/examples/industrial_data_pretraining/contextual_paraformer/demo2.sh @@ -0,0 +1,9 @@ +python -m funasr.bin.inference \ +--config-path="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404" \ +--config-name="config.yaml" \ +++init_param="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/model.pb" \ +++tokenizer_conf.token_list="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/tokens.txt" \ +++frontend_conf.cmvn_file="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/am.mvn" \ +++input="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/asr_example_zh.wav" \ +++output_dir="./outputs/debug2" \ +++device="" \ From 50ee4bafdcf6f0fca6b31ddf208f9575821a5455 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 21 Feb 2024 15:33:27 +0800 Subject: [PATCH 003/101] new --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 23864c3e0..bdfe70f1a 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,4 @@ outputs* emotion2vec* GPT-SoVITS* examples/*/*/outputs +cmd_read From 49bec4052b766dd57580ef83aababaab02b64f5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 21 Feb 2024 15:43:52 +0800 Subject: [PATCH 004/101] add test --- new | 1 + 1 file changed, 1 insertion(+) create mode 100644 new diff --git a/new b/new new file mode 100644 index 000000000..9daeafb98 --- /dev/null +++ b/new @@ -0,0 +1 @@ +test From eeccdc9a5d72f496f5e7b2a0e3dd381bebcc6ff9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 21 Feb 2024 15:50:16 +0800 Subject: [PATCH 005/101] new --- new | 1 + 1 file changed, 1 insertion(+) diff --git a/new b/new index 9daeafb98..ae69a4cd4 100644 --- a/new +++ b/new @@ -1 +1,2 @@ test +sda From a4df88c96fb6e44bc4f684a1289aca99dd8d7eb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 21 Feb 2024 16:12:05 +0800 Subject: [PATCH 006/101] test --- new | 2 -- new2 | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) delete mode 100644 new create mode 100644 new2 diff --git a/new b/new deleted file mode 100644 index ae69a4cd4..000000000 --- a/new +++ /dev/null @@ -1,2 +0,0 @@ -test -sda diff --git a/new2 b/new2 new file mode 100644 index 000000000..fbb91086d --- /dev/null +++ b/new2 @@ -0,0 +1,2 @@ +sdsd + From 0a6eacc54c6b2564aaa048076c2b2a1202b9c6a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 21 Feb 2024 16:20:57 +0800 Subject: [PATCH 007/101] test --- funasr/models/contextual_paraformer/model.py | 27 ++++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/funasr/models/contextual_paraformer/model.py b/funasr/models/contextual_paraformer/model.py index 3f79eedf2..598c074e2 100644 --- a/funasr/models/contextual_paraformer/model.py +++ b/funasr/models/contextual_paraformer/model.py @@ -29,7 +29,7 @@ from funasr.train_utils.device_funcs import force_gatherable from funasr.models.transformer.utils.add_sos_eos import add_sos_eos from funasr.models.transformer.utils.nets_utils import make_pad_mask, pad_list from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank - +import pdb if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"): from torch.cuda.amp import autocast @@ -63,7 +63,7 @@ class ContextualParaformer(Paraformer): crit_attn_smooth = kwargs.get("crit_attn_smooth", 0.0) bias_encoder_dropout_rate = kwargs.get("bias_encoder_dropout_rate", 0.0) - + pdb.set_trace() if bias_encoder_type == 'lstm': self.bias_encoder = torch.nn.LSTM(inner_dim, inner_dim, 1, batch_first=True, dropout=bias_encoder_dropout_rate) self.bias_embed = torch.nn.Embedding(self.vocab_size, inner_dim) @@ -81,6 +81,7 @@ class ContextualParaformer(Paraformer): if self.crit_attn_weight > 0: self.attn_loss = torch.nn.L1Loss() self.crit_attn_smooth = crit_attn_smooth + pdb.set_trace() def forward( @@ -103,17 +104,17 @@ class ContextualParaformer(Paraformer): text_lengths = text_lengths[:, 0] if len(speech_lengths.size()) > 1: speech_lengths = speech_lengths[:, 0] - + pdb.set_trace() batch_size = speech.shape[0] hotword_pad = kwargs.get("hotword_pad") hotword_lengths = kwargs.get("hotword_lengths") dha_pad = kwargs.get("dha_pad") - + pdb.set_trace() # 1. Encoder encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) - + pdb.set_trace() loss_ctc, cer_ctc = None, None stats = dict() @@ -128,12 +129,12 @@ class ContextualParaformer(Paraformer): stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None stats["cer_ctc"] = cer_ctc - + pdb.set_trace() # 2b. Attention decoder branch loss_att, acc_att, cer_att, wer_att, loss_pre, loss_ideal = self._calc_att_clas_loss( encoder_out, encoder_out_lens, text, text_lengths, hotword_pad, hotword_lengths ) - + pdb.set_trace() # 3. CTC-Att loss definition if self.ctc_weight == 0.0: loss = loss_att + loss_pre * self.predictor_weight @@ -171,22 +172,26 @@ class ContextualParaformer(Paraformer): ): encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to( encoder_out.device) + pdb.set_trace() if self.predictor_bias == 1: _, ys_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id) ys_pad_lens = ys_pad_lens + self.predictor_bias + pdb.set_trace() pre_acoustic_embeds, pre_token_length, _, _ = self.predictor(encoder_out, ys_pad, encoder_out_mask, ignore_id=self.ignore_id) - + pdb.set_trace() # -1. bias encoder if self.use_decoder_embedding: hw_embed = self.decoder.embed(hotword_pad) else: hw_embed = self.bias_embed(hotword_pad) + pdb.set_trace() hw_embed, (_, _) = self.bias_encoder(hw_embed) + pdb.set_trace() _ind = np.arange(0, hotword_pad.shape[0]).tolist() selected = hw_embed[_ind, [i - 1 for i in hotword_lengths.detach().cpu().tolist()]] contextual_info = selected.squeeze(0).repeat(ys_pad.shape[0], 1, 1).to(ys_pad.device) - + pdb.set_trace() # 0. sampler decoder_out_1st = None if self.sampling_ratio > 0.0: @@ -198,7 +203,7 @@ class ContextualParaformer(Paraformer): if self.step_cur < 2: logging.info("disable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio)) sematic_embeds = pre_acoustic_embeds - + pdb.set_trace() # 1. Forward decoder decoder_outs = self.decoder( encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=contextual_info @@ -214,7 +219,7 @@ class ContextualParaformer(Paraformer): loss_ideal = None ''' loss_ideal = None - + pdb.set_trace() if decoder_out_1st is None: decoder_out_1st = decoder_out # 2. Compute attention loss From a0ffe57b05679d91e56227ce1109a5d725d93192 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 21 Feb 2024 16:48:33 +0800 Subject: [PATCH 008/101] test --- funasr/auto/auto_model.py | 11 +++++++-- funasr/models/contextual_paraformer/model.py | 24 ++++++++++++++++---- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index e95cfd8d1..4cc52a50c 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -24,7 +24,7 @@ try: from funasr.models.campplus.cluster_backend import ClusterBackend except: print("If you want to use the speaker diarization, please `pip install hdbscan`") - +import pdb def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None): """ @@ -210,13 +210,15 @@ class AutoModel: kwargs.update(cfg) model = self.model if model is None else model model.eval() + pdb.set_trace() batch_size = kwargs.get("batch_size", 1) # if kwargs.get("device", "cpu") == "cpu": # batch_size = 1 key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key) - + pdb.set_trace() + speed_stats = {} asr_result_list = [] num_samples = len(data_list) @@ -224,20 +226,25 @@ class AutoModel: pbar = tqdm(colour="blue", total=num_samples, dynamic_ncols=True) if not disable_pbar else None time_speech_total = 0.0 time_escape_total = 0.0 + pdb.set_trace() for beg_idx in range(0, num_samples, batch_size): + pdb.set_trace() end_idx = min(num_samples, beg_idx + batch_size) data_batch = data_list[beg_idx:end_idx] key_batch = key_list[beg_idx:end_idx] batch = {"data_in": data_batch, "key": key_batch} + pdb.set_trace() if (end_idx - beg_idx) == 1 and kwargs.get("data_type", None) == "fbank": # fbank batch["data_in"] = data_batch[0] batch["data_lengths"] = input_len time1 = time.perf_counter() with torch.no_grad(): + pdb.set_trace() results, meta_data = model.inference(**batch, **kwargs) time2 = time.perf_counter() + pdb.set_trace() asr_result_list.extend(results) # batch_data_time = time_per_frame_s * data_batch_i["speech_lengths"].sum().item() diff --git a/funasr/models/contextual_paraformer/model.py b/funasr/models/contextual_paraformer/model.py index 598c074e2..655ca6f58 100644 --- a/funasr/models/contextual_paraformer/model.py +++ b/funasr/models/contextual_paraformer/model.py @@ -63,7 +63,6 @@ class ContextualParaformer(Paraformer): crit_attn_smooth = kwargs.get("crit_attn_smooth", 0.0) bias_encoder_dropout_rate = kwargs.get("bias_encoder_dropout_rate", 0.0) - pdb.set_trace() if bias_encoder_type == 'lstm': self.bias_encoder = torch.nn.LSTM(inner_dim, inner_dim, 1, batch_first=True, dropout=bias_encoder_dropout_rate) self.bias_embed = torch.nn.Embedding(self.vocab_size, inner_dim) @@ -81,7 +80,6 @@ class ContextualParaformer(Paraformer): if self.crit_attn_weight > 0: self.attn_loss = torch.nn.L1Loss() self.crit_attn_smooth = crit_attn_smooth - pdb.set_trace() def forward( @@ -313,20 +311,24 @@ class ContextualParaformer(Paraformer): **kwargs, ): # init beamsearch + pdb.set_trace() is_use_ctc = kwargs.get("decoding_ctc_weight", 0.0) > 0.00001 and self.ctc != None is_use_lm = kwargs.get("lm_weight", 0.0) > 0.00001 and kwargs.get("lm_file", None) is not None if self.beam_search is None and (is_use_lm or is_use_ctc): logging.info("enable beam_search") self.init_beam_search(**kwargs) self.nbest = kwargs.get("nbest", 1) - + pdb.set_trace() meta_data = {} # extract fbank feats time1 = time.perf_counter() + pdb.set_trace() audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000)) + pdb.set_trace() time2 = time.perf_counter() meta_data["load_data"] = f"{time2 - time1:0.3f}" + pdb.set_trace() speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend) time3 = time.perf_counter() @@ -334,38 +336,50 @@ class ContextualParaformer(Paraformer): meta_data[ "batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000 + pdb.set_trace() speech = speech.to(device=kwargs["device"]) speech_lengths = speech_lengths.to(device=kwargs["device"]) # hotword + pdb.set_trace() self.hotword_list = self.generate_hotwords_list(kwargs.get("hotword", None), tokenizer=tokenizer, frontend=frontend) + pdb.set_trace() + # Encoder encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) if isinstance(encoder_out, tuple): encoder_out = encoder_out[0] - + pdb.set_trace() + + # predictor predictor_outs = self.calc_predictor(encoder_out, encoder_out_lens) pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \ predictor_outs[2], predictor_outs[3] + pdb.set_trace() pre_token_length = pre_token_length.round().long() if torch.max(pre_token_length) < 1: return [] - + pdb.set_trace() + decoder_outs = self.cal_decoder_with_predictor(encoder_out, encoder_out_lens, pre_acoustic_embeds, pre_token_length, hw_list=self.hotword_list, clas_scale=kwargs.get("clas_scale", 1.0)) + pdb.set_trace() decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1] + pdb.set_trace() results = [] b, n, d = decoder_out.size() + pdb.set_trace() for i in range(b): x = encoder_out[i, :encoder_out_lens[i], :] am_scores = decoder_out[i, :pre_token_length[i], :] + pdb.set_trace() if self.beam_search is not None: nbest_hyps = self.beam_search( x=x, am_scores=am_scores, maxlenratio=kwargs.get("maxlenratio", 0.0), From d2f1cf39f8fedc19d0e14fac269a413d62375359 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 21 Feb 2024 17:03:29 +0800 Subject: [PATCH 009/101] test --- .../industrial_data_pretraining/contextual_paraformer/demo2.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 examples/industrial_data_pretraining/contextual_paraformer/demo2.sh diff --git a/examples/industrial_data_pretraining/contextual_paraformer/demo2.sh b/examples/industrial_data_pretraining/contextual_paraformer/demo2.sh old mode 100644 new mode 100755 From 0a7384a1ec540c38b2b584e373fd516f61e2e86d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 21 Feb 2024 19:07:25 +0800 Subject: [PATCH 010/101] test --- funasr/models/contextual_paraformer/model.py | 25 +++++++------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/funasr/models/contextual_paraformer/model.py b/funasr/models/contextual_paraformer/model.py index 5ccc611d6..10bbf9d00 100644 --- a/funasr/models/contextual_paraformer/model.py +++ b/funasr/models/contextual_paraformer/model.py @@ -294,10 +294,11 @@ class ContextualParaformer(Paraformer): enforce_sorted=False) _, (h_n, _) = self.bias_encoder(hw_embed) hw_embed = h_n.repeat(encoder_out.shape[0], 1, 1) - + pdb.set_trace() decoder_outs = self.decoder( encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=hw_embed, clas_scale=clas_scale ) + pdb.set_trace() decoder_out = decoder_outs[0] decoder_out = torch.log_softmax(decoder_out, dim=-1) return decoder_out, ys_pad_lens @@ -311,65 +312,55 @@ class ContextualParaformer(Paraformer): **kwargs, ): # init beamsearch - pdb.set_trace() + is_use_ctc = kwargs.get("decoding_ctc_weight", 0.0) > 0.00001 and self.ctc != None is_use_lm = kwargs.get("lm_weight", 0.0) > 0.00001 and kwargs.get("lm_file", None) is not None if self.beam_search is None and (is_use_lm or is_use_ctc): logging.info("enable beam_search") self.init_beam_search(**kwargs) self.nbest = kwargs.get("nbest", 1) - pdb.set_trace() + meta_data = {} # extract fbank feats time1 = time.perf_counter() - pdb.set_trace() + audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000)) - pdb.set_trace() + time2 = time.perf_counter() meta_data["load_data"] = f"{time2 - time1:0.3f}" - pdb.set_trace() + speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend) time3 = time.perf_counter() meta_data["extract_feat"] = f"{time3 - time2:0.3f}" meta_data[ "batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000 - - pdb.set_trace() + speech = speech.to(device=kwargs["device"]) speech_lengths = speech_lengths.to(device=kwargs["device"]) # hotword - pdb.set_trace() self.hotword_list = self.generate_hotwords_list(kwargs.get("hotword", None), tokenizer=tokenizer, frontend=frontend) - pdb.set_trace() - # Encoder encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) if isinstance(encoder_out, tuple): encoder_out = encoder_out[0] - pdb.set_trace() - # predictor predictor_outs = self.calc_predictor(encoder_out, encoder_out_lens) pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \ predictor_outs[2], predictor_outs[3] - pdb.set_trace() pre_token_length = pre_token_length.round().long() if torch.max(pre_token_length) < 1: return [] - - pdb.set_trace() decoder_outs = self.cal_decoder_with_predictor(encoder_out, encoder_out_lens, pre_acoustic_embeds, pre_token_length, hw_list=self.hotword_list, clas_scale=kwargs.get("clas_scale", 1.0)) - pdb.set_trace() decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1] pdb.set_trace() From 62178770dccdbf5da42e831898ea32adeeacba45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 21 Feb 2024 20:04:01 +0800 Subject: [PATCH 011/101] test --- funasr/auto/auto_model.py | 6 +-- funasr/models/contextual_paraformer/model.py | 29 +++++------ funasr/models/seaco_paraformer/model.py | 51 +++++++++++++++++--- 3 files changed, 55 insertions(+), 31 deletions(-) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index 60aeb1600..a3202fdb4 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -209,14 +209,12 @@ class AutoModel: kwargs.update(cfg) model = self.model if model is None else model model.eval() - pdb.set_trace() batch_size = kwargs.get("batch_size", 1) # if kwargs.get("device", "cpu") == "cpu": # batch_size = 1 key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key) - pdb.set_trace() speed_stats = {} asr_result_list = [] @@ -225,14 +223,12 @@ class AutoModel: pbar = tqdm(colour="blue", total=num_samples, dynamic_ncols=True) if not disable_pbar else None time_speech_total = 0.0 time_escape_total = 0.0 - pdb.set_trace() for beg_idx in range(0, num_samples, batch_size): - pdb.set_trace() end_idx = min(num_samples, beg_idx + batch_size) data_batch = data_list[beg_idx:end_idx] key_batch = key_list[beg_idx:end_idx] batch = {"data_in": data_batch, "key": key_batch} - pdb.set_trace() + if (end_idx - beg_idx) == 1 and kwargs.get("data_type", None) == "fbank": # fbank batch["data_in"] = data_batch[0] batch["data_lengths"] = input_len diff --git a/funasr/models/contextual_paraformer/model.py b/funasr/models/contextual_paraformer/model.py index 10bbf9d00..1c0805ab0 100644 --- a/funasr/models/contextual_paraformer/model.py +++ b/funasr/models/contextual_paraformer/model.py @@ -102,17 +102,16 @@ class ContextualParaformer(Paraformer): text_lengths = text_lengths[:, 0] if len(speech_lengths.size()) > 1: speech_lengths = speech_lengths[:, 0] - pdb.set_trace() + batch_size = speech.shape[0] hotword_pad = kwargs.get("hotword_pad") hotword_lengths = kwargs.get("hotword_lengths") dha_pad = kwargs.get("dha_pad") - pdb.set_trace() + # 1. Encoder encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) - pdb.set_trace() loss_ctc, cer_ctc = None, None stats = dict() @@ -127,12 +126,11 @@ class ContextualParaformer(Paraformer): stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None stats["cer_ctc"] = cer_ctc - pdb.set_trace() # 2b. Attention decoder branch loss_att, acc_att, cer_att, wer_att, loss_pre, loss_ideal = self._calc_att_clas_loss( encoder_out, encoder_out_lens, text, text_lengths, hotword_pad, hotword_lengths ) - pdb.set_trace() + # 3. CTC-Att loss definition if self.ctc_weight == 0.0: loss = loss_att + loss_pre * self.predictor_weight @@ -170,26 +168,24 @@ class ContextualParaformer(Paraformer): ): encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to( encoder_out.device) - pdb.set_trace() + if self.predictor_bias == 1: _, ys_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id) ys_pad_lens = ys_pad_lens + self.predictor_bias - pdb.set_trace() + pre_acoustic_embeds, pre_token_length, _, _ = self.predictor(encoder_out, ys_pad, encoder_out_mask, ignore_id=self.ignore_id) - pdb.set_trace() # -1. bias encoder if self.use_decoder_embedding: hw_embed = self.decoder.embed(hotword_pad) else: hw_embed = self.bias_embed(hotword_pad) - pdb.set_trace() + hw_embed, (_, _) = self.bias_encoder(hw_embed) - pdb.set_trace() _ind = np.arange(0, hotword_pad.shape[0]).tolist() selected = hw_embed[_ind, [i - 1 for i in hotword_lengths.detach().cpu().tolist()]] contextual_info = selected.squeeze(0).repeat(ys_pad.shape[0], 1, 1).to(ys_pad.device) - pdb.set_trace() + # 0. sampler decoder_out_1st = None if self.sampling_ratio > 0.0: @@ -201,7 +197,7 @@ class ContextualParaformer(Paraformer): if self.step_cur < 2: logging.info("disable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio)) sematic_embeds = pre_acoustic_embeds - pdb.set_trace() + # 1. Forward decoder decoder_outs = self.decoder( encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=contextual_info @@ -217,7 +213,7 @@ class ContextualParaformer(Paraformer): loss_ideal = None ''' loss_ideal = None - pdb.set_trace() + if decoder_out_1st is None: decoder_out_1st = decoder_out # 2. Compute attention loss @@ -294,11 +290,11 @@ class ContextualParaformer(Paraformer): enforce_sorted=False) _, (h_n, _) = self.bias_encoder(hw_embed) hw_embed = h_n.repeat(encoder_out.shape[0], 1, 1) - pdb.set_trace() + decoder_outs = self.decoder( encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=hw_embed, clas_scale=clas_scale ) - pdb.set_trace() + decoder_out = decoder_outs[0] decoder_out = torch.log_softmax(decoder_out, dim=-1) return decoder_out, ys_pad_lens @@ -363,14 +359,11 @@ class ContextualParaformer(Paraformer): clas_scale=kwargs.get("clas_scale", 1.0)) decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1] - pdb.set_trace() results = [] b, n, d = decoder_out.size() - pdb.set_trace() for i in range(b): x = encoder_out[i, :encoder_out_lens[i], :] am_scores = decoder_out[i, :pre_token_length[i], :] - pdb.set_trace() if self.beam_search is not None: nbest_hyps = self.beam_search( x=x, am_scores=am_scores, maxlenratio=kwargs.get("maxlenratio", 0.0), diff --git a/funasr/models/seaco_paraformer/model.py b/funasr/models/seaco_paraformer/model.py index caf2b15c7..b3b913344 100644 --- a/funasr/models/seaco_paraformer/model.py +++ b/funasr/models/seaco_paraformer/model.py @@ -32,7 +32,7 @@ from funasr.models.transformer.utils.add_sos_eos import add_sos_eos from funasr.models.transformer.utils.nets_utils import make_pad_mask, pad_list from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank - +import pdb if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"): from torch.cuda.amp import autocast else: @@ -130,7 +130,7 @@ class SeacoParaformer(BiCifParaformer, Paraformer): hotword_pad = kwargs.get("hotword_pad") hotword_lengths = kwargs.get("hotword_lengths") dha_pad = kwargs.get("dha_pad") - + batch_size = speech.shape[0] self.step_cur += 1 # for data-parallel @@ -212,58 +212,87 @@ class SeacoParaformer(BiCifParaformer, Paraformer): nfilter=50, seaco_weight=1.0): # decoder forward + pdb.set_trace() decoder_out, decoder_hidden, _ = self.decoder(encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, return_hidden=True, return_both=True) + pdb.set_trace() decoder_pred = torch.log_softmax(decoder_out, dim=-1) if hw_list is not None: + pdb.set_trace() hw_lengths = [len(i) for i in hw_list] hw_list_ = [torch.Tensor(i).long() for i in hw_list] hw_list_pad = pad_list(hw_list_, 0).to(encoder_out.device) + pdb.set_trace() selected = self._hotword_representation(hw_list_pad, torch.Tensor(hw_lengths).int().to(encoder_out.device)) + pdb.set_trace() contextual_info = selected.squeeze(0).repeat(encoder_out.shape[0], 1, 1).to(encoder_out.device) + pdb.set_trace() num_hot_word = contextual_info.shape[1] _contextual_length = torch.Tensor([num_hot_word]).int().repeat(encoder_out.shape[0]).to(encoder_out.device) - + pdb.set_trace() # ASF Core if nfilter > 0 and nfilter < num_hot_word: for dec in self.seaco_decoder.decoders: dec.reserve_attn = True + pdb.set_trace() # cif_attended, _ = self.decoder2(contextual_info, _contextual_length, sematic_embeds, ys_pad_lens) dec_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, decoder_hidden, ys_pad_lens) # cif_filter = torch.topk(self.decoder2.decoders[-1].attn_mat[0][0].sum(0).sum(0)[:-1], min(nfilter, num_hot_word-1))[1].tolist() + pdb.set_trace() hotword_scores = self.seaco_decoder.decoders[-1].attn_mat[0][0].sum(0).sum(0)[:-1] # hotword_scores /= torch.sqrt(torch.tensor(hw_lengths)[:-1].float()).to(hotword_scores.device) + pdb.set_trace() dec_filter = torch.topk(hotword_scores, min(nfilter, num_hot_word-1))[1].tolist() + pdb.set_trace() add_filter = dec_filter + pdb.set_trace() add_filter.append(len(hw_list_pad)-1) # filter hotword embedding + pdb.set_trace() selected = selected[add_filter] # again + pdb.set_trace() contextual_info = selected.squeeze(0).repeat(encoder_out.shape[0], 1, 1).to(encoder_out.device) + pdb.set_trace() num_hot_word = contextual_info.shape[1] _contextual_length = torch.Tensor([num_hot_word]).int().repeat(encoder_out.shape[0]).to(encoder_out.device) + pdb.set_trace() for dec in self.seaco_decoder.decoders: dec.attn_mat = [] dec.reserve_attn = False - + pdb.set_trace() # SeACo Core cif_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, sematic_embeds, ys_pad_lens) + pdb.set_trace() dec_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, decoder_hidden, ys_pad_lens) + pdb.set_trace() merged = self._merge(cif_attended, dec_attended) - + pdb.set_trace() + dha_output = self.hotword_output_layer(merged) # remove the last token in loss calculation + pdb.set_trace() dha_pred = torch.log_softmax(dha_output, dim=-1) + pdb.set_trace() def _merge_res(dec_output, dha_output): + pdb.set_trace() lmbd = torch.Tensor([seaco_weight] * dha_output.shape[0]) + pdb.set_trace() dha_ids = dha_output.max(-1)[-1]# [0] + pdb.set_trace() dha_mask = (dha_ids == 8377).int().unsqueeze(-1) + pdb.set_trace() a = (1 - lmbd) / lmbd b = 1 / lmbd + pdb.set_trace() a, b = a.to(dec_output.device), b.to(dec_output.device) + pdb.set_trace() dha_mask = (dha_mask + a.reshape(-1, 1, 1)) / b.reshape(-1, 1, 1) # logits = dec_output * dha_mask + dha_output[:,:,:-1] * (1-dha_mask) + pdb.set_trace() logits = dec_output * dha_mask + dha_output[:,:,:] * (1-dha_mask) return logits + merged_pred = _merge_res(decoder_pred, dha_pred) + pdb.set_trace() # import pdb; pdb.set_trace() return merged_pred else: @@ -318,7 +347,7 @@ class SeacoParaformer(BiCifParaformer, Paraformer): logging.info("enable beam_search") self.init_beam_search(**kwargs) self.nbest = kwargs.get("nbest", 1) - + pdb.set_trace() meta_data = {} # extract fbank feats @@ -326,6 +355,7 @@ class SeacoParaformer(BiCifParaformer, Paraformer): audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000)) time2 = time.perf_counter() meta_data["load_data"] = f"{time2 - time1:0.3f}" + pdb.set_trace() speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend) time3 = time.perf_counter() @@ -336,14 +366,18 @@ class SeacoParaformer(BiCifParaformer, Paraformer): speech = speech.to(device=kwargs["device"]) speech_lengths = speech_lengths.to(device=kwargs["device"]) + pdb.set_trace() # hotword self.hotword_list = self.generate_hotwords_list(kwargs.get("hotword", None), tokenizer=tokenizer, frontend=frontend) + pdb.set_trace() # Encoder encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) if isinstance(encoder_out, tuple): encoder_out = encoder_out[0] + + pdb.set_trace() # predictor predictor_outs = self.calc_predictor(encoder_out, encoder_out_lens) pre_acoustic_embeds, pre_token_length, _, _ = predictor_outs[0], predictor_outs[1], \ @@ -352,15 +386,16 @@ class SeacoParaformer(BiCifParaformer, Paraformer): if torch.max(pre_token_length) < 1: return [] - + pdb.set_trace() decoder_out = self._seaco_decode_with_ASF(encoder_out, encoder_out_lens, pre_acoustic_embeds, pre_token_length, hw_list=self.hotword_list) + pdb.set_trace() # decoder_out, _ = decoder_outs[0], decoder_outs[1] _, _, us_alphas, us_peaks = self.calc_predictor_timestamp(encoder_out, encoder_out_lens, pre_token_length) - + pdb.set_trace() results = [] b, n, d = decoder_out.size() for i in range(b): From e943de2cb128074ca71bcee69fc262ac43420860 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 10:13:29 +0800 Subject: [PATCH 012/101] test --- .../seaco_paraformer/demo.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/industrial_data_pretraining/seaco_paraformer/demo.py b/examples/industrial_data_pretraining/seaco_paraformer/demo.py index a44c649ae..551dd8bf8 100644 --- a/examples/industrial_data_pretraining/seaco_paraformer/demo.py +++ b/examples/industrial_data_pretraining/seaco_paraformer/demo.py @@ -7,10 +7,10 @@ from funasr import AutoModel model = AutoModel(model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revision="v2.0.4", - vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", - vad_model_revision="v2.0.4", - punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", - punc_model_revision="v2.0.4", + # vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", + # vad_model_revision="v2.0.4", + # punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", + # punc_model_revision="v2.0.4", # spk_model="damo/speech_campplus_sv_zh-cn_16k-common", # spk_model_revision="v2.0.2", ) @@ -43,4 +43,4 @@ import soundfile wav_file = os.path.join(model.model_path, "example/asr_example.wav") speech, sample_rate = soundfile.read(wav_file) res = model.generate(input=[speech], batch_size_s=300, is_final=True) -''' \ No newline at end of file +''' From eba89467c819857f16f1883ff87c4d2e79e4a17b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 11:49:34 +0800 Subject: [PATCH 013/101] test --- funasr/models/seaco_paraformer/model.py | 46 ++++--------------------- 1 file changed, 7 insertions(+), 39 deletions(-) diff --git a/funasr/models/seaco_paraformer/model.py b/funasr/models/seaco_paraformer/model.py index b3b913344..e0467b3c4 100644 --- a/funasr/models/seaco_paraformer/model.py +++ b/funasr/models/seaco_paraformer/model.py @@ -212,88 +212,63 @@ class SeacoParaformer(BiCifParaformer, Paraformer): nfilter=50, seaco_weight=1.0): # decoder forward - pdb.set_trace() + decoder_out, decoder_hidden, _ = self.decoder(encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, return_hidden=True, return_both=True) - pdb.set_trace() + decoder_pred = torch.log_softmax(decoder_out, dim=-1) if hw_list is not None: - pdb.set_trace() hw_lengths = [len(i) for i in hw_list] hw_list_ = [torch.Tensor(i).long() for i in hw_list] hw_list_pad = pad_list(hw_list_, 0).to(encoder_out.device) - pdb.set_trace() selected = self._hotword_representation(hw_list_pad, torch.Tensor(hw_lengths).int().to(encoder_out.device)) - pdb.set_trace() + contextual_info = selected.squeeze(0).repeat(encoder_out.shape[0], 1, 1).to(encoder_out.device) - pdb.set_trace() num_hot_word = contextual_info.shape[1] _contextual_length = torch.Tensor([num_hot_word]).int().repeat(encoder_out.shape[0]).to(encoder_out.device) - pdb.set_trace() + # ASF Core if nfilter > 0 and nfilter < num_hot_word: for dec in self.seaco_decoder.decoders: dec.reserve_attn = True - pdb.set_trace() + # cif_attended, _ = self.decoder2(contextual_info, _contextual_length, sematic_embeds, ys_pad_lens) dec_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, decoder_hidden, ys_pad_lens) # cif_filter = torch.topk(self.decoder2.decoders[-1].attn_mat[0][0].sum(0).sum(0)[:-1], min(nfilter, num_hot_word-1))[1].tolist() - pdb.set_trace() + hotword_scores = self.seaco_decoder.decoders[-1].attn_mat[0][0].sum(0).sum(0)[:-1] # hotword_scores /= torch.sqrt(torch.tensor(hw_lengths)[:-1].float()).to(hotword_scores.device) - pdb.set_trace() dec_filter = torch.topk(hotword_scores, min(nfilter, num_hot_word-1))[1].tolist() - pdb.set_trace() add_filter = dec_filter - pdb.set_trace() add_filter.append(len(hw_list_pad)-1) # filter hotword embedding - pdb.set_trace() selected = selected[add_filter] # again - pdb.set_trace() contextual_info = selected.squeeze(0).repeat(encoder_out.shape[0], 1, 1).to(encoder_out.device) - pdb.set_trace() num_hot_word = contextual_info.shape[1] _contextual_length = torch.Tensor([num_hot_word]).int().repeat(encoder_out.shape[0]).to(encoder_out.device) - pdb.set_trace() for dec in self.seaco_decoder.decoders: dec.attn_mat = [] dec.reserve_attn = False - pdb.set_trace() # SeACo Core cif_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, sematic_embeds, ys_pad_lens) - pdb.set_trace() dec_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, decoder_hidden, ys_pad_lens) - pdb.set_trace() merged = self._merge(cif_attended, dec_attended) - pdb.set_trace() dha_output = self.hotword_output_layer(merged) # remove the last token in loss calculation - pdb.set_trace() dha_pred = torch.log_softmax(dha_output, dim=-1) - pdb.set_trace() def _merge_res(dec_output, dha_output): - pdb.set_trace() lmbd = torch.Tensor([seaco_weight] * dha_output.shape[0]) - pdb.set_trace() dha_ids = dha_output.max(-1)[-1]# [0] - pdb.set_trace() dha_mask = (dha_ids == 8377).int().unsqueeze(-1) - pdb.set_trace() a = (1 - lmbd) / lmbd b = 1 / lmbd - pdb.set_trace() a, b = a.to(dec_output.device), b.to(dec_output.device) - pdb.set_trace() dha_mask = (dha_mask + a.reshape(-1, 1, 1)) / b.reshape(-1, 1, 1) # logits = dec_output * dha_mask + dha_output[:,:,:-1] * (1-dha_mask) - pdb.set_trace() logits = dec_output * dha_mask + dha_output[:,:,:] * (1-dha_mask) return logits merged_pred = _merge_res(decoder_pred, dha_pred) - pdb.set_trace() - # import pdb; pdb.set_trace() return merged_pred else: return decoder_pred @@ -347,7 +322,6 @@ class SeacoParaformer(BiCifParaformer, Paraformer): logging.info("enable beam_search") self.init_beam_search(**kwargs) self.nbest = kwargs.get("nbest", 1) - pdb.set_trace() meta_data = {} # extract fbank feats @@ -355,7 +329,6 @@ class SeacoParaformer(BiCifParaformer, Paraformer): audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000)) time2 = time.perf_counter() meta_data["load_data"] = f"{time2 - time1:0.3f}" - pdb.set_trace() speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend) time3 = time.perf_counter() @@ -366,18 +339,15 @@ class SeacoParaformer(BiCifParaformer, Paraformer): speech = speech.to(device=kwargs["device"]) speech_lengths = speech_lengths.to(device=kwargs["device"]) - pdb.set_trace() # hotword self.hotword_list = self.generate_hotwords_list(kwargs.get("hotword", None), tokenizer=tokenizer, frontend=frontend) - pdb.set_trace() # Encoder encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) if isinstance(encoder_out, tuple): encoder_out = encoder_out[0] - pdb.set_trace() # predictor predictor_outs = self.calc_predictor(encoder_out, encoder_out_lens) pre_acoustic_embeds, pre_token_length, _, _ = predictor_outs[0], predictor_outs[1], \ @@ -386,16 +356,14 @@ class SeacoParaformer(BiCifParaformer, Paraformer): if torch.max(pre_token_length) < 1: return [] - pdb.set_trace() decoder_out = self._seaco_decode_with_ASF(encoder_out, encoder_out_lens, pre_acoustic_embeds, pre_token_length, hw_list=self.hotword_list) - pdb.set_trace() + # decoder_out, _ = decoder_outs[0], decoder_outs[1] _, _, us_alphas, us_peaks = self.calc_predictor_timestamp(encoder_out, encoder_out_lens, pre_token_length) - pdb.set_trace() results = [] b, n, d = decoder_out.size() for i in range(b): From 0871fa6e0d986115e3056878d2eec9dcac2ba43d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 14:15:35 +0800 Subject: [PATCH 014/101] atsr --- funasr/models/lcbnet/model.py | 454 ++++++++++++++++++++++++++++++++++ 1 file changed, 454 insertions(+) create mode 100644 funasr/models/lcbnet/model.py diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py new file mode 100644 index 000000000..c68ccd7ba --- /dev/null +++ b/funasr/models/lcbnet/model.py @@ -0,0 +1,454 @@ +import logging +from typing import Union, Dict, List, Tuple, Optional + +import time +import torch +import torch.nn as nn +from torch.cuda.amp import autocast + +from funasr.losses.label_smoothing_loss import LabelSmoothingLoss +from funasr.models.ctc.ctc import CTC +from funasr.models.transformer.utils.add_sos_eos import add_sos_eos +from funasr.metrics.compute_acc import th_accuracy +# from funasr.models.e2e_asr_common import ErrorCalculator +from funasr.train_utils.device_funcs import force_gatherable +from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank +from funasr.utils import postprocess_utils +from funasr.utils.datadir_writer import DatadirWriter +from funasr.register import tables + +@tables.register("model_classes", "Transformer") +class Transformer(nn.Module): + """CTC-attention hybrid Encoder-Decoder model""" + + + def __init__( + self, + specaug: str = None, + specaug_conf: dict = None, + normalize: str = None, + normalize_conf: dict = None, + encoder: str = None, + encoder_conf: dict = None, + decoder: str = None, + decoder_conf: dict = None, + ctc: str = None, + ctc_conf: dict = None, + ctc_weight: float = 0.5, + interctc_weight: float = 0.0, + input_size: int = 80, + vocab_size: int = -1, + ignore_id: int = -1, + blank_id: int = 0, + sos: int = 1, + eos: int = 2, + lsm_weight: float = 0.0, + length_normalized_loss: bool = False, + report_cer: bool = True, + report_wer: bool = True, + sym_space: str = "", + sym_blank: str = "", + # extract_feats_in_collect_stats: bool = True, + share_embedding: bool = False, + # preencoder: Optional[AbsPreEncoder] = None, + # postencoder: Optional[AbsPostEncoder] = None, + **kwargs, + ): + + super().__init__() + + if specaug is not None: + specaug_class = tables.specaug_classes.get(specaug) + specaug = specaug_class(**specaug_conf) + if normalize is not None: + normalize_class = tables.normalize_classes.get(normalize) + normalize = normalize_class(**normalize_conf) + encoder_class = tables.encoder_classes.get(encoder) + encoder = encoder_class(input_size=input_size, **encoder_conf) + encoder_output_size = encoder.output_size() + if decoder is not None: + decoder_class = tables.decoder_classes.get(decoder) + decoder = decoder_class( + vocab_size=vocab_size, + encoder_output_size=encoder_output_size, + **decoder_conf, + ) + if ctc_weight > 0.0: + + if ctc_conf is None: + ctc_conf = {} + + ctc = CTC( + odim=vocab_size, encoder_output_size=encoder_output_size, **ctc_conf + ) + + self.blank_id = blank_id + self.sos = sos if sos is not None else vocab_size - 1 + self.eos = eos if eos is not None else vocab_size - 1 + self.vocab_size = vocab_size + self.ignore_id = ignore_id + self.ctc_weight = ctc_weight + self.specaug = specaug + self.normalize = normalize + self.encoder = encoder + + if not hasattr(self.encoder, "interctc_use_conditioning"): + self.encoder.interctc_use_conditioning = False + if self.encoder.interctc_use_conditioning: + self.encoder.conditioning_layer = torch.nn.Linear( + vocab_size, self.encoder.output_size() + ) + self.interctc_weight = interctc_weight + + # self.error_calculator = None + if ctc_weight == 1.0: + self.decoder = None + else: + self.decoder = decoder + + self.criterion_att = LabelSmoothingLoss( + size=vocab_size, + padding_idx=ignore_id, + smoothing=lsm_weight, + normalize_length=length_normalized_loss, + ) + # + # if report_cer or report_wer: + # self.error_calculator = ErrorCalculator( + # token_list, sym_space, sym_blank, report_cer, report_wer + # ) + # + self.error_calculator = None + if ctc_weight == 0.0: + self.ctc = None + else: + self.ctc = ctc + + self.share_embedding = share_embedding + if self.share_embedding: + self.decoder.embed = None + + self.length_normalized_loss = length_normalized_loss + self.beam_search = None + + def forward( + self, + speech: torch.Tensor, + speech_lengths: torch.Tensor, + text: torch.Tensor, + text_lengths: torch.Tensor, + **kwargs, + ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]: + """Encoder + Decoder + Calc loss + Args: + speech: (Batch, Length, ...) + speech_lengths: (Batch, ) + text: (Batch, Length) + text_lengths: (Batch,) + """ + # import pdb; + # pdb.set_trace() + if len(text_lengths.size()) > 1: + text_lengths = text_lengths[:, 0] + if len(speech_lengths.size()) > 1: + speech_lengths = speech_lengths[:, 0] + + batch_size = speech.shape[0] + + # 1. Encoder + encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) + intermediate_outs = None + if isinstance(encoder_out, tuple): + intermediate_outs = encoder_out[1] + encoder_out = encoder_out[0] + + loss_att, acc_att, cer_att, wer_att = None, None, None, None + loss_ctc, cer_ctc = None, None + stats = dict() + + # decoder: CTC branch + if self.ctc_weight != 0.0: + loss_ctc, cer_ctc = self._calc_ctc_loss( + encoder_out, encoder_out_lens, text, text_lengths + ) + + # Collect CTC branch stats + stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None + stats["cer_ctc"] = cer_ctc + + # Intermediate CTC (optional) + loss_interctc = 0.0 + if self.interctc_weight != 0.0 and intermediate_outs is not None: + for layer_idx, intermediate_out in intermediate_outs: + # we assume intermediate_out has the same length & padding + # as those of encoder_out + loss_ic, cer_ic = self._calc_ctc_loss( + intermediate_out, encoder_out_lens, text, text_lengths + ) + loss_interctc = loss_interctc + loss_ic + + # Collect Intermedaite CTC stats + stats["loss_interctc_layer{}".format(layer_idx)] = ( + loss_ic.detach() if loss_ic is not None else None + ) + stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic + + loss_interctc = loss_interctc / len(intermediate_outs) + + # calculate whole encoder loss + loss_ctc = ( + 1 - self.interctc_weight + ) * loss_ctc + self.interctc_weight * loss_interctc + + # decoder: Attention decoder branch + loss_att, acc_att, cer_att, wer_att = self._calc_att_loss( + encoder_out, encoder_out_lens, text, text_lengths + ) + + # 3. CTC-Att loss definition + if self.ctc_weight == 0.0: + loss = loss_att + elif self.ctc_weight == 1.0: + loss = loss_ctc + else: + loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att + + # Collect Attn branch stats + stats["loss_att"] = loss_att.detach() if loss_att is not None else None + stats["acc"] = acc_att + stats["cer"] = cer_att + stats["wer"] = wer_att + + # Collect total loss stats + stats["loss"] = torch.clone(loss.detach()) + + # force_gatherable: to-device and to-tensor if scalar for DataParallel + if self.length_normalized_loss: + batch_size = int((text_lengths + 1).sum()) + loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device) + return loss, stats, weight + + + def encode( + self, speech: torch.Tensor, speech_lengths: torch.Tensor, **kwargs, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Frontend + Encoder. Note that this method is used by asr_inference.py + Args: + speech: (Batch, Length, ...) + speech_lengths: (Batch, ) + ind: int + """ + with autocast(False): + + # Data augmentation + if self.specaug is not None and self.training: + speech, speech_lengths = self.specaug(speech, speech_lengths) + + # Normalization for feature: e.g. Global-CMVN, Utterance-CMVN + if self.normalize is not None: + speech, speech_lengths = self.normalize(speech, speech_lengths) + + # Forward encoder + # feats: (Batch, Length, Dim) + # -> encoder_out: (Batch, Length2, Dim2) + if self.encoder.interctc_use_conditioning: + encoder_out, encoder_out_lens, _ = self.encoder( + speech, speech_lengths, ctc=self.ctc + ) + else: + encoder_out, encoder_out_lens, _ = self.encoder(speech, speech_lengths) + intermediate_outs = None + if isinstance(encoder_out, tuple): + intermediate_outs = encoder_out[1] + encoder_out = encoder_out[0] + + if intermediate_outs is not None: + return (encoder_out, intermediate_outs), encoder_out_lens + + return encoder_out, encoder_out_lens + + def _calc_att_loss( + self, + encoder_out: torch.Tensor, + encoder_out_lens: torch.Tensor, + ys_pad: torch.Tensor, + ys_pad_lens: torch.Tensor, + ): + ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id) + ys_in_lens = ys_pad_lens + 1 + + # 1. Forward decoder + decoder_out, _ = self.decoder( + encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens + ) + + # 2. Compute attention loss + loss_att = self.criterion_att(decoder_out, ys_out_pad) + acc_att = th_accuracy( + decoder_out.view(-1, self.vocab_size), + ys_out_pad, + ignore_label=self.ignore_id, + ) + + # Compute cer/wer using attention-decoder + if self.training or self.error_calculator is None: + cer_att, wer_att = None, None + else: + ys_hat = decoder_out.argmax(dim=-1) + cer_att, wer_att = self.error_calculator(ys_hat.cpu(), ys_pad.cpu()) + + return loss_att, acc_att, cer_att, wer_att + + def _calc_ctc_loss( + self, + encoder_out: torch.Tensor, + encoder_out_lens: torch.Tensor, + ys_pad: torch.Tensor, + ys_pad_lens: torch.Tensor, + ): + # Calc CTC loss + loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens) + + # Calc CER using CTC + cer_ctc = None + if not self.training and self.error_calculator is not None: + ys_hat = self.ctc.argmax(encoder_out).data + cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True) + return loss_ctc, cer_ctc + + def init_beam_search(self, + **kwargs, + ): + from funasr.models.transformer.search import BeamSearch + from funasr.models.transformer.scorers.ctc import CTCPrefixScorer + from funasr.models.transformer.scorers.length_bonus import LengthBonus + + # 1. Build ASR model + scorers = {} + + if self.ctc != None: + ctc = CTCPrefixScorer(ctc=self.ctc, eos=self.eos) + scorers.update( + ctc=ctc + ) + token_list = kwargs.get("token_list") + scorers.update( + decoder=self.decoder, + length_bonus=LengthBonus(len(token_list)), + ) + + + # 3. Build ngram model + # ngram is not supported now + ngram = None + scorers["ngram"] = ngram + + weights = dict( + decoder=1.0 - kwargs.get("decoding_ctc_weight", 0.5), + ctc=kwargs.get("decoding_ctc_weight", 0.5), + lm=kwargs.get("lm_weight", 0.0), + ngram=kwargs.get("ngram_weight", 0.0), + length_bonus=kwargs.get("penalty", 0.0), + ) + beam_search = BeamSearch( + beam_size=kwargs.get("beam_size", 10), + weights=weights, + scorers=scorers, + sos=self.sos, + eos=self.eos, + vocab_size=len(token_list), + token_list=token_list, + pre_beam_score_key=None if self.ctc_weight == 1.0 else "full", + ) + + self.beam_search = beam_search + + def inference(self, + data_in, + data_lengths=None, + key: list=None, + tokenizer=None, + frontend=None, + **kwargs, + ): + + if kwargs.get("batch_size", 1) > 1: + raise NotImplementedError("batch decoding is not implemented") + + # init beamsearch + if self.beam_search is None: + logging.info("enable beam_search") + self.init_beam_search(**kwargs) + self.nbest = kwargs.get("nbest", 1) + + meta_data = {} + if isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank": # fbank + speech, speech_lengths = data_in, data_lengths + if len(speech.shape) < 3: + speech = speech[None, :, :] + if speech_lengths is None: + speech_lengths = speech.shape[1] + else: + # extract fbank feats + time1 = time.perf_counter() + audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000), + data_type=kwargs.get("data_type", "sound"), + tokenizer=tokenizer) + time2 = time.perf_counter() + meta_data["load_data"] = f"{time2 - time1:0.3f}" + speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), + frontend=frontend) + time3 = time.perf_counter() + meta_data["extract_feat"] = f"{time3 - time2:0.3f}" + meta_data["batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000 + + speech = speech.to(device=kwargs["device"]) + speech_lengths = speech_lengths.to(device=kwargs["device"]) + # Encoder + encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) + if isinstance(encoder_out, tuple): + encoder_out = encoder_out[0] + + # c. Passed the encoder result and the beam search + nbest_hyps = self.beam_search( + x=encoder_out[0], maxlenratio=kwargs.get("maxlenratio", 0.0), minlenratio=kwargs.get("minlenratio", 0.0) + ) + + nbest_hyps = nbest_hyps[: self.nbest] + + + results = [] + b, n, d = encoder_out.size() + for i in range(b): + + for nbest_idx, hyp in enumerate(nbest_hyps): + ibest_writer = None + if kwargs.get("output_dir") is not None: + if not hasattr(self, "writer"): + self.writer = DatadirWriter(kwargs.get("output_dir")) + ibest_writer = self.writer[f"{nbest_idx + 1}best_recog"] + + # remove sos/eos and get results + last_pos = -1 + if isinstance(hyp.yseq, list): + token_int = hyp.yseq[1:last_pos] + else: + token_int = hyp.yseq[1:last_pos].tolist() + + # remove blank symbol id, which is assumed to be 0 + token_int = list(filter(lambda x: x != self.eos and x != self.sos and x != self.blank_id, token_int)) + + # Change integer-ids to tokens + token = tokenizer.ids2tokens(token_int) + text = tokenizer.tokens2text(token) + + text_postprocessed, _ = postprocess_utils.sentence_postprocess(token) + result_i = {"key": key[i], "token": token, "text": text_postprocessed} + results.append(result_i) + + if ibest_writer is not None: + ibest_writer["token"][key[i]] = " ".join(token) + ibest_writer["text"][key[i]] = text_postprocessed + + return results, meta_data + From 0e416eacbfea112a76860223ca99937cb4a909c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 14:24:54 +0800 Subject: [PATCH 015/101] test --- funasr/models/lcbnet/model.py | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index c68ccd7ba..6a028b2f7 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -1,3 +1,8 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- +# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. +# MIT License (https://opensource.org/licenses/MIT) + import logging from typing import Union, Dict, List, Tuple, Optional @@ -17,10 +22,13 @@ from funasr.utils import postprocess_utils from funasr.utils.datadir_writer import DatadirWriter from funasr.register import tables -@tables.register("model_classes", "Transformer") -class Transformer(nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - +@tables.register("model_classes", "LCBNet") +class LCBNet(nn.Module): + """ + Author: Speech Lab of DAMO Academy, Alibaba Group + LCB-NET: LONG-CONTEXT BIASING FOR AUDIO-VISUAL SPEECH RECOGNITION + https://arxiv.org/abs/2401.06390 + """ def __init__( self, @@ -32,10 +40,19 @@ class Transformer(nn.Module): encoder_conf: dict = None, decoder: str = None, decoder_conf: dict = None, + text_encoder: str = None, + text_encoder_conf: dict = None, + bias_predictor: str = None, + bias_predictor_conf: dict = None, + fusion_encoder: str = None, + fusion_encoder_conf: dict = None, ctc: str = None, ctc_conf: dict = None, ctc_weight: float = 0.5, interctc_weight: float = 0.0, + select_num: int = 2, + select_length: int = 3, + insert_blank: bool = True, input_size: int = 80, vocab_size: int = -1, ignore_id: int = -1, @@ -66,6 +83,15 @@ class Transformer(nn.Module): encoder_class = tables.encoder_classes.get(encoder) encoder = encoder_class(input_size=input_size, **encoder_conf) encoder_output_size = encoder.output_size() + + # lcbnet modules: text encoder, fusion encoder and bias predictor + text_encoder_class = tables.encoder_classes.get(text_encoder) + text_encoder = text_encoder_class(input_size=vocab_size, **text_encoder_conf) + fusion_encoder_class = tables.encoder_classes.get(fusion_encoder) + fusion_encoder = fusion_encoder_class(**fusion_encoder_conf) + bias_predictor_class = tables.encoder_classes.get_class(bias_predictor) + bias_predictor = bias_predictor_class(args.bias_predictor_conf) + if decoder is not None: decoder_class = tables.decoder_classes.get(decoder) decoder = decoder_class( From 6d4a5c19310be72e4dc12dc9471670868451dda6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 14:30:57 +0800 Subject: [PATCH 016/101] test --- funasr/models/lcbnet/model.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index 6a028b2f7..9646e1e0d 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -117,6 +117,9 @@ class LCBNet(nn.Module): self.specaug = specaug self.normalize = normalize self.encoder = encoder + self.text_encoder = text_encoder + self.fusion_encoder = fusion_encoder + self.bias_predictor = bias_predictor if not hasattr(self.encoder, "interctc_use_conditioning"): self.encoder.interctc_use_conditioning = False From 044199f80279825baba0831380c5fc0369abd298 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 14:33:17 +0800 Subject: [PATCH 017/101] test --- funasr/models/lcbnet/model.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index 9646e1e0d..563ff26e9 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -21,7 +21,7 @@ from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank from funasr.utils import postprocess_utils from funasr.utils.datadir_writer import DatadirWriter from funasr.register import tables - +import pdb @tables.register("model_classes", "LCBNet") class LCBNet(nn.Module): """ @@ -90,7 +90,7 @@ class LCBNet(nn.Module): fusion_encoder_class = tables.encoder_classes.get(fusion_encoder) fusion_encoder = fusion_encoder_class(**fusion_encoder_conf) bias_predictor_class = tables.encoder_classes.get_class(bias_predictor) - bias_predictor = bias_predictor_class(args.bias_predictor_conf) + bias_predictor = bias_predictor_class(bias_predictor_conf) if decoder is not None: decoder_class = tables.decoder_classes.get(decoder) @@ -117,9 +117,13 @@ class LCBNet(nn.Module): self.specaug = specaug self.normalize = normalize self.encoder = encoder + # lcbnet self.text_encoder = text_encoder self.fusion_encoder = fusion_encoder self.bias_predictor = bias_predictor + self.select_num = select_num + self.select_length = select_length + self.insert_blank = insert_blank if not hasattr(self.encoder, "interctc_use_conditioning"): self.encoder.interctc_use_conditioning = False @@ -409,7 +413,8 @@ class LCBNet(nn.Module): logging.info("enable beam_search") self.init_beam_search(**kwargs) self.nbest = kwargs.get("nbest", 1) - + pdb.set_trace() + meta_data = {} if isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank": # fbank speech, speech_lengths = data_in, data_lengths From a70bd4b9ff593648de6b939a908caaaf18df5719 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 14:39:11 +0800 Subject: [PATCH 018/101] atsr --- examples/industrial_data_pretraining/lcbnet/demo2.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100755 examples/industrial_data_pretraining/lcbnet/demo2.sh diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh new file mode 100755 index 000000000..5fd2eccdc --- /dev/null +++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh @@ -0,0 +1,12 @@ +file_dir=./exp/ + + +python -m funasr.bin.inference \ +--config-path=$file_dir \ +--config-name="config.yaml" \ +++init_param="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/model.pb" \ +++tokenizer_conf.token_list="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/tokens.txt" \ +++frontend_conf.cmvn_file="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/am.mvn" \ +++input="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/asr_example_zh.wav" \ +++output_dir="./outputs/debug2" \ +++device="" \ From dce85a25d3c0f444b7e7825f186e483af4646760 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 15:38:39 +0800 Subject: [PATCH 019/101] test --- .../industrial_data_pretraining/lcbnet/demo2.sh | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh index 5fd2eccdc..20b003b33 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo2.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh @@ -1,12 +1,14 @@ -file_dir=./exp/ +file_dir=./exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch python -m funasr.bin.inference \ ---config-path=$file_dir \ +--config-path=${file_dir} \ --config-name="config.yaml" \ -++init_param="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/model.pb" \ -++tokenizer_conf.token_list="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/tokens.txt" \ -++frontend_conf.cmvn_file="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/am.mvn" \ -++input="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/asr_example_zh.wav" \ -++output_dir="./outputs/debug2" \ +++init_param=${file_dir}/model.pb \ +++tokenizer_conf.token_list=${file_dir}/tokens.txt \ +++frontend_conf.cmvn_file=${file_dir}/am.mvn \ +++input=${file_dir}/wav.scp \ +++input=${file_dir}/ocr_text \ +++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ +++output_dir="./outputs/debug" \ ++device="" \ From 7f0a06946f9accf61264ba8befe84a5cadb9f6a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 15:43:12 +0800 Subject: [PATCH 020/101] test --- examples/industrial_data_pretraining/lcbnet/demo2.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh index 20b003b33..10ba5aed8 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo2.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh @@ -9,6 +9,7 @@ python -m funasr.bin.inference \ ++frontend_conf.cmvn_file=${file_dir}/am.mvn \ ++input=${file_dir}/wav.scp \ ++input=${file_dir}/ocr_text \ ++data_type='["sound", "text"]' \ ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ ++output_dir="./outputs/debug" \ ++device="" \ From 733073d2693de593cef2eacc902c49990e067cef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 16:08:51 +0800 Subject: [PATCH 021/101] test --- funasr/auto/auto_model.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index a3202fdb4..9db8c015d 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -153,15 +153,18 @@ class AutoModel: # build tokenizer tokenizer = kwargs.get("tokenizer", None) + pdb.set_trace() if tokenizer is not None: tokenizer_class = tables.tokenizer_classes.get(tokenizer) + pdb.set_trace() tokenizer = tokenizer_class(**kwargs["tokenizer_conf"]) + pdb.set_trace() kwargs["tokenizer"] = tokenizer kwargs["token_list"] = tokenizer.token_list vocab_size = len(tokenizer.token_list) else: vocab_size = -1 - + pdb.set_trace() # build frontend frontend = kwargs.get("frontend", None) if frontend is not None: From c4fa4c5efd4965b4514194179cfed6e1faa76c42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 16:10:12 +0800 Subject: [PATCH 022/101] test --- examples/industrial_data_pretraining/lcbnet/demo2.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh index 10ba5aed8..a5afa9c7d 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo2.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh @@ -1,4 +1,4 @@ -file_dir=./exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch +file_dir="./exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" python -m funasr.bin.inference \ From 0b317f6d8f11de02c1348f0828e01f63bfad3626 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 16:13:18 +0800 Subject: [PATCH 023/101] test --- examples/industrial_data_pretraining/lcbnet/demo2.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh index a5afa9c7d..36a692856 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo2.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh @@ -1,4 +1,4 @@ -file_dir="./exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" +file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" python -m funasr.bin.inference \ From 491b2af1ecdf26f1513ac6a83f3490bf1b265449 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 16:21:37 +0800 Subject: [PATCH 024/101] test --- funasr/auto/auto_model.py | 3 --- funasr/frontends/default.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index 9db8c015d..68559d121 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -153,12 +153,9 @@ class AutoModel: # build tokenizer tokenizer = kwargs.get("tokenizer", None) - pdb.set_trace() if tokenizer is not None: tokenizer_class = tables.tokenizer_classes.get(tokenizer) - pdb.set_trace() tokenizer = tokenizer_class(**kwargs["tokenizer_conf"]) - pdb.set_trace() kwargs["tokenizer"] = tokenizer kwargs["token_list"] = tokenizer.token_list vocab_size = len(tokenizer.token_list) diff --git a/funasr/frontends/default.py b/funasr/frontends/default.py index 8ac1ca853..15cc35a27 100644 --- a/funasr/frontends/default.py +++ b/funasr/frontends/default.py @@ -17,7 +17,7 @@ from funasr.frontends.utils.stft import Stft from funasr.frontends.utils.frontend import Frontend from funasr.models.transformer.utils.nets_utils import make_pad_mask - +@tables.register("frontend_classes", "DefaultFrontend") class DefaultFrontend(nn.Module): """Conventional frontend structure for ASR. Stft -> WPE -> MVDR-Beamformer -> Power-spec -> Mel-Fbank -> CMVN From 8cfee2db5cf7a32f8865f393184d8a48dd6bd38d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 16:22:46 +0800 Subject: [PATCH 025/101] test --- funasr/auto/auto_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index 68559d121..e6e08b8cd 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -169,7 +169,7 @@ class AutoModel: frontend = frontend_class(**kwargs["frontend_conf"]) kwargs["frontend"] = frontend kwargs["input_size"] = frontend.output_size() - + pdb.set_trace() # build model model_class = tables.model_classes.get(kwargs["model"]) model = model_class(**kwargs, **kwargs["model_conf"], vocab_size=vocab_size) From cfc18a90476675a04baa4edf62f756ff408f3551 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 16:24:17 +0800 Subject: [PATCH 026/101] test --- funasr/auto/auto_model.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index e6e08b8cd..7c8630356 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -164,9 +164,12 @@ class AutoModel: pdb.set_trace() # build frontend frontend = kwargs.get("frontend", None) + pdb.set_trace() if frontend is not None: + pdb.set_trace() frontend_class = tables.frontend_classes.get(frontend) frontend = frontend_class(**kwargs["frontend_conf"]) + pdb.set_trace() kwargs["frontend"] = frontend kwargs["input_size"] = frontend.output_size() pdb.set_trace() From c1c337ef9a6916d9fb12898983f54b5f3630ff0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 16:28:59 +0800 Subject: [PATCH 027/101] test --- funasr/frontends/default.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/funasr/frontends/default.py b/funasr/frontends/default.py index 15cc35a27..70638e29c 100644 --- a/funasr/frontends/default.py +++ b/funasr/frontends/default.py @@ -16,6 +16,8 @@ from funasr.frontends.utils.log_mel import LogMel from funasr.frontends.utils.stft import Stft from funasr.frontends.utils.frontend import Frontend from funasr.models.transformer.utils.nets_utils import make_pad_mask +from funasr.register import tables + @tables.register("frontend_classes", "DefaultFrontend") class DefaultFrontend(nn.Module): @@ -40,6 +42,7 @@ class DefaultFrontend(nn.Module): frontend_conf: Optional[dict] = None, apply_stft: bool = True, use_channel: int = None, + **kwargs, ): super().__init__() if isinstance(fs, str): From 060d18ee4b86729e11f31ff16c822f3be33503ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 16:36:40 +0800 Subject: [PATCH 028/101] test --- funasr/frontends/default.py | 1 - 1 file changed, 1 deletion(-) diff --git a/funasr/frontends/default.py b/funasr/frontends/default.py index 70638e29c..ab5b73166 100644 --- a/funasr/frontends/default.py +++ b/funasr/frontends/default.py @@ -3,7 +3,6 @@ from typing import Optional from typing import Tuple from typing import Union import logging -import humanfriendly import numpy as np import torch import torch.nn as nn From c1e136f639a650cd40c2df9599935bb7f4c307ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 16:38:29 +0800 Subject: [PATCH 029/101] test --- funasr/models/lcbnet/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 funasr/models/lcbnet/__init__.py diff --git a/funasr/models/lcbnet/__init__.py b/funasr/models/lcbnet/__init__.py new file mode 100644 index 000000000..e69de29bb From b13b4c1bf5dbf5337539dbc017820ae20d0f2dc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 16:42:18 +0800 Subject: [PATCH 030/101] test --- funasr/frontends/default.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/funasr/frontends/default.py b/funasr/frontends/default.py index ab5b73166..66d42f71c 100644 --- a/funasr/frontends/default.py +++ b/funasr/frontends/default.py @@ -26,7 +26,7 @@ class DefaultFrontend(nn.Module): def __init__( self, - fs: Union[int, str] = 16000, + fs: int = 16000, n_fft: int = 512, win_length: int = None, hop_length: int = 128, @@ -44,8 +44,6 @@ class DefaultFrontend(nn.Module): **kwargs, ): super().__init__() - if isinstance(fs, str): - fs = humanfriendly.parse_size(fs) # Deepcopy (In general, dict shouldn't be used as default arg) frontend_conf = copy.deepcopy(frontend_conf) @@ -147,7 +145,7 @@ class MultiChannelFrontend(nn.Module): def __init__( self, - fs: Union[int, str] = 16000, + fs: int = 16000, n_fft: int = 512, win_length: int = None, hop_length: int = None, @@ -170,9 +168,6 @@ class MultiChannelFrontend(nn.Module): mc: bool = True ): super().__init__() - if isinstance(fs, str): - fs = humanfriendly.parse_size(fs) - # Deepcopy (In general, dict shouldn't be used as default arg) frontend_conf = copy.deepcopy(frontend_conf) if win_length is None and hop_length is None: From cdc70650084f9a69bacd842b7434a008354e2ea0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 17:23:20 +0800 Subject: [PATCH 031/101] test --- funasr/auto/auto_model.py | 7 +- funasr/models/lcbnet/attention.py | 112 +++++++++ funasr/models/lcbnet/encoder.py | 392 ++++++++++++++++++++++++++++++ 3 files changed, 506 insertions(+), 5 deletions(-) create mode 100644 funasr/models/lcbnet/attention.py create mode 100644 funasr/models/lcbnet/encoder.py diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index 7c8630356..a5341eacf 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -161,18 +161,15 @@ class AutoModel: vocab_size = len(tokenizer.token_list) else: vocab_size = -1 - pdb.set_trace() # build frontend frontend = kwargs.get("frontend", None) - pdb.set_trace() + if frontend is not None: - pdb.set_trace() frontend_class = tables.frontend_classes.get(frontend) frontend = frontend_class(**kwargs["frontend_conf"]) - pdb.set_trace() kwargs["frontend"] = frontend kwargs["input_size"] = frontend.output_size() - pdb.set_trace() + # build model model_class = tables.model_classes.get(kwargs["model"]) model = model_class(**kwargs, **kwargs["model_conf"], vocab_size=vocab_size) diff --git a/funasr/models/lcbnet/attention.py b/funasr/models/lcbnet/attention.py new file mode 100644 index 000000000..8e8c5943a --- /dev/null +++ b/funasr/models/lcbnet/attention.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2024 yufan +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +"""Multi-Head Attention Return Weight layer definition.""" + +import math + +import torch +from torch import nn + +class MultiHeadedAttentionReturnWeight(nn.Module): + """Multi-Head Attention layer. + + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, n_head, n_feat, dropout_rate): + """Construct an MultiHeadedAttentionReturnWeight object.""" + super(MultiHeadedAttentionReturnWeight, self).__init__() + assert n_feat % n_head == 0 + # We assume d_v always equals d_k + self.d_k = n_feat // n_head + self.h = n_head + self.linear_q = nn.Linear(n_feat, n_feat) + self.linear_k = nn.Linear(n_feat, n_feat) + self.linear_v = nn.Linear(n_feat, n_feat) + self.linear_out = nn.Linear(n_feat, n_feat) + self.attn = None + self.dropout = nn.Dropout(p=dropout_rate) + + def forward_qkv(self, query, key, value): + """Transform query, key and value. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + + Returns: + torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k). + torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k). + torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k). + + """ + n_batch = query.size(0) + q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) + k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) + v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) + q = q.transpose(1, 2) # (batch, head, time1, d_k) + k = k.transpose(1, 2) # (batch, head, time2, d_k) + v = v.transpose(1, 2) # (batch, head, time2, d_k) + + return q, k, v + + def forward_attention(self, value, scores, mask): + """Compute attention context vector. + + Args: + value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k). + scores (torch.Tensor): Attention score (#batch, n_head, time1, time2). + mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2). + + Returns: + torch.Tensor: Transformed value (#batch, time1, d_model) + weighted by the attention score (#batch, time1, time2). + + """ + n_batch = value.size(0) + if mask is not None: + mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) + min_value = torch.finfo(scores.dtype).min + scores = scores.masked_fill(mask, min_value) + self.attn = torch.softmax(scores, dim=-1).masked_fill( + mask, 0.0 + ) # (batch, head, time1, time2) + else: + self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + + p_attn = self.dropout(self.attn) + x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) + x = ( + x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) + ) # (batch, time1, d_model) + + return self.linear_out(x), self.attn # (batch, time1, d_model) + + def forward(self, query, key, value, mask): + """Compute scaled dot product attention. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + + """ + q, k, v = self.forward_qkv(query, key, value) + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) + return self.forward_attention(v, scores, mask) + + diff --git a/funasr/models/lcbnet/encoder.py b/funasr/models/lcbnet/encoder.py new file mode 100644 index 000000000..d2464f1de --- /dev/null +++ b/funasr/models/lcbnet/encoder.py @@ -0,0 +1,392 @@ +# Copyright 2019 Shigeki Karita +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +"""Transformer encoder definition.""" + +from typing import List +from typing import Optional +from typing import Tuple + +import torch +from torch import nn +import logging + +from funasr.models.transformer.attention import MultiHeadedAttention +from funasr.models.lcbnet.attention import MultiHeadedAttentionReturnWeight +from funasr.models.transformer.embedding import PositionalEncoding +from funasr.models.transformer.layer_norm import LayerNorm + +from funasr.models.transformer.utils.nets_utils import make_pad_mask +from funasr.models.transformer.positionwise_feed_forward import PositionwiseFeedForward +from funasr.models.transformer.utils.repeat import repeat +from funasr.register import tables + +class EncoderLayer(nn.Module): + """Encoder layer module. + + Args: + size (int): Input dimension. + self_attn (torch.nn.Module): Self-attention module instance. + `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance + can be used as the argument. + feed_forward (torch.nn.Module): Feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance + can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + stochastic_depth_rate (float): Proability to skip this layer. + During training, the layer may skip residual computation and return input + as-is with given probability. + """ + + def __init__( + self, + size, + self_attn, + feed_forward, + dropout_rate, + normalize_before=True, + concat_after=False, + stochastic_depth_rate=0.0, + ): + """Construct an EncoderLayer object.""" + super(EncoderLayer, self).__init__() + self.self_attn = self_attn + self.feed_forward = feed_forward + self.norm1 = LayerNorm(size) + self.norm2 = LayerNorm(size) + self.dropout = nn.Dropout(dropout_rate) + self.size = size + self.normalize_before = normalize_before + self.concat_after = concat_after + if self.concat_after: + self.concat_linear = nn.Linear(size + size, size) + self.stochastic_depth_rate = stochastic_depth_rate + + def forward(self, x, mask, cache=None): + """Compute encoded features. + + Args: + x_input (torch.Tensor): Input tensor (#batch, time, size). + mask (torch.Tensor): Mask tensor for the input (#batch, time). + cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size). + + Returns: + torch.Tensor: Output tensor (#batch, time, size). + torch.Tensor: Mask tensor (#batch, time). + + """ + skip_layer = False + # with stochastic depth, residual connection `x + f(x)` becomes + # `x <- x + 1 / (1 - p) * f(x)` at training time. + stoch_layer_coeff = 1.0 + if self.training and self.stochastic_depth_rate > 0: + skip_layer = torch.rand(1).item() < self.stochastic_depth_rate + stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate) + + if skip_layer: + if cache is not None: + x = torch.cat([cache, x], dim=1) + return x, mask + + residual = x + if self.normalize_before: + x = self.norm1(x) + + if cache is None: + x_q = x + else: + assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size) + x_q = x[:, -1:, :] + residual = residual[:, -1:, :] + mask = None if mask is None else mask[:, -1:, :] + + if self.concat_after: + x_concat = torch.cat((x, self.self_attn(x_q, x, x, mask)), dim=-1) + x = residual + stoch_layer_coeff * self.concat_linear(x_concat) + else: + x = residual + stoch_layer_coeff * self.dropout( + self.self_attn(x_q, x, x, mask) + ) + if not self.normalize_before: + x = self.norm1(x) + + residual = x + if self.normalize_before: + x = self.norm2(x) + x = residual + stoch_layer_coeff * self.dropout(self.feed_forward(x)) + if not self.normalize_before: + x = self.norm2(x) + + if cache is not None: + x = torch.cat([cache, x], dim=1) + + return x, mask + +@tables.register("encoder_classes", "TransformerTextEncoder") +class TransformerTextEncoder(nn.Module): + """Transformer text encoder module. + + Args: + input_size: input dim + output_size: dimension of attention + attention_heads: the number of heads of multi head attention + linear_units: the number of units of position-wise feed forward + num_blocks: the number of decoder blocks + dropout_rate: dropout rate + attention_dropout_rate: dropout rate in attention + positional_dropout_rate: dropout rate after adding positional encoding + input_layer: input layer type + pos_enc_class: PositionalEncoding or ScaledPositionalEncoding + normalize_before: whether to use layer_norm before the first block + concat_after: whether to concat attention layer's input and output + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. + i.e. x -> x + att(x) + positionwise_layer_type: linear of conv1d + positionwise_conv_kernel_size: kernel size of positionwise conv1d layer + padding_idx: padding_idx for input_layer=embed + """ + + def __init__( + self, + input_size: int, + output_size: int = 256, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + attention_dropout_rate: float = 0.0, + pos_enc_class=PositionalEncoding, + normalize_before: bool = True, + concat_after: bool = False, + ): + super().__init__() + self._output_size = output_size + + self.embed = torch.nn.Sequential( + torch.nn.Embedding(input_size, output_size), + pos_enc_class(output_size, positional_dropout_rate), + ) + + self.normalize_before = normalize_before + + positionwise_layer = PositionwiseFeedForward + positionwise_layer_args = ( + output_size, + linear_units, + dropout_rate, + ) + self.encoders = repeat( + num_blocks, + lambda lnum: EncoderLayer( + output_size, + MultiHeadedAttention( + attention_heads, output_size, attention_dropout_rate + ), + positionwise_layer(*positionwise_layer_args), + dropout_rate, + normalize_before, + concat_after, + ), + ) + if self.normalize_before: + self.after_norm = LayerNorm(output_size) + + def output_size(self) -> int: + return self._output_size + + def forward( + self, + xs_pad: torch.Tensor, + ilens: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + """Embed positions in tensor. + + Args: + xs_pad: input tensor (B, L, D) + ilens: input length (B) + Returns: + position embedded tensor and mask + """ + masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device) + xs_pad = self.embed(xs_pad) + + xs_pad, masks = self.encoders(xs_pad, masks) + + if self.normalize_before: + xs_pad = self.after_norm(xs_pad) + + olens = masks.squeeze(1).sum(1) + return xs_pad, olens, None + + + + +@tables.register("encoder_classes", "FusionSANEncoder") +class SelfSrcAttention(nn.Module): + """Single decoder layer module. + + Args: + size (int): Input dimension. + self_attn (torch.nn.Module): Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + src_attn (torch.nn.Module): Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + feed_forward (torch.nn.Module): Feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance + can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + + + """ + def __init__( + self, + size, + attention_heads, + attention_dim, + linear_units, + self_attention_dropout_rate, + src_attention_dropout_rate, + positional_dropout_rate, + dropout_rate, + normalize_before=True, + concat_after=False, + ): + """Construct an SelfSrcAttention object.""" + super(SelfSrcAttention, self).__init__() + self.size = size + self.self_attn = MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate) + self.src_attn = MultiHeadedAttentionReturnWeight(attention_heads, attention_dim, src_attention_dropout_rate) + self.feed_forward = PositionwiseFeedForward(attention_dim, linear_units, positional_dropout_rate) + self.norm1 = LayerNorm(size) + self.norm2 = LayerNorm(size) + self.norm3 = LayerNorm(size) + self.dropout = nn.Dropout(dropout_rate) + self.normalize_before = normalize_before + self.concat_after = concat_after + if self.concat_after: + self.concat_linear1 = nn.Linear(size + size, size) + self.concat_linear2 = nn.Linear(size + size, size) + + def forward(self, tgt, tgt_mask, memory, memory_mask, cache=None): + """Compute decoded features. + + Args: + tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). + tgt_mask (torch.Tensor): Mask for input tensor (#batch, maxlen_out). + memory (torch.Tensor): Encoded memory, float32 (#batch, maxlen_in, size). + memory_mask (torch.Tensor): Encoded memory mask (#batch, maxlen_in). + cache (List[torch.Tensor]): List of cached tensors. + Each tensor shape should be (#batch, maxlen_out - 1, size). + + Returns: + torch.Tensor: Output tensor(#batch, maxlen_out, size). + torch.Tensor: Mask for output tensor (#batch, maxlen_out). + torch.Tensor: Encoded memory (#batch, maxlen_in, size). + torch.Tensor: Encoded memory mask (#batch, maxlen_in). + + """ + residual = tgt + if self.normalize_before: + tgt = self.norm1(tgt) + + if cache is None: + tgt_q = tgt + tgt_q_mask = tgt_mask + else: + # compute only the last frame query keeping dim: max_time_out -> 1 + assert cache.shape == ( + tgt.shape[0], + tgt.shape[1] - 1, + self.size, + ), f"{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" + tgt_q = tgt[:, -1:, :] + residual = residual[:, -1:, :] + tgt_q_mask = None + if tgt_mask is not None: + tgt_q_mask = tgt_mask[:, -1:, :] + + if self.concat_after: + tgt_concat = torch.cat( + (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)), dim=-1 + ) + x = residual + self.concat_linear1(tgt_concat) + else: + x = residual + self.dropout(self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)) + if not self.normalize_before: + x = self.norm1(x) + + residual = x + if self.normalize_before: + x = self.norm2(x) + if self.concat_after: + x_concat = torch.cat( + (x, self.src_attn(x, memory, memory, memory_mask)), dim=-1 + ) + x = residual + self.concat_linear2(x_concat) + else: + x, score = self.src_attn(x, memory, memory, memory_mask) + x = residual + self.dropout(x) + if not self.normalize_before: + x = self.norm2(x) + + residual = x + if self.normalize_before: + x = self.norm3(x) + x = residual + self.dropout(self.feed_forward(x)) + if not self.normalize_before: + x = self.norm3(x) + + if cache is not None: + x = torch.cat([cache, x], dim=1) + + return x, tgt_mask, memory, memory_mask + + + +class ConvPredictor(nn.Module): + def __init__(self, size=256, l_order=3, r_order=3, attention_heads=4, attention_dropout_rate=0.1, linear_units=2048): + super().__init__() + self.atten = MultiHeadedAttention(attention_heads, size, attention_dropout_rate) + self.norm1 = LayerNorm(size) + self.feed_forward = PositionwiseFeedForward(size, linear_units, attention_dropout_rate) + self.norm2 = LayerNorm(size) + self.pad = nn.ConstantPad1d((l_order, r_order), 0) + self.conv1d = nn.Conv1d(size, size, l_order + r_order + 1, groups=size) + self.output_linear = nn.Linear(size, 1) + + + def forward(self, text_enc, asr_enc): + # stage1 cross-attention + residual = text_enc + text_enc = residual + self.atten(text_enc, asr_enc, asr_enc, None) + + # stage2 FFN + residual = text_enc + text_enc = self.norm1(text_enc) + text_enc = residual + self.feed_forward(text_enc) + + # stage Conv predictor + text_enc = self.norm2(text_enc) + context = text_enc.transpose(1, 2) + queries = self.pad(context) + memory = self.conv1d(queries) + output = memory + context + output = output.transpose(1, 2) + output = torch.relu(output) + output = self.output_linear(output) + if output.dim()==3: + output = output.squeeze(2) + return output From b349739f5d6302048c179eeaadb4432acc541cd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 17:27:02 +0800 Subject: [PATCH 032/101] test --- funasr/auto/auto_model.py | 4 +++- funasr/train_utils/load_pretrained_model.py | 8 +++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index a5341eacf..23b80d72a 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -172,12 +172,14 @@ class AutoModel: # build model model_class = tables.model_classes.get(kwargs["model"]) + pdb.set_trace() model = model_class(**kwargs, **kwargs["model_conf"], vocab_size=vocab_size) - + pdb.set_trace() model.to(device) # init_param init_param = kwargs.get("init_param", None) + pdb.set_trace() if init_param is not None: logging.info(f"Loading pretrained params from {init_param}") load_pretrained_model( diff --git a/funasr/train_utils/load_pretrained_model.py b/funasr/train_utils/load_pretrained_model.py index 5ba9bb7dc..aec31e3cc 100644 --- a/funasr/train_utils/load_pretrained_model.py +++ b/funasr/train_utils/load_pretrained_model.py @@ -7,7 +7,7 @@ import logging import torch import torch.nn import torch.optim - +import pdb def filter_state_dict( dst_state: Dict[str, Union[float, torch.Tensor]], @@ -99,14 +99,16 @@ def load_pretrained_model( # import pdb; # pdb.set_trace() print(f"ckpt: {path}") + pdb.set_trace() if oss_bucket is None: src_state = torch.load(path, map_location=map_location) else: buffer = BytesIO(oss_bucket.get_object(path).read()) src_state = torch.load(buffer, map_location=map_location) + pdb.set_trace() if "state_dict" in src_state: src_state = src_state["state_dict"] - + pdb.set_trace() for k in dst_state.keys(): if not k.startswith("module.") and "module." + k in src_state.keys(): k_ddp = "module." + k @@ -116,7 +118,7 @@ def load_pretrained_model( dst_state[k] = src_state[k_ddp] else: print(f"Miss key in ckpt: model: {k}, ckpt: {k_ddp}") - + pdb.set_trace() flag = obj.load_state_dict(dst_state, strict=True) # print(flag) From e615585fd3e40531fb714586d98c6a307a95c03d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 22 Feb 2024 17:31:57 +0800 Subject: [PATCH 033/101] test --- funasr/models/lcbnet/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index 563ff26e9..bbc99fdba 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -89,7 +89,7 @@ class LCBNet(nn.Module): text_encoder = text_encoder_class(input_size=vocab_size, **text_encoder_conf) fusion_encoder_class = tables.encoder_classes.get(fusion_encoder) fusion_encoder = fusion_encoder_class(**fusion_encoder_conf) - bias_predictor_class = tables.encoder_classes.get_class(bias_predictor) + bias_predictor_class = tables.encoder_classes.get(bias_predictor) bias_predictor = bias_predictor_class(bias_predictor_conf) if decoder is not None: @@ -414,7 +414,7 @@ class LCBNet(nn.Module): self.init_beam_search(**kwargs) self.nbest = kwargs.get("nbest", 1) pdb.set_trace() - + meta_data = {} if isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank": # fbank speech, speech_lengths = data_in, data_lengths From 5c1308e3cf5dd63c1d1c0b5299bd79b3064bca7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Fri, 23 Feb 2024 16:13:23 +0800 Subject: [PATCH 034/101] test --- funasr/models/lcbnet/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index bbc99fdba..555d4e658 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -90,7 +90,7 @@ class LCBNet(nn.Module): fusion_encoder_class = tables.encoder_classes.get(fusion_encoder) fusion_encoder = fusion_encoder_class(**fusion_encoder_conf) bias_predictor_class = tables.encoder_classes.get(bias_predictor) - bias_predictor = bias_predictor_class(bias_predictor_conf) + bias_predictor = bias_predictor_class(**bias_predictor_conf) if decoder is not None: decoder_class = tables.decoder_classes.get(decoder) From 54bd357b0857b94f761e270dbed5f90ca4e77d51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Fri, 23 Feb 2024 16:17:18 +0800 Subject: [PATCH 035/101] test --- funasr/models/lcbnet/encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/funasr/models/lcbnet/encoder.py b/funasr/models/lcbnet/encoder.py index d2464f1de..c65823cb0 100644 --- a/funasr/models/lcbnet/encoder.py +++ b/funasr/models/lcbnet/encoder.py @@ -355,7 +355,7 @@ class SelfSrcAttention(nn.Module): return x, tgt_mask, memory, memory_mask - +@tables.register("encoder_classes", "ConvBiasPredictor") class ConvPredictor(nn.Module): def __init__(self, size=256, l_order=3, r_order=3, attention_heads=4, attention_dropout_rate=0.1, linear_units=2048): super().__init__() From d60306e7a435053a1ed626213f9fa6fe12af2b3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Fri, 23 Feb 2024 16:47:15 +0800 Subject: [PATCH 036/101] test --- examples/industrial_data_pretraining/lcbnet/demo2.sh | 3 +-- funasr/auto/auto_model.py | 3 --- funasr/train_utils/load_pretrained_model.py | 9 +++------ 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh index 36a692856..3e4d22393 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo2.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh @@ -7,8 +7,7 @@ python -m funasr.bin.inference \ ++init_param=${file_dir}/model.pb \ ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ ++frontend_conf.cmvn_file=${file_dir}/am.mvn \ -++input=${file_dir}/wav.scp \ -++input=${file_dir}/ocr_text \ +++input=[${file_dir}/wav.scp,${file_dir}/ocr_text] \ +data_type='["sound", "text"]' \ ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ ++output_dir="./outputs/debug" \ diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index 23b80d72a..87c7e2d03 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -172,14 +172,11 @@ class AutoModel: # build model model_class = tables.model_classes.get(kwargs["model"]) - pdb.set_trace() model = model_class(**kwargs, **kwargs["model_conf"], vocab_size=vocab_size) - pdb.set_trace() model.to(device) # init_param init_param = kwargs.get("init_param", None) - pdb.set_trace() if init_param is not None: logging.info(f"Loading pretrained params from {init_param}") load_pretrained_model( diff --git a/funasr/train_utils/load_pretrained_model.py b/funasr/train_utils/load_pretrained_model.py index aec31e3cc..9127e2fe1 100644 --- a/funasr/train_utils/load_pretrained_model.py +++ b/funasr/train_utils/load_pretrained_model.py @@ -96,19 +96,17 @@ def load_pretrained_model( obj = model dst_state = obj.state_dict() - # import pdb; - # pdb.set_trace() print(f"ckpt: {path}") - pdb.set_trace() + if oss_bucket is None: src_state = torch.load(path, map_location=map_location) else: buffer = BytesIO(oss_bucket.get_object(path).read()) src_state = torch.load(buffer, map_location=map_location) - pdb.set_trace() + if "state_dict" in src_state: src_state = src_state["state_dict"] - pdb.set_trace() + for k in dst_state.keys(): if not k.startswith("module.") and "module." + k in src_state.keys(): k_ddp = "module." + k @@ -118,7 +116,6 @@ def load_pretrained_model( dst_state[k] = src_state[k_ddp] else: print(f"Miss key in ckpt: model: {k}, ckpt: {k_ddp}") - pdb.set_trace() flag = obj.load_state_dict(dst_state, strict=True) # print(flag) From 6a8c943435edf25f252d9d4db0095d4a01c7a3cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Fri, 23 Feb 2024 16:56:02 +0800 Subject: [PATCH 037/101] test --- funasr/models/lcbnet/model.py | 1 + funasr/utils/load_utils.py | 16 ++++++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index 555d4e658..54fba1cb2 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -425,6 +425,7 @@ class LCBNet(nn.Module): else: # extract fbank feats time1 = time.perf_counter() + pdb.set_trace() audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000), data_type=kwargs.get("data_type", "sound"), tokenizer=tokenizer) diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py index 7748172f6..cdd378de6 100644 --- a/funasr/utils/load_utils.py +++ b/funasr/utils/load_utils.py @@ -13,30 +13,34 @@ try: from funasr.download.file import download_from_url except: print("urllib is not installed, if you infer from url, please install it first.") - +import pdb def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: int = 16000, data_type="sound", tokenizer=None, **kwargs): + pdb.set_trace() if isinstance(data_or_path_or_list, (list, tuple)): if data_type is not None and isinstance(data_type, (list, tuple)): - + pdb.set_trace() data_types = [data_type] * len(data_or_path_or_list) data_or_path_or_list_ret = [[] for d in data_type] + pdb.set_trace() for i, (data_type_i, data_or_path_or_list_i) in enumerate(zip(data_types, data_or_path_or_list)): for j, (data_type_j, data_or_path_or_list_j) in enumerate(zip(data_type_i, data_or_path_or_list_i)): - + pdb.set_trace() data_or_path_or_list_j = load_audio_text_image_video(data_or_path_or_list_j, fs=fs, audio_fs=audio_fs, data_type=data_type_j, tokenizer=tokenizer, **kwargs) + pdb.set_trace() data_or_path_or_list_ret[j].append(data_or_path_or_list_j) return data_or_path_or_list_ret else: return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs, data_type=data_type, **kwargs) for audio in data_or_path_or_list] - + pdb.set_trace() if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith('http'): # download url to local file data_or_path_or_list = download_from_url(data_or_path_or_list) - + pdb.set_trace() if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): # local file + pdb.set_trace() if data_type is None or data_type == "sound": data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list) if kwargs.get("reduce_channels", True): @@ -59,7 +63,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: else: pass # print(f"unsupport data type: {data_or_path_or_list}, return raw data") - + pdb.set_trace() if audio_fs != fs and data_type != "text": resampler = torchaudio.transforms.Resample(audio_fs, fs) data_or_path_or_list = resampler(data_or_path_or_list[None, :])[0, :] From 5130d2406df1aa567d13eec49eea8f9e392c6790 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Fri, 23 Feb 2024 17:01:41 +0800 Subject: [PATCH 038/101] test --- funasr/frontends/default.py | 1 + 1 file changed, 1 insertion(+) diff --git a/funasr/frontends/default.py b/funasr/frontends/default.py index 66d42f71c..364c8bbb9 100644 --- a/funasr/frontends/default.py +++ b/funasr/frontends/default.py @@ -48,6 +48,7 @@ class DefaultFrontend(nn.Module): # Deepcopy (In general, dict shouldn't be used as default arg) frontend_conf = copy.deepcopy(frontend_conf) self.hop_length = hop_length + self.fs = fs if apply_stft: self.stft = Stft( From 70a236b652b3c2a4377bd551f4b7c9d4c49cb61c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Fri, 23 Feb 2024 17:38:54 +0800 Subject: [PATCH 039/101] test --- .../lcbnet/demo2.sh | 2 +- funasr/utils/load_utils.py | 22 ++++++++++++------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh index 3e4d22393..cfb5b235e 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo2.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh @@ -8,7 +8,7 @@ python -m funasr.bin.inference \ ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ ++frontend_conf.cmvn_file=${file_dir}/am.mvn \ ++input=[${file_dir}/wav.scp,${file_dir}/ocr_text] \ -+data_type='["sound", "text"]' \ ++data_type='["kaldi_ark", "text"]' \ ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ ++output_dir="./outputs/debug" \ ++device="" \ diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py index cdd378de6..b7d0200cc 100644 --- a/funasr/utils/load_utils.py +++ b/funasr/utils/load_utils.py @@ -17,35 +17,28 @@ import pdb def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: int = 16000, data_type="sound", tokenizer=None, **kwargs): - pdb.set_trace() if isinstance(data_or_path_or_list, (list, tuple)): if data_type is not None and isinstance(data_type, (list, tuple)): - pdb.set_trace() data_types = [data_type] * len(data_or_path_or_list) data_or_path_or_list_ret = [[] for d in data_type] - pdb.set_trace() for i, (data_type_i, data_or_path_or_list_i) in enumerate(zip(data_types, data_or_path_or_list)): - for j, (data_type_j, data_or_path_or_list_j) in enumerate(zip(data_type_i, data_or_path_or_list_i)): - pdb.set_trace() data_or_path_or_list_j = load_audio_text_image_video(data_or_path_or_list_j, fs=fs, audio_fs=audio_fs, data_type=data_type_j, tokenizer=tokenizer, **kwargs) - pdb.set_trace() data_or_path_or_list_ret[j].append(data_or_path_or_list_j) return data_or_path_or_list_ret else: return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs, data_type=data_type, **kwargs) for audio in data_or_path_or_list] - pdb.set_trace() if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith('http'): # download url to local file data_or_path_or_list = download_from_url(data_or_path_or_list) pdb.set_trace() if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): # local file - pdb.set_trace() if data_type is None or data_type == "sound": data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list) if kwargs.get("reduce_channels", True): data_or_path_or_list = data_or_path_or_list.mean(0) elif data_type == "text" and tokenizer is not None: + pdb.set_trace() data_or_path_or_list = tokenizer.encode(data_or_path_or_list) elif data_type == "image": # undo pass @@ -60,6 +53,19 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: data_or_path_or_list = tokenizer.encode(data_or_path_or_list) elif isinstance(data_or_path_or_list, np.ndarray): # audio sample point data_or_path_or_list = torch.from_numpy(data_or_path_or_list).squeeze() # [n_samples,] + elif isinstance(data_or_path_or_list, str) and data_type == "kaldi_ark": + data_mat = kaldiio.load_mat(data_or_path_or_list) + if isinstance(data_mat, tuple): + sampling_rate, mat = data_mat + assert sampling_rate == audio_fs + else: + mat = data_mat + if mat.dtype == 'int16' or mat.dtype == 'int32': + mat = mat.astype(np.float64) + mat = mat / 32768 + if mat.ndim ==2: + mat = mat[:,0] + data_or_path_or_list = mat else: pass # print(f"unsupport data type: {data_or_path_or_list}, return raw data") From 5ecd13bd0460c4317e9a585e4204731791e5e9db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 11:23:51 +0800 Subject: [PATCH 040/101] test --- funasr/utils/load_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py index b7d0200cc..20fa0fd2e 100644 --- a/funasr/utils/load_utils.py +++ b/funasr/utils/load_utils.py @@ -56,8 +56,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: elif isinstance(data_or_path_or_list, str) and data_type == "kaldi_ark": data_mat = kaldiio.load_mat(data_or_path_or_list) if isinstance(data_mat, tuple): - sampling_rate, mat = data_mat - assert sampling_rate == audio_fs + audio_fs, mat = data_mat else: mat = data_mat if mat.dtype == 'int16' or mat.dtype == 'int32': From 343a281ca14809153e2ab1df49ca0c5ffdb01abd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 13:56:32 +0800 Subject: [PATCH 041/101] test --- funasr/models/lcbnet/model.py | 2 +- funasr/utils/load_utils.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index 54fba1cb2..d1ebc5ce5 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -413,7 +413,6 @@ class LCBNet(nn.Module): logging.info("enable beam_search") self.init_beam_search(**kwargs) self.nbest = kwargs.get("nbest", 1) - pdb.set_trace() meta_data = {} if isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank": # fbank @@ -431,6 +430,7 @@ class LCBNet(nn.Module): tokenizer=tokenizer) time2 = time.perf_counter() meta_data["load_data"] = f"{time2 - time1:0.3f}" + pdb.set_trace() speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend) time3 = time.perf_counter() diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py index 20fa0fd2e..963f5c258 100644 --- a/funasr/utils/load_utils.py +++ b/funasr/utils/load_utils.py @@ -31,14 +31,13 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs, data_type=data_type, **kwargs) for audio in data_or_path_or_list] if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith('http'): # download url to local file data_or_path_or_list = download_from_url(data_or_path_or_list) - pdb.set_trace() + if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): # local file if data_type is None or data_type == "sound": data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list) if kwargs.get("reduce_channels", True): data_or_path_or_list = data_or_path_or_list.mean(0) elif data_type == "text" and tokenizer is not None: - pdb.set_trace() data_or_path_or_list = tokenizer.encode(data_or_path_or_list) elif data_type == "image": # undo pass @@ -68,7 +67,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: else: pass # print(f"unsupport data type: {data_or_path_or_list}, return raw data") - pdb.set_trace() + if audio_fs != fs and data_type != "text": resampler = torchaudio.transforms.Resample(audio_fs, fs) data_or_path_or_list = resampler(data_or_path_or_list[None, :])[0, :] @@ -112,6 +111,7 @@ def extract_fbank(data, data_len = None, data_type: str="sound", frontend=None, # import pdb; # pdb.set_trace() # if data_type == "sound": + pdb.set_trace() data, data_len = frontend(data, data_len, **kwargs) if isinstance(data_len, (list, tuple)): From 0d32e02c79d751ae15af8fb767df32564e34cbf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 14:02:03 +0800 Subject: [PATCH 042/101] test --- funasr/models/lcbnet/model.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index d1ebc5ce5..1acce785f 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -425,14 +425,17 @@ class LCBNet(nn.Module): # extract fbank feats time1 = time.perf_counter() pdb.set_trace() - audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000), + sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000), data_type=kwargs.get("data_type", "sound"), tokenizer=tokenizer) time2 = time.perf_counter() meta_data["load_data"] = f"{time2 - time1:0.3f}" + audio_sample_list = sample_list[0] + ocr_sample_list = sample_list[1] pdb.set_trace() speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend) + pdb.set_trace() time3 = time.perf_counter() meta_data["extract_feat"] = f"{time3 - time2:0.3f}" meta_data["batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000 From ab4a31201c218b212ac52cbd529024c5858a9f87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 14:25:00 +0800 Subject: [PATCH 043/101] test --- funasr/frontends/default.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/funasr/frontends/default.py b/funasr/frontends/default.py index 364c8bbb9..c4bdbd774 100644 --- a/funasr/frontends/default.py +++ b/funasr/frontends/default.py @@ -85,8 +85,12 @@ class DefaultFrontend(nn.Module): return self.n_mels def forward( - self, input: torch.Tensor, input_lengths: torch.Tensor + self, input: torch.Tensor, input_lengths: Union[torch.Tensor, list] ) -> Tuple[torch.Tensor, torch.Tensor]: + if isinstance(input_lengths, list): + input_lengths = torch.tensor(input_lengths) + if input.dtype == torch.float64: + input = input.float() # 1. Domain-conversion: e.g. Stft: time -> time-freq if self.stft is not None: input_stft, feats_lens = self._compute_stft(input, input_lengths) From 2bffe1d5392b291c071cde0ffcc03860abdfc230 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 14:52:05 +0800 Subject: [PATCH 044/101] test --- examples/industrial_data_pretraining/lcbnet/demo2.sh | 1 - funasr/models/lcbnet/model.py | 11 +++++++---- funasr/utils/load_utils.py | 5 +---- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh index cfb5b235e..0d5a4f031 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo2.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh @@ -6,7 +6,6 @@ python -m funasr.bin.inference \ --config-name="config.yaml" \ ++init_param=${file_dir}/model.pb \ ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ -++frontend_conf.cmvn_file=${file_dir}/am.mvn \ ++input=[${file_dir}/wav.scp,${file_dir}/ocr_text] \ +data_type='["kaldi_ark", "text"]' \ ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index 1acce785f..f45e71d6f 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -21,6 +21,7 @@ from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank from funasr.utils import postprocess_utils from funasr.utils.datadir_writer import DatadirWriter from funasr.register import tables + import pdb @tables.register("model_classes", "LCBNet") class LCBNet(nn.Module): @@ -92,6 +93,7 @@ class LCBNet(nn.Module): bias_predictor_class = tables.encoder_classes.get(bias_predictor) bias_predictor = bias_predictor_class(**bias_predictor_conf) + if decoder is not None: decoder_class = tables.decoder_classes.get(decoder) decoder = decoder_class( @@ -272,15 +274,15 @@ class LCBNet(nn.Module): ind: int """ with autocast(False): - + pdb.set_trace() # Data augmentation if self.specaug is not None and self.training: speech, speech_lengths = self.specaug(speech, speech_lengths) - + pdb.set_trace() # Normalization for feature: e.g. Global-CMVN, Utterance-CMVN if self.normalize is not None: speech, speech_lengths = self.normalize(speech, speech_lengths) - + pdb.set_trace() # Forward encoder # feats: (Batch, Length, Dim) # -> encoder_out: (Batch, Length2, Dim2) @@ -297,7 +299,7 @@ class LCBNet(nn.Module): if intermediate_outs is not None: return (encoder_out, intermediate_outs), encoder_out_lens - + pdb.set_trace() return encoder_out, encoder_out_lens def _calc_att_loss( @@ -442,6 +444,7 @@ class LCBNet(nn.Module): speech = speech.to(device=kwargs["device"]) speech_lengths = speech_lengths.to(device=kwargs["device"]) + pdb.set_trace() # Encoder encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) if isinstance(encoder_out, tuple): diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py index 963f5c258..644af2324 100644 --- a/funasr/utils/load_utils.py +++ b/funasr/utils/load_utils.py @@ -108,10 +108,7 @@ def extract_fbank(data, data_len = None, data_type: str="sound", frontend=None, data_list.append(data_i) data_len.append(data_i.shape[0]) data = pad_sequence(data_list, batch_first=True) # data: [batch, N] - # import pdb; - # pdb.set_trace() - # if data_type == "sound": - pdb.set_trace() + data, data_len = frontend(data, data_len, **kwargs) if isinstance(data_len, (list, tuple)): From 19103386dc4f52619aba21af4008a9d082ea4a67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 15:05:41 +0800 Subject: [PATCH 045/101] test --- funasr/models/lcbnet/model.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index f45e71d6f..45b1ee5d1 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -426,7 +426,6 @@ class LCBNet(nn.Module): else: # extract fbank feats time1 = time.perf_counter() - pdb.set_trace() sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000), data_type=kwargs.get("data_type", "sound"), tokenizer=tokenizer) @@ -434,13 +433,12 @@ class LCBNet(nn.Module): meta_data["load_data"] = f"{time2 - time1:0.3f}" audio_sample_list = sample_list[0] ocr_sample_list = sample_list[1] - pdb.set_trace() speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend) - pdb.set_trace() time3 = time.perf_counter() meta_data["extract_feat"] = f"{time3 - time2:0.3f}" - meta_data["batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000 + frame_shift = 10 + meta_data["batch_data_time"] = speech_lengths.sum().item() * frame_shift / 1000 speech = speech.to(device=kwargs["device"]) speech_lengths = speech_lengths.to(device=kwargs["device"]) From eb92e79fb94e7b3df8f27c8ce3e607a70dff2a2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 15:21:32 +0800 Subject: [PATCH 046/101] test --- funasr/models/conformer/encoder.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/funasr/models/conformer/encoder.py b/funasr/models/conformer/encoder.py index 1d252c206..443d30912 100644 --- a/funasr/models/conformer/encoder.py +++ b/funasr/models/conformer/encoder.py @@ -47,7 +47,7 @@ from funasr.models.transformer.utils.subsampling import check_short_utt from funasr.models.transformer.utils.subsampling import Conv2dSubsamplingPad from funasr.models.transformer.utils.subsampling import StreamingConvInput from funasr.register import tables - +import pdb class ConvolutionModule(nn.Module): """ConvolutionModule in Conformer model. @@ -573,7 +573,7 @@ class ConformerEncoder(nn.Module): xs_pad, masks = self.embed(xs_pad, masks) else: xs_pad = self.embed(xs_pad) - + pdb.set_trace() intermediate_outs = [] if len(self.interctc_layer_idx) == 0: xs_pad, masks = self.encoders(xs_pad, masks) @@ -601,12 +601,12 @@ class ConformerEncoder(nn.Module): xs_pad = (x, pos_emb) else: xs_pad = xs_pad + self.conditioning_layer(ctc_out) - + pdb.set_trace() if isinstance(xs_pad, tuple): xs_pad = xs_pad[0] if self.normalize_before: xs_pad = self.after_norm(xs_pad) - + pdb.set_trace() olens = masks.squeeze(1).sum(1) if len(intermediate_outs) > 0: return (xs_pad, intermediate_outs), olens, None From debafeac37c0259bc3cf7642700f05adea34e047 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 15:23:07 +0800 Subject: [PATCH 047/101] test --- funasr/auto/auto_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index 87c7e2d03..5cb2e6e48 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -141,7 +141,7 @@ class AutoModel: kwargs = download_model(**kwargs) set_all_random_seed(kwargs.get("seed", 0)) - + pdb.set_trace() device = kwargs.get("device", "cuda") if not torch.cuda.is_available() or kwargs.get("ngpu", 1) == 0: device = "cpu" From 52fee96d71ba96fd09ad453dbae1926a1d601a56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 15:31:14 +0800 Subject: [PATCH 048/101] test --- funasr/auto/auto_model.py | 2 +- funasr/models/lcbnet/model.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index 5cb2e6e48..ba7dcabaa 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -141,7 +141,7 @@ class AutoModel: kwargs = download_model(**kwargs) set_all_random_seed(kwargs.get("seed", 0)) - pdb.set_trace() + device = kwargs.get("device", "cuda") if not torch.cuda.is_available() or kwargs.get("ngpu", 1) == 0: device = "cpu" diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index 45b1ee5d1..f8bbf7af1 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -274,15 +274,12 @@ class LCBNet(nn.Module): ind: int """ with autocast(False): - pdb.set_trace() # Data augmentation if self.specaug is not None and self.training: speech, speech_lengths = self.specaug(speech, speech_lengths) - pdb.set_trace() # Normalization for feature: e.g. Global-CMVN, Utterance-CMVN if self.normalize is not None: speech, speech_lengths = self.normalize(speech, speech_lengths) - pdb.set_trace() # Forward encoder # feats: (Batch, Length, Dim) # -> encoder_out: (Batch, Length2, Dim2) From e2425cc0675cc6fd7685067a27eabd1d32ca7fc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 15:44:07 +0800 Subject: [PATCH 049/101] test --- funasr/models/conformer/encoder.py | 6 +++--- funasr/models/lcbnet/model.py | 7 +++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/funasr/models/conformer/encoder.py b/funasr/models/conformer/encoder.py index 443d30912..be973c641 100644 --- a/funasr/models/conformer/encoder.py +++ b/funasr/models/conformer/encoder.py @@ -573,7 +573,7 @@ class ConformerEncoder(nn.Module): xs_pad, masks = self.embed(xs_pad, masks) else: xs_pad = self.embed(xs_pad) - pdb.set_trace() + intermediate_outs = [] if len(self.interctc_layer_idx) == 0: xs_pad, masks = self.encoders(xs_pad, masks) @@ -601,12 +601,12 @@ class ConformerEncoder(nn.Module): xs_pad = (x, pos_emb) else: xs_pad = xs_pad + self.conditioning_layer(ctc_out) - pdb.set_trace() + if isinstance(xs_pad, tuple): xs_pad = xs_pad[0] if self.normalize_before: xs_pad = self.after_norm(xs_pad) - pdb.set_trace() + olens = masks.squeeze(1).sum(1) if len(intermediate_outs) > 0: return (xs_pad, intermediate_outs), olens, None diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index f8bbf7af1..8070aa378 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -296,7 +296,6 @@ class LCBNet(nn.Module): if intermediate_outs is not None: return (encoder_out, intermediate_outs), encoder_out_lens - pdb.set_trace() return encoder_out, encoder_out_lens def _calc_att_loss( @@ -444,7 +443,11 @@ class LCBNet(nn.Module): encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) if isinstance(encoder_out, tuple): encoder_out = encoder_out[0] - + pdb.set_trace() + ocr = ocr_sample_list[0] + ocr_lengths = ocr.new_full([1], dtype=torch.long, fill_value=ocr.size(1)) + ocr, ocr_lens, _ = self.text_encoder(ocr, ocr_lengths) + pdb.set_trace() # c. Passed the encoder result and the beam search nbest_hyps = self.beam_search( x=encoder_out[0], maxlenratio=kwargs.get("maxlenratio", 0.0), minlenratio=kwargs.get("minlenratio", 0.0) From 5b9c073f43dbecc3ae9d771af50a8f52f87931e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 15:49:41 +0800 Subject: [PATCH 050/101] test --- funasr/models/lcbnet/model.py | 1 - funasr/utils/load_utils.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index 8070aa378..b4a206bed 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -438,7 +438,6 @@ class LCBNet(nn.Module): speech = speech.to(device=kwargs["device"]) speech_lengths = speech_lengths.to(device=kwargs["device"]) - pdb.set_trace() # Encoder encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) if isinstance(encoder_out, tuple): diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py index 644af2324..8b75cbdb6 100644 --- a/funasr/utils/load_utils.py +++ b/funasr/utils/load_utils.py @@ -31,7 +31,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs, data_type=data_type, **kwargs) for audio in data_or_path_or_list] if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith('http'): # download url to local file data_or_path_or_list = download_from_url(data_or_path_or_list) - + pdb.set_trace() if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): # local file if data_type is None or data_type == "sound": data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list) From 6e47d42ea00e6d10746b59a86d6455465464ed83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 15:55:23 +0800 Subject: [PATCH 051/101] test --- funasr/models/lcbnet/model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index b4a206bed..3b8f3c96e 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -422,6 +422,7 @@ class LCBNet(nn.Module): else: # extract fbank feats time1 = time.perf_counter() + pdb.set_trace() sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000), data_type=kwargs.get("data_type", "sound"), tokenizer=tokenizer) From ecd9e74b6e177e5dd584609c04570870f15af63b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 16:00:44 +0800 Subject: [PATCH 052/101] test --- funasr/auto/auto_model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index ba7dcabaa..d5225dee8 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -39,11 +39,13 @@ def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None): filelist = [".scp", ".txt", ".json", ".jsonl"] chars = string.ascii_letters + string.digits + pdb.set_trace() if isinstance(data_in, str) and data_in.startswith('http'): # url data_in = download_from_url(data_in) if isinstance(data_in, str) and os.path.exists(data_in): # wav_path; filelist: wav.scp, file.jsonl;text.txt; _, file_extension = os.path.splitext(data_in) file_extension = file_extension.lower() + pdb.set_trace() if file_extension in filelist: #filelist: wav.scp, file.jsonl;text.txt; with open(data_in, encoding='utf-8') as fin: for line in fin: From a88b51c5442efba7bf1e8d91881f69279b27224d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 16:04:35 +0800 Subject: [PATCH 053/101] test --- examples/industrial_data_pretraining/lcbnet/demo2.sh | 2 +- funasr/auto/auto_model.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh index 0d5a4f031..9ba176be6 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo2.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh @@ -6,7 +6,7 @@ python -m funasr.bin.inference \ --config-name="config.yaml" \ ++init_param=${file_dir}/model.pb \ ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ -++input=[${file_dir}/wav.scp,${file_dir}/ocr_text] \ +++input=[${file_dir}/wav.scp,${file_dir}/ocr.txt] \ +data_type='["kaldi_ark", "text"]' \ ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ ++output_dir="./outputs/debug" \ diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index d5225dee8..ba7dcabaa 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -39,13 +39,11 @@ def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None): filelist = [".scp", ".txt", ".json", ".jsonl"] chars = string.ascii_letters + string.digits - pdb.set_trace() if isinstance(data_in, str) and data_in.startswith('http'): # url data_in = download_from_url(data_in) if isinstance(data_in, str) and os.path.exists(data_in): # wav_path; filelist: wav.scp, file.jsonl;text.txt; _, file_extension = os.path.splitext(data_in) file_extension = file_extension.lower() - pdb.set_trace() if file_extension in filelist: #filelist: wav.scp, file.jsonl;text.txt; with open(data_in, encoding='utf-8') as fin: for line in fin: From 31e2eb39ad3965931f9df22fce86c708f4d9da95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 16:14:57 +0800 Subject: [PATCH 054/101] test --- funasr/models/lcbnet/model.py | 7 ++++--- funasr/utils/load_utils.py | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index 3b8f3c96e..f4caee8a4 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -422,7 +422,6 @@ class LCBNet(nn.Module): else: # extract fbank feats time1 = time.perf_counter() - pdb.set_trace() sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000), data_type=kwargs.get("data_type", "sound"), tokenizer=tokenizer) @@ -443,9 +442,11 @@ class LCBNet(nn.Module): encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) if isinstance(encoder_out, tuple): encoder_out = encoder_out[0] - pdb.set_trace() - ocr = ocr_sample_list[0] + + ocr_list_new = [[x + 1 if x != 0 else x for x in sublist] for sublist in ocr_sample_list] + ocr = torch.tensor(ocr_list_new) ocr_lengths = ocr.new_full([1], dtype=torch.long, fill_value=ocr.size(1)) + pdb.set_trace() ocr, ocr_lens, _ = self.text_encoder(ocr, ocr_lengths) pdb.set_trace() # c. Passed the encoder result and the beam search diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py index 8b75cbdb6..87412bd87 100644 --- a/funasr/utils/load_utils.py +++ b/funasr/utils/load_utils.py @@ -31,7 +31,6 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs, data_type=data_type, **kwargs) for audio in data_or_path_or_list] if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith('http'): # download url to local file data_or_path_or_list = download_from_url(data_or_path_or_list) - pdb.set_trace() if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): # local file if data_type is None or data_type == "sound": data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list) From 5b93a56a7b15ae236317f78c60b67a5e95488b38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 16:22:51 +0800 Subject: [PATCH 055/101] test --- funasr/models/lcbnet/model.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index f4caee8a4..422956f04 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -446,9 +446,10 @@ class LCBNet(nn.Module): ocr_list_new = [[x + 1 if x != 0 else x for x in sublist] for sublist in ocr_sample_list] ocr = torch.tensor(ocr_list_new) ocr_lengths = ocr.new_full([1], dtype=torch.long, fill_value=ocr.size(1)) - pdb.set_trace() ocr, ocr_lens, _ = self.text_encoder(ocr, ocr_lengths) - pdb.set_trace() + fusion_out, _, _, _ = self.fusion_encoder(encoder_out,None, ocr, None) + encoder_out = encoder_out + fusion_out + # c. Passed the encoder result and the beam search nbest_hyps = self.beam_search( x=encoder_out[0], maxlenratio=kwargs.get("maxlenratio", 0.0), minlenratio=kwargs.get("minlenratio", 0.0) @@ -456,7 +457,7 @@ class LCBNet(nn.Module): nbest_hyps = nbest_hyps[: self.nbest] - + pdb.set_trace(0) results = [] b, n, d = encoder_out.size() for i in range(b): @@ -478,9 +479,12 @@ class LCBNet(nn.Module): # remove blank symbol id, which is assumed to be 0 token_int = list(filter(lambda x: x != self.eos and x != self.sos and x != self.blank_id, token_int)) + pdb.set_trace() # Change integer-ids to tokens token = tokenizer.ids2tokens(token_int) + pdb.set_trace() text = tokenizer.tokens2text(token) + pdb.set_trace() text_postprocessed, _ = postprocess_utils.sentence_postprocess(token) result_i = {"key": key[i], "token": token, "text": text_postprocessed} From 77c2c933a221c4b04f211eeacb7981abccee3c24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 16:30:10 +0800 Subject: [PATCH 056/101] test --- funasr/models/lcbnet/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index 422956f04..6de69846b 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -449,7 +449,7 @@ class LCBNet(nn.Module): ocr, ocr_lens, _ = self.text_encoder(ocr, ocr_lengths) fusion_out, _, _, _ = self.fusion_encoder(encoder_out,None, ocr, None) encoder_out = encoder_out + fusion_out - + pdb.set_trace() # c. Passed the encoder result and the beam search nbest_hyps = self.beam_search( x=encoder_out[0], maxlenratio=kwargs.get("maxlenratio", 0.0), minlenratio=kwargs.get("minlenratio", 0.0) @@ -485,7 +485,7 @@ class LCBNet(nn.Module): pdb.set_trace() text = tokenizer.tokens2text(token) pdb.set_trace() - + text_postprocessed, _ = postprocess_utils.sentence_postprocess(token) result_i = {"key": key[i], "token": token, "text": text_postprocessed} results.append(result_i) From 8992750f02bdc37da40b2e56831b12cc7b3cf756 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 16:30:55 +0800 Subject: [PATCH 057/101] test --- funasr/models/lcbnet/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index 6de69846b..6ee5342d4 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -457,7 +457,7 @@ class LCBNet(nn.Module): nbest_hyps = nbest_hyps[: self.nbest] - pdb.set_trace(0) + pdb.set_trace() results = [] b, n, d = encoder_out.size() for i in range(b): From 39de3adfbc12bc491f6da9eb9ffdc5122a3f623d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 16:39:15 +0800 Subject: [PATCH 058/101] test --- examples/industrial_data_pretraining/lcbnet/demo2.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh index 9ba176be6..20af1f57d 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo2.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh @@ -1,6 +1,6 @@ file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" - +CUDA_VISIBLE_DEVICES="" \ python -m funasr.bin.inference \ --config-path=${file_dir} \ --config-name="config.yaml" \ From 12d8bd77a6686e29f6840e8de3909f3aaf96afa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 16:48:11 +0800 Subject: [PATCH 059/101] test --- funasr/models/lcbnet/model.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index 6ee5342d4..82a1b787d 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -111,8 +111,8 @@ class LCBNet(nn.Module): ) self.blank_id = blank_id - self.sos = sos if sos is not None else vocab_size - 1 - self.eos = eos if eos is not None else vocab_size - 1 + self.sos = vocab_size - 1 + self.eos = vocab_size - 1 self.vocab_size = vocab_size self.ignore_id = ignore_id self.ctc_weight = ctc_weight @@ -375,14 +375,14 @@ class LCBNet(nn.Module): scorers["ngram"] = ngram weights = dict( - decoder=1.0 - kwargs.get("decoding_ctc_weight", 0.5), - ctc=kwargs.get("decoding_ctc_weight", 0.5), + decoder=1.0 - kwargs.get("decoding_ctc_weight", 0.3), + ctc=kwargs.get("decoding_ctc_weight", 0.3), lm=kwargs.get("lm_weight", 0.0), ngram=kwargs.get("ngram_weight", 0.0), length_bonus=kwargs.get("penalty", 0.0), ) beam_search = BeamSearch( - beam_size=kwargs.get("beam_size", 10), + beam_size=kwargs.get("beam_size", 20), weights=weights, scorers=scorers, sos=self.sos, From e59ec16e6a1306d27056d48f7426b6c9a18ae669 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 16:56:58 +0800 Subject: [PATCH 060/101] test --- funasr/auto/auto_model.py | 2 -- funasr/models/lcbnet/model.py | 8 +------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index ba7dcabaa..3f99e4d17 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -234,11 +234,9 @@ class AutoModel: time1 = time.perf_counter() with torch.no_grad(): - pdb.set_trace() results, meta_data = model.inference(**batch, **kwargs) time2 = time.perf_counter() - pdb.set_trace() asr_result_list.extend(results) # batch_data_time = time_per_frame_s * data_batch_i["speech_lengths"].sum().item() diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index 82a1b787d..deddf73df 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -181,8 +181,7 @@ class LCBNet(nn.Module): text: (Batch, Length) text_lengths: (Batch,) """ - # import pdb; - # pdb.set_trace() + if len(text_lengths.size()) > 1: text_lengths = text_lengths[:, 0] if len(speech_lengths.size()) > 1: @@ -449,7 +448,6 @@ class LCBNet(nn.Module): ocr, ocr_lens, _ = self.text_encoder(ocr, ocr_lengths) fusion_out, _, _, _ = self.fusion_encoder(encoder_out,None, ocr, None) encoder_out = encoder_out + fusion_out - pdb.set_trace() # c. Passed the encoder result and the beam search nbest_hyps = self.beam_search( x=encoder_out[0], maxlenratio=kwargs.get("maxlenratio", 0.0), minlenratio=kwargs.get("minlenratio", 0.0) @@ -457,7 +455,6 @@ class LCBNet(nn.Module): nbest_hyps = nbest_hyps[: self.nbest] - pdb.set_trace() results = [] b, n, d = encoder_out.size() for i in range(b): @@ -479,12 +476,9 @@ class LCBNet(nn.Module): # remove blank symbol id, which is assumed to be 0 token_int = list(filter(lambda x: x != self.eos and x != self.sos and x != self.blank_id, token_int)) - pdb.set_trace() # Change integer-ids to tokens token = tokenizer.ids2tokens(token_int) - pdb.set_trace() text = tokenizer.tokens2text(token) - pdb.set_trace() text_postprocessed, _ = postprocess_utils.sentence_postprocess(token) result_i = {"key": key[i], "token": token, "text": text_postprocessed} From e0fca115cbae19e8280eb0b31286195d5f5473f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 17:10:49 +0800 Subject: [PATCH 061/101] test --- funasr/models/lcbnet/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index deddf73df..ab557e6d8 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -443,8 +443,8 @@ class LCBNet(nn.Module): encoder_out = encoder_out[0] ocr_list_new = [[x + 1 if x != 0 else x for x in sublist] for sublist in ocr_sample_list] - ocr = torch.tensor(ocr_list_new) - ocr_lengths = ocr.new_full([1], dtype=torch.long, fill_value=ocr.size(1)) + ocr = torch.tensor(ocr_list_new).to(device=kwargs["device"]) + ocr_lengths = ocr.new_full([1], dtype=torch.long, fill_value=ocr.size(1)).to(device=kwargs["device"]) ocr, ocr_lens, _ = self.text_encoder(ocr, ocr_lengths) fusion_out, _, _, _ = self.fusion_encoder(encoder_out,None, ocr, None) encoder_out = encoder_out + fusion_out From 0a4e01bd7d789504cc5986fa848e5822bef4dfc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 17:18:23 +0800 Subject: [PATCH 062/101] atsr --- .../lcbnet/{demo2.sh => demo.sh} | 4 ++-- .../industrial_data_pretraining/lcbnet/demo_nj.sh | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) rename examples/industrial_data_pretraining/lcbnet/{demo2.sh => demo.sh} (92%) create mode 100755 examples/industrial_data_pretraining/lcbnet/demo_nj.sh diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo.sh similarity index 92% rename from examples/industrial_data_pretraining/lcbnet/demo2.sh rename to examples/industrial_data_pretraining/lcbnet/demo.sh index 20af1f57d..9515f985d 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo2.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo.sh @@ -1,6 +1,6 @@ file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" -CUDA_VISIBLE_DEVICES="" \ +#CUDA_VISIBLE_DEVICES="" \ python -m funasr.bin.inference \ --config-path=${file_dir} \ --config-name="config.yaml" \ @@ -10,4 +10,4 @@ python -m funasr.bin.inference \ +data_type='["kaldi_ark", "text"]' \ ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ ++output_dir="./outputs/debug" \ -++device="" \ +++device="cpu" \ diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh new file mode 100755 index 000000000..9515f985d --- /dev/null +++ b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh @@ -0,0 +1,13 @@ +file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" + +#CUDA_VISIBLE_DEVICES="" \ +python -m funasr.bin.inference \ +--config-path=${file_dir} \ +--config-name="config.yaml" \ +++init_param=${file_dir}/model.pb \ +++tokenizer_conf.token_list=${file_dir}/tokens.txt \ +++input=[${file_dir}/wav.scp,${file_dir}/ocr.txt] \ ++data_type='["kaldi_ark", "text"]' \ +++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ +++output_dir="./outputs/debug" \ +++device="cpu" \ From 2d71d8f679894ab49374b10784547db001bba7be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 17:30:27 +0800 Subject: [PATCH 063/101] test --- .../lcbnet/demo_nj.sh | 80 ++++++++++++++++--- 1 file changed, 69 insertions(+), 11 deletions(-) diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh index 9515f985d..51ffad71c 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh @@ -1,13 +1,71 @@ file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" +CUDA_VISIBLE_DEVICES="0,1" +inference_device="cuda" -#CUDA_VISIBLE_DEVICES="" \ -python -m funasr.bin.inference \ ---config-path=${file_dir} \ ---config-name="config.yaml" \ -++init_param=${file_dir}/model.pb \ -++tokenizer_conf.token_list=${file_dir}/tokens.txt \ -++input=[${file_dir}/wav.scp,${file_dir}/ocr.txt] \ -+data_type='["kaldi_ark", "text"]' \ -++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ -++output_dir="./outputs/debug" \ -++device="cpu" \ +if [ ${inference_device} == "cuda" ]; then + nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +else + inference_batch_size=1 + CUDA_VISIBLE_DEVICES="" + for JOB in $(seq ${nj}); do + CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1," + done +fi + +inference_dir="outputs/test" +_logdir="${inference_dir}/logdir" +echo "inference_dir: ${inference_dir}" + +mkdir -p "${_logdir}" +key_file1=${file_dir}/wav.scp +key_file2=${file_dir}/ocr.txt +split_scps1= +split_scps2= +for JOB in $(seq "${nj}"); do + split_scps1+=" ${_logdir}/wav.${JOB}.scp" + split_scps2+=" ${_logdir}/ocr.${JOB}.txt" +done +utils/split_scp.pl "${key_file1}" ${split_scps1} +utils/split_scp.pl "${key_file2}" ${split_scps2} + +gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ }) +for JOB in $(seq ${nj}); do + { + id=$((JOB-1)) + gpuid=${gpuid_list_array[$id]} + + export CUDA_VISIBLE_DEVICES=${gpuid} + + python -m funasr.bin.inference \ + --config-path=${file_dir} \ + --config-name="config.yaml" \ + ++init_param=${file_dir}/model.pb \ + ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ + ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \ + +data_type='["kaldi_ark", "text"]' \ + ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ + ++output_dir="${inference_dir}/${JOB}" \ + ++device="${inference_device}" \ + ++ncpu=1 \ + ++disable_log=true &> ${_logdir}/log.${JOB}.txt + + }& +done +wait + + +mkdir -p ${inference_dir}/1best_recog +for f in token score text; do + if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then + for JOB in $(seq "${nj}"); do + cat "${inference_dir}/${JOB}/1best_recog/${f}" + done | sort -k1 >"${inference_dir}/1best_recog/${f}" + fi +done + +echo "Computing WER ..." +echo "Computing WER ..." +python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc +python utils/postprocess_text_zh.py ${data_dir}/text ${inference_dir}/1best_recog/text.ref +python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer +tail -n 3 ${inference_dir}/1best_recog/text.cer \ No newline at end of file From 59ae516f6762077ed9933128e2d804f9a65066a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 17:30:41 +0800 Subject: [PATCH 064/101] test --- examples/industrial_data_pretraining/lcbnet/demo_nj.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh index 51ffad71c..5e634c315 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh @@ -1,5 +1,5 @@ file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" -CUDA_VISIBLE_DEVICES="0,1" +CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" inference_device="cuda" if [ ${inference_device} == "cuda" ]; then From 179a3f99c45d21cec3ea17e3b9265bcf1e49c617 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 19:30:15 +0800 Subject: [PATCH 065/101] test --- .../lcbnet/demo_nj.sh | 77 ++++++++++--------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh index 5e634c315..d9f42a033 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh @@ -16,46 +16,46 @@ inference_dir="outputs/test" _logdir="${inference_dir}/logdir" echo "inference_dir: ${inference_dir}" -mkdir -p "${_logdir}" -key_file1=${file_dir}/wav.scp -key_file2=${file_dir}/ocr.txt -split_scps1= -split_scps2= -for JOB in $(seq "${nj}"); do - split_scps1+=" ${_logdir}/wav.${JOB}.scp" - split_scps2+=" ${_logdir}/ocr.${JOB}.txt" -done -utils/split_scp.pl "${key_file1}" ${split_scps1} -utils/split_scp.pl "${key_file2}" ${split_scps2} +# mkdir -p "${_logdir}" +# key_file1=${file_dir}/wav.scp +# key_file2=${file_dir}/ocr.txt +# split_scps1= +# split_scps2= +# for JOB in $(seq "${nj}"); do +# split_scps1+=" ${_logdir}/wav.${JOB}.scp" +# split_scps2+=" ${_logdir}/ocr.${JOB}.txt" +# done +# utils/split_scp.pl "${key_file1}" ${split_scps1} +# utils/split_scp.pl "${key_file2}" ${split_scps2} -gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ }) -for JOB in $(seq ${nj}); do - { - id=$((JOB-1)) - gpuid=${gpuid_list_array[$id]} +# gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ }) +# for JOB in $(seq ${nj}); do +# { +# id=$((JOB-1)) +# gpuid=${gpuid_list_array[$id]} - export CUDA_VISIBLE_DEVICES=${gpuid} +# export CUDA_VISIBLE_DEVICES=${gpuid} - python -m funasr.bin.inference \ - --config-path=${file_dir} \ - --config-name="config.yaml" \ - ++init_param=${file_dir}/model.pb \ - ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ - ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \ - +data_type='["kaldi_ark", "text"]' \ - ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ - ++output_dir="${inference_dir}/${JOB}" \ - ++device="${inference_device}" \ - ++ncpu=1 \ - ++disable_log=true &> ${_logdir}/log.${JOB}.txt +# python -m funasr.bin.inference \ +# --config-path=${file_dir} \ +# --config-name="config.yaml" \ +# ++init_param=${file_dir}/model.pb \ +# ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ +# ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \ +# +data_type='["kaldi_ark", "text"]' \ +# ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ +# ++output_dir="${inference_dir}/${JOB}" \ +# ++device="${inference_device}" \ +# ++ncpu=1 \ +# ++disable_log=true &> ${_logdir}/log.${JOB}.txt - }& -done -wait +# }& +# done +# wait -mkdir -p ${inference_dir}/1best_recog -for f in token score text; do +#mkdir -p ${inference_dir}/1best_recog +for f in token; do if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then for JOB in $(seq "${nj}"); do cat "${inference_dir}/${JOB}/1best_recog/${f}" @@ -65,7 +65,8 @@ done echo "Computing WER ..." echo "Computing WER ..." -python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc -python utils/postprocess_text_zh.py ${data_dir}/text ${inference_dir}/1best_recog/text.ref -python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer -tail -n 3 ${inference_dir}/1best_recog/text.cer \ No newline at end of file +#python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc + +#cp ${data_dir}/text ${inference_dir}/1best_recog/text.ref +#python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer +#tail -n 3 ${inference_dir}/1best_recog/text.cer \ No newline at end of file From 5f91acae0d8be4b3223bcb4732bad2796d654547 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 19:35:32 +0800 Subject: [PATCH 066/101] test --- .../lcbnet/demo_nj.sh | 72 ------------------- .../lcbnet/demo_nj2.sh | 72 +++++++++++++++++++ .../industrial_data_pretraining/lcbnet/utils | 1 + 3 files changed, 73 insertions(+), 72 deletions(-) delete mode 100755 examples/industrial_data_pretraining/lcbnet/demo_nj.sh create mode 100644 examples/industrial_data_pretraining/lcbnet/demo_nj2.sh create mode 120000 examples/industrial_data_pretraining/lcbnet/utils diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh deleted file mode 100755 index d9f42a033..000000000 --- a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh +++ /dev/null @@ -1,72 +0,0 @@ -file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" -CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -inference_device="cuda" - -if [ ${inference_device} == "cuda" ]; then - nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') -else - inference_batch_size=1 - CUDA_VISIBLE_DEVICES="" - for JOB in $(seq ${nj}); do - CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1," - done -fi - -inference_dir="outputs/test" -_logdir="${inference_dir}/logdir" -echo "inference_dir: ${inference_dir}" - -# mkdir -p "${_logdir}" -# key_file1=${file_dir}/wav.scp -# key_file2=${file_dir}/ocr.txt -# split_scps1= -# split_scps2= -# for JOB in $(seq "${nj}"); do -# split_scps1+=" ${_logdir}/wav.${JOB}.scp" -# split_scps2+=" ${_logdir}/ocr.${JOB}.txt" -# done -# utils/split_scp.pl "${key_file1}" ${split_scps1} -# utils/split_scp.pl "${key_file2}" ${split_scps2} - -# gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ }) -# for JOB in $(seq ${nj}); do -# { -# id=$((JOB-1)) -# gpuid=${gpuid_list_array[$id]} - -# export CUDA_VISIBLE_DEVICES=${gpuid} - -# python -m funasr.bin.inference \ -# --config-path=${file_dir} \ -# --config-name="config.yaml" \ -# ++init_param=${file_dir}/model.pb \ -# ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ -# ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \ -# +data_type='["kaldi_ark", "text"]' \ -# ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ -# ++output_dir="${inference_dir}/${JOB}" \ -# ++device="${inference_device}" \ -# ++ncpu=1 \ -# ++disable_log=true &> ${_logdir}/log.${JOB}.txt - -# }& -# done -# wait - - -#mkdir -p ${inference_dir}/1best_recog -for f in token; do - if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then - for JOB in $(seq "${nj}"); do - cat "${inference_dir}/${JOB}/1best_recog/${f}" - done | sort -k1 >"${inference_dir}/1best_recog/${f}" - fi -done - -echo "Computing WER ..." -echo "Computing WER ..." -#python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc - -#cp ${data_dir}/text ${inference_dir}/1best_recog/text.ref -#python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer -#tail -n 3 ${inference_dir}/1best_recog/text.cer \ No newline at end of file diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj2.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj2.sh new file mode 100644 index 000000000..205c28fa3 --- /dev/null +++ b/examples/industrial_data_pretraining/lcbnet/demo_nj2.sh @@ -0,0 +1,72 @@ +file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" +CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +inference_device="cuda" + +if [ ${inference_device} == "cuda" ]; then + nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') + else + inference_batch_size=1 + CUDA_VISIBLE_DEVICES="" + for JOB in $(seq ${nj}); do + CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1," + done + fi + + inference_dir="outputs/test" + _logdir="${inference_dir}/logdir" + echo "inference_dir: ${inference_dir}" + + # mkdir -p "${_logdir}" + # key_file1=${file_dir}/wav.scp + # key_file2=${file_dir}/ocr.txt + # split_scps1= + # split_scps2= + # for JOB in $(seq "${nj}"); do + # split_scps1+=" ${_logdir}/wav.${JOB}.scp" + # split_scps2+=" ${_logdir}/ocr.${JOB}.txt" + # done + # utils/split_scp.pl "${key_file1}" ${split_scps1} + # utils/split_scp.pl "${key_file2}" ${split_scps2} + + # gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ }) + # for JOB in $(seq ${nj}); do + # { + # id=$((JOB-1)) + # gpuid=${gpuid_list_array[$id]} + + # export CUDA_VISIBLE_DEVICES=${gpuid} + + # python -m funasr.bin.inference \ + # --config-path=${file_dir} \ + # --config-name="config.yaml" \ + # ++init_param=${file_dir}/model.pb \ + # ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ + # ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \ + # +data_type='["kaldi_ark", "text"]' \ + # ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ + # ++output_dir="${inference_dir}/${JOB}" \ + # ++device="${inference_device}" \ + # ++ncpu=1 \ + # ++disable_log=true &> ${_logdir}/log.${JOB}.txt + + # }& + # done + # wait + + + #mkdir -p ${inference_dir}/1best_recog + for f in token; do + if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then + for JOB in $(seq "${nj}"); do + cat "${inference_dir}/${JOB}/1best_recog/${f}" + done | sort -k1 >"${inference_dir}/1best_recog/${f}" + fi + done + + echo "Computing WER ..." + echo "Computing WER ..." + #python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc + + #cp ${data_dir}/text ${inference_dir}/1best_recog/text.ref + #python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer + #tail -n 3 ${inference_dir}/1best_recog/text.cer diff --git a/examples/industrial_data_pretraining/lcbnet/utils b/examples/industrial_data_pretraining/lcbnet/utils new file mode 120000 index 000000000..be5e5a322 --- /dev/null +++ b/examples/industrial_data_pretraining/lcbnet/utils @@ -0,0 +1 @@ +../../aishell/paraformer/utils \ No newline at end of file From e702cad2fb38d8458d57b8ee7639e35ef84f0967 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 19:36:19 +0800 Subject: [PATCH 067/101] test --- .../lcbnet/demo_nj.sh | 72 +++++++++++++++++++ .../lcbnet/demo_nj2.sh | 72 ------------------- 2 files changed, 72 insertions(+), 72 deletions(-) create mode 100644 examples/industrial_data_pretraining/lcbnet/demo_nj.sh delete mode 100644 examples/industrial_data_pretraining/lcbnet/demo_nj2.sh diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh new file mode 100644 index 000000000..d9f42a033 --- /dev/null +++ b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh @@ -0,0 +1,72 @@ +file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" +CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +inference_device="cuda" + +if [ ${inference_device} == "cuda" ]; then + nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +else + inference_batch_size=1 + CUDA_VISIBLE_DEVICES="" + for JOB in $(seq ${nj}); do + CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1," + done +fi + +inference_dir="outputs/test" +_logdir="${inference_dir}/logdir" +echo "inference_dir: ${inference_dir}" + +# mkdir -p "${_logdir}" +# key_file1=${file_dir}/wav.scp +# key_file2=${file_dir}/ocr.txt +# split_scps1= +# split_scps2= +# for JOB in $(seq "${nj}"); do +# split_scps1+=" ${_logdir}/wav.${JOB}.scp" +# split_scps2+=" ${_logdir}/ocr.${JOB}.txt" +# done +# utils/split_scp.pl "${key_file1}" ${split_scps1} +# utils/split_scp.pl "${key_file2}" ${split_scps2} + +# gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ }) +# for JOB in $(seq ${nj}); do +# { +# id=$((JOB-1)) +# gpuid=${gpuid_list_array[$id]} + +# export CUDA_VISIBLE_DEVICES=${gpuid} + +# python -m funasr.bin.inference \ +# --config-path=${file_dir} \ +# --config-name="config.yaml" \ +# ++init_param=${file_dir}/model.pb \ +# ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ +# ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \ +# +data_type='["kaldi_ark", "text"]' \ +# ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ +# ++output_dir="${inference_dir}/${JOB}" \ +# ++device="${inference_device}" \ +# ++ncpu=1 \ +# ++disable_log=true &> ${_logdir}/log.${JOB}.txt + +# }& +# done +# wait + + +#mkdir -p ${inference_dir}/1best_recog +for f in token; do + if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then + for JOB in $(seq "${nj}"); do + cat "${inference_dir}/${JOB}/1best_recog/${f}" + done | sort -k1 >"${inference_dir}/1best_recog/${f}" + fi +done + +echo "Computing WER ..." +echo "Computing WER ..." +#python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc + +#cp ${data_dir}/text ${inference_dir}/1best_recog/text.ref +#python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer +#tail -n 3 ${inference_dir}/1best_recog/text.cer \ No newline at end of file diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj2.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj2.sh deleted file mode 100644 index 205c28fa3..000000000 --- a/examples/industrial_data_pretraining/lcbnet/demo_nj2.sh +++ /dev/null @@ -1,72 +0,0 @@ -file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" -CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -inference_device="cuda" - -if [ ${inference_device} == "cuda" ]; then - nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - else - inference_batch_size=1 - CUDA_VISIBLE_DEVICES="" - for JOB in $(seq ${nj}); do - CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1," - done - fi - - inference_dir="outputs/test" - _logdir="${inference_dir}/logdir" - echo "inference_dir: ${inference_dir}" - - # mkdir -p "${_logdir}" - # key_file1=${file_dir}/wav.scp - # key_file2=${file_dir}/ocr.txt - # split_scps1= - # split_scps2= - # for JOB in $(seq "${nj}"); do - # split_scps1+=" ${_logdir}/wav.${JOB}.scp" - # split_scps2+=" ${_logdir}/ocr.${JOB}.txt" - # done - # utils/split_scp.pl "${key_file1}" ${split_scps1} - # utils/split_scp.pl "${key_file2}" ${split_scps2} - - # gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ }) - # for JOB in $(seq ${nj}); do - # { - # id=$((JOB-1)) - # gpuid=${gpuid_list_array[$id]} - - # export CUDA_VISIBLE_DEVICES=${gpuid} - - # python -m funasr.bin.inference \ - # --config-path=${file_dir} \ - # --config-name="config.yaml" \ - # ++init_param=${file_dir}/model.pb \ - # ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ - # ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \ - # +data_type='["kaldi_ark", "text"]' \ - # ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ - # ++output_dir="${inference_dir}/${JOB}" \ - # ++device="${inference_device}" \ - # ++ncpu=1 \ - # ++disable_log=true &> ${_logdir}/log.${JOB}.txt - - # }& - # done - # wait - - - #mkdir -p ${inference_dir}/1best_recog - for f in token; do - if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then - for JOB in $(seq "${nj}"); do - cat "${inference_dir}/${JOB}/1best_recog/${f}" - done | sort -k1 >"${inference_dir}/1best_recog/${f}" - fi - done - - echo "Computing WER ..." - echo "Computing WER ..." - #python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc - - #cp ${data_dir}/text ${inference_dir}/1best_recog/text.ref - #python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer - #tail -n 3 ${inference_dir}/1best_recog/text.cer From 0fbebf114a84fd25170b4fdf997e6dc69556f299 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 19:39:00 +0800 Subject: [PATCH 068/101] test --- examples/industrial_data_pretraining/lcbnet/demo_nj.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 examples/industrial_data_pretraining/lcbnet/demo_nj.sh diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh old mode 100644 new mode 100755 From e66b05020b30ccb1df04b1383ae21098591fe827 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 19:39:07 +0800 Subject: [PATCH 069/101] test --- .../industrial_data_pretraining/lcbnet/demo_nj.sh | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) mode change 100755 => 100644 examples/industrial_data_pretraining/lcbnet/demo_nj.sh diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh old mode 100755 new mode 100644 index d9f42a033..9d7755f2e --- a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh @@ -55,13 +55,12 @@ echo "inference_dir: ${inference_dir}" #mkdir -p ${inference_dir}/1best_recog -for f in token; do - if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then - for JOB in $(seq "${nj}"); do - cat "${inference_dir}/${JOB}/1best_recog/${f}" - done | sort -k1 >"${inference_dir}/1best_recog/${f}" - fi -done + +if [ -f "${inference_dir}/${JOB}/1best_recog/token" ]; then + for JOB in $(seq "${nj}"); do + cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token" + done +fi echo "Computing WER ..." echo "Computing WER ..." From 61597039b4e5f4b28ff1762f67d4a79f93f9c3b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 19:39:56 +0800 Subject: [PATCH 070/101] test --- examples/industrial_data_pretraining/lcbnet/demo_nj.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 examples/industrial_data_pretraining/lcbnet/demo_nj.sh diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh old mode 100644 new mode 100755 From 47823c9007c7040dd05367f1a170a7be9fef188b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 19:42:16 +0800 Subject: [PATCH 071/101] test --- examples/industrial_data_pretraining/lcbnet/demo_nj.sh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh index 9d7755f2e..c7e17594c 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh @@ -56,11 +56,9 @@ echo "inference_dir: ${inference_dir}" #mkdir -p ${inference_dir}/1best_recog -if [ -f "${inference_dir}/${JOB}/1best_recog/token" ]; then - for JOB in $(seq "${nj}"); do - cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token" - done -fi +for JOB in $(seq "${nj}"); do + cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token" +done echo "Computing WER ..." echo "Computing WER ..." From 7904f2782697768e0d74b04ccf214c156b101696 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Wed, 28 Feb 2024 19:46:38 +0800 Subject: [PATCH 072/101] test --- .../lcbnet/demo_nj.sh | 78 +++++++++---------- 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh index c7e17594c..4aae9e5ed 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh @@ -16,54 +16,52 @@ inference_dir="outputs/test" _logdir="${inference_dir}/logdir" echo "inference_dir: ${inference_dir}" -# mkdir -p "${_logdir}" -# key_file1=${file_dir}/wav.scp -# key_file2=${file_dir}/ocr.txt -# split_scps1= -# split_scps2= -# for JOB in $(seq "${nj}"); do -# split_scps1+=" ${_logdir}/wav.${JOB}.scp" -# split_scps2+=" ${_logdir}/ocr.${JOB}.txt" -# done -# utils/split_scp.pl "${key_file1}" ${split_scps1} -# utils/split_scp.pl "${key_file2}" ${split_scps2} +mkdir -p "${_logdir}" +key_file1=${file_dir}/wav.scp +key_file2=${file_dir}/ocr.txt +split_scps1= +split_scps2= +for JOB in $(seq "${nj}"); do + split_scps1+=" ${_logdir}/wav.${JOB}.scp" + split_scps2+=" ${_logdir}/ocr.${JOB}.txt" +done +utils/split_scp.pl "${key_file1}" ${split_scps1} +utils/split_scp.pl "${key_file2}" ${split_scps2} -# gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ }) -# for JOB in $(seq ${nj}); do -# { -# id=$((JOB-1)) -# gpuid=${gpuid_list_array[$id]} +gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ }) +for JOB in $(seq ${nj}); do + { + id=$((JOB-1)) + gpuid=${gpuid_list_array[$id]} -# export CUDA_VISIBLE_DEVICES=${gpuid} + export CUDA_VISIBLE_DEVICES=${gpuid} -# python -m funasr.bin.inference \ -# --config-path=${file_dir} \ -# --config-name="config.yaml" \ -# ++init_param=${file_dir}/model.pb \ -# ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ -# ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \ -# +data_type='["kaldi_ark", "text"]' \ -# ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ -# ++output_dir="${inference_dir}/${JOB}" \ -# ++device="${inference_device}" \ -# ++ncpu=1 \ -# ++disable_log=true &> ${_logdir}/log.${JOB}.txt + python -m funasr.bin.inference \ + --config-path=${file_dir} \ + --config-name="config.yaml" \ + ++init_param=${file_dir}/model.pb \ + ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ + ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \ + +data_type='["kaldi_ark", "text"]' \ + ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ + ++output_dir="${inference_dir}/${JOB}" \ + ++device="${inference_device}" \ + ++ncpu=1 \ + ++disable_log=true &> ${_logdir}/log.${JOB}.txt -# }& -# done -# wait + }& +done +wait -#mkdir -p ${inference_dir}/1best_recog +mkdir -p ${inference_dir}/1best_recog for JOB in $(seq "${nj}"); do - cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token" + cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token" done echo "Computing WER ..." -echo "Computing WER ..." -#python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc - -#cp ${data_dir}/text ${inference_dir}/1best_recog/text.ref -#python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer -#tail -n 3 ${inference_dir}/1best_recog/text.cer \ No newline at end of file +sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/' ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc +cp ${file_dir}/text ${inference_dir}/1best_recog/token.ref +python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer +tail -n 3 ${inference_dir}/1best_recog/token.cer \ No newline at end of file From 574155be137b7e0af4f874d4025d15c85b265e22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 29 Feb 2024 16:07:49 +0800 Subject: [PATCH 073/101] atsr --- .../lcbnet/compute_wer_details.py | 702 ++++++++++++++++++ .../lcbnet/demo.sh | 80 +- .../lcbnet/demo_nj.sh | 67 -- .../lcbnet/run_bwer_recall.sh | 11 + 4 files changed, 782 insertions(+), 78 deletions(-) create mode 100755 examples/industrial_data_pretraining/lcbnet/compute_wer_details.py delete mode 100755 examples/industrial_data_pretraining/lcbnet/demo_nj.sh create mode 100755 examples/industrial_data_pretraining/lcbnet/run_bwer_recall.sh diff --git a/examples/industrial_data_pretraining/lcbnet/compute_wer_details.py b/examples/industrial_data_pretraining/lcbnet/compute_wer_details.py new file mode 100755 index 000000000..e72d87155 --- /dev/null +++ b/examples/industrial_data_pretraining/lcbnet/compute_wer_details.py @@ -0,0 +1,702 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +from enum import Enum +import re, sys, unicodedata +import codecs +import argparse +from tqdm import tqdm +import os +import pdb +remove_tag = False +spacelist = [" ", "\t", "\r", "\n"] +puncts = [ + "!", + ",", + "?", + "、", + "。", + "!", + ",", + ";", + "?", + ":", + "「", + "」", + "︰", + "『", + "』", + "《", + "》", +] + + +class Code(Enum): + match = 1 + substitution = 2 + insertion = 3 + deletion = 4 + + +class WordError(object): + def __init__(self): + self.errors = { + Code.substitution: 0, + Code.insertion: 0, + Code.deletion: 0, + } + self.ref_words = 0 + + def get_wer(self): + assert self.ref_words != 0 + errors = ( + self.errors[Code.substitution] + + self.errors[Code.insertion] + + self.errors[Code.deletion] + ) + return 100.0 * errors / self.ref_words + + def get_result_string(self): + return ( + f"error_rate={self.get_wer():.4f}, " + f"ref_words={self.ref_words}, " + f"subs={self.errors[Code.substitution]}, " + f"ins={self.errors[Code.insertion]}, " + f"dels={self.errors[Code.deletion]}" + ) + + +def characterize(string): + res = [] + i = 0 + while i < len(string): + char = string[i] + if char in puncts: + i += 1 + continue + cat1 = unicodedata.category(char) + # https://unicodebook.readthedocs.io/unicode.html#unicode-categories + if cat1 == "Zs" or cat1 == "Cn" or char in spacelist: # space or not assigned + i += 1 + continue + if cat1 == "Lo": # letter-other + res.append(char) + i += 1 + else: + # some input looks like: , we want to separate it to two words. + sep = " " + if char == "<": + sep = ">" + j = i + 1 + while j < len(string): + c = string[j] + if ord(c) >= 128 or (c in spacelist) or (c == sep): + break + j += 1 + if j < len(string) and string[j] == ">": + j += 1 + res.append(string[i:j]) + i = j + return res + + +def stripoff_tags(x): + if not x: + return "" + chars = [] + i = 0 + T = len(x) + while i < T: + if x[i] == "<": + while i < T and x[i] != ">": + i += 1 + i += 1 + else: + chars.append(x[i]) + i += 1 + return "".join(chars) + + +def normalize(sentence, ignore_words, cs, split=None): + """sentence, ignore_words are both in unicode""" + new_sentence = [] + for token in sentence: + x = token + if not cs: + x = x.upper() + if x in ignore_words: + continue + if remove_tag: + x = stripoff_tags(x) + if not x: + continue + if split and x in split: + new_sentence += split[x] + else: + new_sentence.append(x) + return new_sentence + + +class Calculator: + def __init__(self): + self.data = {} + self.space = [] + self.cost = {} + self.cost["cor"] = 0 + self.cost["sub"] = 1 + self.cost["del"] = 1 + self.cost["ins"] = 1 + + def calculate(self, lab, rec): + # Initialization + lab.insert(0, "") + rec.insert(0, "") + while len(self.space) < len(lab): + self.space.append([]) + for row in self.space: + for element in row: + element["dist"] = 0 + element["error"] = "non" + while len(row) < len(rec): + row.append({"dist": 0, "error": "non"}) + for i in range(len(lab)): + self.space[i][0]["dist"] = i + self.space[i][0]["error"] = "del" + for j in range(len(rec)): + self.space[0][j]["dist"] = j + self.space[0][j]["error"] = "ins" + self.space[0][0]["error"] = "non" + for token in lab: + if token not in self.data and len(token) > 0: + self.data[token] = {"all": 0, "cor": 0, "sub": 0, "ins": 0, "del": 0} + for token in rec: + if token not in self.data and len(token) > 0: + self.data[token] = {"all": 0, "cor": 0, "sub": 0, "ins": 0, "del": 0} + # Computing edit distance + for i, lab_token in enumerate(lab): + for j, rec_token in enumerate(rec): + if i == 0 or j == 0: + continue + min_dist = sys.maxsize + min_error = "none" + dist = self.space[i - 1][j]["dist"] + self.cost["del"] + error = "del" + if dist < min_dist: + min_dist = dist + min_error = error + dist = self.space[i][j - 1]["dist"] + self.cost["ins"] + error = "ins" + if dist < min_dist: + min_dist = dist + min_error = error + if lab_token == rec_token.replace("", ""): + dist = self.space[i - 1][j - 1]["dist"] + self.cost["cor"] + error = "cor" + else: + dist = self.space[i - 1][j - 1]["dist"] + self.cost["sub"] + error = "sub" + if dist < min_dist: + min_dist = dist + min_error = error + self.space[i][j]["dist"] = min_dist + self.space[i][j]["error"] = min_error + # Tracing back + result = { + "lab": [], + "rec": [], + "code": [], + "all": 0, + "cor": 0, + "sub": 0, + "ins": 0, + "del": 0, + } + i = len(lab) - 1 + j = len(rec) - 1 + while True: + if self.space[i][j]["error"] == "cor": # correct + if len(lab[i]) > 0: + self.data[lab[i]]["all"] = self.data[lab[i]]["all"] + 1 + self.data[lab[i]]["cor"] = self.data[lab[i]]["cor"] + 1 + result["all"] = result["all"] + 1 + result["cor"] = result["cor"] + 1 + result["lab"].insert(0, lab[i]) + result["rec"].insert(0, rec[j]) + result["code"].insert(0, Code.match) + i = i - 1 + j = j - 1 + elif self.space[i][j]["error"] == "sub": # substitution + if len(lab[i]) > 0: + self.data[lab[i]]["all"] = self.data[lab[i]]["all"] + 1 + self.data[lab[i]]["sub"] = self.data[lab[i]]["sub"] + 1 + result["all"] = result["all"] + 1 + result["sub"] = result["sub"] + 1 + result["lab"].insert(0, lab[i]) + result["rec"].insert(0, rec[j]) + result["code"].insert(0, Code.substitution) + i = i - 1 + j = j - 1 + elif self.space[i][j]["error"] == "del": # deletion + if len(lab[i]) > 0: + self.data[lab[i]]["all"] = self.data[lab[i]]["all"] + 1 + self.data[lab[i]]["del"] = self.data[lab[i]]["del"] + 1 + result["all"] = result["all"] + 1 + result["del"] = result["del"] + 1 + result["lab"].insert(0, lab[i]) + result["rec"].insert(0, "") + result["code"].insert(0, Code.deletion) + i = i - 1 + elif self.space[i][j]["error"] == "ins": # insertion + if len(rec[j]) > 0: + self.data[rec[j]]["ins"] = self.data[rec[j]]["ins"] + 1 + result["ins"] = result["ins"] + 1 + result["lab"].insert(0, "") + result["rec"].insert(0, rec[j]) + result["code"].insert(0, Code.insertion) + j = j - 1 + elif self.space[i][j]["error"] == "non": # starting point + break + else: # shouldn't reach here + print( + "this should not happen , i = {i} , j = {j} , error = {error}".format( + i=i, j=j, error=self.space[i][j]["error"] + ) + ) + return result + + def overall(self): + result = {"all": 0, "cor": 0, "sub": 0, "ins": 0, "del": 0} + for token in self.data: + result["all"] = result["all"] + self.data[token]["all"] + result["cor"] = result["cor"] + self.data[token]["cor"] + result["sub"] = result["sub"] + self.data[token]["sub"] + result["ins"] = result["ins"] + self.data[token]["ins"] + result["del"] = result["del"] + self.data[token]["del"] + return result + + def cluster(self, data): + result = {"all": 0, "cor": 0, "sub": 0, "ins": 0, "del": 0} + for token in data: + if token in self.data: + result["all"] = result["all"] + self.data[token]["all"] + result["cor"] = result["cor"] + self.data[token]["cor"] + result["sub"] = result["sub"] + self.data[token]["sub"] + result["ins"] = result["ins"] + self.data[token]["ins"] + result["del"] = result["del"] + self.data[token]["del"] + return result + + def keys(self): + return list(self.data.keys()) + + +def width(string): + return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) + + +def default_cluster(word): + unicode_names = [unicodedata.name(char) for char in word] + for i in reversed(range(len(unicode_names))): + if unicode_names[i].startswith("DIGIT"): # 1 + unicode_names[i] = "Number" # 'DIGIT' + elif unicode_names[i].startswith("CJK UNIFIED IDEOGRAPH") or unicode_names[ + i + ].startswith("CJK COMPATIBILITY IDEOGRAPH"): + # 明 / 郎 + unicode_names[i] = "Mandarin" # 'CJK IDEOGRAPH' + elif unicode_names[i].startswith("LATIN CAPITAL LETTER") or unicode_names[ + i + ].startswith("LATIN SMALL LETTER"): + # A / a + unicode_names[i] = "English" # 'LATIN LETTER' + elif unicode_names[i].startswith("HIRAGANA LETTER"): # は こ め + unicode_names[i] = "Japanese" # 'GANA LETTER' + elif ( + unicode_names[i].startswith("AMPERSAND") + or unicode_names[i].startswith("APOSTROPHE") + or unicode_names[i].startswith("COMMERCIAL AT") + or unicode_names[i].startswith("DEGREE CELSIUS") + or unicode_names[i].startswith("EQUALS SIGN") + or unicode_names[i].startswith("FULL STOP") + or unicode_names[i].startswith("HYPHEN-MINUS") + or unicode_names[i].startswith("LOW LINE") + or unicode_names[i].startswith("NUMBER SIGN") + or unicode_names[i].startswith("PLUS SIGN") + or unicode_names[i].startswith("SEMICOLON") + ): + # & / ' / @ / ℃ / = / . / - / _ / # / + / ; + del unicode_names[i] + else: + return "Other" + if len(unicode_names) == 0: + return "Other" + if len(unicode_names) == 1: + return unicode_names[0] + for i in range(len(unicode_names) - 1): + if unicode_names[i] != unicode_names[i + 1]: + return "Other" + return unicode_names[0] + + +def get_args(): + parser = argparse.ArgumentParser(description="wer cal") + parser.add_argument("--ref", type=str, help="Text input path") + parser.add_argument("--ref_ocr", type=str, help="Text input path") + parser.add_argument("--rec_name", type=str, action="append", default=[]) + parser.add_argument("--rec_file", type=str, action="append", default=[]) + parser.add_argument("--verbose", type=int, default=1, help="show") + parser.add_argument("--char", type=bool, default=True, help="show") + args = parser.parse_args() + return args + + +def main(args): + cluster_file = "" + ignore_words = set() + tochar = args.char + verbose = args.verbose + padding_symbol = " " + case_sensitive = False + max_words_per_line = sys.maxsize + split = None + + if not case_sensitive: + ig = set([w.upper() for w in ignore_words]) + ignore_words = ig + + default_clusters = {} + default_words = {} + ref_file = args.ref + ref_ocr = args.ref_ocr + rec_files = args.rec_file + rec_names = args.rec_name + assert len(rec_files) == len(rec_names) + + # load ocr + ref_ocr_dict = {} + with codecs.open(ref_ocr, "r", "utf-8") as fh: + for line in fh: + if "$" in line: + line = line.replace("$", " ") + if tochar: + array = characterize(line) + else: + array = line.strip().split() + if len(array) == 0: + continue + fid = array[0] + ref_ocr_dict[fid] = normalize(array[1:], ignore_words, case_sensitive, split) + + if split and not case_sensitive: + newsplit = dict() + for w in split: + words = split[w] + for i in range(len(words)): + words[i] = words[i].upper() + newsplit[w.upper()] = words + split = newsplit + + rec_sets = {} + calculators_dict = dict() + ub_wer_dict = dict() + hotwords_related_dict = dict() # 记录recall相关的内容 + for i, hyp_file in enumerate(rec_files): + rec_sets[rec_names[i]] = dict() + with codecs.open(hyp_file, "r", "utf-8") as fh: + for line in fh: + if tochar: + array = characterize(line) + else: + array = line.strip().split() + if len(array) == 0: + continue + fid = array[0] + rec_sets[rec_names[i]][fid] = normalize(array[1:], ignore_words, case_sensitive, split) + + calculators_dict[rec_names[i]] = Calculator() + ub_wer_dict[rec_names[i]] = {"u_wer": WordError(), "b_wer": WordError(), "wer": WordError()} + hotwords_related_dict[rec_names[i]] = {'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0} + # tp: 热词在label里,同时在rec里 + # tn: 热词不在label里,同时不在rec里 + # fp: 热词不在label里,但是在rec里 + # fn: 热词在label里,但是不在rec里 + + # record wrong label but in ocr + wrong_rec_but_in_ocr_dict = {} + for rec_name in rec_names: + wrong_rec_but_in_ocr_dict[rec_name] = 0 + + _file_total_len = 0 + with os.popen("cat {} | wc -l".format(ref_file)) as pipe: + _file_total_len = int(pipe.read().strip()) + + # compute error rate on the interaction of reference file and hyp file + for line in tqdm(open(ref_file, 'r', encoding='utf-8'), total=_file_total_len): + if tochar: + array = characterize(line) + else: + array = line.rstrip('\n').split() + if len(array) == 0: continue + fid = array[0] + lab = normalize(array[1:], ignore_words, case_sensitive, split) + + if verbose: + print('\nutt: %s' % fid) + + ocr_text = ref_ocr_dict[fid] + ocr_set = set(ocr_text) + print('ocr: {}'.format(" ".join(ocr_text))) + list_match = [] # 指label里面在ocr里面的内容 + list_not_mathch = [] + tmp_error = 0 + tmp_match = 0 + for index in range(len(lab)): + # text_list.append(uttlist[index+1]) + if lab[index] not in ocr_set: + tmp_error += 1 + list_not_mathch.append(lab[index]) + else: + tmp_match += 1 + list_match.append(lab[index]) + print('label in ocr: {}'.format(" ".join(list_match))) + + # for each reco file + base_wrong_ocr_wer = None + ocr_wrong_ocr_wer = None + + for rec_name in rec_names: + rec_set = rec_sets[rec_name] + if fid not in rec_set: + continue + rec = rec_set[fid] + + # print(rec) + for word in rec + lab: + if word not in default_words: + default_cluster_name = default_cluster(word) + if default_cluster_name not in default_clusters: + default_clusters[default_cluster_name] = {} + if word not in default_clusters[default_cluster_name]: + default_clusters[default_cluster_name][word] = 1 + default_words[word] = default_cluster_name + + result = calculators_dict[rec_name].calculate(lab.copy(), rec.copy()) + if verbose: + if result['all'] != 0: + wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] + else: + wer = 0.0 + print('WER(%s): %4.2f %%' % (rec_name, wer), end=' ') + print('N=%d C=%d S=%d D=%d I=%d' % + (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) + + + # print(result['rec']) + wrong_rec_but_in_ocr = [] + for idx in range(len(result['lab'])): + if result['lab'][idx] != "": + if result['lab'][idx] != result['rec'][idx].replace("", ""): + if result['lab'][idx] in list_match: + wrong_rec_but_in_ocr.append(result['lab'][idx]) + wrong_rec_but_in_ocr_dict[rec_name] += 1 + print('wrong_rec_but_in_ocr: {}'.format(" ".join(wrong_rec_but_in_ocr))) + + if rec_name == "base": + base_wrong_ocr_wer = len(wrong_rec_but_in_ocr) + if "ocr" in rec_name or "hot" in rec_name: + ocr_wrong_ocr_wer = len(wrong_rec_but_in_ocr) + if ocr_wrong_ocr_wer < base_wrong_ocr_wer: + print("{} {} helps, {} -> {}".format(fid, rec_name, base_wrong_ocr_wer, ocr_wrong_ocr_wer)) + elif ocr_wrong_ocr_wer > base_wrong_ocr_wer: + print("{} {} hurts, {} -> {}".format(fid, rec_name, base_wrong_ocr_wer, ocr_wrong_ocr_wer)) + + # recall = 0 + # false_alarm = 0 + # for idx in range(len(result['lab'])): + # if "" in result['rec'][idx]: + # if result['rec'][idx].replace("", "") in list_match: + # recall += 1 + # else: + # false_alarm += 1 + # print("bias hotwords recall: {}, fa: {}, list_match {}, recall: {:.2f}, fa: {:.2f}".format( + # recall, false_alarm, len(list_match), recall / len(list_match) if len(list_match) != 0 else 0, false_alarm / len(list_match) if len(list_match) != 0 else 0 + # )) + # tp: 热词在label里,同时在rec里 + # tn: 热词不在label里,同时不在rec里 + # fp: 热词不在label里,但是在rec里 + # fn: 热词在label里,但是不在rec里 + _rec_list = [word.replace("", "") for word in rec] + _label_list = [word for word in lab] + _tp = _tn = _fp = _fn = 0 + hot_true_list = [hotword for hotword in ocr_text if hotword in _label_list] + hot_bad_list = [hotword for hotword in ocr_text if hotword not in _label_list] + for badhotword in hot_bad_list: + count = len([word for word in _rec_list if word == badhotword]) + # print(f"bad {badhotword} count: {count}") + # for word in _rec_list: + # if badhotword == word: + # count += 1 + if count == 0: + hotwords_related_dict[rec_name]['tn'] += 1 + _tn += 1 + # fp: 0 + else: + hotwords_related_dict[rec_name]['fp'] += count + _fp += count + # tn: 0 + # if badhotword in _rec_list: + # hotwords_related_dict[rec_name]['fp'] += 1 + # else: + # hotwords_related_dict[rec_name]['tn'] += 1 + for hotword in hot_true_list: + true_count = len([word for word in _label_list if hotword == word]) + rec_count = len([word for word in _rec_list if hotword == word]) + # print(f"good {hotword} true_count: {true_count}, rec_count: {rec_count}") + if rec_count == true_count: + hotwords_related_dict[rec_name]['tp'] += true_count + _tp += true_count + elif rec_count > true_count: + hotwords_related_dict[rec_name]['tp'] += true_count + # fp: 不在label里,但是在rec里 + hotwords_related_dict[rec_name]['fp'] += rec_count - true_count + _tp += true_count + _fp += rec_count - true_count + else: + hotwords_related_dict[rec_name]['tp'] += rec_count + # fn: 热词在label里,但是不在rec里 + hotwords_related_dict[rec_name]['fn'] += true_count - rec_count + _tp += rec_count + _fn += true_count - rec_count + print("hotword: tp: {}, tn: {}, fp: {}, fn: {}, all: {}, recall: {:.2f}%".format( + _tp, _tn, _fp, _fn, sum([_tp, _tn, _fp, _fn]), _tp / (_tp + _fn) * 100 if (_tp + _fn) != 0 else 0 + )) + + # if hotword in _rec_list: + # hotwords_related_dict[rec_name]['tp'] += 1 + # else: + # hotwords_related_dict[rec_name]['fn'] += 1 + # 计算uwer, bwer, wer + for code, rec_word, lab_word in zip(result["code"], result["rec"], result["lab"]): + if code == Code.match: + ub_wer_dict[rec_name]["wer"].ref_words += 1 + if lab_word in hot_true_list: + # tmp_ref.append(ref_tokens[ref_idx]) + ub_wer_dict[rec_name]["b_wer"].ref_words += 1 + else: + ub_wer_dict[rec_name]["u_wer"].ref_words += 1 + elif code == Code.substitution: + ub_wer_dict[rec_name]["wer"].ref_words += 1 + ub_wer_dict[rec_name]["wer"].errors[Code.substitution] += 1 + if lab_word in hot_true_list: + # tmp_ref.append(ref_tokens[ref_idx]) + ub_wer_dict[rec_name]["b_wer"].ref_words += 1 + ub_wer_dict[rec_name]["b_wer"].errors[Code.substitution] += 1 + else: + ub_wer_dict[rec_name]["u_wer"].ref_words += 1 + ub_wer_dict[rec_name]["u_wer"].errors[Code.substitution] += 1 + elif code == Code.deletion: + ub_wer_dict[rec_name]["wer"].ref_words += 1 + ub_wer_dict[rec_name]["wer"].errors[Code.deletion] += 1 + if lab_word in hot_true_list: + # tmp_ref.append(ref_tokens[ref_idx]) + ub_wer_dict[rec_name]["b_wer"].ref_words += 1 + ub_wer_dict[rec_name]["b_wer"].errors[Code.deletion] += 1 + else: + ub_wer_dict[rec_name]["u_wer"].ref_words += 1 + ub_wer_dict[rec_name]["u_wer"].errors[Code.deletion] += 1 + elif code == Code.insertion: + ub_wer_dict[rec_name]["wer"].errors[Code.insertion] += 1 + if rec_word in hot_true_list: + ub_wer_dict[rec_name]["b_wer"].errors[Code.insertion] += 1 + else: + ub_wer_dict[rec_name]["u_wer"].errors[Code.insertion] += 1 + + space = {} + space['lab'] = [] + space['rec'] = [] + for idx in range(len(result['lab'])): + len_lab = width(result['lab'][idx]) + len_rec = width(result['rec'][idx]) + length = max(len_lab, len_rec) + space['lab'].append(length - len_lab) + space['rec'].append(length - len_rec) + upper_lab = len(result['lab']) + upper_rec = len(result['rec']) + lab1, rec1 = 0, 0 + while lab1 < upper_lab or rec1 < upper_rec: + if verbose > 1: + print('lab(%s):' % fid.encode('utf-8'), end=' ') + else: + print('lab:', end=' ') + lab2 = min(upper_lab, lab1 + max_words_per_line) + for idx in range(lab1, lab2): + token = result['lab'][idx] + print('{token}'.format(token=token), end='') + for n in range(space['lab'][idx]): + print(padding_symbol, end='') + print(' ', end='') + print() + if verbose > 1: + print('rec(%s):' % fid.encode('utf-8'), end=' ') + else: + print('rec:', end=' ') + + rec2 = min(upper_rec, rec1 + max_words_per_line) + for idx in range(rec1, rec2): + token = result['rec'][idx] + print('{token}'.format(token=token), end='') + for n in range(space['rec'][idx]): + print(padding_symbol, end='') + print(' ', end='') + print() + # print('\n', end='\n') + lab1 = lab2 + rec1 = rec2 + print('\n', end='\n') + # break + if verbose: + print('===========================================================================') + print() + + print(wrong_rec_but_in_ocr_dict) + for rec_name in rec_names: + result = calculators_dict[rec_name].overall() + + if result['all'] != 0: + wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] + else: + wer = 0.0 + print('{} Overall -> {:4.2f} %'.format(rec_name, wer), end=' ') + print('N=%d C=%d S=%d D=%d I=%d' % + (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) + print(f"WER: {ub_wer_dict[rec_name]['wer'].get_result_string()}") + print(f"U-WER: {ub_wer_dict[rec_name]['u_wer'].get_result_string()}") + print(f"B-WER: {ub_wer_dict[rec_name]['b_wer'].get_result_string()}") + + print('hotword: tp: {}, tn: {}, fp: {}, fn: {}, all: {}, recall: {:.2f}%'.format( + hotwords_related_dict[rec_name]['tp'], + hotwords_related_dict[rec_name]['tn'], + hotwords_related_dict[rec_name]['fp'], + hotwords_related_dict[rec_name]['fn'], + sum([v for k, v in hotwords_related_dict[rec_name].items()]), + hotwords_related_dict[rec_name]['tp'] / ( + hotwords_related_dict[rec_name]['tp'] + hotwords_related_dict[rec_name]['fn'] + ) * 100 if hotwords_related_dict[rec_name]['tp'] + hotwords_related_dict[rec_name]['fn'] != 0 else 0 + )) + + # tp: 热词在label里,同时在rec里 + # tn: 热词不在label里,同时不在rec里 + # fp: 热词不在label里,但是在rec里 + # fn: 热词在label里,但是不在rec里 + if not verbose: + print() + print() + + +if __name__ == "__main__": + args = get_args() + + # print("") + print(args) + main(args) + diff --git a/examples/industrial_data_pretraining/lcbnet/demo.sh b/examples/industrial_data_pretraining/lcbnet/demo.sh index 9515f985d..f90b8e24b 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo.sh @@ -1,13 +1,71 @@ file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" +CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +inference_device="cuda" -#CUDA_VISIBLE_DEVICES="" \ -python -m funasr.bin.inference \ ---config-path=${file_dir} \ ---config-name="config.yaml" \ -++init_param=${file_dir}/model.pb \ -++tokenizer_conf.token_list=${file_dir}/tokens.txt \ -++input=[${file_dir}/wav.scp,${file_dir}/ocr.txt] \ -+data_type='["kaldi_ark", "text"]' \ -++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ -++output_dir="./outputs/debug" \ -++device="cpu" \ +if [ ${inference_device} == "cuda" ]; then + nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +else + inference_batch_size=1 + CUDA_VISIBLE_DEVICES="" + for JOB in $(seq ${nj}); do + CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1," + done +fi + +inference_dir="outputs/slidespeech_dev_beamsearch" +_logdir="${inference_dir}/logdir" +echo "inference_dir: ${inference_dir}" + +mkdir -p "${_logdir}" +key_file1=${file_dir}/dev/wav.scp +key_file2=${file_dir}/dev/ocr.txt +split_scps1= +split_scps2= +for JOB in $(seq "${nj}"); do + split_scps1+=" ${_logdir}/wav.${JOB}.scp" + split_scps2+=" ${_logdir}/ocr.${JOB}.txt" +done +utils/split_scp.pl "${key_file1}" ${split_scps1} +utils/split_scp.pl "${key_file2}" ${split_scps2} + +gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ }) +for JOB in $(seq ${nj}); do + { + id=$((JOB-1)) + gpuid=${gpuid_list_array[$id]} + + export CUDA_VISIBLE_DEVICES=${gpuid} + + python -m funasr.bin.inference \ + --config-path=${file_dir} \ + --config-name="config.yaml" \ + ++init_param=${file_dir}/model.pb \ + ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ + ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \ + +data_type='["kaldi_ark", "text"]' \ + ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ + ++output_dir="${inference_dir}/${JOB}" \ + ++device="${inference_device}" \ + ++ncpu=1 \ + ++disable_log=true &> ${_logdir}/log.${JOB}.txt + + }& +done +wait + + +mkdir -p ${inference_dir}/1best_recog + +for JOB in $(seq "${nj}"); do + cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token" +done + +echo "Computing WER ..." +sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/' ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc +cp ${file_dir}/dev/text ${inference_dir}/1best_recog/token.ref +cp ${file_dir}/dev/ocr.list ${inference_dir}/1best_recog/ocr.list +python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer +tail -n 3 ${inference_dir}/1best_recog/token.cer + +./run_bwer_recall.sh ${inference_dir}/1best_recog/ +tail -n 6 ${inference_dir}/1best_recog/BWER-UWER.results |head -n 5 diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh deleted file mode 100755 index 4aae9e5ed..000000000 --- a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh +++ /dev/null @@ -1,67 +0,0 @@ -file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" -CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -inference_device="cuda" - -if [ ${inference_device} == "cuda" ]; then - nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') -else - inference_batch_size=1 - CUDA_VISIBLE_DEVICES="" - for JOB in $(seq ${nj}); do - CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1," - done -fi - -inference_dir="outputs/test" -_logdir="${inference_dir}/logdir" -echo "inference_dir: ${inference_dir}" - -mkdir -p "${_logdir}" -key_file1=${file_dir}/wav.scp -key_file2=${file_dir}/ocr.txt -split_scps1= -split_scps2= -for JOB in $(seq "${nj}"); do - split_scps1+=" ${_logdir}/wav.${JOB}.scp" - split_scps2+=" ${_logdir}/ocr.${JOB}.txt" -done -utils/split_scp.pl "${key_file1}" ${split_scps1} -utils/split_scp.pl "${key_file2}" ${split_scps2} - -gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ }) -for JOB in $(seq ${nj}); do - { - id=$((JOB-1)) - gpuid=${gpuid_list_array[$id]} - - export CUDA_VISIBLE_DEVICES=${gpuid} - - python -m funasr.bin.inference \ - --config-path=${file_dir} \ - --config-name="config.yaml" \ - ++init_param=${file_dir}/model.pb \ - ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ - ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \ - +data_type='["kaldi_ark", "text"]' \ - ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ - ++output_dir="${inference_dir}/${JOB}" \ - ++device="${inference_device}" \ - ++ncpu=1 \ - ++disable_log=true &> ${_logdir}/log.${JOB}.txt - - }& -done -wait - - -mkdir -p ${inference_dir}/1best_recog - -for JOB in $(seq "${nj}"); do - cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token" -done - -echo "Computing WER ..." -sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/' ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc -cp ${file_dir}/text ${inference_dir}/1best_recog/token.ref -python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer -tail -n 3 ${inference_dir}/1best_recog/token.cer \ No newline at end of file diff --git a/examples/industrial_data_pretraining/lcbnet/run_bwer_recall.sh b/examples/industrial_data_pretraining/lcbnet/run_bwer_recall.sh new file mode 100755 index 000000000..7d6b6ff8b --- /dev/null +++ b/examples/industrial_data_pretraining/lcbnet/run_bwer_recall.sh @@ -0,0 +1,11 @@ +#now_result_name=asr_conformer_acc1_lr002_warm20000/decode_asr_asr_model_valid.acc.ave +#hotword_type=ocr_1ngram_top10_hotwords_list +hot_exp_suf=$1 + + +python compute_wer_details.py --v 1 \ + --ref ${hot_exp_suf}/token.ref \ + --ref_ocr ${hot_exp_suf}/ocr.list \ + --rec_name base \ + --rec_file ${hot_exp_suf}/token.proc \ + > ${hot_exp_suf}/BWER-UWER.results From d12d18886cf5d1c7daaf74bd348cf3ca7b2c8b7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 29 Feb 2024 16:20:18 +0800 Subject: [PATCH 074/101] test --- .../industrial_data_pretraining/lcbnet/demo_pdb.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 examples/industrial_data_pretraining/lcbnet/demo_pdb.sh diff --git a/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh b/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh new file mode 100644 index 000000000..e435905bf --- /dev/null +++ b/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh @@ -0,0 +1,13 @@ +file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" + +#CUDA_VISIBLE_DEVICES="" \ +python -m funasr.bin.inference \ +--config-path=${file_dir} \ +--config-name="config.yaml" \ +++init_param=${file_dir}/model.pb \ +++tokenizer_conf.token_list=${file_dir}/tokens.txt \ +++input=[${file_dir}/dev/wav.scp,${file_dir}/dev/ocr.txt] \ ++data_type='["kaldi_ark", "text"]' \ +++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ +++output_dir="./outputs/debug" \ +++device="cpu" \ From 4477e27bf08f065dacd37c82fc88e69f43805328 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 29 Feb 2024 16:26:54 +0800 Subject: [PATCH 075/101] test --- examples/industrial_data_pretraining/lcbnet/demo.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/industrial_data_pretraining/lcbnet/demo.sh b/examples/industrial_data_pretraining/lcbnet/demo.sh index f90b8e24b..825289188 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo.sh @@ -12,7 +12,7 @@ else done fi -inference_dir="outputs/slidespeech_dev_beamsearch" +inference_dir="outputs/slidespeech_dev_beamsearch_new" _logdir="${inference_dir}/logdir" echo "inference_dir: ${inference_dir}" From 650c506cda3f6d38ad4805f02fe2700d2287400d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 29 Feb 2024 18:57:17 +0800 Subject: [PATCH 076/101] atsr --- examples/industrial_data_pretraining/lcbnet/demo_pdb.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 examples/industrial_data_pretraining/lcbnet/demo_pdb.sh diff --git a/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh b/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh old mode 100644 new mode 100755 From 84ad3e48a0a1c29967a4cf9195ad202d434c7860 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 29 Feb 2024 18:58:08 +0800 Subject: [PATCH 077/101] atsr --- funasr/utils/load_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py index 87412bd87..bd8de3e87 100644 --- a/funasr/utils/load_utils.py +++ b/funasr/utils/load_utils.py @@ -34,6 +34,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): # local file if data_type is None or data_type == "sound": data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list) + pdb.set_trace() if kwargs.get("reduce_channels", True): data_or_path_or_list = data_or_path_or_list.mean(0) elif data_type == "text" and tokenizer is not None: From 96eaabca5b2e9c93b40c9840e2ae0003a618bb6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Thu, 29 Feb 2024 19:02:43 +0800 Subject: [PATCH 078/101] atsr --- funasr/utils/load_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py index bd8de3e87..87412bd87 100644 --- a/funasr/utils/load_utils.py +++ b/funasr/utils/load_utils.py @@ -34,7 +34,6 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): # local file if data_type is None or data_type == "sound": data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list) - pdb.set_trace() if kwargs.get("reduce_channels", True): data_or_path_or_list = data_or_path_or_list.mean(0) elif data_type == "text" and tokenizer is not None: From 8f63be3af7264f3b0831d91e2e54800fcd246120 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Fri, 1 Mar 2024 10:16:03 +0800 Subject: [PATCH 079/101] atsr --- funasr/auto/auto_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index 3f99e4d17..89e38eafc 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -41,6 +41,7 @@ def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None): chars = string.ascii_letters + string.digits if isinstance(data_in, str) and data_in.startswith('http'): # url data_in = download_from_url(data_in) + pdb.set_trace() if isinstance(data_in, str) and os.path.exists(data_in): # wav_path; filelist: wav.scp, file.jsonl;text.txt; _, file_extension = os.path.splitext(data_in) file_extension = file_extension.lower() From ec98a8e13859e4c3f1e55fca5f09e91be1b3d810 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Fri, 1 Mar 2024 10:26:23 +0800 Subject: [PATCH 080/101] atsr --- funasr/auto/auto_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index 89e38eafc..ef261dfcc 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -213,7 +213,7 @@ class AutoModel: batch_size = kwargs.get("batch_size", 1) # if kwargs.get("device", "cpu") == "cpu": # batch_size = 1 - + pdb.set_trace() key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key) speed_stats = {} From fae6fd6d16e0fb060aa063790893f7555c421c4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Fri, 1 Mar 2024 10:29:15 +0800 Subject: [PATCH 081/101] atsr --- funasr/bin/inference.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/funasr/bin/inference.py b/funasr/bin/inference.py index d2f0c149d..3943d7ed4 100644 --- a/funasr/bin/inference.py +++ b/funasr/bin/inference.py @@ -1,7 +1,7 @@ import hydra import logging from omegaconf import DictConfig, OmegaConf, ListConfig - +import pdb from funasr.auto.auto_model import AutoModel @@ -23,6 +23,7 @@ def main_hydra(cfg: DictConfig): if kwargs.get("debug", False): import pdb; pdb.set_trace() model = AutoModel(**kwargs) + pdb.set_trace() res = model.generate(input=kwargs["input"]) print(res) From 21b49bd56f44ea5921bacd8c0a3c5e35680cb405 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Fri, 1 Mar 2024 10:36:27 +0800 Subject: [PATCH 082/101] atsr --- funasr/auto/auto_model.py | 2 +- funasr/bin/inference.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index ef261dfcc..56dd5b5e4 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -213,7 +213,7 @@ class AutoModel: batch_size = kwargs.get("batch_size", 1) # if kwargs.get("device", "cpu") == "cpu": # batch_size = 1 - pdb.set_trace() + key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key) speed_stats = {} diff --git a/funasr/bin/inference.py b/funasr/bin/inference.py index 3943d7ed4..d2f0c149d 100644 --- a/funasr/bin/inference.py +++ b/funasr/bin/inference.py @@ -1,7 +1,7 @@ import hydra import logging from omegaconf import DictConfig, OmegaConf, ListConfig -import pdb + from funasr.auto.auto_model import AutoModel @@ -23,7 +23,6 @@ def main_hydra(cfg: DictConfig): if kwargs.get("debug", False): import pdb; pdb.set_trace() model = AutoModel(**kwargs) - pdb.set_trace() res = model.generate(input=kwargs["input"]) print(res) From d4955ba39594dfec455c3045807927809974507f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Fri, 1 Mar 2024 10:42:37 +0800 Subject: [PATCH 083/101] atsr --- funasr/utils/load_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py index 87412bd87..ccb5670c2 100644 --- a/funasr/utils/load_utils.py +++ b/funasr/utils/load_utils.py @@ -31,6 +31,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs, data_type=data_type, **kwargs) for audio in data_or_path_or_list] if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith('http'): # download url to local file data_or_path_or_list = download_from_url(data_or_path_or_list) + pdb.set_trace() if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): # local file if data_type is None or data_type == "sound": data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list) @@ -66,7 +67,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: else: pass # print(f"unsupport data type: {data_or_path_or_list}, return raw data") - + pdb.set_trace() if audio_fs != fs and data_type != "text": resampler = torchaudio.transforms.Resample(audio_fs, fs) data_or_path_or_list = resampler(data_or_path_or_list[None, :])[0, :] From 6d7b9457103264b760f79918aa13ec1b89474670 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Fri, 1 Mar 2024 11:10:44 +0800 Subject: [PATCH 084/101] atsr --- funasr/auto/auto_model.py | 2 +- funasr/models/lcbnet/model.py | 3 ++- funasr/utils/load_utils.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index 56dd5b5e4..9bb9ce07d 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -41,7 +41,7 @@ def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None): chars = string.ascii_letters + string.digits if isinstance(data_in, str) and data_in.startswith('http'): # url data_in = download_from_url(data_in) - pdb.set_trace() + if isinstance(data_in, str) and os.path.exists(data_in): # wav_path; filelist: wav.scp, file.jsonl;text.txt; _, file_extension = os.path.splitext(data_in) file_extension = file_extension.lower() diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index ab557e6d8..09e6dd137 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -426,6 +426,7 @@ class LCBNet(nn.Module): tokenizer=tokenizer) time2 = time.perf_counter() meta_data["load_data"] = f"{time2 - time1:0.3f}" + pdb.set_trace() audio_sample_list = sample_list[0] ocr_sample_list = sample_list[1] speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), @@ -441,7 +442,7 @@ class LCBNet(nn.Module): encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) if isinstance(encoder_out, tuple): encoder_out = encoder_out[0] - + pdb.set_trace() ocr_list_new = [[x + 1 if x != 0 else x for x in sublist] for sublist in ocr_sample_list] ocr = torch.tensor(ocr_list_new).to(device=kwargs["device"]) ocr_lengths = ocr.new_full([1], dtype=torch.long, fill_value=ocr.size(1)).to(device=kwargs["device"]) diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py index ccb5670c2..644af2324 100644 --- a/funasr/utils/load_utils.py +++ b/funasr/utils/load_utils.py @@ -31,7 +31,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs, data_type=data_type, **kwargs) for audio in data_or_path_or_list] if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith('http'): # download url to local file data_or_path_or_list = download_from_url(data_or_path_or_list) - pdb.set_trace() + if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): # local file if data_type is None or data_type == "sound": data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list) @@ -67,7 +67,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: else: pass # print(f"unsupport data type: {data_or_path_or_list}, return raw data") - pdb.set_trace() + if audio_fs != fs and data_type != "text": resampler = torchaudio.transforms.Resample(audio_fs, fs) data_or_path_or_list = resampler(data_or_path_or_list[None, :])[0, :] From 583f918e0ec752518a0263b09ce9b9e55f047fa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Fri, 1 Mar 2024 11:18:29 +0800 Subject: [PATCH 085/101] atsr --- funasr/models/lcbnet/model.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index 09e6dd137..e83f8d783 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -428,7 +428,10 @@ class LCBNet(nn.Module): meta_data["load_data"] = f"{time2 - time1:0.3f}" pdb.set_trace() audio_sample_list = sample_list[0] - ocr_sample_list = sample_list[1] + if len(sample_list) >1: + ocr_sample_list = sample_list[1] + else: + ocr_sample_list = [294, 0] speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend) time3 = time.perf_counter() From de21c10a2e8a7f93719902708a5ca7970d9e3f00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Fri, 1 Mar 2024 11:30:49 +0800 Subject: [PATCH 086/101] atsr --- funasr/models/lcbnet/model.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index e83f8d783..15e2fa1c4 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -426,12 +426,11 @@ class LCBNet(nn.Module): tokenizer=tokenizer) time2 = time.perf_counter() meta_data["load_data"] = f"{time2 - time1:0.3f}" - pdb.set_trace() audio_sample_list = sample_list[0] if len(sample_list) >1: ocr_sample_list = sample_list[1] else: - ocr_sample_list = [294, 0] + ocr_sample_list = [[294, 0]] speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend) time3 = time.perf_counter() From 52cec216726806c1a1223a305ae6c3f87bce8558 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Fri, 1 Mar 2024 11:33:31 +0800 Subject: [PATCH 087/101] atsr --- funasr/models/lcbnet/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py index 15e2fa1c4..3ac319c61 100644 --- a/funasr/models/lcbnet/model.py +++ b/funasr/models/lcbnet/model.py @@ -444,7 +444,7 @@ class LCBNet(nn.Module): encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) if isinstance(encoder_out, tuple): encoder_out = encoder_out[0] - pdb.set_trace() + ocr_list_new = [[x + 1 if x != 0 else x for x in sublist] for sublist in ocr_sample_list] ocr = torch.tensor(ocr_list_new).to(device=kwargs["device"]) ocr_lengths = ocr.new_full([1], dtype=torch.long, fill_value=ocr.size(1)).to(device=kwargs["device"]) From 1d7ba1be1ad824135698e8000386c1fd55268ae4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Mon, 4 Mar 2024 13:52:21 +0800 Subject: [PATCH 088/101] atsr --- .gitignore | 1 + .../lcbnet/demo2.sh | 71 +++++++++++++++++++ .../lcbnet/demo2_tmp.sh | 71 +++++++++++++++++++ .../lcbnet/demo_pdb.sh | 9 ++- .../lcbnet/demo_pdb2.sh | 15 ++++ .../lcbnet/demo_tmp1.sh | 71 +++++++++++++++++++ 6 files changed, 236 insertions(+), 2 deletions(-) create mode 100755 examples/industrial_data_pretraining/lcbnet/demo2.sh create mode 100755 examples/industrial_data_pretraining/lcbnet/demo2_tmp.sh create mode 100755 examples/industrial_data_pretraining/lcbnet/demo_pdb2.sh create mode 100755 examples/industrial_data_pretraining/lcbnet/demo_tmp1.sh diff --git a/.gitignore b/.gitignore index bdfe70f1a..d2b4c53b9 100644 --- a/.gitignore +++ b/.gitignore @@ -25,4 +25,5 @@ outputs* emotion2vec* GPT-SoVITS* examples/*/*/outputs +examples/*/*/exp cmd_read diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh new file mode 100755 index 000000000..69df6d16c --- /dev/null +++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh @@ -0,0 +1,71 @@ +file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" +CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +inference_device="cuda" +test_set="dev_wav" +if [ ${inference_device} == "cuda" ]; then + nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +else + inference_batch_size=1 + CUDA_VISIBLE_DEVICES="" + for JOB in $(seq ${nj}); do + CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1," + done +fi + +inference_dir="outputs/slidespeech_dev_beamsearch_wav" +_logdir="${inference_dir}/logdir" +echo "inference_dir: ${inference_dir}" + +mkdir -p "${_logdir}" +key_file1=${file_dir}/${test_set}/wav.scp +key_file2=${file_dir}/${test_set}/ocr.txt +split_scps1= +split_scps2= +for JOB in $(seq "${nj}"); do + split_scps1+=" ${_logdir}/wav.${JOB}.scp" + split_scps2+=" ${_logdir}/ocr.${JOB}.txt" +done +utils/split_scp.pl "${key_file1}" ${split_scps1} +utils/split_scp.pl "${key_file2}" ${split_scps2} + +gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ }) +for JOB in $(seq ${nj}); do + { + id=$((JOB-1)) + gpuid=${gpuid_list_array[$id]} + + export CUDA_VISIBLE_DEVICES=${gpuid} + + python -m funasr.bin.inference \ + --config-path=${file_dir} \ + --config-name="config.yaml" \ + ++init_param=${file_dir}/model.pb \ + ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ + ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \ + +data_type='["sound", "text"]' \ + ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ + ++output_dir="${inference_dir}/${JOB}" \ + ++device="${inference_device}" \ + ++ncpu=1 \ + ++disable_log=true &> ${_logdir}/log.${JOB}.txt + + }& +done +wait + + +mkdir -p ${inference_dir}/1best_recog + +for JOB in $(seq "${nj}"); do + cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token" +done + +echo "Computing WER ..." +sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/' ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc +cp ${file_dir}/${test_set}/text ${inference_dir}/1best_recog/token.ref +cp ${file_dir}/${test_set}/ocr.list ${inference_dir}/1best_recog/ocr.list +python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer +tail -n 3 ${inference_dir}/1best_recog/token.cer + +./run_bwer_recall.sh ${inference_dir}/1best_recog/ +tail -n 6 ${inference_dir}/1best_recog/BWER-UWER.results |head -n 5 diff --git a/examples/industrial_data_pretraining/lcbnet/demo2_tmp.sh b/examples/industrial_data_pretraining/lcbnet/demo2_tmp.sh new file mode 100755 index 000000000..da6ad686d --- /dev/null +++ b/examples/industrial_data_pretraining/lcbnet/demo2_tmp.sh @@ -0,0 +1,71 @@ +file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" +CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +inference_device="cuda" +test_set="test_wav" +if [ ${inference_device} == "cuda" ]; then + nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +else + inference_batch_size=1 + CUDA_VISIBLE_DEVICES="" + for JOB in $(seq ${nj}); do + CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1," + done +fi + +inference_dir="outputs/slidespeech_test_beamsearch_wav" +_logdir="${inference_dir}/logdir" +echo "inference_dir: ${inference_dir}" + +mkdir -p "${_logdir}" +key_file1=${file_dir}/${test_set}/wav.scp +key_file2=${file_dir}/${test_set}/ocr.txt +split_scps1= +split_scps2= +for JOB in $(seq "${nj}"); do + split_scps1+=" ${_logdir}/wav.${JOB}.scp" + split_scps2+=" ${_logdir}/ocr.${JOB}.txt" +done +utils/split_scp.pl "${key_file1}" ${split_scps1} +utils/split_scp.pl "${key_file2}" ${split_scps2} + +gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ }) +for JOB in $(seq ${nj}); do + { + id=$((JOB-1)) + gpuid=${gpuid_list_array[$id]} + + export CUDA_VISIBLE_DEVICES=${gpuid} + + python -m funasr.bin.inference \ + --config-path=${file_dir} \ + --config-name="config.yaml" \ + ++init_param=${file_dir}/model.pb \ + ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ + ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \ + +data_type='["sound", "text"]' \ + ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ + ++output_dir="${inference_dir}/${JOB}" \ + ++device="${inference_device}" \ + ++ncpu=1 \ + ++disable_log=true &> ${_logdir}/log.${JOB}.txt + + }& +done +wait + + +mkdir -p ${inference_dir}/1best_recog + +for JOB in $(seq "${nj}"); do + cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token" +done + +echo "Computing WER ..." +sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/' ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc +cp ${file_dir}/${test_set}/text ${inference_dir}/1best_recog/token.ref +cp ${file_dir}/${test_set}/ocr.list ${inference_dir}/1best_recog/ocr.list +python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer +tail -n 3 ${inference_dir}/1best_recog/token.cer + +./run_bwer_recall.sh ${inference_dir}/1best_recog/ +tail -n 6 ${inference_dir}/1best_recog/BWER-UWER.results |head -n 5 diff --git a/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh b/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh index e435905bf..0747a8d7b 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh @@ -6,8 +6,13 @@ python -m funasr.bin.inference \ --config-name="config.yaml" \ ++init_param=${file_dir}/model.pb \ ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ -++input=[${file_dir}/dev/wav.scp,${file_dir}/dev/ocr.txt] \ -+data_type='["kaldi_ark", "text"]' \ ++input=["${file_dir}/example/asr_example.wav","${file_dir}/example/ocr.txt"] \ ++data_type='["sound","text"]' \ ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ ++output_dir="./outputs/debug" \ ++device="cpu" \ + +#++input=["/nfs/yufan.yf/workspace/espnet/egs2/youtube_ppt/asr/dump/raw/dev_oracle_v1_new/data/format.1/YTB+--tMoLpQI-w+00322.wav"] \ +#+data_type='["sound"]' \ +#++input=["/nfs/yufan.yf/workspace/espnet/egs2/youtube_ppt/asr/dump/raw/dev_oracle_v1_new/data/format.1/YTB+--tMoLpQI-w+00322.wav","/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch/example/ocr2.txt"] \ +#+data_type='["sound","text"]' \ diff --git a/examples/industrial_data_pretraining/lcbnet/demo_pdb2.sh b/examples/industrial_data_pretraining/lcbnet/demo_pdb2.sh new file mode 100755 index 000000000..557e9b2d8 --- /dev/null +++ b/examples/industrial_data_pretraining/lcbnet/demo_pdb2.sh @@ -0,0 +1,15 @@ +file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" + +#CUDA_VISIBLE_DEVICES="" \ +python -m funasr.bin.inference \ +--config-path=${file_dir} \ +--config-name="config.yaml" \ +++init_param=${file_dir}/model.pb \ +++tokenizer_conf.token_list=${file_dir}/tokens.txt \ +++input=[${file_dir}/dev_wav/wav.scp,${file_dir}/dev_wav/ocr.txt] \ ++data_type='["sound", "text"]' \ +++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ +++output_dir="./outputs/debug" \ +++device="cpu" \ + +#++input=[${file_dir}/dev_wav/wav.scp,${file_dir}/dev_wav/ocr.txt] \ diff --git a/examples/industrial_data_pretraining/lcbnet/demo_tmp1.sh b/examples/industrial_data_pretraining/lcbnet/demo_tmp1.sh new file mode 100755 index 000000000..488f7d2a1 --- /dev/null +++ b/examples/industrial_data_pretraining/lcbnet/demo_tmp1.sh @@ -0,0 +1,71 @@ +file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" +CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +inference_device="cuda" + +if [ ${inference_device} == "cuda" ]; then + nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +else + inference_batch_size=1 + CUDA_VISIBLE_DEVICES="" + for JOB in $(seq ${nj}); do + CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1," + done +fi + +inference_dir="outputs/slidespeech_test_beamsearch_new" +_logdir="${inference_dir}/logdir" +echo "inference_dir: ${inference_dir}" + +mkdir -p "${_logdir}" +key_file1=${file_dir}/test/wav.scp +key_file2=${file_dir}/test/ocr.txt +split_scps1= +split_scps2= +for JOB in $(seq "${nj}"); do + split_scps1+=" ${_logdir}/wav.${JOB}.scp" + split_scps2+=" ${_logdir}/ocr.${JOB}.txt" +done +utils/split_scp.pl "${key_file1}" ${split_scps1} +utils/split_scp.pl "${key_file2}" ${split_scps2} + +gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ }) +for JOB in $(seq ${nj}); do + { + id=$((JOB-1)) + gpuid=${gpuid_list_array[$id]} + + export CUDA_VISIBLE_DEVICES=${gpuid} + + python -m funasr.bin.inference \ + --config-path=${file_dir} \ + --config-name="config.yaml" \ + ++init_param=${file_dir}/model.pb \ + ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ + ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \ + +data_type='["kaldi_ark", "text"]' \ + ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ + ++output_dir="${inference_dir}/${JOB}" \ + ++device="${inference_device}" \ + ++ncpu=1 \ + ++disable_log=true &> ${_logdir}/log.${JOB}.txt + + }& +done +wait + + +mkdir -p ${inference_dir}/1best_recog + +for JOB in $(seq "${nj}"); do + cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token" +done + +echo "Computing WER ..." +sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/' ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc +cp ${file_dir}/test/text ${inference_dir}/1best_recog/token.ref +cp ${file_dir}/test/ocr.list ${inference_dir}/1best_recog/ocr.list +python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer +tail -n 3 ${inference_dir}/1best_recog/token.cer + +./run_bwer_recall.sh ${inference_dir}/1best_recog/ +tail -n 6 ${inference_dir}/1best_recog/BWER-UWER.results |head -n 5 From aa45aeeaa7e8abd11bc1be392b4547685645ca5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Mon, 4 Mar 2024 14:29:52 +0800 Subject: [PATCH 089/101] atsr --- .../lcbnet/demp.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 examples/industrial_data_pretraining/lcbnet/demp.py diff --git a/examples/industrial_data_pretraining/lcbnet/demp.py b/examples/industrial_data_pretraining/lcbnet/demp.py new file mode 100644 index 000000000..cb08290b4 --- /dev/null +++ b/examples/industrial_data_pretraining/lcbnet/demp.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- +# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. +# MIT License (https://opensource.org/licenses/MIT) + +from funasr import AutoModel + +model = AutoModel(model="iic/LCB-NET" + ) + + +# example1 +#res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", + ) +#print(res) + + +''' +# tensor or numpy as input +# example2 +import torchaudio +import os +wav_file = os.path.join(model.model_path, "example/asr_example.wav") +input_tensor, sample_rate = torchaudio.load(wav_file) +input_tensor = input_tensor.mean(0) +res = model.generate(input=[input_tensor], batch_size_s=300, is_final=True) + + +# example3 +import soundfile + +wav_file = os.path.join(model.model_path, "example/asr_example.wav") +speech, sample_rate = soundfile.read(wav_file) +res = model.generate(input=[speech], batch_size_s=300, is_final=True) +''' From 4e881fc2be339718b771b7469d9b83be89943fa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Mon, 4 Mar 2024 14:32:10 +0800 Subject: [PATCH 090/101] atsr --- .../industrial_data_pretraining/lcbnet/{demp.py => demo.py} | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) rename examples/industrial_data_pretraining/lcbnet/{demp.py => demo.py} (91%) mode change 100644 => 100755 diff --git a/examples/industrial_data_pretraining/lcbnet/demp.py b/examples/industrial_data_pretraining/lcbnet/demo.py old mode 100644 new mode 100755 similarity index 91% rename from examples/industrial_data_pretraining/lcbnet/demp.py rename to examples/industrial_data_pretraining/lcbnet/demo.py index cb08290b4..b9d70c779 --- a/examples/industrial_data_pretraining/lcbnet/demp.py +++ b/examples/industrial_data_pretraining/lcbnet/demo.py @@ -5,13 +5,12 @@ from funasr import AutoModel -model = AutoModel(model="iic/LCB-NET" - ) +model = AutoModel(model="iic/LCB-NET") # example1 #res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", - ) +# ) #print(res) From 8cca996c7003a5756d09a6a278a9e7efd23b5701 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Mon, 4 Mar 2024 14:45:03 +0800 Subject: [PATCH 091/101] atsr --- examples/industrial_data_pretraining/lcbnet/demo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/industrial_data_pretraining/lcbnet/demo.py b/examples/industrial_data_pretraining/lcbnet/demo.py index b9d70c779..d8e6c9755 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo.py +++ b/examples/industrial_data_pretraining/lcbnet/demo.py @@ -5,7 +5,8 @@ from funasr import AutoModel -model = AutoModel(model="iic/LCB-NET") +model = AutoModel(model="iic/LCB-NET", + model_revision="v1.0.0") # example1 From 9070774ab6c9a7149d31240fb0d686485f30f8e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Mon, 4 Mar 2024 15:21:47 +0800 Subject: [PATCH 092/101] atsr --- .../lcbnet/README.md | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 examples/industrial_data_pretraining/lcbnet/README.md diff --git a/examples/industrial_data_pretraining/lcbnet/README.md b/examples/industrial_data_pretraining/lcbnet/README.md new file mode 100644 index 000000000..4273ec085 --- /dev/null +++ b/examples/industrial_data_pretraining/lcbnet/README.md @@ -0,0 +1,105 @@ +--- +tasks: +- audio-visual-speech-recognition +domain: +- audio, visual +model-type: +- Autoregressive +frameworks: +- pytorch +backbone: +- transformer/conformer +metrics: +- WER/B-WER +license: Apache License 2.0 +language: +- en +tags: +- FunASR +- Alibaba +- ICASSP 2024 +- Audio-Visual +- Hotword +- Long-Context Biasing +datasets: + train: + - SlideSpeech corpus + test: + - dev and test of SlideSpeech corpus +indexing: + results: + - task: + name: Audio-Visual Speech Recognition + dataset: + name: SlideSpeech corpus + type: audio # optional + args: 16k sampling rate, 5002 bpe units # optional + metrics: + - type: WER + value: 18.8% # float + description: beamsearch search, withou lm, avg. + args: default + +widgets: + - task: audio-visual-speech-recognition + inputs: + - type: audio + name: input + title: 音频 + - type: text + name: input + title: OCR识别文本 +finetune-support: True +--- + + +# Paraformer-large模型介绍 + +## Highlights +- 热词版本:[Paraformer-large热词版模型](https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary)支持热词定制功能,基于提供的热词列表进行激励增强,提升热词的召回率和准确率。 +- 长音频版本:[Paraformer-large长音频模型](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary),集成VAD、ASR、标点与时间戳功能,可直接对时长为数小时音频进行识别,并输出带标点文字与时间戳。 + +## [FunASR开源项目介绍](https://github.com/alibaba-damo-academy/FunASR) +[FunASR](https://github.com/alibaba-damo-academy/FunASR)希望在语音识别的学术研究和工业应用之间架起一座桥梁。通过发布工业级语音识别模型的训练和微调,研究人员和开发人员可以更方便地进行语音识别模型的研究和生产,并推动语音识别生态的发展。让语音识别更有趣! + +[**github仓库**](https://github.com/alibaba-damo-academy/FunASR) +| [**最新动态**](https://github.com/alibaba-damo-academy/FunASR#whats-new) +| [**环境安装**](https://github.com/alibaba-damo-academy/FunASR#installation) +| [**服务部署**](https://www.funasr.com) +| [**模型库**](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo) +| [**联系我们**](https://github.com/alibaba-damo-academy/FunASR#contact) + + +## 模型原理介绍 + +随着在线会议和课程越来越普遍,如何利用视频幻灯片中丰富的文本信息来改善语音识别(Automatic  Speech Recognition, ASR)面临着新的挑战。视频中的幻灯片与语音实时同步,相比于统一的稀有词列表,能够提供更长的上下文相关信息。因此,我们提出了一种创新的长上下文偏置网络(LCB-net),用于音频-视觉语音识别(Audio-Visual Speech Recognition,AVSR),以更好地利用视频中的长时上下文信息。 + +

+AVSR整体流程框架 +

+LCB-NET模型结构 + + +具体来说,我们首先使用OCR技术来检测和识别幻灯片中的文本内容,其次我们采用关键词提取技术来获取文本内容中的关键词短语。最后,我们将关键词拼接成长上下文文本和音频同时输入到我们的LCB-net模型中进行识别。而LCB-net模型采用了双编码器结构,同时建模音频和长上下文文本信息。此外,我们还引入了一个显式的偏置词预测模块,通过使用二元交叉熵(BCE)损失函数显式预测长上下文文本中在音频中出现的关键偏置词。此外,为增强LCB-net的泛化能力和稳健性,我们还采用了动态的关键词模拟策略。实验证明,我们提出的LCB-net热词模型,不仅能够提升关键词的识别效果,同时也能够提升非关键词的识别效果。具体实验结果如下所示: + +

+实验结果 + + +更详细的细节见: +- 论文: [LCB-net: Long-Context Biasing for Audio-Visual Speech Recognition](https://arxiv.org/abs/2401.06390) + + + + + +## 相关论文以及引用信息 + +```BibTeX +@inproceedings{yu2024lcbnet, + title={LCB-net: Long-Context Biasing for Audio-Visual Speech Recognition}, + author={Fan Yu, Haoxu Wang, Xian Shi, Shiliang Zhang}, + booktitle={ICASSP}, + year={2024} +} +``` \ No newline at end of file From beb5db6c1c44664a1415dc85e7cf441504f1c2a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Mon, 4 Mar 2024 15:29:03 +0800 Subject: [PATCH 093/101] atsr --- examples/industrial_data_pretraining/lcbnet/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/industrial_data_pretraining/lcbnet/README.md b/examples/industrial_data_pretraining/lcbnet/README.md index 4273ec085..c0d4b1ecb 100644 --- a/examples/industrial_data_pretraining/lcbnet/README.md +++ b/examples/industrial_data_pretraining/lcbnet/README.md @@ -75,9 +75,9 @@ finetune-support: True 随着在线会议和课程越来越普遍,如何利用视频幻灯片中丰富的文本信息来改善语音识别(Automatic  Speech Recognition, ASR)面临着新的挑战。视频中的幻灯片与语音实时同步,相比于统一的稀有词列表,能够提供更长的上下文相关信息。因此,我们提出了一种创新的长上下文偏置网络(LCB-net),用于音频-视觉语音识别(Audio-Visual Speech Recognition,AVSR),以更好地利用视频中的长时上下文信息。

-AVSR整体流程框架 +AVSR整体流程框架

-LCB-NET模型结构 +LCB-NET模型结构 具体来说,我们首先使用OCR技术来检测和识别幻灯片中的文本内容,其次我们采用关键词提取技术来获取文本内容中的关键词短语。最后,我们将关键词拼接成长上下文文本和音频同时输入到我们的LCB-net模型中进行识别。而LCB-net模型采用了双编码器结构,同时建模音频和长上下文文本信息。此外,我们还引入了一个显式的偏置词预测模块,通过使用二元交叉熵(BCE)损失函数显式预测长上下文文本中在音频中出现的关键偏置词。此外,为增强LCB-net的泛化能力和稳健性,我们还采用了动态的关键词模拟策略。实验证明,我们提出的LCB-net热词模型,不仅能够提升关键词的识别效果,同时也能够提升非关键词的识别效果。具体实验结果如下所示: From 9876729a257b2ebb6c9289e7d442c8e086d96e7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Mon, 4 Mar 2024 15:41:22 +0800 Subject: [PATCH 094/101] atsr --- examples/industrial_data_pretraining/lcbnet/demo.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/industrial_data_pretraining/lcbnet/demo.py b/examples/industrial_data_pretraining/lcbnet/demo.py index d8e6c9755..fe51f08e1 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo.py +++ b/examples/industrial_data_pretraining/lcbnet/demo.py @@ -10,9 +10,8 @@ model = AutoModel(model="iic/LCB-NET", # example1 -#res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", -# ) -#print(res) +res = model.generate(input='["~/.cache/modelscope/hub/iic/LCB-NET/example/asr_example.wav","~/.cache/modelscope/hub/iic/LCB-NET/example/ocr.txt"]',data_type='["sound", "text"]') +print(res) ''' From fd365acc9452e0e8fdf2cb8da82fb8ccde0326b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Mon, 4 Mar 2024 15:45:53 +0800 Subject: [PATCH 095/101] atsr --- examples/industrial_data_pretraining/lcbnet/demo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/industrial_data_pretraining/lcbnet/demo.py b/examples/industrial_data_pretraining/lcbnet/demo.py index fe51f08e1..d0870bc8a 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo.py +++ b/examples/industrial_data_pretraining/lcbnet/demo.py @@ -11,6 +11,7 @@ model = AutoModel(model="iic/LCB-NET", # example1 res = model.generate(input='["~/.cache/modelscope/hub/iic/LCB-NET/example/asr_example.wav","~/.cache/modelscope/hub/iic/LCB-NET/example/ocr.txt"]',data_type='["sound", "text"]') + print(res) From 1a6d9d5cc422dcd1e6dd5b9c67047d63bc6cd667 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Mon, 4 Mar 2024 16:32:28 +0800 Subject: [PATCH 096/101] atsr --- .gitignore | 1 + .../lcbnet/demo.py | 2 +- .../lcbnet/demo.sh | 10 +-- .../lcbnet/demo2.sh | 71 ------------------- .../lcbnet/demo2_tmp.sh | 71 ------------------- .../lcbnet/demo_pdb.sh | 18 ----- .../lcbnet/demo_pdb2.sh | 15 ---- .../lcbnet/demo_tmp1.sh | 71 ------------------- funasr/utils/load_utils.py | 2 - 9 files changed, 7 insertions(+), 254 deletions(-) delete mode 100755 examples/industrial_data_pretraining/lcbnet/demo2.sh delete mode 100755 examples/industrial_data_pretraining/lcbnet/demo2_tmp.sh delete mode 100755 examples/industrial_data_pretraining/lcbnet/demo_pdb.sh delete mode 100755 examples/industrial_data_pretraining/lcbnet/demo_pdb2.sh delete mode 100755 examples/industrial_data_pretraining/lcbnet/demo_tmp1.sh diff --git a/.gitignore b/.gitignore index d2b4c53b9..1f2a3d1a7 100644 --- a/.gitignore +++ b/.gitignore @@ -26,4 +26,5 @@ emotion2vec* GPT-SoVITS* examples/*/*/outputs examples/*/*/exp +examples/*/*/tmp cmd_read diff --git a/examples/industrial_data_pretraining/lcbnet/demo.py b/examples/industrial_data_pretraining/lcbnet/demo.py index d0870bc8a..602a986d5 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo.py +++ b/examples/industrial_data_pretraining/lcbnet/demo.py @@ -10,7 +10,7 @@ model = AutoModel(model="iic/LCB-NET", # example1 -res = model.generate(input='["~/.cache/modelscope/hub/iic/LCB-NET/example/asr_example.wav","~/.cache/modelscope/hub/iic/LCB-NET/example/ocr.txt"]',data_type='["sound", "text"]') +res = model.generate(input=("https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/asr_example.wav","https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/ocr.txt"),data_type=("sound", "text")) print(res) diff --git a/examples/industrial_data_pretraining/lcbnet/demo.sh b/examples/industrial_data_pretraining/lcbnet/demo.sh index 825289188..3e04ccd1e 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo.sh @@ -1,5 +1,5 @@ -file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" -CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +file_dir="/home/yf352572/.cache/modelscope/hub/iic/LCB-NET/" +CUDA_VISIBLE_DEVICES="0,1" inference_device="cuda" if [ ${inference_device} == "cuda" ]; then @@ -12,7 +12,7 @@ else done fi -inference_dir="outputs/slidespeech_dev_beamsearch_new" +inference_dir="outputs/slidespeech_dev" _logdir="${inference_dir}/logdir" echo "inference_dir: ${inference_dir}" @@ -39,11 +39,11 @@ for JOB in $(seq ${nj}); do python -m funasr.bin.inference \ --config-path=${file_dir} \ --config-name="config.yaml" \ - ++init_param=${file_dir}/model.pb \ + ++init_param=${file_dir}/model.pt \ ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \ +data_type='["kaldi_ark", "text"]' \ - ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ + ++tokenizer_conf.bpemodel=${file_dir}/bpe.pt \ ++output_dir="${inference_dir}/${JOB}" \ ++device="${inference_device}" \ ++ncpu=1 \ diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh deleted file mode 100755 index 69df6d16c..000000000 --- a/examples/industrial_data_pretraining/lcbnet/demo2.sh +++ /dev/null @@ -1,71 +0,0 @@ -file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" -CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -inference_device="cuda" -test_set="dev_wav" -if [ ${inference_device} == "cuda" ]; then - nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') -else - inference_batch_size=1 - CUDA_VISIBLE_DEVICES="" - for JOB in $(seq ${nj}); do - CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1," - done -fi - -inference_dir="outputs/slidespeech_dev_beamsearch_wav" -_logdir="${inference_dir}/logdir" -echo "inference_dir: ${inference_dir}" - -mkdir -p "${_logdir}" -key_file1=${file_dir}/${test_set}/wav.scp -key_file2=${file_dir}/${test_set}/ocr.txt -split_scps1= -split_scps2= -for JOB in $(seq "${nj}"); do - split_scps1+=" ${_logdir}/wav.${JOB}.scp" - split_scps2+=" ${_logdir}/ocr.${JOB}.txt" -done -utils/split_scp.pl "${key_file1}" ${split_scps1} -utils/split_scp.pl "${key_file2}" ${split_scps2} - -gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ }) -for JOB in $(seq ${nj}); do - { - id=$((JOB-1)) - gpuid=${gpuid_list_array[$id]} - - export CUDA_VISIBLE_DEVICES=${gpuid} - - python -m funasr.bin.inference \ - --config-path=${file_dir} \ - --config-name="config.yaml" \ - ++init_param=${file_dir}/model.pb \ - ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ - ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \ - +data_type='["sound", "text"]' \ - ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ - ++output_dir="${inference_dir}/${JOB}" \ - ++device="${inference_device}" \ - ++ncpu=1 \ - ++disable_log=true &> ${_logdir}/log.${JOB}.txt - - }& -done -wait - - -mkdir -p ${inference_dir}/1best_recog - -for JOB in $(seq "${nj}"); do - cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token" -done - -echo "Computing WER ..." -sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/' ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc -cp ${file_dir}/${test_set}/text ${inference_dir}/1best_recog/token.ref -cp ${file_dir}/${test_set}/ocr.list ${inference_dir}/1best_recog/ocr.list -python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer -tail -n 3 ${inference_dir}/1best_recog/token.cer - -./run_bwer_recall.sh ${inference_dir}/1best_recog/ -tail -n 6 ${inference_dir}/1best_recog/BWER-UWER.results |head -n 5 diff --git a/examples/industrial_data_pretraining/lcbnet/demo2_tmp.sh b/examples/industrial_data_pretraining/lcbnet/demo2_tmp.sh deleted file mode 100755 index da6ad686d..000000000 --- a/examples/industrial_data_pretraining/lcbnet/demo2_tmp.sh +++ /dev/null @@ -1,71 +0,0 @@ -file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" -CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -inference_device="cuda" -test_set="test_wav" -if [ ${inference_device} == "cuda" ]; then - nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') -else - inference_batch_size=1 - CUDA_VISIBLE_DEVICES="" - for JOB in $(seq ${nj}); do - CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1," - done -fi - -inference_dir="outputs/slidespeech_test_beamsearch_wav" -_logdir="${inference_dir}/logdir" -echo "inference_dir: ${inference_dir}" - -mkdir -p "${_logdir}" -key_file1=${file_dir}/${test_set}/wav.scp -key_file2=${file_dir}/${test_set}/ocr.txt -split_scps1= -split_scps2= -for JOB in $(seq "${nj}"); do - split_scps1+=" ${_logdir}/wav.${JOB}.scp" - split_scps2+=" ${_logdir}/ocr.${JOB}.txt" -done -utils/split_scp.pl "${key_file1}" ${split_scps1} -utils/split_scp.pl "${key_file2}" ${split_scps2} - -gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ }) -for JOB in $(seq ${nj}); do - { - id=$((JOB-1)) - gpuid=${gpuid_list_array[$id]} - - export CUDA_VISIBLE_DEVICES=${gpuid} - - python -m funasr.bin.inference \ - --config-path=${file_dir} \ - --config-name="config.yaml" \ - ++init_param=${file_dir}/model.pb \ - ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ - ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \ - +data_type='["sound", "text"]' \ - ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ - ++output_dir="${inference_dir}/${JOB}" \ - ++device="${inference_device}" \ - ++ncpu=1 \ - ++disable_log=true &> ${_logdir}/log.${JOB}.txt - - }& -done -wait - - -mkdir -p ${inference_dir}/1best_recog - -for JOB in $(seq "${nj}"); do - cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token" -done - -echo "Computing WER ..." -sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/' ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc -cp ${file_dir}/${test_set}/text ${inference_dir}/1best_recog/token.ref -cp ${file_dir}/${test_set}/ocr.list ${inference_dir}/1best_recog/ocr.list -python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer -tail -n 3 ${inference_dir}/1best_recog/token.cer - -./run_bwer_recall.sh ${inference_dir}/1best_recog/ -tail -n 6 ${inference_dir}/1best_recog/BWER-UWER.results |head -n 5 diff --git a/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh b/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh deleted file mode 100755 index 0747a8d7b..000000000 --- a/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh +++ /dev/null @@ -1,18 +0,0 @@ -file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" - -#CUDA_VISIBLE_DEVICES="" \ -python -m funasr.bin.inference \ ---config-path=${file_dir} \ ---config-name="config.yaml" \ -++init_param=${file_dir}/model.pb \ -++tokenizer_conf.token_list=${file_dir}/tokens.txt \ -+input=["${file_dir}/example/asr_example.wav","${file_dir}/example/ocr.txt"] \ -+data_type='["sound","text"]' \ -++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ -++output_dir="./outputs/debug" \ -++device="cpu" \ - -#++input=["/nfs/yufan.yf/workspace/espnet/egs2/youtube_ppt/asr/dump/raw/dev_oracle_v1_new/data/format.1/YTB+--tMoLpQI-w+00322.wav"] \ -#+data_type='["sound"]' \ -#++input=["/nfs/yufan.yf/workspace/espnet/egs2/youtube_ppt/asr/dump/raw/dev_oracle_v1_new/data/format.1/YTB+--tMoLpQI-w+00322.wav","/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch/example/ocr2.txt"] \ -#+data_type='["sound","text"]' \ diff --git a/examples/industrial_data_pretraining/lcbnet/demo_pdb2.sh b/examples/industrial_data_pretraining/lcbnet/demo_pdb2.sh deleted file mode 100755 index 557e9b2d8..000000000 --- a/examples/industrial_data_pretraining/lcbnet/demo_pdb2.sh +++ /dev/null @@ -1,15 +0,0 @@ -file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" - -#CUDA_VISIBLE_DEVICES="" \ -python -m funasr.bin.inference \ ---config-path=${file_dir} \ ---config-name="config.yaml" \ -++init_param=${file_dir}/model.pb \ -++tokenizer_conf.token_list=${file_dir}/tokens.txt \ -++input=[${file_dir}/dev_wav/wav.scp,${file_dir}/dev_wav/ocr.txt] \ -+data_type='["sound", "text"]' \ -++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ -++output_dir="./outputs/debug" \ -++device="cpu" \ - -#++input=[${file_dir}/dev_wav/wav.scp,${file_dir}/dev_wav/ocr.txt] \ diff --git a/examples/industrial_data_pretraining/lcbnet/demo_tmp1.sh b/examples/industrial_data_pretraining/lcbnet/demo_tmp1.sh deleted file mode 100755 index 488f7d2a1..000000000 --- a/examples/industrial_data_pretraining/lcbnet/demo_tmp1.sh +++ /dev/null @@ -1,71 +0,0 @@ -file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch" -CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -inference_device="cuda" - -if [ ${inference_device} == "cuda" ]; then - nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') -else - inference_batch_size=1 - CUDA_VISIBLE_DEVICES="" - for JOB in $(seq ${nj}); do - CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1," - done -fi - -inference_dir="outputs/slidespeech_test_beamsearch_new" -_logdir="${inference_dir}/logdir" -echo "inference_dir: ${inference_dir}" - -mkdir -p "${_logdir}" -key_file1=${file_dir}/test/wav.scp -key_file2=${file_dir}/test/ocr.txt -split_scps1= -split_scps2= -for JOB in $(seq "${nj}"); do - split_scps1+=" ${_logdir}/wav.${JOB}.scp" - split_scps2+=" ${_logdir}/ocr.${JOB}.txt" -done -utils/split_scp.pl "${key_file1}" ${split_scps1} -utils/split_scp.pl "${key_file2}" ${split_scps2} - -gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ }) -for JOB in $(seq ${nj}); do - { - id=$((JOB-1)) - gpuid=${gpuid_list_array[$id]} - - export CUDA_VISIBLE_DEVICES=${gpuid} - - python -m funasr.bin.inference \ - --config-path=${file_dir} \ - --config-name="config.yaml" \ - ++init_param=${file_dir}/model.pb \ - ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ - ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \ - +data_type='["kaldi_ark", "text"]' \ - ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \ - ++output_dir="${inference_dir}/${JOB}" \ - ++device="${inference_device}" \ - ++ncpu=1 \ - ++disable_log=true &> ${_logdir}/log.${JOB}.txt - - }& -done -wait - - -mkdir -p ${inference_dir}/1best_recog - -for JOB in $(seq "${nj}"); do - cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token" -done - -echo "Computing WER ..." -sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/' ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc -cp ${file_dir}/test/text ${inference_dir}/1best_recog/token.ref -cp ${file_dir}/test/ocr.list ${inference_dir}/1best_recog/ocr.list -python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer -tail -n 3 ${inference_dir}/1best_recog/token.cer - -./run_bwer_recall.sh ${inference_dir}/1best_recog/ -tail -n 6 ${inference_dir}/1best_recog/BWER-UWER.results |head -n 5 diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py index 644af2324..84c38f9b9 100644 --- a/funasr/utils/load_utils.py +++ b/funasr/utils/load_utils.py @@ -89,8 +89,6 @@ def load_bytes(input): return array def extract_fbank(data, data_len = None, data_type: str="sound", frontend=None, **kwargs): - # import pdb; - # pdb.set_trace() if isinstance(data, np.ndarray): data = torch.from_numpy(data) if len(data.shape) < 2: From d9e60d9ddc92ab5746842b5a2b6f7a423de2a795 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Mon, 4 Mar 2024 16:44:31 +0800 Subject: [PATCH 097/101] atsr --- .../lcbnet/README.md | 73 +++++++++++++++++++ .../lcbnet/demo.py | 26 +------ 2 files changed, 75 insertions(+), 24 deletions(-) diff --git a/examples/industrial_data_pretraining/lcbnet/README.md b/examples/industrial_data_pretraining/lcbnet/README.md index c0d4b1ecb..ff75b40b8 100644 --- a/examples/industrial_data_pretraining/lcbnet/README.md +++ b/examples/industrial_data_pretraining/lcbnet/README.md @@ -91,6 +91,79 @@ finetune-support: True +## 基于ModelScope进行推理 + +- 推理支持音频格式如下: + - wav文件路径,例如:data/test/asr_example.wav + - pcm文件路径,例如:data/test/asr_example.pcm + - ark文件路径,例如:data/test/data.ark + - wav文件url,例如:https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/asr_example.wav + - wav二进制数据,格式bytes,例如:用户直接从文件里读出bytes数据或者是麦克风录出bytes数据。 + - 已解析的audio音频,例如:audio, rate = soundfile.read("asr_example_zh.wav"),类型为numpy.ndarray或者torch.Tensor。 + - wav.scp文件,需符合如下要求(以下分别为sound和kaldi_ark格式): + +```sh +cat wav.scp +asr_example1 data/test/asr_example1.wav +asr_example2 data/test/asr_example2.wav + +cat wav.scp +asr_example1 data/test/data_wav.ark:22 +asr_example2 data/test/data_wav.ark:90445 +... +``` + +- 推理支持OCR预测文本格式如下: + - ocr.txt文件,需符合如下要求: +```sh +cat ocr.txt +asr_example1 ANIMAL RIGHTS MANAGER PLOEG +asr_example2 UNIVERSITY CAMPUS DEANO +... +``` + +- 若输入格式wav文件和ocr文件均为url,api调用方式可参考如下范例: + +```python +from funasr import AutoModel + +model = AutoModel(model="iic/LCB-NET", + model_revision="v2.0.0") +res = model.generate(input=("https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/asr_example.wav","https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/ocr.txt"),data_type=("sound", "text")) +``` + + +## 复现论文中的结果 +```python +python -m funasr.bin.inference \ + --config-path=${file_dir} \ + --config-name="config.yaml" \ + ++init_param=${file_dir}/model.pt \ + ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ + ++input=[${_logdir}/wav.scp,${_logdir}/ocr.txt] \ + +data_type='["kaldi_ark", "text"]' \ + ++tokenizer_conf.bpemodel=${file_dir}/bpe.pt \ + ++output_dir="${inference_dir}/results" \ + ++device="${inference_device}" \ + ++ncpu=1 \ + ++disable_log=true + +``` + + +识别结果输出路径结构如下: + +```sh +tree output_dir/ +output_dir/ +└── 1best_recog + ├── text + └── token +``` + +token:语音识别结果文件 + +可以使用funasr里面提供的run_bwer_recall.sh计算WER、BWER、UWER和Recall。 ## 相关论文以及引用信息 diff --git a/examples/industrial_data_pretraining/lcbnet/demo.py b/examples/industrial_data_pretraining/lcbnet/demo.py index 602a986d5..ac679cec8 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo.py +++ b/examples/industrial_data_pretraining/lcbnet/demo.py @@ -6,30 +6,8 @@ from funasr import AutoModel model = AutoModel(model="iic/LCB-NET", - model_revision="v1.0.0") + model_revision="v2.0.0") - -# example1 res = model.generate(input=("https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/asr_example.wav","https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/ocr.txt"),data_type=("sound", "text")) -print(res) - - -''' -# tensor or numpy as input -# example2 -import torchaudio -import os -wav_file = os.path.join(model.model_path, "example/asr_example.wav") -input_tensor, sample_rate = torchaudio.load(wav_file) -input_tensor = input_tensor.mean(0) -res = model.generate(input=[input_tensor], batch_size_s=300, is_final=True) - - -# example3 -import soundfile - -wav_file = os.path.join(model.model_path, "example/asr_example.wav") -speech, sample_rate = soundfile.read(wav_file) -res = model.generate(input=[speech], batch_size_s=300, is_final=True) -''' +print(res) \ No newline at end of file From 9c884c566ff1a7c26f0f28e8d8ad4deb281a954d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Mon, 4 Mar 2024 16:48:01 +0800 Subject: [PATCH 098/101] atsr --- examples/industrial_data_pretraining/lcbnet/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/industrial_data_pretraining/lcbnet/README.md b/examples/industrial_data_pretraining/lcbnet/README.md index ff75b40b8..923588668 100644 --- a/examples/industrial_data_pretraining/lcbnet/README.md +++ b/examples/industrial_data_pretraining/lcbnet/README.md @@ -164,7 +164,7 @@ output_dir/ token:语音识别结果文件 可以使用funasr里面提供的run_bwer_recall.sh计算WER、BWER、UWER和Recall。 - +详细脚本可以参考funasr里面的demo.sh脚本,需要注意的是你需要修改一下iic/LCB-NET/conf.yaml中CMVN(stats_file)的路径和iic/LCB-NET/dev/wav.scp里面ark的路径,修改为你自己本地的路径,然后跑解码。 ## 相关论文以及引用信息 From 015b1e424e9ee96c770f3221deef4c20a544e883 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Mon, 4 Mar 2024 17:06:21 +0800 Subject: [PATCH 099/101] atsr --- examples/industrial_data_pretraining/lcbnet/demo.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/industrial_data_pretraining/lcbnet/demo.sh b/examples/industrial_data_pretraining/lcbnet/demo.sh index 3e04ccd1e..2f226bc03 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo.sh +++ b/examples/industrial_data_pretraining/lcbnet/demo.sh @@ -44,6 +44,7 @@ for JOB in $(seq ${nj}); do ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \ +data_type='["kaldi_ark", "text"]' \ ++tokenizer_conf.bpemodel=${file_dir}/bpe.pt \ + ++normalize_conf.stats_file=${file_dir}/am.mvn \ ++output_dir="${inference_dir}/${JOB}" \ ++device="${inference_device}" \ ++ncpu=1 \ From a2f263bd05498cf4f35d78ee0ee8755ba84d09ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Mon, 4 Mar 2024 17:09:05 +0800 Subject: [PATCH 100/101] atsr --- examples/industrial_data_pretraining/lcbnet/demo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/industrial_data_pretraining/lcbnet/demo.py b/examples/industrial_data_pretraining/lcbnet/demo.py index ac679cec8..4ca52553f 100755 --- a/examples/industrial_data_pretraining/lcbnet/demo.py +++ b/examples/industrial_data_pretraining/lcbnet/demo.py @@ -6,7 +6,7 @@ from funasr import AutoModel model = AutoModel(model="iic/LCB-NET", - model_revision="v2.0.0") + model_revision="v1.0.0") res = model.generate(input=("https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/asr_example.wav","https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/ocr.txt"),data_type=("sound", "text")) From 1162dee2dd2971243607bb58a766987acda6e9ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= Date: Mon, 4 Mar 2024 17:11:46 +0800 Subject: [PATCH 101/101] atsr --- .../lcbnet/README.md | 178 ------------------ 1 file changed, 178 deletions(-) delete mode 100644 examples/industrial_data_pretraining/lcbnet/README.md diff --git a/examples/industrial_data_pretraining/lcbnet/README.md b/examples/industrial_data_pretraining/lcbnet/README.md deleted file mode 100644 index 923588668..000000000 --- a/examples/industrial_data_pretraining/lcbnet/README.md +++ /dev/null @@ -1,178 +0,0 @@ ---- -tasks: -- audio-visual-speech-recognition -domain: -- audio, visual -model-type: -- Autoregressive -frameworks: -- pytorch -backbone: -- transformer/conformer -metrics: -- WER/B-WER -license: Apache License 2.0 -language: -- en -tags: -- FunASR -- Alibaba -- ICASSP 2024 -- Audio-Visual -- Hotword -- Long-Context Biasing -datasets: - train: - - SlideSpeech corpus - test: - - dev and test of SlideSpeech corpus -indexing: - results: - - task: - name: Audio-Visual Speech Recognition - dataset: - name: SlideSpeech corpus - type: audio # optional - args: 16k sampling rate, 5002 bpe units # optional - metrics: - - type: WER - value: 18.8% # float - description: beamsearch search, withou lm, avg. - args: default - -widgets: - - task: audio-visual-speech-recognition - inputs: - - type: audio - name: input - title: 音频 - - type: text - name: input - title: OCR识别文本 -finetune-support: True ---- - - -# Paraformer-large模型介绍 - -## Highlights -- 热词版本:[Paraformer-large热词版模型](https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary)支持热词定制功能,基于提供的热词列表进行激励增强,提升热词的召回率和准确率。 -- 长音频版本:[Paraformer-large长音频模型](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary),集成VAD、ASR、标点与时间戳功能,可直接对时长为数小时音频进行识别,并输出带标点文字与时间戳。 - -## [FunASR开源项目介绍](https://github.com/alibaba-damo-academy/FunASR) -[FunASR](https://github.com/alibaba-damo-academy/FunASR)希望在语音识别的学术研究和工业应用之间架起一座桥梁。通过发布工业级语音识别模型的训练和微调,研究人员和开发人员可以更方便地进行语音识别模型的研究和生产,并推动语音识别生态的发展。让语音识别更有趣! - -[**github仓库**](https://github.com/alibaba-damo-academy/FunASR) -| [**最新动态**](https://github.com/alibaba-damo-academy/FunASR#whats-new) -| [**环境安装**](https://github.com/alibaba-damo-academy/FunASR#installation) -| [**服务部署**](https://www.funasr.com) -| [**模型库**](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo) -| [**联系我们**](https://github.com/alibaba-damo-academy/FunASR#contact) - - -## 模型原理介绍 - -随着在线会议和课程越来越普遍,如何利用视频幻灯片中丰富的文本信息来改善语音识别(Automatic  Speech Recognition, ASR)面临着新的挑战。视频中的幻灯片与语音实时同步,相比于统一的稀有词列表,能够提供更长的上下文相关信息。因此,我们提出了一种创新的长上下文偏置网络(LCB-net),用于音频-视觉语音识别(Audio-Visual Speech Recognition,AVSR),以更好地利用视频中的长时上下文信息。 - -

-AVSR整体流程框架 -

-LCB-NET模型结构 - - -具体来说,我们首先使用OCR技术来检测和识别幻灯片中的文本内容,其次我们采用关键词提取技术来获取文本内容中的关键词短语。最后,我们将关键词拼接成长上下文文本和音频同时输入到我们的LCB-net模型中进行识别。而LCB-net模型采用了双编码器结构,同时建模音频和长上下文文本信息。此外,我们还引入了一个显式的偏置词预测模块,通过使用二元交叉熵(BCE)损失函数显式预测长上下文文本中在音频中出现的关键偏置词。此外,为增强LCB-net的泛化能力和稳健性,我们还采用了动态的关键词模拟策略。实验证明,我们提出的LCB-net热词模型,不仅能够提升关键词的识别效果,同时也能够提升非关键词的识别效果。具体实验结果如下所示: - -

-实验结果 - - -更详细的细节见: -- 论文: [LCB-net: Long-Context Biasing for Audio-Visual Speech Recognition](https://arxiv.org/abs/2401.06390) - - - -## 基于ModelScope进行推理 - -- 推理支持音频格式如下: - - wav文件路径,例如:data/test/asr_example.wav - - pcm文件路径,例如:data/test/asr_example.pcm - - ark文件路径,例如:data/test/data.ark - - wav文件url,例如:https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/asr_example.wav - - wav二进制数据,格式bytes,例如:用户直接从文件里读出bytes数据或者是麦克风录出bytes数据。 - - 已解析的audio音频,例如:audio, rate = soundfile.read("asr_example_zh.wav"),类型为numpy.ndarray或者torch.Tensor。 - - wav.scp文件,需符合如下要求(以下分别为sound和kaldi_ark格式): - -```sh -cat wav.scp -asr_example1 data/test/asr_example1.wav -asr_example2 data/test/asr_example2.wav - -cat wav.scp -asr_example1 data/test/data_wav.ark:22 -asr_example2 data/test/data_wav.ark:90445 -... -``` - -- 推理支持OCR预测文本格式如下: - - ocr.txt文件,需符合如下要求: -```sh -cat ocr.txt -asr_example1 ANIMAL RIGHTS MANAGER PLOEG -asr_example2 UNIVERSITY CAMPUS DEANO -... -``` - -- 若输入格式wav文件和ocr文件均为url,api调用方式可参考如下范例: - -```python -from funasr import AutoModel - -model = AutoModel(model="iic/LCB-NET", - model_revision="v2.0.0") -res = model.generate(input=("https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/asr_example.wav","https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/ocr.txt"),data_type=("sound", "text")) -``` - - -## 复现论文中的结果 -```python -python -m funasr.bin.inference \ - --config-path=${file_dir} \ - --config-name="config.yaml" \ - ++init_param=${file_dir}/model.pt \ - ++tokenizer_conf.token_list=${file_dir}/tokens.txt \ - ++input=[${_logdir}/wav.scp,${_logdir}/ocr.txt] \ - +data_type='["kaldi_ark", "text"]' \ - ++tokenizer_conf.bpemodel=${file_dir}/bpe.pt \ - ++output_dir="${inference_dir}/results" \ - ++device="${inference_device}" \ - ++ncpu=1 \ - ++disable_log=true - -``` - - -识别结果输出路径结构如下: - -```sh -tree output_dir/ -output_dir/ -└── 1best_recog - ├── text - └── token -``` - -token:语音识别结果文件 - -可以使用funasr里面提供的run_bwer_recall.sh计算WER、BWER、UWER和Recall。 -详细脚本可以参考funasr里面的demo.sh脚本,需要注意的是你需要修改一下iic/LCB-NET/conf.yaml中CMVN(stats_file)的路径和iic/LCB-NET/dev/wav.scp里面ark的路径,修改为你自己本地的路径,然后跑解码。 - -## 相关论文以及引用信息 - -```BibTeX -@inproceedings{yu2024lcbnet, - title={LCB-net: Long-Context Biasing for Audio-Visual Speech Recognition}, - author={Fan Yu, Haoxu Wang, Xian Shi, Shiliang Zhang}, - booktitle={ICASSP}, - year={2024} -} -``` \ No newline at end of file