From ea71c0f891ab2307dd71322f83e470e216af81fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 21 Feb 2024 14:41:39 +0800
Subject: [PATCH 001/101] add test

---
 .gitignore                                               | 1 +
 .../contextual_paraformer/demo.py                        | 0
 .../contextual_paraformer/demo2.py                       | 9 +++++++++
 .../contextual_paraformer/infer.sh                       | 2 +-
 .../contextual_paraformer/path.sh                        | 6 ++++++
 5 files changed, 17 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 examples/industrial_data_pretraining/contextual_paraformer/demo.py
 create mode 100644 examples/industrial_data_pretraining/contextual_paraformer/demo2.py
 mode change 100644 => 100755 examples/industrial_data_pretraining/contextual_paraformer/infer.sh
 create mode 100755 examples/industrial_data_pretraining/contextual_paraformer/path.sh

diff --git a/.gitignore b/.gitignore
index 6bdfd5d06..23864c3e0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,4 @@ samples
 outputs*
 emotion2vec*
 GPT-SoVITS*
+examples/*/*/outputs
diff --git a/examples/industrial_data_pretraining/contextual_paraformer/demo.py b/examples/industrial_data_pretraining/contextual_paraformer/demo.py
old mode 100644
new mode 100755
diff --git a/examples/industrial_data_pretraining/contextual_paraformer/demo2.py b/examples/industrial_data_pretraining/contextual_paraformer/demo2.py
new file mode 100644
index 000000000..30bb76fbc
--- /dev/null
+++ b/examples/industrial_data_pretraining/contextual_paraformer/demo2.py
@@ -0,0 +1,9 @@
+python -m funasr.bin.inference \
+--config-path="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3" \
+--config-name="config.yaml" \
+++init_param="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3/model.pt.ep38" \
+++tokenizer_conf.token_list="/mnt/nfs/zhifu.gzf/data/AISHELL-1-feats/DATA/data/zh_token_list/char/tokens.txt" \
+++frontend_conf.cmvn_file="/mnt/nfs/zhifu.gzf/data/AISHELL-1-feats/DATA/data/train/am.mvn" \
+++input="/mnt/nfs/zhifu.gzf/data/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav" \
+++output_dir="./outputs/debug" \
+++device="cuda:0" \
diff --git a/examples/industrial_data_pretraining/contextual_paraformer/infer.sh b/examples/industrial_data_pretraining/contextual_paraformer/infer.sh
old mode 100644
new mode 100755
index 8fc66f34f..1bd4f7f5b
--- a/examples/industrial_data_pretraining/contextual_paraformer/infer.sh
+++ b/examples/industrial_data_pretraining/contextual_paraformer/infer.sh
@@ -2,7 +2,7 @@
 model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404"
 model_revision="v2.0.4"
 
-python funasr/bin/inference.py \
+python ../../../funasr/bin/inference.py \
 +model=${model} \
 +model_revision=${model_revision} \
 +input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \
diff --git a/examples/industrial_data_pretraining/contextual_paraformer/path.sh b/examples/industrial_data_pretraining/contextual_paraformer/path.sh
new file mode 100755
index 000000000..1a6d67e08
--- /dev/null
+++ b/examples/industrial_data_pretraining/contextual_paraformer/path.sh
@@ -0,0 +1,6 @@
+export FUNASR_DIR=$PWD/../../../
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
+export PYTHONPATH=$FUNASR_DIR/funasr/bin:$FUNASR_DIR/funasr:$FUNASR_DIR:$PYTHONPATH

From 691f3235cdfaea38bc92f52b5f9e14cc7ea98dcf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 21 Feb 2024 15:10:00 +0800
Subject: [PATCH 002/101] add test

---
 .../contextual_paraformer/demo2.py                       | 9 ---------
 .../contextual_paraformer/demo2.sh                       | 9 +++++++++
 2 files changed, 9 insertions(+), 9 deletions(-)
 delete mode 100644 examples/industrial_data_pretraining/contextual_paraformer/demo2.py
 create mode 100644 examples/industrial_data_pretraining/contextual_paraformer/demo2.sh

diff --git a/examples/industrial_data_pretraining/contextual_paraformer/demo2.py b/examples/industrial_data_pretraining/contextual_paraformer/demo2.py
deleted file mode 100644
index 30bb76fbc..000000000
--- a/examples/industrial_data_pretraining/contextual_paraformer/demo2.py
+++ /dev/null
@@ -1,9 +0,0 @@
-python -m funasr.bin.inference \
---config-path="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3" \
---config-name="config.yaml" \
-++init_param="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3/model.pt.ep38" \
-++tokenizer_conf.token_list="/mnt/nfs/zhifu.gzf/data/AISHELL-1-feats/DATA/data/zh_token_list/char/tokens.txt" \
-++frontend_conf.cmvn_file="/mnt/nfs/zhifu.gzf/data/AISHELL-1-feats/DATA/data/train/am.mvn" \
-++input="/mnt/nfs/zhifu.gzf/data/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav" \
-++output_dir="./outputs/debug" \
-++device="cuda:0" \
diff --git a/examples/industrial_data_pretraining/contextual_paraformer/demo2.sh b/examples/industrial_data_pretraining/contextual_paraformer/demo2.sh
new file mode 100644
index 000000000..282f4f1f2
--- /dev/null
+++ b/examples/industrial_data_pretraining/contextual_paraformer/demo2.sh
@@ -0,0 +1,9 @@
+python -m funasr.bin.inference \
+--config-path="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404" \
+--config-name="config.yaml" \
+++init_param="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/model.pb" \
+++tokenizer_conf.token_list="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/tokens.txt" \
+++frontend_conf.cmvn_file="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/am.mvn" \
+++input="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/asr_example_zh.wav" \
+++output_dir="./outputs/debug2" \
+++device="" \

From 50ee4bafdcf6f0fca6b31ddf208f9575821a5455 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 21 Feb 2024 15:33:27 +0800
Subject: [PATCH 003/101] new

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 23864c3e0..bdfe70f1a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,4 @@ outputs*
 emotion2vec*
 GPT-SoVITS*
 examples/*/*/outputs
+cmd_read

From 49bec4052b766dd57580ef83aababaab02b64f5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 21 Feb 2024 15:43:52 +0800
Subject: [PATCH 004/101] add test

---
 new | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 new

diff --git a/new b/new
new file mode 100644
index 000000000..9daeafb98
--- /dev/null
+++ b/new
@@ -0,0 +1 @@
+test

From eeccdc9a5d72f496f5e7b2a0e3dd381bebcc6ff9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 21 Feb 2024 15:50:16 +0800
Subject: [PATCH 005/101] new

---
 new | 1 +
 1 file changed, 1 insertion(+)

diff --git a/new b/new
index 9daeafb98..ae69a4cd4 100644
--- a/new
+++ b/new
@@ -1 +1,2 @@
 test
+sda

From a4df88c96fb6e44bc4f684a1289aca99dd8d7eb0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 21 Feb 2024 16:12:05 +0800
Subject: [PATCH 006/101] test

---
 new  | 2 --
 new2 | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)
 delete mode 100644 new
 create mode 100644 new2

diff --git a/new b/new
deleted file mode 100644
index ae69a4cd4..000000000
--- a/new
+++ /dev/null
@@ -1,2 +0,0 @@
-test
-sda
diff --git a/new2 b/new2
new file mode 100644
index 000000000..fbb91086d
--- /dev/null
+++ b/new2
@@ -0,0 +1,2 @@
+sdsd
+

From 0a6eacc54c6b2564aaa048076c2b2a1202b9c6a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 21 Feb 2024 16:20:57 +0800
Subject: [PATCH 007/101] test

---
 funasr/models/contextual_paraformer/model.py | 27 ++++++++++++--------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/funasr/models/contextual_paraformer/model.py b/funasr/models/contextual_paraformer/model.py
index 3f79eedf2..598c074e2 100644
--- a/funasr/models/contextual_paraformer/model.py
+++ b/funasr/models/contextual_paraformer/model.py
@@ -29,7 +29,7 @@ from funasr.train_utils.device_funcs import force_gatherable
 from funasr.models.transformer.utils.add_sos_eos import add_sos_eos
 from funasr.models.transformer.utils.nets_utils import make_pad_mask, pad_list
 from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
-
+import pdb
 
 if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
     from torch.cuda.amp import autocast
@@ -63,7 +63,7 @@ class ContextualParaformer(Paraformer):
         crit_attn_smooth = kwargs.get("crit_attn_smooth", 0.0)
         bias_encoder_dropout_rate = kwargs.get("bias_encoder_dropout_rate", 0.0)
 
-
+        pdb.set_trace()
         if bias_encoder_type == 'lstm':
             self.bias_encoder = torch.nn.LSTM(inner_dim, inner_dim, 1, batch_first=True, dropout=bias_encoder_dropout_rate)
             self.bias_embed = torch.nn.Embedding(self.vocab_size, inner_dim)
@@ -81,6 +81,7 @@ class ContextualParaformer(Paraformer):
         if self.crit_attn_weight > 0:
             self.attn_loss = torch.nn.L1Loss()
         self.crit_attn_smooth = crit_attn_smooth
+        pdb.set_trace()
 
 
     def forward(
@@ -103,17 +104,17 @@ class ContextualParaformer(Paraformer):
             text_lengths = text_lengths[:, 0]
         if len(speech_lengths.size()) > 1:
             speech_lengths = speech_lengths[:, 0]
-        
+        pdb.set_trace()
         batch_size = speech.shape[0]
 
         hotword_pad = kwargs.get("hotword_pad")
         hotword_lengths = kwargs.get("hotword_lengths")
         dha_pad = kwargs.get("dha_pad")
-        
+        pdb.set_trace()
         # 1. Encoder
         encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
 
-        
+        pdb.set_trace()
         loss_ctc, cer_ctc = None, None
         
         stats = dict()
@@ -128,12 +129,12 @@ class ContextualParaformer(Paraformer):
             stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
             stats["cer_ctc"] = cer_ctc
         
-
+        pdb.set_trace()
         # 2b. Attention decoder branch
         loss_att, acc_att, cer_att, wer_att, loss_pre, loss_ideal = self._calc_att_clas_loss(
             encoder_out, encoder_out_lens, text, text_lengths, hotword_pad, hotword_lengths
         )
-        
+        pdb.set_trace()
         # 3. CTC-Att loss definition
         if self.ctc_weight == 0.0:
             loss = loss_att + loss_pre * self.predictor_weight
@@ -171,22 +172,26 @@ class ContextualParaformer(Paraformer):
     ):
         encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
             encoder_out.device)
+        pdb.set_trace()
         if self.predictor_bias == 1:
             _, ys_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
             ys_pad_lens = ys_pad_lens + self.predictor_bias
+        pdb.set_trace()
         pre_acoustic_embeds, pre_token_length, _, _ = self.predictor(encoder_out, ys_pad, encoder_out_mask,
                                                                      ignore_id=self.ignore_id)
-        
+        pdb.set_trace()
         # -1. bias encoder
         if self.use_decoder_embedding:
             hw_embed = self.decoder.embed(hotword_pad)
         else:
             hw_embed = self.bias_embed(hotword_pad)
+        pdb.set_trace()
         hw_embed, (_, _) = self.bias_encoder(hw_embed)
+        pdb.set_trace()
         _ind = np.arange(0, hotword_pad.shape[0]).tolist()
         selected = hw_embed[_ind, [i - 1 for i in hotword_lengths.detach().cpu().tolist()]]
         contextual_info = selected.squeeze(0).repeat(ys_pad.shape[0], 1, 1).to(ys_pad.device)
-        
+        pdb.set_trace()
         # 0. sampler
         decoder_out_1st = None
         if self.sampling_ratio > 0.0:
@@ -198,7 +203,7 @@ class ContextualParaformer(Paraformer):
             if self.step_cur < 2:
                 logging.info("disable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
             sematic_embeds = pre_acoustic_embeds
-        
+        pdb.set_trace()
         # 1. Forward decoder
         decoder_outs = self.decoder(
             encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=contextual_info
@@ -214,7 +219,7 @@ class ContextualParaformer(Paraformer):
             loss_ideal = None
         '''
         loss_ideal = None
-        
+        pdb.set_trace()
         if decoder_out_1st is None:
             decoder_out_1st = decoder_out
         # 2. Compute attention loss

From a0ffe57b05679d91e56227ce1109a5d725d93192 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 21 Feb 2024 16:48:33 +0800
Subject: [PATCH 008/101] test

---
 funasr/auto/auto_model.py                    | 11 +++++++--
 funasr/models/contextual_paraformer/model.py | 24 ++++++++++++++++----
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py
index e95cfd8d1..4cc52a50c 100644
--- a/funasr/auto/auto_model.py
+++ b/funasr/auto/auto_model.py
@@ -24,7 +24,7 @@ try:
     from funasr.models.campplus.cluster_backend import ClusterBackend
 except:
     print("If you want to use the speaker diarization, please `pip install hdbscan`")
-
+import pdb
 
 def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None):
     """
@@ -210,13 +210,15 @@ class AutoModel:
         kwargs.update(cfg)
         model = self.model if model is None else model
         model.eval()
+        pdb.set_trace()
 
         batch_size = kwargs.get("batch_size", 1)
         # if kwargs.get("device", "cpu") == "cpu":
         #     batch_size = 1
         
         key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key)
-        
+        pdb.set_trace()
+
         speed_stats = {}
         asr_result_list = []
         num_samples = len(data_list)
@@ -224,20 +226,25 @@ class AutoModel:
         pbar = tqdm(colour="blue", total=num_samples, dynamic_ncols=True) if not disable_pbar else None
         time_speech_total = 0.0
         time_escape_total = 0.0
+        pdb.set_trace()
         for beg_idx in range(0, num_samples, batch_size):
+            pdb.set_trace()
             end_idx = min(num_samples, beg_idx + batch_size)
             data_batch = data_list[beg_idx:end_idx]
             key_batch = key_list[beg_idx:end_idx]
             batch = {"data_in": data_batch, "key": key_batch}
+            pdb.set_trace()
             if (end_idx - beg_idx) == 1 and kwargs.get("data_type", None) == "fbank": # fbank
                 batch["data_in"] = data_batch[0]
                 batch["data_lengths"] = input_len
         
             time1 = time.perf_counter()
             with torch.no_grad():
+                pdb.set_trace()
                 results, meta_data = model.inference(**batch, **kwargs)
             time2 = time.perf_counter()
             
+            pdb.set_trace()
             asr_result_list.extend(results)
 
             # batch_data_time = time_per_frame_s * data_batch_i["speech_lengths"].sum().item()
diff --git a/funasr/models/contextual_paraformer/model.py b/funasr/models/contextual_paraformer/model.py
index 598c074e2..655ca6f58 100644
--- a/funasr/models/contextual_paraformer/model.py
+++ b/funasr/models/contextual_paraformer/model.py
@@ -63,7 +63,6 @@ class ContextualParaformer(Paraformer):
         crit_attn_smooth = kwargs.get("crit_attn_smooth", 0.0)
         bias_encoder_dropout_rate = kwargs.get("bias_encoder_dropout_rate", 0.0)
 
-        pdb.set_trace()
         if bias_encoder_type == 'lstm':
             self.bias_encoder = torch.nn.LSTM(inner_dim, inner_dim, 1, batch_first=True, dropout=bias_encoder_dropout_rate)
             self.bias_embed = torch.nn.Embedding(self.vocab_size, inner_dim)
@@ -81,7 +80,6 @@ class ContextualParaformer(Paraformer):
         if self.crit_attn_weight > 0:
             self.attn_loss = torch.nn.L1Loss()
         self.crit_attn_smooth = crit_attn_smooth
-        pdb.set_trace()
 
 
     def forward(
@@ -313,20 +311,24 @@ class ContextualParaformer(Paraformer):
                  **kwargs,
                  ):
         # init beamsearch
+        pdb.set_trace()
         is_use_ctc = kwargs.get("decoding_ctc_weight", 0.0) > 0.00001 and self.ctc != None
         is_use_lm = kwargs.get("lm_weight", 0.0) > 0.00001 and kwargs.get("lm_file", None) is not None
         if self.beam_search is None and (is_use_lm or is_use_ctc):
             logging.info("enable beam_search")
             self.init_beam_search(**kwargs)
             self.nbest = kwargs.get("nbest", 1)
-        
+        pdb.set_trace()
         meta_data = {}
         
         # extract fbank feats
         time1 = time.perf_counter()
+        pdb.set_trace()
         audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
+        pdb.set_trace()
         time2 = time.perf_counter()
         meta_data["load_data"] = f"{time2 - time1:0.3f}"
+        pdb.set_trace()
         speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
                                                frontend=frontend)
         time3 = time.perf_counter()
@@ -334,38 +336,50 @@ class ContextualParaformer(Paraformer):
         meta_data[
             "batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000
         
+        pdb.set_trace()
         speech = speech.to(device=kwargs["device"])
         speech_lengths = speech_lengths.to(device=kwargs["device"])
 
         # hotword
+        pdb.set_trace()
         self.hotword_list = self.generate_hotwords_list(kwargs.get("hotword", None), tokenizer=tokenizer, frontend=frontend)
+        pdb.set_trace()
+
         
         # Encoder
         encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
         if isinstance(encoder_out, tuple):
             encoder_out = encoder_out[0]
-        
+        pdb.set_trace()
+
+
         # predictor
         predictor_outs = self.calc_predictor(encoder_out, encoder_out_lens)
         pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
                                                                         predictor_outs[2], predictor_outs[3]
+        pdb.set_trace()
         pre_token_length = pre_token_length.round().long()
         if torch.max(pre_token_length) < 1:
             return []
 
-
+        pdb.set_trace()
+        
         decoder_outs = self.cal_decoder_with_predictor(encoder_out, encoder_out_lens,
                                                                  pre_acoustic_embeds,
                                                                  pre_token_length,
                                                                  hw_list=self.hotword_list,
                                                                  clas_scale=kwargs.get("clas_scale", 1.0))
+        pdb.set_trace()
         decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
         
+        pdb.set_trace()
         results = []
         b, n, d = decoder_out.size()
+        pdb.set_trace()
         for i in range(b):
             x = encoder_out[i, :encoder_out_lens[i], :]
             am_scores = decoder_out[i, :pre_token_length[i], :]
+            pdb.set_trace()
             if self.beam_search is not None:
                 nbest_hyps = self.beam_search(
                     x=x, am_scores=am_scores, maxlenratio=kwargs.get("maxlenratio", 0.0),

From d2f1cf39f8fedc19d0e14fac269a413d62375359 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 21 Feb 2024 17:03:29 +0800
Subject: [PATCH 009/101] test

---
 .../industrial_data_pretraining/contextual_paraformer/demo2.sh    | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 examples/industrial_data_pretraining/contextual_paraformer/demo2.sh

diff --git a/examples/industrial_data_pretraining/contextual_paraformer/demo2.sh b/examples/industrial_data_pretraining/contextual_paraformer/demo2.sh
old mode 100644
new mode 100755

From 0a7384a1ec540c38b2b584e373fd516f61e2e86d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 21 Feb 2024 19:07:25 +0800
Subject: [PATCH 010/101] test

---
 funasr/models/contextual_paraformer/model.py | 25 +++++++-------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/funasr/models/contextual_paraformer/model.py b/funasr/models/contextual_paraformer/model.py
index 5ccc611d6..10bbf9d00 100644
--- a/funasr/models/contextual_paraformer/model.py
+++ b/funasr/models/contextual_paraformer/model.py
@@ -294,10 +294,11 @@ class ContextualParaformer(Paraformer):
                                                                enforce_sorted=False)
             _, (h_n, _) = self.bias_encoder(hw_embed)
             hw_embed = h_n.repeat(encoder_out.shape[0], 1, 1)
-        
+        pdb.set_trace()
         decoder_outs = self.decoder(
             encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=hw_embed, clas_scale=clas_scale
         )
+        pdb.set_trace()
         decoder_out = decoder_outs[0]
         decoder_out = torch.log_softmax(decoder_out, dim=-1)
         return decoder_out, ys_pad_lens
@@ -311,65 +312,55 @@ class ContextualParaformer(Paraformer):
                  **kwargs,
                  ):
         # init beamsearch
-        pdb.set_trace()
+
         is_use_ctc = kwargs.get("decoding_ctc_weight", 0.0) > 0.00001 and self.ctc != None
         is_use_lm = kwargs.get("lm_weight", 0.0) > 0.00001 and kwargs.get("lm_file", None) is not None
         if self.beam_search is None and (is_use_lm or is_use_ctc):
             logging.info("enable beam_search")
             self.init_beam_search(**kwargs)
             self.nbest = kwargs.get("nbest", 1)
-        pdb.set_trace()
+
         meta_data = {}
         
         # extract fbank feats
         time1 = time.perf_counter()
-        pdb.set_trace()
+
         audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
-        pdb.set_trace()
+
         time2 = time.perf_counter()
         meta_data["load_data"] = f"{time2 - time1:0.3f}"
-        pdb.set_trace()
+
         speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
                                                frontend=frontend)
         time3 = time.perf_counter()
         meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
         meta_data[
             "batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000
-        
-        pdb.set_trace()
+
         speech = speech.to(device=kwargs["device"])
         speech_lengths = speech_lengths.to(device=kwargs["device"])
 
         # hotword
-        pdb.set_trace()
         self.hotword_list = self.generate_hotwords_list(kwargs.get("hotword", None), tokenizer=tokenizer, frontend=frontend)
-        pdb.set_trace()
 
-        
         # Encoder
         encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
         if isinstance(encoder_out, tuple):
             encoder_out = encoder_out[0]
-        pdb.set_trace()
-
 
         # predictor
         predictor_outs = self.calc_predictor(encoder_out, encoder_out_lens)
         pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
                                                                         predictor_outs[2], predictor_outs[3]
-        pdb.set_trace()
         pre_token_length = pre_token_length.round().long()
         if torch.max(pre_token_length) < 1:
             return []
-
-        pdb.set_trace()
         
         decoder_outs = self.cal_decoder_with_predictor(encoder_out, encoder_out_lens,
                                                                  pre_acoustic_embeds,
                                                                  pre_token_length,
                                                                  hw_list=self.hotword_list,
                                                                  clas_scale=kwargs.get("clas_scale", 1.0))
-        pdb.set_trace()
         decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
         
         pdb.set_trace()

From 62178770dccdbf5da42e831898ea32adeeacba45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 21 Feb 2024 20:04:01 +0800
Subject: [PATCH 011/101] test

---
 funasr/auto/auto_model.py                    |  6 +--
 funasr/models/contextual_paraformer/model.py | 29 +++++------
 funasr/models/seaco_paraformer/model.py      | 51 +++++++++++++++++---
 3 files changed, 55 insertions(+), 31 deletions(-)

diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py
index 60aeb1600..a3202fdb4 100644
--- a/funasr/auto/auto_model.py
+++ b/funasr/auto/auto_model.py
@@ -209,14 +209,12 @@ class AutoModel:
         kwargs.update(cfg)
         model = self.model if model is None else model
         model.eval()
-        pdb.set_trace()
 
         batch_size = kwargs.get("batch_size", 1)
         # if kwargs.get("device", "cpu") == "cpu":
         #     batch_size = 1
         
         key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key)
-        pdb.set_trace()
 
         speed_stats = {}
         asr_result_list = []
@@ -225,14 +223,12 @@ class AutoModel:
         pbar = tqdm(colour="blue", total=num_samples, dynamic_ncols=True) if not disable_pbar else None
         time_speech_total = 0.0
         time_escape_total = 0.0
-        pdb.set_trace()
         for beg_idx in range(0, num_samples, batch_size):
-            pdb.set_trace()
             end_idx = min(num_samples, beg_idx + batch_size)
             data_batch = data_list[beg_idx:end_idx]
             key_batch = key_list[beg_idx:end_idx]
             batch = {"data_in": data_batch, "key": key_batch}
-            pdb.set_trace()
+
             if (end_idx - beg_idx) == 1 and kwargs.get("data_type", None) == "fbank": # fbank
                 batch["data_in"] = data_batch[0]
                 batch["data_lengths"] = input_len
diff --git a/funasr/models/contextual_paraformer/model.py b/funasr/models/contextual_paraformer/model.py
index 10bbf9d00..1c0805ab0 100644
--- a/funasr/models/contextual_paraformer/model.py
+++ b/funasr/models/contextual_paraformer/model.py
@@ -102,17 +102,16 @@ class ContextualParaformer(Paraformer):
             text_lengths = text_lengths[:, 0]
         if len(speech_lengths.size()) > 1:
             speech_lengths = speech_lengths[:, 0]
-        pdb.set_trace()
+
         batch_size = speech.shape[0]
 
         hotword_pad = kwargs.get("hotword_pad")
         hotword_lengths = kwargs.get("hotword_lengths")
         dha_pad = kwargs.get("dha_pad")
-        pdb.set_trace()
+
         # 1. Encoder
         encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
 
-        pdb.set_trace()
         loss_ctc, cer_ctc = None, None
         
         stats = dict()
@@ -127,12 +126,11 @@ class ContextualParaformer(Paraformer):
             stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
             stats["cer_ctc"] = cer_ctc
         
-        pdb.set_trace()
         # 2b. Attention decoder branch
         loss_att, acc_att, cer_att, wer_att, loss_pre, loss_ideal = self._calc_att_clas_loss(
             encoder_out, encoder_out_lens, text, text_lengths, hotword_pad, hotword_lengths
         )
-        pdb.set_trace()
+
         # 3. CTC-Att loss definition
         if self.ctc_weight == 0.0:
             loss = loss_att + loss_pre * self.predictor_weight
@@ -170,26 +168,24 @@ class ContextualParaformer(Paraformer):
     ):
         encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
             encoder_out.device)
-        pdb.set_trace()
+
         if self.predictor_bias == 1:
             _, ys_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
             ys_pad_lens = ys_pad_lens + self.predictor_bias
-        pdb.set_trace()
+
         pre_acoustic_embeds, pre_token_length, _, _ = self.predictor(encoder_out, ys_pad, encoder_out_mask,
                                                                      ignore_id=self.ignore_id)
-        pdb.set_trace()
         # -1. bias encoder
         if self.use_decoder_embedding:
             hw_embed = self.decoder.embed(hotword_pad)
         else:
             hw_embed = self.bias_embed(hotword_pad)
-        pdb.set_trace()
+
         hw_embed, (_, _) = self.bias_encoder(hw_embed)
-        pdb.set_trace()
         _ind = np.arange(0, hotword_pad.shape[0]).tolist()
         selected = hw_embed[_ind, [i - 1 for i in hotword_lengths.detach().cpu().tolist()]]
         contextual_info = selected.squeeze(0).repeat(ys_pad.shape[0], 1, 1).to(ys_pad.device)
-        pdb.set_trace()
+
         # 0. sampler
         decoder_out_1st = None
         if self.sampling_ratio > 0.0:
@@ -201,7 +197,7 @@ class ContextualParaformer(Paraformer):
             if self.step_cur < 2:
                 logging.info("disable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
             sematic_embeds = pre_acoustic_embeds
-        pdb.set_trace()
+
         # 1. Forward decoder
         decoder_outs = self.decoder(
             encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=contextual_info
@@ -217,7 +213,7 @@ class ContextualParaformer(Paraformer):
             loss_ideal = None
         '''
         loss_ideal = None
-        pdb.set_trace()
+
         if decoder_out_1st is None:
             decoder_out_1st = decoder_out
         # 2. Compute attention loss
@@ -294,11 +290,11 @@ class ContextualParaformer(Paraformer):
                                                                enforce_sorted=False)
             _, (h_n, _) = self.bias_encoder(hw_embed)
             hw_embed = h_n.repeat(encoder_out.shape[0], 1, 1)
-        pdb.set_trace()
+
         decoder_outs = self.decoder(
             encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=hw_embed, clas_scale=clas_scale
         )
-        pdb.set_trace()
+
         decoder_out = decoder_outs[0]
         decoder_out = torch.log_softmax(decoder_out, dim=-1)
         return decoder_out, ys_pad_lens
@@ -363,14 +359,11 @@ class ContextualParaformer(Paraformer):
                                                                  clas_scale=kwargs.get("clas_scale", 1.0))
         decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
         
-        pdb.set_trace()
         results = []
         b, n, d = decoder_out.size()
-        pdb.set_trace()
         for i in range(b):
             x = encoder_out[i, :encoder_out_lens[i], :]
             am_scores = decoder_out[i, :pre_token_length[i], :]
-            pdb.set_trace()
             if self.beam_search is not None:
                 nbest_hyps = self.beam_search(
                     x=x, am_scores=am_scores, maxlenratio=kwargs.get("maxlenratio", 0.0),
diff --git a/funasr/models/seaco_paraformer/model.py b/funasr/models/seaco_paraformer/model.py
index caf2b15c7..b3b913344 100644
--- a/funasr/models/seaco_paraformer/model.py
+++ b/funasr/models/seaco_paraformer/model.py
@@ -32,7 +32,7 @@ from funasr.models.transformer.utils.add_sos_eos import add_sos_eos
 from funasr.models.transformer.utils.nets_utils import make_pad_mask, pad_list
 from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
 
-
+import pdb
 if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
     from torch.cuda.amp import autocast
 else:
@@ -130,7 +130,7 @@ class SeacoParaformer(BiCifParaformer, Paraformer):
         hotword_pad = kwargs.get("hotword_pad")
         hotword_lengths = kwargs.get("hotword_lengths")
         dha_pad = kwargs.get("dha_pad")
-
+        
         batch_size = speech.shape[0]
         self.step_cur += 1
         # for data-parallel
@@ -212,58 +212,87 @@ class SeacoParaformer(BiCifParaformer, Paraformer):
                                nfilter=50,
                                seaco_weight=1.0):
         # decoder forward
+        pdb.set_trace()
         decoder_out, decoder_hidden, _ = self.decoder(encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, return_hidden=True, return_both=True)
+        pdb.set_trace()
         decoder_pred = torch.log_softmax(decoder_out, dim=-1)
         if hw_list is not None:
+            pdb.set_trace()
             hw_lengths = [len(i) for i in hw_list]
             hw_list_ = [torch.Tensor(i).long() for i in hw_list]
             hw_list_pad = pad_list(hw_list_, 0).to(encoder_out.device)
+            pdb.set_trace()
             selected = self._hotword_representation(hw_list_pad, torch.Tensor(hw_lengths).int().to(encoder_out.device))
+            pdb.set_trace()
             contextual_info = selected.squeeze(0).repeat(encoder_out.shape[0], 1, 1).to(encoder_out.device)
+            pdb.set_trace()
             num_hot_word = contextual_info.shape[1]
             _contextual_length = torch.Tensor([num_hot_word]).int().repeat(encoder_out.shape[0]).to(encoder_out.device)
-            
+            pdb.set_trace()
             # ASF Core
             if nfilter > 0 and nfilter < num_hot_word:
                 for dec in self.seaco_decoder.decoders:
                     dec.reserve_attn = True
+                pdb.set_trace()
                 # cif_attended, _ = self.decoder2(contextual_info, _contextual_length, sematic_embeds, ys_pad_lens)
                 dec_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, decoder_hidden, ys_pad_lens)
                 # cif_filter = torch.topk(self.decoder2.decoders[-1].attn_mat[0][0].sum(0).sum(0)[:-1], min(nfilter, num_hot_word-1))[1].tolist()
+                pdb.set_trace()
                 hotword_scores = self.seaco_decoder.decoders[-1].attn_mat[0][0].sum(0).sum(0)[:-1]
                 # hotword_scores /= torch.sqrt(torch.tensor(hw_lengths)[:-1].float()).to(hotword_scores.device)
+                pdb.set_trace()
                 dec_filter = torch.topk(hotword_scores, min(nfilter, num_hot_word-1))[1].tolist()
+                pdb.set_trace()
                 add_filter = dec_filter
+                pdb.set_trace()
                 add_filter.append(len(hw_list_pad)-1)
                 # filter hotword embedding
+                pdb.set_trace()
                 selected = selected[add_filter]
                 # again
+                pdb.set_trace()
                 contextual_info = selected.squeeze(0).repeat(encoder_out.shape[0], 1, 1).to(encoder_out.device)
+                pdb.set_trace()
                 num_hot_word = contextual_info.shape[1]
                 _contextual_length = torch.Tensor([num_hot_word]).int().repeat(encoder_out.shape[0]).to(encoder_out.device)
+                pdb.set_trace()
                 for dec in self.seaco_decoder.decoders:
                     dec.attn_mat = []
                     dec.reserve_attn = False
-            
+            pdb.set_trace()
             # SeACo Core
             cif_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, sematic_embeds, ys_pad_lens)
+            pdb.set_trace()
             dec_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, decoder_hidden, ys_pad_lens)
+            pdb.set_trace()
             merged = self._merge(cif_attended, dec_attended)
-            
+            pdb.set_trace()
+
             dha_output = self.hotword_output_layer(merged)  # remove the last token in loss calculation
+            pdb.set_trace()
             dha_pred = torch.log_softmax(dha_output, dim=-1)
+            pdb.set_trace()
             def _merge_res(dec_output, dha_output):
+                pdb.set_trace()
                 lmbd = torch.Tensor([seaco_weight] * dha_output.shape[0])
+                pdb.set_trace()
                 dha_ids = dha_output.max(-1)[-1]# [0]
+                pdb.set_trace()
                 dha_mask = (dha_ids == 8377).int().unsqueeze(-1)
+                pdb.set_trace()
                 a = (1 - lmbd) / lmbd
                 b = 1 / lmbd
+                pdb.set_trace()
                 a, b = a.to(dec_output.device), b.to(dec_output.device)
+                pdb.set_trace()
                 dha_mask = (dha_mask + a.reshape(-1, 1, 1)) / b.reshape(-1, 1, 1)
                 # logits = dec_output * dha_mask + dha_output[:,:,:-1] * (1-dha_mask)
+                pdb.set_trace()
                 logits = dec_output * dha_mask + dha_output[:,:,:] * (1-dha_mask)
                 return logits
+
             merged_pred = _merge_res(decoder_pred, dha_pred)
+            pdb.set_trace()
             # import pdb; pdb.set_trace()
             return merged_pred
         else:
@@ -318,7 +347,7 @@ class SeacoParaformer(BiCifParaformer, Paraformer):
             logging.info("enable beam_search")
             self.init_beam_search(**kwargs)
             self.nbest = kwargs.get("nbest", 1)
-        
+        pdb.set_trace()
         meta_data = {}
         
         # extract fbank feats
@@ -326,6 +355,7 @@ class SeacoParaformer(BiCifParaformer, Paraformer):
         audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
         time2 = time.perf_counter()
         meta_data["load_data"] = f"{time2 - time1:0.3f}"
+        pdb.set_trace()
         speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
                                                frontend=frontend)
         time3 = time.perf_counter()
@@ -336,14 +366,18 @@ class SeacoParaformer(BiCifParaformer, Paraformer):
         speech = speech.to(device=kwargs["device"])
         speech_lengths = speech_lengths.to(device=kwargs["device"])
         
+        pdb.set_trace()
         # hotword
         self.hotword_list = self.generate_hotwords_list(kwargs.get("hotword", None), tokenizer=tokenizer, frontend=frontend)
         
+        pdb.set_trace()
         # Encoder
         encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
         if isinstance(encoder_out, tuple):
             encoder_out = encoder_out[0]
         
+
+        pdb.set_trace()
         # predictor
         predictor_outs = self.calc_predictor(encoder_out, encoder_out_lens)
         pre_acoustic_embeds, pre_token_length, _, _ = predictor_outs[0], predictor_outs[1], \
@@ -352,15 +386,16 @@ class SeacoParaformer(BiCifParaformer, Paraformer):
         if torch.max(pre_token_length) < 1:
             return []
 
-
+        pdb.set_trace()
         decoder_out = self._seaco_decode_with_ASF(encoder_out, encoder_out_lens,
                                                    pre_acoustic_embeds,
                                                    pre_token_length,
                                                    hw_list=self.hotword_list)
+        pdb.set_trace()
         # decoder_out, _ = decoder_outs[0], decoder_outs[1]
         _, _, us_alphas, us_peaks = self.calc_predictor_timestamp(encoder_out, encoder_out_lens,
                                                                   pre_token_length)
-        
+        pdb.set_trace()
         results = []
         b, n, d = decoder_out.size()
         for i in range(b):

From e943de2cb128074ca71bcee69fc262ac43420860 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 10:13:29 +0800
Subject: [PATCH 012/101] test

---
 .../seaco_paraformer/demo.py                           | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/industrial_data_pretraining/seaco_paraformer/demo.py b/examples/industrial_data_pretraining/seaco_paraformer/demo.py
index a44c649ae..551dd8bf8 100644
--- a/examples/industrial_data_pretraining/seaco_paraformer/demo.py
+++ b/examples/industrial_data_pretraining/seaco_paraformer/demo.py
@@ -7,10 +7,10 @@ from funasr import AutoModel
 
 model = AutoModel(model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
                   model_revision="v2.0.4",
-                  vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
-                  vad_model_revision="v2.0.4",
-                  punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
-                  punc_model_revision="v2.0.4",
+                  # vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+                  # vad_model_revision="v2.0.4",
+                  # punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+                  # punc_model_revision="v2.0.4",
                   # spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
                   # spk_model_revision="v2.0.2",
                   )
@@ -43,4 +43,4 @@ import soundfile
 wav_file = os.path.join(model.model_path, "example/asr_example.wav")
 speech, sample_rate = soundfile.read(wav_file)
 res = model.generate(input=[speech], batch_size_s=300, is_final=True)
-'''
\ No newline at end of file
+'''

From eba89467c819857f16f1883ff87c4d2e79e4a17b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 11:49:34 +0800
Subject: [PATCH 013/101] test

---
 funasr/models/seaco_paraformer/model.py | 46 ++++---------------------
 1 file changed, 7 insertions(+), 39 deletions(-)

diff --git a/funasr/models/seaco_paraformer/model.py b/funasr/models/seaco_paraformer/model.py
index b3b913344..e0467b3c4 100644
--- a/funasr/models/seaco_paraformer/model.py
+++ b/funasr/models/seaco_paraformer/model.py
@@ -212,88 +212,63 @@ class SeacoParaformer(BiCifParaformer, Paraformer):
                                nfilter=50,
                                seaco_weight=1.0):
         # decoder forward
-        pdb.set_trace()
+
         decoder_out, decoder_hidden, _ = self.decoder(encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, return_hidden=True, return_both=True)
-        pdb.set_trace()
+
         decoder_pred = torch.log_softmax(decoder_out, dim=-1)
         if hw_list is not None:
-            pdb.set_trace()
             hw_lengths = [len(i) for i in hw_list]
             hw_list_ = [torch.Tensor(i).long() for i in hw_list]
             hw_list_pad = pad_list(hw_list_, 0).to(encoder_out.device)
-            pdb.set_trace()
             selected = self._hotword_representation(hw_list_pad, torch.Tensor(hw_lengths).int().to(encoder_out.device))
-            pdb.set_trace()
+
             contextual_info = selected.squeeze(0).repeat(encoder_out.shape[0], 1, 1).to(encoder_out.device)
-            pdb.set_trace()
             num_hot_word = contextual_info.shape[1]
             _contextual_length = torch.Tensor([num_hot_word]).int().repeat(encoder_out.shape[0]).to(encoder_out.device)
-            pdb.set_trace()
+
             # ASF Core
             if nfilter > 0 and nfilter < num_hot_word:
                 for dec in self.seaco_decoder.decoders:
                     dec.reserve_attn = True
-                pdb.set_trace()
+
                 # cif_attended, _ = self.decoder2(contextual_info, _contextual_length, sematic_embeds, ys_pad_lens)
                 dec_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, decoder_hidden, ys_pad_lens)
                 # cif_filter = torch.topk(self.decoder2.decoders[-1].attn_mat[0][0].sum(0).sum(0)[:-1], min(nfilter, num_hot_word-1))[1].tolist()
-                pdb.set_trace()
+
                 hotword_scores = self.seaco_decoder.decoders[-1].attn_mat[0][0].sum(0).sum(0)[:-1]
                 # hotword_scores /= torch.sqrt(torch.tensor(hw_lengths)[:-1].float()).to(hotword_scores.device)
-                pdb.set_trace()
                 dec_filter = torch.topk(hotword_scores, min(nfilter, num_hot_word-1))[1].tolist()
-                pdb.set_trace()
                 add_filter = dec_filter
-                pdb.set_trace()
                 add_filter.append(len(hw_list_pad)-1)
                 # filter hotword embedding
-                pdb.set_trace()
                 selected = selected[add_filter]
                 # again
-                pdb.set_trace()
                 contextual_info = selected.squeeze(0).repeat(encoder_out.shape[0], 1, 1).to(encoder_out.device)
-                pdb.set_trace()
                 num_hot_word = contextual_info.shape[1]
                 _contextual_length = torch.Tensor([num_hot_word]).int().repeat(encoder_out.shape[0]).to(encoder_out.device)
-                pdb.set_trace()
                 for dec in self.seaco_decoder.decoders:
                     dec.attn_mat = []
                     dec.reserve_attn = False
-            pdb.set_trace()
             # SeACo Core
             cif_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, sematic_embeds, ys_pad_lens)
-            pdb.set_trace()
             dec_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, decoder_hidden, ys_pad_lens)
-            pdb.set_trace()
             merged = self._merge(cif_attended, dec_attended)
-            pdb.set_trace()
 
             dha_output = self.hotword_output_layer(merged)  # remove the last token in loss calculation
-            pdb.set_trace()
             dha_pred = torch.log_softmax(dha_output, dim=-1)
-            pdb.set_trace()
             def _merge_res(dec_output, dha_output):
-                pdb.set_trace()
                 lmbd = torch.Tensor([seaco_weight] * dha_output.shape[0])
-                pdb.set_trace()
                 dha_ids = dha_output.max(-1)[-1]# [0]
-                pdb.set_trace()
                 dha_mask = (dha_ids == 8377).int().unsqueeze(-1)
-                pdb.set_trace()
                 a = (1 - lmbd) / lmbd
                 b = 1 / lmbd
-                pdb.set_trace()
                 a, b = a.to(dec_output.device), b.to(dec_output.device)
-                pdb.set_trace()
                 dha_mask = (dha_mask + a.reshape(-1, 1, 1)) / b.reshape(-1, 1, 1)
                 # logits = dec_output * dha_mask + dha_output[:,:,:-1] * (1-dha_mask)
-                pdb.set_trace()
                 logits = dec_output * dha_mask + dha_output[:,:,:] * (1-dha_mask)
                 return logits
 
             merged_pred = _merge_res(decoder_pred, dha_pred)
-            pdb.set_trace()
-            # import pdb; pdb.set_trace()
             return merged_pred
         else:
             return decoder_pred
@@ -347,7 +322,6 @@ class SeacoParaformer(BiCifParaformer, Paraformer):
             logging.info("enable beam_search")
             self.init_beam_search(**kwargs)
             self.nbest = kwargs.get("nbest", 1)
-        pdb.set_trace()
         meta_data = {}
         
         # extract fbank feats
@@ -355,7 +329,6 @@ class SeacoParaformer(BiCifParaformer, Paraformer):
         audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
         time2 = time.perf_counter()
         meta_data["load_data"] = f"{time2 - time1:0.3f}"
-        pdb.set_trace()
         speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
                                                frontend=frontend)
         time3 = time.perf_counter()
@@ -366,18 +339,15 @@ class SeacoParaformer(BiCifParaformer, Paraformer):
         speech = speech.to(device=kwargs["device"])
         speech_lengths = speech_lengths.to(device=kwargs["device"])
         
-        pdb.set_trace()
         # hotword
         self.hotword_list = self.generate_hotwords_list(kwargs.get("hotword", None), tokenizer=tokenizer, frontend=frontend)
         
-        pdb.set_trace()
         # Encoder
         encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
         if isinstance(encoder_out, tuple):
             encoder_out = encoder_out[0]
         
 
-        pdb.set_trace()
         # predictor
         predictor_outs = self.calc_predictor(encoder_out, encoder_out_lens)
         pre_acoustic_embeds, pre_token_length, _, _ = predictor_outs[0], predictor_outs[1], \
@@ -386,16 +356,14 @@ class SeacoParaformer(BiCifParaformer, Paraformer):
         if torch.max(pre_token_length) < 1:
             return []
 
-        pdb.set_trace()
         decoder_out = self._seaco_decode_with_ASF(encoder_out, encoder_out_lens,
                                                    pre_acoustic_embeds,
                                                    pre_token_length,
                                                    hw_list=self.hotword_list)
-        pdb.set_trace()
+
         # decoder_out, _ = decoder_outs[0], decoder_outs[1]
         _, _, us_alphas, us_peaks = self.calc_predictor_timestamp(encoder_out, encoder_out_lens,
                                                                   pre_token_length)
-        pdb.set_trace()
         results = []
         b, n, d = decoder_out.size()
         for i in range(b):

From 0871fa6e0d986115e3056878d2eec9dcac2ba43d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 14:15:35 +0800
Subject: [PATCH 014/101] atsr

---
 funasr/models/lcbnet/model.py | 454 ++++++++++++++++++++++++++++++++++
 1 file changed, 454 insertions(+)
 create mode 100644 funasr/models/lcbnet/model.py

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
new file mode 100644
index 000000000..c68ccd7ba
--- /dev/null
+++ b/funasr/models/lcbnet/model.py
@@ -0,0 +1,454 @@
+import logging
+from typing import Union, Dict, List, Tuple, Optional
+
+import time
+import torch
+import torch.nn as nn
+from torch.cuda.amp import autocast
+
+from funasr.losses.label_smoothing_loss import LabelSmoothingLoss
+from funasr.models.ctc.ctc import CTC
+from funasr.models.transformer.utils.add_sos_eos import add_sos_eos
+from funasr.metrics.compute_acc import th_accuracy
+# from funasr.models.e2e_asr_common import ErrorCalculator
+from funasr.train_utils.device_funcs import force_gatherable
+from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
+from funasr.utils import postprocess_utils
+from funasr.utils.datadir_writer import DatadirWriter
+from funasr.register import tables
+
+@tables.register("model_classes", "Transformer")
+class Transformer(nn.Module):
+    """CTC-attention hybrid Encoder-Decoder model"""
+
+    
+    def __init__(
+        self,
+        specaug: str = None,
+        specaug_conf: dict = None,
+        normalize: str = None,
+        normalize_conf: dict = None,
+        encoder: str = None,
+        encoder_conf: dict = None,
+        decoder: str = None,
+        decoder_conf: dict = None,
+        ctc: str = None,
+        ctc_conf: dict = None,
+        ctc_weight: float = 0.5,
+        interctc_weight: float = 0.0,
+        input_size: int = 80,
+        vocab_size: int = -1,
+        ignore_id: int = -1,
+        blank_id: int = 0,
+        sos: int = 1,
+        eos: int = 2,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+        report_cer: bool = True,
+        report_wer: bool = True,
+        sym_space: str = "<space>",
+        sym_blank: str = "<blank>",
+        # extract_feats_in_collect_stats: bool = True,
+        share_embedding: bool = False,
+        # preencoder: Optional[AbsPreEncoder] = None,
+        # postencoder: Optional[AbsPostEncoder] = None,
+        **kwargs,
+    ):
+
+        super().__init__()
+
+        if specaug is not None:
+            specaug_class = tables.specaug_classes.get(specaug)
+            specaug = specaug_class(**specaug_conf)
+        if normalize is not None:
+            normalize_class = tables.normalize_classes.get(normalize)
+            normalize = normalize_class(**normalize_conf)
+        encoder_class = tables.encoder_classes.get(encoder)
+        encoder = encoder_class(input_size=input_size, **encoder_conf)
+        encoder_output_size = encoder.output_size()
+        if decoder is not None:
+            decoder_class = tables.decoder_classes.get(decoder)
+            decoder = decoder_class(
+                vocab_size=vocab_size,
+                encoder_output_size=encoder_output_size,
+                **decoder_conf,
+            )
+        if ctc_weight > 0.0:
+            
+            if ctc_conf is None:
+                ctc_conf = {}
+            
+            ctc = CTC(
+                odim=vocab_size, encoder_output_size=encoder_output_size, **ctc_conf
+            )
+    
+        self.blank_id = blank_id
+        self.sos = sos if sos is not None else vocab_size - 1
+        self.eos = eos if eos is not None else vocab_size - 1
+        self.vocab_size = vocab_size
+        self.ignore_id = ignore_id
+        self.ctc_weight = ctc_weight
+        self.specaug = specaug
+        self.normalize = normalize
+        self.encoder = encoder
+
+        if not hasattr(self.encoder, "interctc_use_conditioning"):
+            self.encoder.interctc_use_conditioning = False
+        if self.encoder.interctc_use_conditioning:
+            self.encoder.conditioning_layer = torch.nn.Linear(
+                vocab_size, self.encoder.output_size()
+            )
+        self.interctc_weight = interctc_weight
+
+        # self.error_calculator = None
+        if ctc_weight == 1.0:
+            self.decoder = None
+        else:
+            self.decoder = decoder
+        
+        self.criterion_att = LabelSmoothingLoss(
+            size=vocab_size,
+            padding_idx=ignore_id,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+        #
+        # if report_cer or report_wer:
+        #     self.error_calculator = ErrorCalculator(
+        #         token_list, sym_space, sym_blank, report_cer, report_wer
+        #     )
+        #
+        self.error_calculator = None
+        if ctc_weight == 0.0:
+            self.ctc = None
+        else:
+            self.ctc = ctc
+            
+        self.share_embedding = share_embedding
+        if self.share_embedding:
+            self.decoder.embed = None
+        
+        self.length_normalized_loss = length_normalized_loss
+        self.beam_search = None
+    
+    def forward(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Encoder + Decoder + Calc loss
+        Args:
+                speech: (Batch, Length, ...)
+                speech_lengths: (Batch, )
+                text: (Batch, Length)
+                text_lengths: (Batch,)
+        """
+        # import pdb;
+        # pdb.set_trace()
+        if len(text_lengths.size()) > 1:
+            text_lengths = text_lengths[:, 0]
+        if len(speech_lengths.size()) > 1:
+            speech_lengths = speech_lengths[:, 0]
+        
+        batch_size = speech.shape[0]
+        
+        # 1. Encoder
+        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+        intermediate_outs = None
+        if isinstance(encoder_out, tuple):
+            intermediate_outs = encoder_out[1]
+            encoder_out = encoder_out[0]
+        
+        loss_att, acc_att, cer_att, wer_att = None, None, None, None
+        loss_ctc, cer_ctc = None, None
+        stats = dict()
+        
+        # decoder: CTC branch
+        if self.ctc_weight != 0.0:
+            loss_ctc, cer_ctc = self._calc_ctc_loss(
+                encoder_out, encoder_out_lens, text, text_lengths
+            )
+            
+            # Collect CTC branch stats
+            stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
+            stats["cer_ctc"] = cer_ctc
+        
+        # Intermediate CTC (optional)
+        loss_interctc = 0.0
+        if self.interctc_weight != 0.0 and intermediate_outs is not None:
+            for layer_idx, intermediate_out in intermediate_outs:
+                # we assume intermediate_out has the same length & padding
+                # as those of encoder_out
+                loss_ic, cer_ic = self._calc_ctc_loss(
+                    intermediate_out, encoder_out_lens, text, text_lengths
+                )
+                loss_interctc = loss_interctc + loss_ic
+                
+                # Collect Intermedaite CTC stats
+                stats["loss_interctc_layer{}".format(layer_idx)] = (
+                    loss_ic.detach() if loss_ic is not None else None
+                )
+                stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic
+            
+            loss_interctc = loss_interctc / len(intermediate_outs)
+            
+            # calculate whole encoder loss
+            loss_ctc = (
+                           1 - self.interctc_weight
+                       ) * loss_ctc + self.interctc_weight * loss_interctc
+        
+        # decoder: Attention decoder branch
+        loss_att, acc_att, cer_att, wer_att = self._calc_att_loss(
+            encoder_out, encoder_out_lens, text, text_lengths
+        )
+        
+        # 3. CTC-Att loss definition
+        if self.ctc_weight == 0.0:
+            loss = loss_att
+        elif self.ctc_weight == 1.0:
+            loss = loss_ctc
+        else:
+            loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att
+        
+        # Collect Attn branch stats
+        stats["loss_att"] = loss_att.detach() if loss_att is not None else None
+        stats["acc"] = acc_att
+        stats["cer"] = cer_att
+        stats["wer"] = wer_att
+        
+        # Collect total loss stats
+        stats["loss"] = torch.clone(loss.detach())
+        
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        if self.length_normalized_loss:
+            batch_size = int((text_lengths + 1).sum())
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+    
+
+    def encode(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor, **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Frontend + Encoder. Note that this method is used by asr_inference.py
+        Args:
+                speech: (Batch, Length, ...)
+                speech_lengths: (Batch, )
+                ind: int
+        """
+        with autocast(False):
+
+            # Data augmentation
+            if self.specaug is not None and self.training:
+                speech, speech_lengths = self.specaug(speech, speech_lengths)
+            
+            # Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
+            if self.normalize is not None:
+                speech, speech_lengths = self.normalize(speech, speech_lengths)
+        
+        # Forward encoder
+        # feats: (Batch, Length, Dim)
+        # -> encoder_out: (Batch, Length2, Dim2)
+        if self.encoder.interctc_use_conditioning:
+            encoder_out, encoder_out_lens, _ = self.encoder(
+                speech, speech_lengths, ctc=self.ctc
+            )
+        else:
+            encoder_out, encoder_out_lens, _ = self.encoder(speech, speech_lengths)
+        intermediate_outs = None
+        if isinstance(encoder_out, tuple):
+            intermediate_outs = encoder_out[1]
+            encoder_out = encoder_out[0]
+        
+        if intermediate_outs is not None:
+            return (encoder_out, intermediate_outs), encoder_out_lens
+        
+        return encoder_out, encoder_out_lens
+    
+    def _calc_att_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
+        ys_in_lens = ys_pad_lens + 1
+        
+        # 1. Forward decoder
+        decoder_out, _ = self.decoder(
+            encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
+        )
+        
+        # 2. Compute attention loss
+        loss_att = self.criterion_att(decoder_out, ys_out_pad)
+        acc_att = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id,
+        )
+        
+        # Compute cer/wer using attention-decoder
+        if self.training or self.error_calculator is None:
+            cer_att, wer_att = None, None
+        else:
+            ys_hat = decoder_out.argmax(dim=-1)
+            cer_att, wer_att = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())
+        
+        return loss_att, acc_att, cer_att, wer_att
+    
+    def _calc_ctc_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        # Calc CTC loss
+        loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens)
+        
+        # Calc CER using CTC
+        cer_ctc = None
+        if not self.training and self.error_calculator is not None:
+            ys_hat = self.ctc.argmax(encoder_out).data
+            cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
+        return loss_ctc, cer_ctc
+    
+    def init_beam_search(self,
+                         **kwargs,
+                         ):
+        from funasr.models.transformer.search import BeamSearch
+        from funasr.models.transformer.scorers.ctc import CTCPrefixScorer
+        from funasr.models.transformer.scorers.length_bonus import LengthBonus
+    
+        # 1. Build ASR model
+        scorers = {}
+        
+        if self.ctc != None:
+            ctc = CTCPrefixScorer(ctc=self.ctc, eos=self.eos)
+            scorers.update(
+                ctc=ctc
+            )
+        token_list = kwargs.get("token_list")
+        scorers.update(
+            decoder=self.decoder,
+            length_bonus=LengthBonus(len(token_list)),
+        )
+
+        
+        # 3. Build ngram model
+        # ngram is not supported now
+        ngram = None
+        scorers["ngram"] = ngram
+        
+        weights = dict(
+            decoder=1.0 - kwargs.get("decoding_ctc_weight", 0.5),
+            ctc=kwargs.get("decoding_ctc_weight", 0.5),
+            lm=kwargs.get("lm_weight", 0.0),
+            ngram=kwargs.get("ngram_weight", 0.0),
+            length_bonus=kwargs.get("penalty", 0.0),
+        )
+        beam_search = BeamSearch(
+            beam_size=kwargs.get("beam_size", 10),
+            weights=weights,
+            scorers=scorers,
+            sos=self.sos,
+            eos=self.eos,
+            vocab_size=len(token_list),
+            token_list=token_list,
+            pre_beam_score_key=None if self.ctc_weight == 1.0 else "full",
+        )
+
+        self.beam_search = beam_search
+        
+    def inference(self,
+             data_in,
+             data_lengths=None,
+             key: list=None,
+             tokenizer=None,
+             frontend=None,
+             **kwargs,
+             ):
+        
+        if kwargs.get("batch_size", 1) > 1:
+            raise NotImplementedError("batch decoding is not implemented")
+        
+        # init beamsearch
+        if self.beam_search is None:
+            logging.info("enable beam_search")
+            self.init_beam_search(**kwargs)
+            self.nbest = kwargs.get("nbest", 1)
+
+        meta_data = {}
+        if isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank":  # fbank
+            speech, speech_lengths = data_in, data_lengths
+            if len(speech.shape) < 3:
+                speech = speech[None, :, :]
+            if speech_lengths is None:
+                speech_lengths = speech.shape[1]
+        else:
+            # extract fbank feats
+            time1 = time.perf_counter()
+            audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000),
+                                                            data_type=kwargs.get("data_type", "sound"),
+                                                            tokenizer=tokenizer)
+            time2 = time.perf_counter()
+            meta_data["load_data"] = f"{time2 - time1:0.3f}"
+            speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
+                                                   frontend=frontend)
+            time3 = time.perf_counter()
+            meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
+            meta_data["batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000
+
+        speech = speech.to(device=kwargs["device"])
+        speech_lengths = speech_lengths.to(device=kwargs["device"])
+        # Encoder
+        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+        if isinstance(encoder_out, tuple):
+            encoder_out = encoder_out[0]
+        
+        # c. Passed the encoder result and the beam search
+        nbest_hyps = self.beam_search(
+            x=encoder_out[0], maxlenratio=kwargs.get("maxlenratio", 0.0), minlenratio=kwargs.get("minlenratio", 0.0)
+        )
+        
+        nbest_hyps = nbest_hyps[: self.nbest]
+
+
+        results = []
+        b, n, d = encoder_out.size()
+        for i in range(b):
+
+            for nbest_idx, hyp in enumerate(nbest_hyps):
+                ibest_writer = None
+                if kwargs.get("output_dir") is not None:
+                    if not hasattr(self, "writer"):
+                        self.writer = DatadirWriter(kwargs.get("output_dir"))
+                    ibest_writer = self.writer[f"{nbest_idx + 1}best_recog"]
+                    
+                # remove sos/eos and get results
+                last_pos = -1
+                if isinstance(hyp.yseq, list):
+                    token_int = hyp.yseq[1:last_pos]
+                else:
+                    token_int = hyp.yseq[1:last_pos].tolist()
+                    
+                # remove blank symbol id, which is assumed to be 0
+                token_int = list(filter(lambda x: x != self.eos and x != self.sos and x != self.blank_id, token_int))
+                
+                # Change integer-ids to tokens
+                token = tokenizer.ids2tokens(token_int)
+                text = tokenizer.tokens2text(token)
+                
+                text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
+                result_i = {"key": key[i], "token": token, "text": text_postprocessed}
+                results.append(result_i)
+                
+                if ibest_writer is not None:
+                    ibest_writer["token"][key[i]] = " ".join(token)
+                    ibest_writer["text"][key[i]] = text_postprocessed
+        
+        return results, meta_data
+

From 0e416eacbfea112a76860223ca99937cb4a909c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 14:24:54 +0800
Subject: [PATCH 015/101] test

---
 funasr/models/lcbnet/model.py | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index c68ccd7ba..6a028b2f7 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -1,3 +1,8 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
 import logging
 from typing import Union, Dict, List, Tuple, Optional
 
@@ -17,10 +22,13 @@ from funasr.utils import postprocess_utils
 from funasr.utils.datadir_writer import DatadirWriter
 from funasr.register import tables
 
-@tables.register("model_classes", "Transformer")
-class Transformer(nn.Module):
-    """CTC-attention hybrid Encoder-Decoder model"""
-
+@tables.register("model_classes", "LCBNet")
+class LCBNet(nn.Module):
+    """
+    Author: Speech Lab of DAMO Academy, Alibaba Group
+    LCB-NET: LONG-CONTEXT BIASING FOR AUDIO-VISUAL SPEECH RECOGNITION
+    https://arxiv.org/abs/2401.06390
+    """
     
     def __init__(
         self,
@@ -32,10 +40,19 @@ class Transformer(nn.Module):
         encoder_conf: dict = None,
         decoder: str = None,
         decoder_conf: dict = None,
+        text_encoder: str = None,
+        text_encoder_conf: dict = None,
+        bias_predictor: str = None,
+        bias_predictor_conf: dict = None,
+        fusion_encoder: str = None,
+        fusion_encoder_conf: dict = None,
         ctc: str = None,
         ctc_conf: dict = None,
         ctc_weight: float = 0.5,
         interctc_weight: float = 0.0,
+        select_num: int = 2,
+        select_length: int = 3,
+        insert_blank: bool = True,
         input_size: int = 80,
         vocab_size: int = -1,
         ignore_id: int = -1,
@@ -66,6 +83,15 @@ class Transformer(nn.Module):
         encoder_class = tables.encoder_classes.get(encoder)
         encoder = encoder_class(input_size=input_size, **encoder_conf)
         encoder_output_size = encoder.output_size()
+
+        # lcbnet modules: text encoder, fusion encoder and bias predictor
+        text_encoder_class = tables.encoder_classes.get(text_encoder)
+        text_encoder = text_encoder_class(input_size=vocab_size, **text_encoder_conf)
+        fusion_encoder_class = tables.encoder_classes.get(fusion_encoder)
+        fusion_encoder = fusion_encoder_class(**fusion_encoder_conf)
+        bias_predictor_class = tables.encoder_classes.get_class(bias_predictor)
+        bias_predictor = bias_predictor_class(args.bias_predictor_conf)
+
         if decoder is not None:
             decoder_class = tables.decoder_classes.get(decoder)
             decoder = decoder_class(

From 6d4a5c19310be72e4dc12dc9471670868451dda6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 14:30:57 +0800
Subject: [PATCH 016/101] test

---
 funasr/models/lcbnet/model.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index 6a028b2f7..9646e1e0d 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -117,6 +117,9 @@ class LCBNet(nn.Module):
         self.specaug = specaug
         self.normalize = normalize
         self.encoder = encoder
+        self.text_encoder = text_encoder
+        self.fusion_encoder = fusion_encoder
+        self.bias_predictor = bias_predictor
 
         if not hasattr(self.encoder, "interctc_use_conditioning"):
             self.encoder.interctc_use_conditioning = False

From 044199f80279825baba0831380c5fc0369abd298 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 14:33:17 +0800
Subject: [PATCH 017/101] test

---
 funasr/models/lcbnet/model.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index 9646e1e0d..563ff26e9 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -21,7 +21,7 @@ from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
 from funasr.utils import postprocess_utils
 from funasr.utils.datadir_writer import DatadirWriter
 from funasr.register import tables
-
+import pdb
 @tables.register("model_classes", "LCBNet")
 class LCBNet(nn.Module):
     """
@@ -90,7 +90,7 @@ class LCBNet(nn.Module):
         fusion_encoder_class = tables.encoder_classes.get(fusion_encoder)
         fusion_encoder = fusion_encoder_class(**fusion_encoder_conf)
         bias_predictor_class = tables.encoder_classes.get_class(bias_predictor)
-        bias_predictor = bias_predictor_class(args.bias_predictor_conf)
+        bias_predictor = bias_predictor_class(bias_predictor_conf)
 
         if decoder is not None:
             decoder_class = tables.decoder_classes.get(decoder)
@@ -117,9 +117,13 @@ class LCBNet(nn.Module):
         self.specaug = specaug
         self.normalize = normalize
         self.encoder = encoder
+        # lcbnet
         self.text_encoder = text_encoder
         self.fusion_encoder = fusion_encoder
         self.bias_predictor = bias_predictor
+        self.select_num = select_num
+        self.select_length = select_length
+        self.insert_blank = insert_blank
 
         if not hasattr(self.encoder, "interctc_use_conditioning"):
             self.encoder.interctc_use_conditioning = False
@@ -409,7 +413,8 @@ class LCBNet(nn.Module):
             logging.info("enable beam_search")
             self.init_beam_search(**kwargs)
             self.nbest = kwargs.get("nbest", 1)
-
+        pdb.set_trace()
+        
         meta_data = {}
         if isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank":  # fbank
             speech, speech_lengths = data_in, data_lengths

From a70bd4b9ff593648de6b939a908caaaf18df5719 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 14:39:11 +0800
Subject: [PATCH 018/101] atsr

---
 examples/industrial_data_pretraining/lcbnet/demo2.sh | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100755 examples/industrial_data_pretraining/lcbnet/demo2.sh

diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh
new file mode 100755
index 000000000..5fd2eccdc
--- /dev/null
+++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh
@@ -0,0 +1,12 @@
+file_dir=./exp/
+
+
+python -m funasr.bin.inference \
+--config-path=$file_dir \
+--config-name="config.yaml" \
+++init_param="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/model.pb" \
+++tokenizer_conf.token_list="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/tokens.txt" \
+++frontend_conf.cmvn_file="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/am.mvn" \
+++input="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/asr_example_zh.wav" \
+++output_dir="./outputs/debug2" \
+++device="" \

From dce85a25d3c0f444b7e7825f186e483af4646760 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 15:38:39 +0800
Subject: [PATCH 019/101] test

---
 .../industrial_data_pretraining/lcbnet/demo2.sh  | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh
index 5fd2eccdc..20b003b33 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo2.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh
@@ -1,12 +1,14 @@
-file_dir=./exp/
+file_dir=./exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch
 
 
 python -m funasr.bin.inference \
---config-path=$file_dir \
+--config-path=${file_dir} \
 --config-name="config.yaml" \
-++init_param="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/model.pb" \
-++tokenizer_conf.token_list="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/tokens.txt" \
-++frontend_conf.cmvn_file="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/am.mvn" \
-++input="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/asr_example_zh.wav" \
-++output_dir="./outputs/debug2" \
+++init_param=${file_dir}/model.pb \
+++tokenizer_conf.token_list=${file_dir}/tokens.txt \
+++frontend_conf.cmvn_file=${file_dir}/am.mvn \
+++input=${file_dir}/wav.scp \
+++input=${file_dir}/ocr_text \
+++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
+++output_dir="./outputs/debug" \
 ++device="" \

From 7f0a06946f9accf61264ba8befe84a5cadb9f6a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 15:43:12 +0800
Subject: [PATCH 020/101] test

---
 examples/industrial_data_pretraining/lcbnet/demo2.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh
index 20b003b33..10ba5aed8 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo2.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh
@@ -9,6 +9,7 @@ python -m funasr.bin.inference \
 ++frontend_conf.cmvn_file=${file_dir}/am.mvn \
 ++input=${file_dir}/wav.scp \
 ++input=${file_dir}/ocr_text \
++data_type='["sound", "text"]' \
 ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
 ++output_dir="./outputs/debug" \
 ++device="" \

From 733073d2693de593cef2eacc902c49990e067cef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 16:08:51 +0800
Subject: [PATCH 021/101] test

---
 funasr/auto/auto_model.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py
index a3202fdb4..9db8c015d 100644
--- a/funasr/auto/auto_model.py
+++ b/funasr/auto/auto_model.py
@@ -153,15 +153,18 @@ class AutoModel:
         
         # build tokenizer
         tokenizer = kwargs.get("tokenizer", None)
+        pdb.set_trace()
         if tokenizer is not None:
             tokenizer_class = tables.tokenizer_classes.get(tokenizer)
+            pdb.set_trace()
             tokenizer = tokenizer_class(**kwargs["tokenizer_conf"])
+            pdb.set_trace()
             kwargs["tokenizer"] = tokenizer
             kwargs["token_list"] = tokenizer.token_list
             vocab_size = len(tokenizer.token_list)
         else:
             vocab_size = -1
-        
+        pdb.set_trace()
         # build frontend
         frontend = kwargs.get("frontend", None)
         if frontend is not None:

From c4fa4c5efd4965b4514194179cfed6e1faa76c42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 16:10:12 +0800
Subject: [PATCH 022/101] test

---
 examples/industrial_data_pretraining/lcbnet/demo2.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh
index 10ba5aed8..a5afa9c7d 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo2.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh
@@ -1,4 +1,4 @@
-file_dir=./exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch
+file_dir="./exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
 
 
 python -m funasr.bin.inference \

From 0b317f6d8f11de02c1348f0828e01f63bfad3626 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 16:13:18 +0800
Subject: [PATCH 023/101] test

---
 examples/industrial_data_pretraining/lcbnet/demo2.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh
index a5afa9c7d..36a692856 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo2.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh
@@ -1,4 +1,4 @@
-file_dir="./exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
+file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
 
 
 python -m funasr.bin.inference \

From 491b2af1ecdf26f1513ac6a83f3490bf1b265449 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 16:21:37 +0800
Subject: [PATCH 024/101] test

---
 funasr/auto/auto_model.py   | 3 ---
 funasr/frontends/default.py | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py
index 9db8c015d..68559d121 100644
--- a/funasr/auto/auto_model.py
+++ b/funasr/auto/auto_model.py
@@ -153,12 +153,9 @@ class AutoModel:
         
         # build tokenizer
         tokenizer = kwargs.get("tokenizer", None)
-        pdb.set_trace()
         if tokenizer is not None:
             tokenizer_class = tables.tokenizer_classes.get(tokenizer)
-            pdb.set_trace()
             tokenizer = tokenizer_class(**kwargs["tokenizer_conf"])
-            pdb.set_trace()
             kwargs["tokenizer"] = tokenizer
             kwargs["token_list"] = tokenizer.token_list
             vocab_size = len(tokenizer.token_list)
diff --git a/funasr/frontends/default.py b/funasr/frontends/default.py
index 8ac1ca853..15cc35a27 100644
--- a/funasr/frontends/default.py
+++ b/funasr/frontends/default.py
@@ -17,7 +17,7 @@ from funasr.frontends.utils.stft import Stft
 from funasr.frontends.utils.frontend import Frontend
 from funasr.models.transformer.utils.nets_utils import make_pad_mask
 
-
+@tables.register("frontend_classes", "DefaultFrontend")
 class DefaultFrontend(nn.Module):
     """Conventional frontend structure for ASR.
     Stft -> WPE -> MVDR-Beamformer -> Power-spec -> Mel-Fbank -> CMVN

From 8cfee2db5cf7a32f8865f393184d8a48dd6bd38d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 16:22:46 +0800
Subject: [PATCH 025/101] test

---
 funasr/auto/auto_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py
index 68559d121..e6e08b8cd 100644
--- a/funasr/auto/auto_model.py
+++ b/funasr/auto/auto_model.py
@@ -169,7 +169,7 @@ class AutoModel:
             frontend = frontend_class(**kwargs["frontend_conf"])
             kwargs["frontend"] = frontend
             kwargs["input_size"] = frontend.output_size()
-        
+        pdb.set_trace()
         # build model
         model_class = tables.model_classes.get(kwargs["model"])
         model = model_class(**kwargs, **kwargs["model_conf"], vocab_size=vocab_size)

From cfc18a90476675a04baa4edf62f756ff408f3551 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 16:24:17 +0800
Subject: [PATCH 026/101] test

---
 funasr/auto/auto_model.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py
index e6e08b8cd..7c8630356 100644
--- a/funasr/auto/auto_model.py
+++ b/funasr/auto/auto_model.py
@@ -164,9 +164,12 @@ class AutoModel:
         pdb.set_trace()
         # build frontend
         frontend = kwargs.get("frontend", None)
+        pdb.set_trace()
         if frontend is not None:
+            pdb.set_trace()
             frontend_class = tables.frontend_classes.get(frontend)
             frontend = frontend_class(**kwargs["frontend_conf"])
+            pdb.set_trace()
             kwargs["frontend"] = frontend
             kwargs["input_size"] = frontend.output_size()
         pdb.set_trace()

From c1c337ef9a6916d9fb12898983f54b5f3630ff0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 16:28:59 +0800
Subject: [PATCH 027/101] test

---
 funasr/frontends/default.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/funasr/frontends/default.py b/funasr/frontends/default.py
index 15cc35a27..70638e29c 100644
--- a/funasr/frontends/default.py
+++ b/funasr/frontends/default.py
@@ -16,6 +16,8 @@ from funasr.frontends.utils.log_mel import LogMel
 from funasr.frontends.utils.stft import Stft
 from funasr.frontends.utils.frontend import Frontend
 from funasr.models.transformer.utils.nets_utils import make_pad_mask
+from funasr.register import tables
+
 
 @tables.register("frontend_classes", "DefaultFrontend")
 class DefaultFrontend(nn.Module):
@@ -40,6 +42,7 @@ class DefaultFrontend(nn.Module):
             frontend_conf: Optional[dict] = None,
             apply_stft: bool = True,
             use_channel: int = None,
+            **kwargs,
     ):
         super().__init__()
         if isinstance(fs, str):

From 060d18ee4b86729e11f31ff16c822f3be33503ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 16:36:40 +0800
Subject: [PATCH 028/101] test

---
 funasr/frontends/default.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/funasr/frontends/default.py b/funasr/frontends/default.py
index 70638e29c..ab5b73166 100644
--- a/funasr/frontends/default.py
+++ b/funasr/frontends/default.py
@@ -3,7 +3,6 @@ from typing import Optional
 from typing import Tuple
 from typing import Union
 import logging
-import humanfriendly
 import numpy as np
 import torch
 import torch.nn as nn

From c1e136f639a650cd40c2df9599935bb7f4c307ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 16:38:29 +0800
Subject: [PATCH 029/101] test

---
 funasr/models/lcbnet/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 funasr/models/lcbnet/__init__.py

diff --git a/funasr/models/lcbnet/__init__.py b/funasr/models/lcbnet/__init__.py
new file mode 100644
index 000000000..e69de29bb

From b13b4c1bf5dbf5337539dbc017820ae20d0f2dc4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 16:42:18 +0800
Subject: [PATCH 030/101] test

---
 funasr/frontends/default.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/funasr/frontends/default.py b/funasr/frontends/default.py
index ab5b73166..66d42f71c 100644
--- a/funasr/frontends/default.py
+++ b/funasr/frontends/default.py
@@ -26,7 +26,7 @@ class DefaultFrontend(nn.Module):
 
     def __init__(
             self,
-            fs: Union[int, str] = 16000,
+            fs: int = 16000,
             n_fft: int = 512,
             win_length: int = None,
             hop_length: int = 128,
@@ -44,8 +44,6 @@ class DefaultFrontend(nn.Module):
             **kwargs,
     ):
         super().__init__()
-        if isinstance(fs, str):
-            fs = humanfriendly.parse_size(fs)
 
         # Deepcopy (In general, dict shouldn't be used as default arg)
         frontend_conf = copy.deepcopy(frontend_conf)
@@ -147,7 +145,7 @@ class MultiChannelFrontend(nn.Module):
 
     def __init__(
             self,
-            fs: Union[int, str] = 16000,
+            fs: int = 16000,
             n_fft: int = 512,
             win_length: int = None,
             hop_length: int = None,
@@ -170,9 +168,6 @@ class MultiChannelFrontend(nn.Module):
             mc: bool = True
     ):
         super().__init__()
-        if isinstance(fs, str):
-            fs = humanfriendly.parse_size(fs)
-
         # Deepcopy (In general, dict shouldn't be used as default arg)
         frontend_conf = copy.deepcopy(frontend_conf)
         if win_length is None and hop_length is None:

From cdc70650084f9a69bacd842b7434a008354e2ea0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 17:23:20 +0800
Subject: [PATCH 031/101] test

---
 funasr/auto/auto_model.py         |   7 +-
 funasr/models/lcbnet/attention.py | 112 +++++++++
 funasr/models/lcbnet/encoder.py   | 392 ++++++++++++++++++++++++++++++
 3 files changed, 506 insertions(+), 5 deletions(-)
 create mode 100644 funasr/models/lcbnet/attention.py
 create mode 100644 funasr/models/lcbnet/encoder.py

diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py
index 7c8630356..a5341eacf 100644
--- a/funasr/auto/auto_model.py
+++ b/funasr/auto/auto_model.py
@@ -161,18 +161,15 @@ class AutoModel:
             vocab_size = len(tokenizer.token_list)
         else:
             vocab_size = -1
-        pdb.set_trace()
         # build frontend
         frontend = kwargs.get("frontend", None)
-        pdb.set_trace()
+
         if frontend is not None:
-            pdb.set_trace()
             frontend_class = tables.frontend_classes.get(frontend)
             frontend = frontend_class(**kwargs["frontend_conf"])
-            pdb.set_trace()
             kwargs["frontend"] = frontend
             kwargs["input_size"] = frontend.output_size()
-        pdb.set_trace()
+
         # build model
         model_class = tables.model_classes.get(kwargs["model"])
         model = model_class(**kwargs, **kwargs["model_conf"], vocab_size=vocab_size)
diff --git a/funasr/models/lcbnet/attention.py b/funasr/models/lcbnet/attention.py
new file mode 100644
index 000000000..8e8c5943a
--- /dev/null
+++ b/funasr/models/lcbnet/attention.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 yufan
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Multi-Head Attention Return Weight layer definition."""
+
+import math
+
+import torch
+from torch import nn
+
+class MultiHeadedAttentionReturnWeight(nn.Module):
+    """Multi-Head Attention layer.
+
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(self, n_head, n_feat, dropout_rate):
+        """Construct an MultiHeadedAttentionReturnWeight object."""
+        super(MultiHeadedAttentionReturnWeight, self).__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_k = nn.Linear(n_feat, n_feat)
+        self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.attn = None
+        self.dropout = nn.Dropout(p=dropout_rate)
+
+    def forward_qkv(self, query, key, value):
+        """Transform query, key and value.
+
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+
+        Returns:
+            torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
+            torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
+            torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
+
+        """
+        n_batch = query.size(0)
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
+        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
+        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)  # (batch, head, time1, d_k)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+
+        return q, k, v
+
+    def forward_attention(self, value, scores, mask):
+        """Compute attention context vector.
+
+        Args:
+            value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k).
+            scores (torch.Tensor): Attention score (#batch, n_head, time1, time2).
+            mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2).
+
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+
+        """
+        n_batch = value.size(0)
+        if mask is not None:
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            min_value = torch.finfo(scores.dtype).min
+            scores = scores.masked_fill(mask, min_value)
+            self.attn = torch.softmax(scores, dim=-1).masked_fill(
+                mask, 0.0
+            )  # (batch, head, time1, time2)
+        else:
+            self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+
+        p_attn = self.dropout(self.attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = (
+            x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)
+        )  # (batch, time1, d_model)
+
+        return self.linear_out(x), self.attn  # (batch, time1, d_model)
+
+    def forward(self, query, key, value, mask):
+        """Compute scaled dot product attention.
+
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, mask)
+
+
diff --git a/funasr/models/lcbnet/encoder.py b/funasr/models/lcbnet/encoder.py
new file mode 100644
index 000000000..d2464f1de
--- /dev/null
+++ b/funasr/models/lcbnet/encoder.py
@@ -0,0 +1,392 @@
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Transformer encoder definition."""
+
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import torch
+from torch import nn
+import logging
+
+from funasr.models.transformer.attention import MultiHeadedAttention
+from funasr.models.lcbnet.attention import MultiHeadedAttentionReturnWeight
+from funasr.models.transformer.embedding import PositionalEncoding
+from funasr.models.transformer.layer_norm import LayerNorm
+
+from funasr.models.transformer.utils.nets_utils import make_pad_mask
+from funasr.models.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from funasr.models.transformer.utils.repeat import repeat
+from funasr.register import tables
+
+class EncoderLayer(nn.Module):
+    """Encoder layer module.
+
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
+            can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        stochastic_depth_rate (float): Proability to skip this layer.
+            During training, the layer may skip residual computation and return input
+            as-is with given probability.
+    """
+
+    def __init__(
+            self,
+            size,
+            self_attn,
+            feed_forward,
+            dropout_rate,
+            normalize_before=True,
+            concat_after=False,
+            stochastic_depth_rate=0.0,
+    ):
+        """Construct an EncoderLayer object."""
+        super(EncoderLayer, self).__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.norm1 = LayerNorm(size)
+        self.norm2 = LayerNorm(size)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+        self.stochastic_depth_rate = stochastic_depth_rate
+
+    def forward(self, x, mask, cache=None):
+        """Compute encoded features.
+
+        Args:
+            x_input (torch.Tensor): Input tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, time).
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+
+        """
+        skip_layer = False
+        # with stochastic depth, residual connection `x + f(x)` becomes
+        # `x <- x + 1 / (1 - p) * f(x)` at training time.
+        stoch_layer_coeff = 1.0
+        if self.training and self.stochastic_depth_rate > 0:
+            skip_layer = torch.rand(1).item() < self.stochastic_depth_rate
+            stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
+
+        if skip_layer:
+            if cache is not None:
+                x = torch.cat([cache, x], dim=1)
+            return x, mask
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+
+        if cache is None:
+            x_q = x
+        else:
+            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
+            x_q = x[:, -1:, :]
+            residual = residual[:, -1:, :]
+            mask = None if mask is None else mask[:, -1:, :]
+
+        if self.concat_after:
+            x_concat = torch.cat((x, self.self_attn(x_q, x, x, mask)), dim=-1)
+            x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
+        else:
+            x = residual + stoch_layer_coeff * self.dropout(
+                self.self_attn(x_q, x, x, mask)
+            )
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + stoch_layer_coeff * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm2(x)
+
+        if cache is not None:
+            x = torch.cat([cache, x], dim=1)
+
+        return x, mask
+
+@tables.register("encoder_classes", "TransformerTextEncoder")
+class TransformerTextEncoder(nn.Module):
+    """Transformer text encoder module.
+
+    Args:
+        input_size: input dim
+        output_size: dimension of attention
+        attention_heads: the number of heads of multi head attention
+        linear_units: the number of units of position-wise feed forward
+        num_blocks: the number of decoder blocks
+        dropout_rate: dropout rate
+        attention_dropout_rate: dropout rate in attention
+        positional_dropout_rate: dropout rate after adding positional encoding
+        input_layer: input layer type
+        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+        normalize_before: whether to use layer_norm before the first block
+        concat_after: whether to concat attention layer's input and output
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied.
+            i.e. x -> x + att(x)
+        positionwise_layer_type: linear of conv1d
+        positionwise_conv_kernel_size: kernel size of positionwise conv1d layer
+        padding_idx: padding_idx for input_layer=embed
+    """
+
+    def __init__(
+            self,
+            input_size: int,
+            output_size: int = 256,
+            attention_heads: int = 4,
+            linear_units: int = 2048,
+            num_blocks: int = 6,
+            dropout_rate: float = 0.1,
+            positional_dropout_rate: float = 0.1,
+            attention_dropout_rate: float = 0.0,
+            pos_enc_class=PositionalEncoding,
+            normalize_before: bool = True,
+            concat_after: bool = False,
+    ):
+        super().__init__()
+        self._output_size = output_size
+
+        self.embed = torch.nn.Sequential(
+            torch.nn.Embedding(input_size, output_size),
+            pos_enc_class(output_size, positional_dropout_rate),
+        )
+
+        self.normalize_before = normalize_before
+
+        positionwise_layer = PositionwiseFeedForward
+        positionwise_layer_args = (
+            output_size,
+            linear_units,
+            dropout_rate,
+        )
+        self.encoders = repeat(
+            num_blocks,
+            lambda lnum: EncoderLayer(
+                output_size,
+                MultiHeadedAttention(
+                    attention_heads, output_size, attention_dropout_rate
+                ),
+                positionwise_layer(*positionwise_layer_args),
+                dropout_rate,
+                normalize_before,
+                concat_after,
+            ),
+        )
+        if self.normalize_before:
+            self.after_norm = LayerNorm(output_size)
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+            self,
+            xs_pad: torch.Tensor,
+            ilens: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Embed positions in tensor.
+
+        Args:
+            xs_pad: input tensor (B, L, D)
+            ilens: input length (B)
+        Returns:
+            position embedded tensor and mask
+        """
+        masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
+        xs_pad = self.embed(xs_pad)
+
+        xs_pad, masks = self.encoders(xs_pad, masks)
+
+        if self.normalize_before:
+            xs_pad = self.after_norm(xs_pad)
+
+        olens = masks.squeeze(1).sum(1)
+        return xs_pad, olens, None
+
+
+
+
+@tables.register("encoder_classes", "FusionSANEncoder")
+class SelfSrcAttention(nn.Module):
+    """Single decoder layer module.
+
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+        src_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+
+
+    """
+    def __init__(
+            self,
+            size,
+            attention_heads,
+            attention_dim,
+            linear_units,
+            self_attention_dropout_rate,
+            src_attention_dropout_rate,
+            positional_dropout_rate,
+            dropout_rate,
+            normalize_before=True,
+            concat_after=False,
+    ):
+        """Construct an SelfSrcAttention object."""
+        super(SelfSrcAttention, self).__init__()
+        self.size = size
+        self.self_attn = MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate)
+        self.src_attn = MultiHeadedAttentionReturnWeight(attention_heads, attention_dim, src_attention_dropout_rate)
+        self.feed_forward = PositionwiseFeedForward(attention_dim, linear_units, positional_dropout_rate)
+        self.norm1 = LayerNorm(size)
+        self.norm2 = LayerNorm(size)
+        self.norm3 = LayerNorm(size)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear1 = nn.Linear(size + size, size)
+            self.concat_linear2 = nn.Linear(size + size, size)
+
+    def forward(self, tgt, tgt_mask, memory, memory_mask, cache=None):
+        """Compute decoded features.
+
+        Args:
+            tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
+            tgt_mask (torch.Tensor): Mask for input tensor (#batch, maxlen_out).
+            memory (torch.Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
+            memory_mask (torch.Tensor): Encoded memory mask (#batch, maxlen_in).
+            cache (List[torch.Tensor]): List of cached tensors.
+                Each tensor shape should be (#batch, maxlen_out - 1, size).
+
+        Returns:
+            torch.Tensor: Output tensor(#batch, maxlen_out, size).
+            torch.Tensor: Mask for output tensor (#batch, maxlen_out).
+            torch.Tensor: Encoded memory (#batch, maxlen_in, size).
+            torch.Tensor: Encoded memory mask (#batch, maxlen_in).
+
+        """
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+
+        if cache is None:
+            tgt_q = tgt
+            tgt_q_mask = tgt_mask
+        else:
+            # compute only the last frame query keeping dim: max_time_out -> 1
+            assert cache.shape == (
+                tgt.shape[0],
+                tgt.shape[1] - 1,
+                self.size,
+            ), f"{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
+            tgt_q = tgt[:, -1:, :]
+            residual = residual[:, -1:, :]
+            tgt_q_mask = None
+            if tgt_mask is not None:
+                tgt_q_mask = tgt_mask[:, -1:, :]
+
+        if self.concat_after:
+            tgt_concat = torch.cat(
+                (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)), dim=-1
+            )
+            x = residual + self.concat_linear1(tgt_concat)
+        else:
+            x = residual + self.dropout(self.self_attn(tgt_q, tgt, tgt, tgt_q_mask))
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        if self.concat_after:
+            x_concat = torch.cat(
+                (x, self.src_attn(x, memory, memory, memory_mask)), dim=-1
+            )
+            x = residual + self.concat_linear2(x_concat)
+        else:
+            x, score = self.src_attn(x, memory, memory, memory_mask)
+            x = residual + self.dropout(x)
+        if not self.normalize_before:
+            x = self.norm2(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm3(x)
+        x = residual + self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm3(x)
+
+        if cache is not None:
+            x = torch.cat([cache, x], dim=1)
+
+        return x, tgt_mask, memory, memory_mask
+
+
+
+class ConvPredictor(nn.Module):
+    def __init__(self, size=256, l_order=3, r_order=3, attention_heads=4, attention_dropout_rate=0.1, linear_units=2048):
+        super().__init__()
+        self.atten = MultiHeadedAttention(attention_heads, size, attention_dropout_rate)
+        self.norm1 = LayerNorm(size)
+        self.feed_forward = PositionwiseFeedForward(size, linear_units, attention_dropout_rate)
+        self.norm2 = LayerNorm(size)
+        self.pad = nn.ConstantPad1d((l_order, r_order), 0)
+        self.conv1d = nn.Conv1d(size, size, l_order + r_order + 1, groups=size)
+        self.output_linear = nn.Linear(size, 1)
+
+
+    def forward(self, text_enc, asr_enc):
+        # stage1 cross-attention
+        residual = text_enc
+        text_enc = residual + self.atten(text_enc, asr_enc, asr_enc, None)
+        
+        # stage2 FFN
+        residual = text_enc
+        text_enc = self.norm1(text_enc)
+        text_enc = residual + self.feed_forward(text_enc)
+        
+        # stage Conv predictor
+        text_enc = self.norm2(text_enc)
+        context = text_enc.transpose(1, 2)
+        queries = self.pad(context)
+        memory = self.conv1d(queries)
+        output = memory + context
+        output = output.transpose(1, 2)
+        output = torch.relu(output)
+        output = self.output_linear(output)
+        if output.dim()==3:
+          output = output.squeeze(2)
+        return output

From b349739f5d6302048c179eeaadb4432acc541cd5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 17:27:02 +0800
Subject: [PATCH 032/101] test

---
 funasr/auto/auto_model.py                   | 4 +++-
 funasr/train_utils/load_pretrained_model.py | 8 +++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py
index a5341eacf..23b80d72a 100644
--- a/funasr/auto/auto_model.py
+++ b/funasr/auto/auto_model.py
@@ -172,12 +172,14 @@ class AutoModel:
 
         # build model
         model_class = tables.model_classes.get(kwargs["model"])
+        pdb.set_trace()
         model = model_class(**kwargs, **kwargs["model_conf"], vocab_size=vocab_size)
-        
+        pdb.set_trace()
         model.to(device)
         
         # init_param
         init_param = kwargs.get("init_param", None)
+        pdb.set_trace()
         if init_param is not None:
             logging.info(f"Loading pretrained params from {init_param}")
             load_pretrained_model(
diff --git a/funasr/train_utils/load_pretrained_model.py b/funasr/train_utils/load_pretrained_model.py
index 5ba9bb7dc..aec31e3cc 100644
--- a/funasr/train_utils/load_pretrained_model.py
+++ b/funasr/train_utils/load_pretrained_model.py
@@ -7,7 +7,7 @@ import logging
 import torch
 import torch.nn
 import torch.optim
-
+import pdb
 
 def filter_state_dict(
 	dst_state: Dict[str, Union[float, torch.Tensor]],
@@ -99,14 +99,16 @@ def load_pretrained_model(
 	# import pdb;
 	# pdb.set_trace()
 	print(f"ckpt: {path}")
+	pdb.set_trace()
 	if oss_bucket is None:
 		src_state = torch.load(path, map_location=map_location)
 	else:
 		buffer = BytesIO(oss_bucket.get_object(path).read())
 		src_state = torch.load(buffer, map_location=map_location)
+	pdb.set_trace()
 	if "state_dict" in src_state:
 		src_state = src_state["state_dict"]
-		
+	pdb.set_trace()
 	for k in dst_state.keys():
 		if not k.startswith("module.") and "module." + k in src_state.keys():
 			k_ddp = "module." + k
@@ -116,7 +118,7 @@ def load_pretrained_model(
 			dst_state[k] = src_state[k_ddp]
 		else:
 			print(f"Miss key in ckpt: model: {k}, ckpt: {k_ddp}")
-			
+	pdb.set_trace()
 	flag = obj.load_state_dict(dst_state, strict=True)
 	# print(flag)
 

From e615585fd3e40531fb714586d98c6a307a95c03d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 22 Feb 2024 17:31:57 +0800
Subject: [PATCH 033/101] test

---
 funasr/models/lcbnet/model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index 563ff26e9..bbc99fdba 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -89,7 +89,7 @@ class LCBNet(nn.Module):
         text_encoder = text_encoder_class(input_size=vocab_size, **text_encoder_conf)
         fusion_encoder_class = tables.encoder_classes.get(fusion_encoder)
         fusion_encoder = fusion_encoder_class(**fusion_encoder_conf)
-        bias_predictor_class = tables.encoder_classes.get_class(bias_predictor)
+        bias_predictor_class = tables.encoder_classes.get(bias_predictor)
         bias_predictor = bias_predictor_class(bias_predictor_conf)
 
         if decoder is not None:
@@ -414,7 +414,7 @@ class LCBNet(nn.Module):
             self.init_beam_search(**kwargs)
             self.nbest = kwargs.get("nbest", 1)
         pdb.set_trace()
-        
+
         meta_data = {}
         if isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank":  # fbank
             speech, speech_lengths = data_in, data_lengths

From 5c1308e3cf5dd63c1d1c0b5299bd79b3064bca7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Fri, 23 Feb 2024 16:13:23 +0800
Subject: [PATCH 034/101] test

---
 funasr/models/lcbnet/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index bbc99fdba..555d4e658 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -90,7 +90,7 @@ class LCBNet(nn.Module):
         fusion_encoder_class = tables.encoder_classes.get(fusion_encoder)
         fusion_encoder = fusion_encoder_class(**fusion_encoder_conf)
         bias_predictor_class = tables.encoder_classes.get(bias_predictor)
-        bias_predictor = bias_predictor_class(bias_predictor_conf)
+        bias_predictor = bias_predictor_class(**bias_predictor_conf)
 
         if decoder is not None:
             decoder_class = tables.decoder_classes.get(decoder)

From 54bd357b0857b94f761e270dbed5f90ca4e77d51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Fri, 23 Feb 2024 16:17:18 +0800
Subject: [PATCH 035/101] test

---
 funasr/models/lcbnet/encoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/funasr/models/lcbnet/encoder.py b/funasr/models/lcbnet/encoder.py
index d2464f1de..c65823cb0 100644
--- a/funasr/models/lcbnet/encoder.py
+++ b/funasr/models/lcbnet/encoder.py
@@ -355,7 +355,7 @@ class SelfSrcAttention(nn.Module):
         return x, tgt_mask, memory, memory_mask
 
 
-
+@tables.register("encoder_classes", "ConvBiasPredictor")
 class ConvPredictor(nn.Module):
     def __init__(self, size=256, l_order=3, r_order=3, attention_heads=4, attention_dropout_rate=0.1, linear_units=2048):
         super().__init__()

From d60306e7a435053a1ed626213f9fa6fe12af2b3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Fri, 23 Feb 2024 16:47:15 +0800
Subject: [PATCH 036/101] test

---
 examples/industrial_data_pretraining/lcbnet/demo2.sh | 3 +--
 funasr/auto/auto_model.py                            | 3 ---
 funasr/train_utils/load_pretrained_model.py          | 9 +++------
 3 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh
index 36a692856..3e4d22393 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo2.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh
@@ -7,8 +7,7 @@ python -m funasr.bin.inference \
 ++init_param=${file_dir}/model.pb \
 ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
 ++frontend_conf.cmvn_file=${file_dir}/am.mvn \
-++input=${file_dir}/wav.scp \
-++input=${file_dir}/ocr_text \
+++input=[${file_dir}/wav.scp,${file_dir}/ocr_text] \
 +data_type='["sound", "text"]' \
 ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
 ++output_dir="./outputs/debug" \
diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py
index 23b80d72a..87c7e2d03 100644
--- a/funasr/auto/auto_model.py
+++ b/funasr/auto/auto_model.py
@@ -172,14 +172,11 @@ class AutoModel:
 
         # build model
         model_class = tables.model_classes.get(kwargs["model"])
-        pdb.set_trace()
         model = model_class(**kwargs, **kwargs["model_conf"], vocab_size=vocab_size)
-        pdb.set_trace()
         model.to(device)
         
         # init_param
         init_param = kwargs.get("init_param", None)
-        pdb.set_trace()
         if init_param is not None:
             logging.info(f"Loading pretrained params from {init_param}")
             load_pretrained_model(
diff --git a/funasr/train_utils/load_pretrained_model.py b/funasr/train_utils/load_pretrained_model.py
index aec31e3cc..9127e2fe1 100644
--- a/funasr/train_utils/load_pretrained_model.py
+++ b/funasr/train_utils/load_pretrained_model.py
@@ -96,19 +96,17 @@ def load_pretrained_model(
 	
 	obj = model
 	dst_state = obj.state_dict()
-	# import pdb;
-	# pdb.set_trace()
 	print(f"ckpt: {path}")
-	pdb.set_trace()
+
 	if oss_bucket is None:
 		src_state = torch.load(path, map_location=map_location)
 	else:
 		buffer = BytesIO(oss_bucket.get_object(path).read())
 		src_state = torch.load(buffer, map_location=map_location)
-	pdb.set_trace()
+
 	if "state_dict" in src_state:
 		src_state = src_state["state_dict"]
-	pdb.set_trace()
+
 	for k in dst_state.keys():
 		if not k.startswith("module.") and "module." + k in src_state.keys():
 			k_ddp = "module." + k
@@ -118,7 +116,6 @@ def load_pretrained_model(
 			dst_state[k] = src_state[k_ddp]
 		else:
 			print(f"Miss key in ckpt: model: {k}, ckpt: {k_ddp}")
-	pdb.set_trace()
 	flag = obj.load_state_dict(dst_state, strict=True)
 	# print(flag)
 

From 6a8c943435edf25f252d9d4db0095d4a01c7a3cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Fri, 23 Feb 2024 16:56:02 +0800
Subject: [PATCH 037/101] test

---
 funasr/models/lcbnet/model.py |  1 +
 funasr/utils/load_utils.py    | 16 ++++++++++------
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index 555d4e658..54fba1cb2 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -425,6 +425,7 @@ class LCBNet(nn.Module):
         else:
             # extract fbank feats
             time1 = time.perf_counter()
+            pdb.set_trace()
             audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000),
                                                             data_type=kwargs.get("data_type", "sound"),
                                                             tokenizer=tokenizer)
diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py
index 7748172f6..cdd378de6 100644
--- a/funasr/utils/load_utils.py
+++ b/funasr/utils/load_utils.py
@@ -13,30 +13,34 @@ try:
     from funasr.download.file import download_from_url
 except:
     print("urllib is not installed, if you infer from url, please install it first.")
-
+import pdb
 
 
 def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: int = 16000, data_type="sound", tokenizer=None, **kwargs):
+    pdb.set_trace()
     if isinstance(data_or_path_or_list, (list, tuple)):
         if data_type is not None and isinstance(data_type, (list, tuple)):
-
+            pdb.set_trace()
             data_types = [data_type] * len(data_or_path_or_list)
             data_or_path_or_list_ret = [[] for d in data_type]
+            pdb.set_trace()
             for i, (data_type_i, data_or_path_or_list_i) in enumerate(zip(data_types, data_or_path_or_list)):
                 
                 for j, (data_type_j, data_or_path_or_list_j) in enumerate(zip(data_type_i, data_or_path_or_list_i)):
-                    
+                    pdb.set_trace()
                     data_or_path_or_list_j = load_audio_text_image_video(data_or_path_or_list_j, fs=fs, audio_fs=audio_fs, data_type=data_type_j, tokenizer=tokenizer, **kwargs)
+                    pdb.set_trace()
                     data_or_path_or_list_ret[j].append(data_or_path_or_list_j)
 
             return data_or_path_or_list_ret
         else:
             return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs, data_type=data_type, **kwargs) for audio in data_or_path_or_list]
-    
+    pdb.set_trace()
     if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith('http'): # download url to local file
         data_or_path_or_list = download_from_url(data_or_path_or_list)
-    
+    pdb.set_trace()
     if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): # local file
+        pdb.set_trace()
         if data_type is None or data_type == "sound":
             data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list)
             if kwargs.get("reduce_channels", True):
@@ -59,7 +63,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs:
     else:
         pass
         # print(f"unsupport data type: {data_or_path_or_list}, return raw data")
-        
+    pdb.set_trace()  
     if audio_fs != fs and data_type != "text":
         resampler = torchaudio.transforms.Resample(audio_fs, fs)
         data_or_path_or_list = resampler(data_or_path_or_list[None, :])[0, :]

From 5130d2406df1aa567d13eec49eea8f9e392c6790 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Fri, 23 Feb 2024 17:01:41 +0800
Subject: [PATCH 038/101] test

---
 funasr/frontends/default.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/funasr/frontends/default.py b/funasr/frontends/default.py
index 66d42f71c..364c8bbb9 100644
--- a/funasr/frontends/default.py
+++ b/funasr/frontends/default.py
@@ -48,6 +48,7 @@ class DefaultFrontend(nn.Module):
         # Deepcopy (In general, dict shouldn't be used as default arg)
         frontend_conf = copy.deepcopy(frontend_conf)
         self.hop_length = hop_length
+        self.fs = fs
 
         if apply_stft:
             self.stft = Stft(

From 70a236b652b3c2a4377bd551f4b7c9d4c49cb61c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Fri, 23 Feb 2024 17:38:54 +0800
Subject: [PATCH 039/101] test

---
 .../lcbnet/demo2.sh                           |  2 +-
 funasr/utils/load_utils.py                    | 22 ++++++++++++-------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh
index 3e4d22393..cfb5b235e 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo2.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh
@@ -8,7 +8,7 @@ python -m funasr.bin.inference \
 ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
 ++frontend_conf.cmvn_file=${file_dir}/am.mvn \
 ++input=[${file_dir}/wav.scp,${file_dir}/ocr_text] \
-+data_type='["sound", "text"]' \
++data_type='["kaldi_ark", "text"]' \
 ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
 ++output_dir="./outputs/debug" \
 ++device="" \
diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py
index cdd378de6..b7d0200cc 100644
--- a/funasr/utils/load_utils.py
+++ b/funasr/utils/load_utils.py
@@ -17,35 +17,28 @@ import pdb
 
 
 def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: int = 16000, data_type="sound", tokenizer=None, **kwargs):
-    pdb.set_trace()
     if isinstance(data_or_path_or_list, (list, tuple)):
         if data_type is not None and isinstance(data_type, (list, tuple)):
-            pdb.set_trace()
             data_types = [data_type] * len(data_or_path_or_list)
             data_or_path_or_list_ret = [[] for d in data_type]
-            pdb.set_trace()
             for i, (data_type_i, data_or_path_or_list_i) in enumerate(zip(data_types, data_or_path_or_list)):
-                
                 for j, (data_type_j, data_or_path_or_list_j) in enumerate(zip(data_type_i, data_or_path_or_list_i)):
-                    pdb.set_trace()
                     data_or_path_or_list_j = load_audio_text_image_video(data_or_path_or_list_j, fs=fs, audio_fs=audio_fs, data_type=data_type_j, tokenizer=tokenizer, **kwargs)
-                    pdb.set_trace()
                     data_or_path_or_list_ret[j].append(data_or_path_or_list_j)
 
             return data_or_path_or_list_ret
         else:
             return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs, data_type=data_type, **kwargs) for audio in data_or_path_or_list]
-    pdb.set_trace()
     if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith('http'): # download url to local file
         data_or_path_or_list = download_from_url(data_or_path_or_list)
     pdb.set_trace()
     if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): # local file
-        pdb.set_trace()
         if data_type is None or data_type == "sound":
             data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list)
             if kwargs.get("reduce_channels", True):
                 data_or_path_or_list = data_or_path_or_list.mean(0)
         elif data_type == "text" and tokenizer is not None:
+            pdb.set_trace()
             data_or_path_or_list = tokenizer.encode(data_or_path_or_list)
         elif data_type == "image": # undo
             pass
@@ -60,6 +53,19 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs:
         data_or_path_or_list = tokenizer.encode(data_or_path_or_list)
     elif isinstance(data_or_path_or_list, np.ndarray):  # audio sample point
         data_or_path_or_list = torch.from_numpy(data_or_path_or_list).squeeze()  # [n_samples,]
+    elif isinstance(data_or_path_or_list, str) and data_type == "kaldi_ark":
+        data_mat = kaldiio.load_mat(data_or_path_or_list) 
+        if isinstance(data_mat, tuple):
+            sampling_rate, mat = data_mat
+            assert sampling_rate == audio_fs
+        else:
+            mat = data_mat
+        if mat.dtype == 'int16' or mat.dtype == 'int32':
+            mat = mat.astype(np.float64)
+            mat = mat / 32768
+        if mat.ndim ==2:
+            mat = mat[:,0]
+        data_or_path_or_list = mat
     else:
         pass
         # print(f"unsupport data type: {data_or_path_or_list}, return raw data")

From 5ecd13bd0460c4317e9a585e4204731791e5e9db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 11:23:51 +0800
Subject: [PATCH 040/101] test

---
 funasr/utils/load_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py
index b7d0200cc..20fa0fd2e 100644
--- a/funasr/utils/load_utils.py
+++ b/funasr/utils/load_utils.py
@@ -56,8 +56,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs:
     elif isinstance(data_or_path_or_list, str) and data_type == "kaldi_ark":
         data_mat = kaldiio.load_mat(data_or_path_or_list) 
         if isinstance(data_mat, tuple):
-            sampling_rate, mat = data_mat
-            assert sampling_rate == audio_fs
+            audio_fs, mat = data_mat
         else:
             mat = data_mat
         if mat.dtype == 'int16' or mat.dtype == 'int32':

From 343a281ca14809153e2ab1df49ca0c5ffdb01abd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 13:56:32 +0800
Subject: [PATCH 041/101] test

---
 funasr/models/lcbnet/model.py | 2 +-
 funasr/utils/load_utils.py    | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index 54fba1cb2..d1ebc5ce5 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -413,7 +413,6 @@ class LCBNet(nn.Module):
             logging.info("enable beam_search")
             self.init_beam_search(**kwargs)
             self.nbest = kwargs.get("nbest", 1)
-        pdb.set_trace()
 
         meta_data = {}
         if isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank":  # fbank
@@ -431,6 +430,7 @@ class LCBNet(nn.Module):
                                                             tokenizer=tokenizer)
             time2 = time.perf_counter()
             meta_data["load_data"] = f"{time2 - time1:0.3f}"
+            pdb.set_trace()
             speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
                                                    frontend=frontend)
             time3 = time.perf_counter()
diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py
index 20fa0fd2e..963f5c258 100644
--- a/funasr/utils/load_utils.py
+++ b/funasr/utils/load_utils.py
@@ -31,14 +31,13 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs:
             return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs, data_type=data_type, **kwargs) for audio in data_or_path_or_list]
     if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith('http'): # download url to local file
         data_or_path_or_list = download_from_url(data_or_path_or_list)
-    pdb.set_trace()
+
     if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): # local file
         if data_type is None or data_type == "sound":
             data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list)
             if kwargs.get("reduce_channels", True):
                 data_or_path_or_list = data_or_path_or_list.mean(0)
         elif data_type == "text" and tokenizer is not None:
-            pdb.set_trace()
             data_or_path_or_list = tokenizer.encode(data_or_path_or_list)
         elif data_type == "image": # undo
             pass
@@ -68,7 +67,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs:
     else:
         pass
         # print(f"unsupport data type: {data_or_path_or_list}, return raw data")
-    pdb.set_trace()  
+
     if audio_fs != fs and data_type != "text":
         resampler = torchaudio.transforms.Resample(audio_fs, fs)
         data_or_path_or_list = resampler(data_or_path_or_list[None, :])[0, :]
@@ -112,6 +111,7 @@ def extract_fbank(data, data_len = None, data_type: str="sound", frontend=None,
     # import pdb;
     # pdb.set_trace()
     # if data_type == "sound":
+    pdb.set_trace()
     data, data_len = frontend(data, data_len, **kwargs)
     
     if isinstance(data_len, (list, tuple)):

From 0d32e02c79d751ae15af8fb767df32564e34cbf7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 14:02:03 +0800
Subject: [PATCH 042/101] test

---
 funasr/models/lcbnet/model.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index d1ebc5ce5..1acce785f 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -425,14 +425,17 @@ class LCBNet(nn.Module):
             # extract fbank feats
             time1 = time.perf_counter()
             pdb.set_trace()
-            audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000),
+            sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000),
                                                             data_type=kwargs.get("data_type", "sound"),
                                                             tokenizer=tokenizer)
             time2 = time.perf_counter()
             meta_data["load_data"] = f"{time2 - time1:0.3f}"
+            audio_sample_list = sample_list[0]
+            ocr_sample_list = sample_list[1]
             pdb.set_trace()
             speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
                                                    frontend=frontend)
+            pdb.set_trace()
             time3 = time.perf_counter()
             meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
             meta_data["batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000

From ab4a31201c218b212ac52cbd529024c5858a9f87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 14:25:00 +0800
Subject: [PATCH 043/101] test

---
 funasr/frontends/default.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/funasr/frontends/default.py b/funasr/frontends/default.py
index 364c8bbb9..c4bdbd774 100644
--- a/funasr/frontends/default.py
+++ b/funasr/frontends/default.py
@@ -85,8 +85,12 @@ class DefaultFrontend(nn.Module):
         return self.n_mels
 
     def forward(
-            self, input: torch.Tensor, input_lengths: torch.Tensor
+            self, input: torch.Tensor, input_lengths:  Union[torch.Tensor, list]
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if isinstance(input_lengths, list):
+            input_lengths = torch.tensor(input_lengths)
+        if  input.dtype == torch.float64:
+            input = input.float()
         # 1. Domain-conversion: e.g. Stft: time -> time-freq
         if self.stft is not None:
             input_stft, feats_lens = self._compute_stft(input, input_lengths)

From 2bffe1d5392b291c071cde0ffcc03860abdfc230 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 14:52:05 +0800
Subject: [PATCH 044/101] test

---
 examples/industrial_data_pretraining/lcbnet/demo2.sh |  1 -
 funasr/models/lcbnet/model.py                        | 11 +++++++----
 funasr/utils/load_utils.py                           |  5 +----
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh
index cfb5b235e..0d5a4f031 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo2.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh
@@ -6,7 +6,6 @@ python -m funasr.bin.inference \
 --config-name="config.yaml" \
 ++init_param=${file_dir}/model.pb \
 ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
-++frontend_conf.cmvn_file=${file_dir}/am.mvn \
 ++input=[${file_dir}/wav.scp,${file_dir}/ocr_text] \
 +data_type='["kaldi_ark", "text"]' \
 ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index 1acce785f..f45e71d6f 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -21,6 +21,7 @@ from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
 from funasr.utils import postprocess_utils
 from funasr.utils.datadir_writer import DatadirWriter
 from funasr.register import tables
+
 import pdb
 @tables.register("model_classes", "LCBNet")
 class LCBNet(nn.Module):
@@ -92,6 +93,7 @@ class LCBNet(nn.Module):
         bias_predictor_class = tables.encoder_classes.get(bias_predictor)
         bias_predictor = bias_predictor_class(**bias_predictor_conf)
 
+
         if decoder is not None:
             decoder_class = tables.decoder_classes.get(decoder)
             decoder = decoder_class(
@@ -272,15 +274,15 @@ class LCBNet(nn.Module):
                 ind: int
         """
         with autocast(False):
-
+            pdb.set_trace()
             # Data augmentation
             if self.specaug is not None and self.training:
                 speech, speech_lengths = self.specaug(speech, speech_lengths)
-            
+            pdb.set_trace()
             # Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
             if self.normalize is not None:
                 speech, speech_lengths = self.normalize(speech, speech_lengths)
-        
+        pdb.set_trace()
         # Forward encoder
         # feats: (Batch, Length, Dim)
         # -> encoder_out: (Batch, Length2, Dim2)
@@ -297,7 +299,7 @@ class LCBNet(nn.Module):
         
         if intermediate_outs is not None:
             return (encoder_out, intermediate_outs), encoder_out_lens
-        
+        pdb.set_trace()
         return encoder_out, encoder_out_lens
     
     def _calc_att_loss(
@@ -442,6 +444,7 @@ class LCBNet(nn.Module):
 
         speech = speech.to(device=kwargs["device"])
         speech_lengths = speech_lengths.to(device=kwargs["device"])
+        pdb.set_trace()
         # Encoder
         encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
         if isinstance(encoder_out, tuple):
diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py
index 963f5c258..644af2324 100644
--- a/funasr/utils/load_utils.py
+++ b/funasr/utils/load_utils.py
@@ -108,10 +108,7 @@ def extract_fbank(data, data_len = None, data_type: str="sound", frontend=None,
             data_list.append(data_i)
             data_len.append(data_i.shape[0])
         data = pad_sequence(data_list, batch_first=True) # data: [batch, N]
-    # import pdb;
-    # pdb.set_trace()
-    # if data_type == "sound":
-    pdb.set_trace()
+
     data, data_len = frontend(data, data_len, **kwargs)
     
     if isinstance(data_len, (list, tuple)):

From 19103386dc4f52619aba21af4008a9d082ea4a67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 15:05:41 +0800
Subject: [PATCH 045/101] test

---
 funasr/models/lcbnet/model.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index f45e71d6f..45b1ee5d1 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -426,7 +426,6 @@ class LCBNet(nn.Module):
         else:
             # extract fbank feats
             time1 = time.perf_counter()
-            pdb.set_trace()
             sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000),
                                                             data_type=kwargs.get("data_type", "sound"),
                                                             tokenizer=tokenizer)
@@ -434,13 +433,12 @@ class LCBNet(nn.Module):
             meta_data["load_data"] = f"{time2 - time1:0.3f}"
             audio_sample_list = sample_list[0]
             ocr_sample_list = sample_list[1]
-            pdb.set_trace()
             speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
                                                    frontend=frontend)
-            pdb.set_trace()
             time3 = time.perf_counter()
             meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
-            meta_data["batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000
+            frame_shift = 10 
+            meta_data["batch_data_time"] = speech_lengths.sum().item() * frame_shift / 1000
 
         speech = speech.to(device=kwargs["device"])
         speech_lengths = speech_lengths.to(device=kwargs["device"])

From eb92e79fb94e7b3df8f27c8ce3e607a70dff2a2e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 15:21:32 +0800
Subject: [PATCH 046/101] test

---
 funasr/models/conformer/encoder.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/funasr/models/conformer/encoder.py b/funasr/models/conformer/encoder.py
index 1d252c206..443d30912 100644
--- a/funasr/models/conformer/encoder.py
+++ b/funasr/models/conformer/encoder.py
@@ -47,7 +47,7 @@ from funasr.models.transformer.utils.subsampling import check_short_utt
 from funasr.models.transformer.utils.subsampling import Conv2dSubsamplingPad
 from funasr.models.transformer.utils.subsampling import StreamingConvInput
 from funasr.register import tables
-
+import pdb
 
 class ConvolutionModule(nn.Module):
     """ConvolutionModule in Conformer model.
@@ -573,7 +573,7 @@ class ConformerEncoder(nn.Module):
             xs_pad, masks = self.embed(xs_pad, masks)
         else:
             xs_pad = self.embed(xs_pad)
-
+        pdb.set_trace()
         intermediate_outs = []
         if len(self.interctc_layer_idx) == 0:
             xs_pad, masks = self.encoders(xs_pad, masks)
@@ -601,12 +601,12 @@ class ConformerEncoder(nn.Module):
                             xs_pad = (x, pos_emb)
                         else:
                             xs_pad = xs_pad + self.conditioning_layer(ctc_out)
-
+        pdb.set_trace()
         if isinstance(xs_pad, tuple):
             xs_pad = xs_pad[0]
         if self.normalize_before:
             xs_pad = self.after_norm(xs_pad)
-
+        pdb.set_trace()
         olens = masks.squeeze(1).sum(1)
         if len(intermediate_outs) > 0:
             return (xs_pad, intermediate_outs), olens, None

From debafeac37c0259bc3cf7642700f05adea34e047 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 15:23:07 +0800
Subject: [PATCH 047/101] test

---
 funasr/auto/auto_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py
index 87c7e2d03..5cb2e6e48 100644
--- a/funasr/auto/auto_model.py
+++ b/funasr/auto/auto_model.py
@@ -141,7 +141,7 @@ class AutoModel:
             kwargs = download_model(**kwargs)
         
         set_all_random_seed(kwargs.get("seed", 0))
-        
+        pdb.set_trace()
         device = kwargs.get("device", "cuda")
         if not torch.cuda.is_available() or kwargs.get("ngpu", 1) == 0:
             device = "cpu"

From 52fee96d71ba96fd09ad453dbae1926a1d601a56 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 15:31:14 +0800
Subject: [PATCH 048/101] test

---
 funasr/auto/auto_model.py     | 2 +-
 funasr/models/lcbnet/model.py | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py
index 5cb2e6e48..ba7dcabaa 100644
--- a/funasr/auto/auto_model.py
+++ b/funasr/auto/auto_model.py
@@ -141,7 +141,7 @@ class AutoModel:
             kwargs = download_model(**kwargs)
         
         set_all_random_seed(kwargs.get("seed", 0))
-        pdb.set_trace()
+
         device = kwargs.get("device", "cuda")
         if not torch.cuda.is_available() or kwargs.get("ngpu", 1) == 0:
             device = "cpu"
diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index 45b1ee5d1..f8bbf7af1 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -274,15 +274,12 @@ class LCBNet(nn.Module):
                 ind: int
         """
         with autocast(False):
-            pdb.set_trace()
             # Data augmentation
             if self.specaug is not None and self.training:
                 speech, speech_lengths = self.specaug(speech, speech_lengths)
-            pdb.set_trace()
             # Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
             if self.normalize is not None:
                 speech, speech_lengths = self.normalize(speech, speech_lengths)
-        pdb.set_trace()
         # Forward encoder
         # feats: (Batch, Length, Dim)
         # -> encoder_out: (Batch, Length2, Dim2)

From e2425cc0675cc6fd7685067a27eabd1d32ca7fc9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 15:44:07 +0800
Subject: [PATCH 049/101] test

---
 funasr/models/conformer/encoder.py | 6 +++---
 funasr/models/lcbnet/model.py      | 7 +++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/funasr/models/conformer/encoder.py b/funasr/models/conformer/encoder.py
index 443d30912..be973c641 100644
--- a/funasr/models/conformer/encoder.py
+++ b/funasr/models/conformer/encoder.py
@@ -573,7 +573,7 @@ class ConformerEncoder(nn.Module):
             xs_pad, masks = self.embed(xs_pad, masks)
         else:
             xs_pad = self.embed(xs_pad)
-        pdb.set_trace()
+
         intermediate_outs = []
         if len(self.interctc_layer_idx) == 0:
             xs_pad, masks = self.encoders(xs_pad, masks)
@@ -601,12 +601,12 @@ class ConformerEncoder(nn.Module):
                             xs_pad = (x, pos_emb)
                         else:
                             xs_pad = xs_pad + self.conditioning_layer(ctc_out)
-        pdb.set_trace()
+
         if isinstance(xs_pad, tuple):
             xs_pad = xs_pad[0]
         if self.normalize_before:
             xs_pad = self.after_norm(xs_pad)
-        pdb.set_trace()
+
         olens = masks.squeeze(1).sum(1)
         if len(intermediate_outs) > 0:
             return (xs_pad, intermediate_outs), olens, None
diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index f8bbf7af1..8070aa378 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -296,7 +296,6 @@ class LCBNet(nn.Module):
         
         if intermediate_outs is not None:
             return (encoder_out, intermediate_outs), encoder_out_lens
-        pdb.set_trace()
         return encoder_out, encoder_out_lens
     
     def _calc_att_loss(
@@ -444,7 +443,11 @@ class LCBNet(nn.Module):
         encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
         if isinstance(encoder_out, tuple):
             encoder_out = encoder_out[0]
-        
+        pdb.set_trace()
+        ocr = ocr_sample_list[0]
+        ocr_lengths = ocr.new_full([1], dtype=torch.long, fill_value=ocr.size(1))
+        ocr, ocr_lens, _ = self.text_encoder(ocr, ocr_lengths)
+        pdb.set_trace()
         # c. Passed the encoder result and the beam search
         nbest_hyps = self.beam_search(
             x=encoder_out[0], maxlenratio=kwargs.get("maxlenratio", 0.0), minlenratio=kwargs.get("minlenratio", 0.0)

From 5b9c073f43dbecc3ae9d771af50a8f52f87931e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 15:49:41 +0800
Subject: [PATCH 050/101] test

---
 funasr/models/lcbnet/model.py | 1 -
 funasr/utils/load_utils.py    | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index 8070aa378..b4a206bed 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -438,7 +438,6 @@ class LCBNet(nn.Module):
 
         speech = speech.to(device=kwargs["device"])
         speech_lengths = speech_lengths.to(device=kwargs["device"])
-        pdb.set_trace()
         # Encoder
         encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
         if isinstance(encoder_out, tuple):
diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py
index 644af2324..8b75cbdb6 100644
--- a/funasr/utils/load_utils.py
+++ b/funasr/utils/load_utils.py
@@ -31,7 +31,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs:
             return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs, data_type=data_type, **kwargs) for audio in data_or_path_or_list]
     if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith('http'): # download url to local file
         data_or_path_or_list = download_from_url(data_or_path_or_list)
-
+    pdb.set_trace()
     if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): # local file
         if data_type is None or data_type == "sound":
             data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list)

From 6e47d42ea00e6d10746b59a86d6455465464ed83 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 15:55:23 +0800
Subject: [PATCH 051/101] test

---
 funasr/models/lcbnet/model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index b4a206bed..3b8f3c96e 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -422,6 +422,7 @@ class LCBNet(nn.Module):
         else:
             # extract fbank feats
             time1 = time.perf_counter()
+            pdb.set_trace()
             sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000),
                                                             data_type=kwargs.get("data_type", "sound"),
                                                             tokenizer=tokenizer)

From ecd9e74b6e177e5dd584609c04570870f15af63b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 16:00:44 +0800
Subject: [PATCH 052/101] test

---
 funasr/auto/auto_model.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py
index ba7dcabaa..d5225dee8 100644
--- a/funasr/auto/auto_model.py
+++ b/funasr/auto/auto_model.py
@@ -39,11 +39,13 @@ def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None):
     filelist = [".scp", ".txt", ".json", ".jsonl"]
     
     chars = string.ascii_letters + string.digits
+    pdb.set_trace()
     if isinstance(data_in, str) and data_in.startswith('http'): # url
         data_in = download_from_url(data_in)
     if isinstance(data_in, str) and os.path.exists(data_in): # wav_path; filelist: wav.scp, file.jsonl;text.txt;
         _, file_extension = os.path.splitext(data_in)
         file_extension = file_extension.lower()
+        pdb.set_trace()
         if file_extension in filelist: #filelist: wav.scp, file.jsonl;text.txt;
             with open(data_in, encoding='utf-8') as fin:
                 for line in fin:

From a88b51c5442efba7bf1e8d91881f69279b27224d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 16:04:35 +0800
Subject: [PATCH 053/101] test

---
 examples/industrial_data_pretraining/lcbnet/demo2.sh | 2 +-
 funasr/auto/auto_model.py                            | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh
index 0d5a4f031..9ba176be6 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo2.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh
@@ -6,7 +6,7 @@ python -m funasr.bin.inference \
 --config-name="config.yaml" \
 ++init_param=${file_dir}/model.pb \
 ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
-++input=[${file_dir}/wav.scp,${file_dir}/ocr_text] \
+++input=[${file_dir}/wav.scp,${file_dir}/ocr.txt] \
 +data_type='["kaldi_ark", "text"]' \
 ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
 ++output_dir="./outputs/debug" \
diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py
index d5225dee8..ba7dcabaa 100644
--- a/funasr/auto/auto_model.py
+++ b/funasr/auto/auto_model.py
@@ -39,13 +39,11 @@ def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None):
     filelist = [".scp", ".txt", ".json", ".jsonl"]
     
     chars = string.ascii_letters + string.digits
-    pdb.set_trace()
     if isinstance(data_in, str) and data_in.startswith('http'): # url
         data_in = download_from_url(data_in)
     if isinstance(data_in, str) and os.path.exists(data_in): # wav_path; filelist: wav.scp, file.jsonl;text.txt;
         _, file_extension = os.path.splitext(data_in)
         file_extension = file_extension.lower()
-        pdb.set_trace()
         if file_extension in filelist: #filelist: wav.scp, file.jsonl;text.txt;
             with open(data_in, encoding='utf-8') as fin:
                 for line in fin:

From 31e2eb39ad3965931f9df22fce86c708f4d9da95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 16:14:57 +0800
Subject: [PATCH 054/101] test

---
 funasr/models/lcbnet/model.py | 7 ++++---
 funasr/utils/load_utils.py    | 1 -
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index 3b8f3c96e..f4caee8a4 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -422,7 +422,6 @@ class LCBNet(nn.Module):
         else:
             # extract fbank feats
             time1 = time.perf_counter()
-            pdb.set_trace()
             sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000),
                                                             data_type=kwargs.get("data_type", "sound"),
                                                             tokenizer=tokenizer)
@@ -443,9 +442,11 @@ class LCBNet(nn.Module):
         encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
         if isinstance(encoder_out, tuple):
             encoder_out = encoder_out[0]
-        pdb.set_trace()
-        ocr = ocr_sample_list[0]
+        
+        ocr_list_new = [[x + 1 if x != 0 else x for x in sublist] for sublist in ocr_sample_list]
+        ocr = torch.tensor(ocr_list_new)
         ocr_lengths = ocr.new_full([1], dtype=torch.long, fill_value=ocr.size(1))
+        pdb.set_trace()
         ocr, ocr_lens, _ = self.text_encoder(ocr, ocr_lengths)
         pdb.set_trace()
         # c. Passed the encoder result and the beam search
diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py
index 8b75cbdb6..87412bd87 100644
--- a/funasr/utils/load_utils.py
+++ b/funasr/utils/load_utils.py
@@ -31,7 +31,6 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs:
             return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs, data_type=data_type, **kwargs) for audio in data_or_path_or_list]
     if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith('http'): # download url to local file
         data_or_path_or_list = download_from_url(data_or_path_or_list)
-    pdb.set_trace()
     if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): # local file
         if data_type is None or data_type == "sound":
             data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list)

From 5b93a56a7b15ae236317f78c60b67a5e95488b38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 16:22:51 +0800
Subject: [PATCH 055/101] test

---
 funasr/models/lcbnet/model.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index f4caee8a4..422956f04 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -446,9 +446,10 @@ class LCBNet(nn.Module):
         ocr_list_new = [[x + 1 if x != 0 else x for x in sublist] for sublist in ocr_sample_list]
         ocr = torch.tensor(ocr_list_new)
         ocr_lengths = ocr.new_full([1], dtype=torch.long, fill_value=ocr.size(1))
-        pdb.set_trace()
         ocr, ocr_lens, _ = self.text_encoder(ocr, ocr_lengths)
-        pdb.set_trace()
+        fusion_out, _, _, _ = self.fusion_encoder(encoder_out,None, ocr, None)
+        encoder_out = encoder_out + fusion_out
+
         # c. Passed the encoder result and the beam search
         nbest_hyps = self.beam_search(
             x=encoder_out[0], maxlenratio=kwargs.get("maxlenratio", 0.0), minlenratio=kwargs.get("minlenratio", 0.0)
@@ -456,7 +457,7 @@ class LCBNet(nn.Module):
         
         nbest_hyps = nbest_hyps[: self.nbest]
 
-
+        pdb.set_trace(0)
         results = []
         b, n, d = encoder_out.size()
         for i in range(b):
@@ -478,9 +479,12 @@ class LCBNet(nn.Module):
                 # remove blank symbol id, which is assumed to be 0
                 token_int = list(filter(lambda x: x != self.eos and x != self.sos and x != self.blank_id, token_int))
                 
+                pdb.set_trace()
                 # Change integer-ids to tokens
                 token = tokenizer.ids2tokens(token_int)
+                pdb.set_trace()
                 text = tokenizer.tokens2text(token)
+                pdb.set_trace()
                 
                 text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
                 result_i = {"key": key[i], "token": token, "text": text_postprocessed}

From 77c2c933a221c4b04f211eeacb7981abccee3c24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 16:30:10 +0800
Subject: [PATCH 056/101] test

---
 funasr/models/lcbnet/model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index 422956f04..6de69846b 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -449,7 +449,7 @@ class LCBNet(nn.Module):
         ocr, ocr_lens, _ = self.text_encoder(ocr, ocr_lengths)
         fusion_out, _, _, _ = self.fusion_encoder(encoder_out,None, ocr, None)
         encoder_out = encoder_out + fusion_out
-
+        pdb.set_trace()
         # c. Passed the encoder result and the beam search
         nbest_hyps = self.beam_search(
             x=encoder_out[0], maxlenratio=kwargs.get("maxlenratio", 0.0), minlenratio=kwargs.get("minlenratio", 0.0)
@@ -485,7 +485,7 @@ class LCBNet(nn.Module):
                 pdb.set_trace()
                 text = tokenizer.tokens2text(token)
                 pdb.set_trace()
-                
+
                 text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
                 result_i = {"key": key[i], "token": token, "text": text_postprocessed}
                 results.append(result_i)

From 8992750f02bdc37da40b2e56831b12cc7b3cf756 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 16:30:55 +0800
Subject: [PATCH 057/101] test

---
 funasr/models/lcbnet/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index 6de69846b..6ee5342d4 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -457,7 +457,7 @@ class LCBNet(nn.Module):
         
         nbest_hyps = nbest_hyps[: self.nbest]
 
-        pdb.set_trace(0)
+        pdb.set_trace()
         results = []
         b, n, d = encoder_out.size()
         for i in range(b):

From 39de3adfbc12bc491f6da9eb9ffdc5122a3f623d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 16:39:15 +0800
Subject: [PATCH 058/101] test

---
 examples/industrial_data_pretraining/lcbnet/demo2.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh
index 9ba176be6..20af1f57d 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo2.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh
@@ -1,6 +1,6 @@
 file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
 
-
+CUDA_VISIBLE_DEVICES="" \
 python -m funasr.bin.inference \
 --config-path=${file_dir} \
 --config-name="config.yaml" \

From 12d8bd77a6686e29f6840e8de3909f3aaf96afa3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 16:48:11 +0800
Subject: [PATCH 059/101] test

---
 funasr/models/lcbnet/model.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index 6ee5342d4..82a1b787d 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -111,8 +111,8 @@ class LCBNet(nn.Module):
             )
     
         self.blank_id = blank_id
-        self.sos = sos if sos is not None else vocab_size - 1
-        self.eos = eos if eos is not None else vocab_size - 1
+        self.sos = vocab_size - 1
+        self.eos = vocab_size - 1
         self.vocab_size = vocab_size
         self.ignore_id = ignore_id
         self.ctc_weight = ctc_weight
@@ -375,14 +375,14 @@ class LCBNet(nn.Module):
         scorers["ngram"] = ngram
         
         weights = dict(
-            decoder=1.0 - kwargs.get("decoding_ctc_weight", 0.5),
-            ctc=kwargs.get("decoding_ctc_weight", 0.5),
+            decoder=1.0 - kwargs.get("decoding_ctc_weight", 0.3),
+            ctc=kwargs.get("decoding_ctc_weight", 0.3),
             lm=kwargs.get("lm_weight", 0.0),
             ngram=kwargs.get("ngram_weight", 0.0),
             length_bonus=kwargs.get("penalty", 0.0),
         )
         beam_search = BeamSearch(
-            beam_size=kwargs.get("beam_size", 10),
+            beam_size=kwargs.get("beam_size", 20),
             weights=weights,
             scorers=scorers,
             sos=self.sos,

From e59ec16e6a1306d27056d48f7426b6c9a18ae669 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 16:56:58 +0800
Subject: [PATCH 060/101] test

---
 funasr/auto/auto_model.py     | 2 --
 funasr/models/lcbnet/model.py | 8 +-------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py
index ba7dcabaa..3f99e4d17 100644
--- a/funasr/auto/auto_model.py
+++ b/funasr/auto/auto_model.py
@@ -234,11 +234,9 @@ class AutoModel:
         
             time1 = time.perf_counter()
             with torch.no_grad():
-                pdb.set_trace()
                 results, meta_data = model.inference(**batch, **kwargs)
             time2 = time.perf_counter()
             
-            pdb.set_trace()
             asr_result_list.extend(results)
 
             # batch_data_time = time_per_frame_s * data_batch_i["speech_lengths"].sum().item()
diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index 82a1b787d..deddf73df 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -181,8 +181,7 @@ class LCBNet(nn.Module):
                 text: (Batch, Length)
                 text_lengths: (Batch,)
         """
-        # import pdb;
-        # pdb.set_trace()
+
         if len(text_lengths.size()) > 1:
             text_lengths = text_lengths[:, 0]
         if len(speech_lengths.size()) > 1:
@@ -449,7 +448,6 @@ class LCBNet(nn.Module):
         ocr, ocr_lens, _ = self.text_encoder(ocr, ocr_lengths)
         fusion_out, _, _, _ = self.fusion_encoder(encoder_out,None, ocr, None)
         encoder_out = encoder_out + fusion_out
-        pdb.set_trace()
         # c. Passed the encoder result and the beam search
         nbest_hyps = self.beam_search(
             x=encoder_out[0], maxlenratio=kwargs.get("maxlenratio", 0.0), minlenratio=kwargs.get("minlenratio", 0.0)
@@ -457,7 +455,6 @@ class LCBNet(nn.Module):
         
         nbest_hyps = nbest_hyps[: self.nbest]
 
-        pdb.set_trace()
         results = []
         b, n, d = encoder_out.size()
         for i in range(b):
@@ -479,12 +476,9 @@ class LCBNet(nn.Module):
                 # remove blank symbol id, which is assumed to be 0
                 token_int = list(filter(lambda x: x != self.eos and x != self.sos and x != self.blank_id, token_int))
                 
-                pdb.set_trace()
                 # Change integer-ids to tokens
                 token = tokenizer.ids2tokens(token_int)
-                pdb.set_trace()
                 text = tokenizer.tokens2text(token)
-                pdb.set_trace()
 
                 text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
                 result_i = {"key": key[i], "token": token, "text": text_postprocessed}

From e0fca115cbae19e8280eb0b31286195d5f5473f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 17:10:49 +0800
Subject: [PATCH 061/101] test

---
 funasr/models/lcbnet/model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index deddf73df..ab557e6d8 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -443,8 +443,8 @@ class LCBNet(nn.Module):
             encoder_out = encoder_out[0]
         
         ocr_list_new = [[x + 1 if x != 0 else x for x in sublist] for sublist in ocr_sample_list]
-        ocr = torch.tensor(ocr_list_new)
-        ocr_lengths = ocr.new_full([1], dtype=torch.long, fill_value=ocr.size(1))
+        ocr = torch.tensor(ocr_list_new).to(device=kwargs["device"])
+        ocr_lengths = ocr.new_full([1], dtype=torch.long, fill_value=ocr.size(1)).to(device=kwargs["device"])
         ocr, ocr_lens, _ = self.text_encoder(ocr, ocr_lengths)
         fusion_out, _, _, _ = self.fusion_encoder(encoder_out,None, ocr, None)
         encoder_out = encoder_out + fusion_out

From 0a4e01bd7d789504cc5986fa848e5822bef4dfc9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 17:18:23 +0800
Subject: [PATCH 062/101] atsr

---
 .../lcbnet/{demo2.sh => demo.sh}                    |  4 ++--
 .../industrial_data_pretraining/lcbnet/demo_nj.sh   | 13 +++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)
 rename examples/industrial_data_pretraining/lcbnet/{demo2.sh => demo.sh} (92%)
 create mode 100755 examples/industrial_data_pretraining/lcbnet/demo_nj.sh

diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo.sh
similarity index 92%
rename from examples/industrial_data_pretraining/lcbnet/demo2.sh
rename to examples/industrial_data_pretraining/lcbnet/demo.sh
index 20af1f57d..9515f985d 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo2.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo.sh
@@ -1,6 +1,6 @@
 file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
 
-CUDA_VISIBLE_DEVICES="" \
+#CUDA_VISIBLE_DEVICES="" \
 python -m funasr.bin.inference \
 --config-path=${file_dir} \
 --config-name="config.yaml" \
@@ -10,4 +10,4 @@ python -m funasr.bin.inference \
 +data_type='["kaldi_ark", "text"]' \
 ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
 ++output_dir="./outputs/debug" \
-++device="" \
+++device="cpu" \
diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
new file mode 100755
index 000000000..9515f985d
--- /dev/null
+++ b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
@@ -0,0 +1,13 @@
+file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
+
+#CUDA_VISIBLE_DEVICES="" \
+python -m funasr.bin.inference \
+--config-path=${file_dir} \
+--config-name="config.yaml" \
+++init_param=${file_dir}/model.pb \
+++tokenizer_conf.token_list=${file_dir}/tokens.txt \
+++input=[${file_dir}/wav.scp,${file_dir}/ocr.txt] \
++data_type='["kaldi_ark", "text"]' \
+++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
+++output_dir="./outputs/debug" \
+++device="cpu" \

From 2d71d8f679894ab49374b10784547db001bba7be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 17:30:27 +0800
Subject: [PATCH 063/101] test

---
 .../lcbnet/demo_nj.sh                         | 80 ++++++++++++++++---
 1 file changed, 69 insertions(+), 11 deletions(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
index 9515f985d..51ffad71c 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
@@ -1,13 +1,71 @@
 file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
+CUDA_VISIBLE_DEVICES="0,1"
+inference_device="cuda"
 
-#CUDA_VISIBLE_DEVICES="" \
-python -m funasr.bin.inference \
---config-path=${file_dir} \
---config-name="config.yaml" \
-++init_param=${file_dir}/model.pb \
-++tokenizer_conf.token_list=${file_dir}/tokens.txt \
-++input=[${file_dir}/wav.scp,${file_dir}/ocr.txt] \
-+data_type='["kaldi_ark", "text"]' \
-++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
-++output_dir="./outputs/debug" \
-++device="cpu" \
+if [ ${inference_device} == "cuda" ]; then
+    nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+else
+    inference_batch_size=1
+    CUDA_VISIBLE_DEVICES=""
+    for JOB in $(seq ${nj}); do
+        CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
+    done
+fi
+
+inference_dir="outputs/test"
+_logdir="${inference_dir}/logdir"
+echo "inference_dir: ${inference_dir}"
+
+mkdir -p "${_logdir}"
+key_file1=${file_dir}/wav.scp
+key_file2=${file_dir}/ocr.txt
+split_scps1=
+split_scps2=
+for JOB in $(seq "${nj}"); do
+    split_scps1+=" ${_logdir}/wav.${JOB}.scp"
+    split_scps2+=" ${_logdir}/ocr.${JOB}.txt"
+done
+utils/split_scp.pl "${key_file1}" ${split_scps1}
+utils/split_scp.pl "${key_file2}" ${split_scps2}
+
+gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
+for JOB in $(seq ${nj}); do
+    {
+        id=$((JOB-1))
+        gpuid=${gpuid_list_array[$id]}
+
+        export CUDA_VISIBLE_DEVICES=${gpuid}
+
+        python -m funasr.bin.inference \
+        --config-path=${file_dir} \
+        --config-name="config.yaml" \
+        ++init_param=${file_dir}/model.pb \
+        ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
+        ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
+        +data_type='["kaldi_ark", "text"]' \
+        ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
+        ++output_dir="${inference_dir}/${JOB}" \
+        ++device="${inference_device}" \
+        ++ncpu=1 \
+        ++disable_log=true  &> ${_logdir}/log.${JOB}.txt
+
+    }&
+done
+wait
+
+
+mkdir -p ${inference_dir}/1best_recog
+for f in token score text; do
+    if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then
+        for JOB in $(seq "${nj}"); do
+            cat "${inference_dir}/${JOB}/1best_recog/${f}"
+        done | sort -k1 >"${inference_dir}/1best_recog/${f}"
+    fi
+done
+
+echo "Computing WER ..."
+echo "Computing WER ..."
+python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc
+python utils/postprocess_text_zh.py  ${data_dir}/text ${inference_dir}/1best_recog/text.ref
+python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer
+tail -n 3 ${inference_dir}/1best_recog/text.cer
\ No newline at end of file

From 59ae516f6762077ed9933128e2d804f9a65066a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 17:30:41 +0800
Subject: [PATCH 064/101] test

---
 examples/industrial_data_pretraining/lcbnet/demo_nj.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
index 51ffad71c..5e634c315 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
@@ -1,5 +1,5 @@
 file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
-CUDA_VISIBLE_DEVICES="0,1"
+CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
 inference_device="cuda"
 
 if [ ${inference_device} == "cuda" ]; then

From 179a3f99c45d21cec3ea17e3b9265bcf1e49c617 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 19:30:15 +0800
Subject: [PATCH 065/101] test

---
 .../lcbnet/demo_nj.sh                         | 77 ++++++++++---------
 1 file changed, 39 insertions(+), 38 deletions(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
index 5e634c315..d9f42a033 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
@@ -16,46 +16,46 @@ inference_dir="outputs/test"
 _logdir="${inference_dir}/logdir"
 echo "inference_dir: ${inference_dir}"
 
-mkdir -p "${_logdir}"
-key_file1=${file_dir}/wav.scp
-key_file2=${file_dir}/ocr.txt
-split_scps1=
-split_scps2=
-for JOB in $(seq "${nj}"); do
-    split_scps1+=" ${_logdir}/wav.${JOB}.scp"
-    split_scps2+=" ${_logdir}/ocr.${JOB}.txt"
-done
-utils/split_scp.pl "${key_file1}" ${split_scps1}
-utils/split_scp.pl "${key_file2}" ${split_scps2}
+# mkdir -p "${_logdir}"
+# key_file1=${file_dir}/wav.scp
+# key_file2=${file_dir}/ocr.txt
+# split_scps1=
+# split_scps2=
+# for JOB in $(seq "${nj}"); do
+#     split_scps1+=" ${_logdir}/wav.${JOB}.scp"
+#     split_scps2+=" ${_logdir}/ocr.${JOB}.txt"
+# done
+# utils/split_scp.pl "${key_file1}" ${split_scps1}
+# utils/split_scp.pl "${key_file2}" ${split_scps2}
 
-gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
-for JOB in $(seq ${nj}); do
-    {
-        id=$((JOB-1))
-        gpuid=${gpuid_list_array[$id]}
+# gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
+# for JOB in $(seq ${nj}); do
+#     {
+#         id=$((JOB-1))
+#         gpuid=${gpuid_list_array[$id]}
 
-        export CUDA_VISIBLE_DEVICES=${gpuid}
+#         export CUDA_VISIBLE_DEVICES=${gpuid}
 
-        python -m funasr.bin.inference \
-        --config-path=${file_dir} \
-        --config-name="config.yaml" \
-        ++init_param=${file_dir}/model.pb \
-        ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
-        ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
-        +data_type='["kaldi_ark", "text"]' \
-        ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
-        ++output_dir="${inference_dir}/${JOB}" \
-        ++device="${inference_device}" \
-        ++ncpu=1 \
-        ++disable_log=true  &> ${_logdir}/log.${JOB}.txt
+#         python -m funasr.bin.inference \
+#         --config-path=${file_dir} \
+#         --config-name="config.yaml" \
+#         ++init_param=${file_dir}/model.pb \
+#         ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
+#         ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
+#         +data_type='["kaldi_ark", "text"]' \
+#         ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
+#         ++output_dir="${inference_dir}/${JOB}" \
+#         ++device="${inference_device}" \
+#         ++ncpu=1 \
+#         ++disable_log=true  &> ${_logdir}/log.${JOB}.txt
 
-    }&
-done
-wait
+#     }&
+# done
+# wait
 
 
-mkdir -p ${inference_dir}/1best_recog
-for f in token score text; do
+#mkdir -p ${inference_dir}/1best_recog
+for f in token; do
     if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then
         for JOB in $(seq "${nj}"); do
             cat "${inference_dir}/${JOB}/1best_recog/${f}"
@@ -65,7 +65,8 @@ done
 
 echo "Computing WER ..."
 echo "Computing WER ..."
-python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc
-python utils/postprocess_text_zh.py  ${data_dir}/text ${inference_dir}/1best_recog/text.ref
-python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer
-tail -n 3 ${inference_dir}/1best_recog/text.cer
\ No newline at end of file
+#python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc
+
+#cp  ${data_dir}/text ${inference_dir}/1best_recog/text.ref
+#python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer
+#tail -n 3 ${inference_dir}/1best_recog/text.cer
\ No newline at end of file

From 5f91acae0d8be4b3223bcb4732bad2796d654547 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 19:35:32 +0800
Subject: [PATCH 066/101] test

---
 .../lcbnet/demo_nj.sh                         | 72 -------------------
 .../lcbnet/demo_nj2.sh                        | 72 +++++++++++++++++++
 .../industrial_data_pretraining/lcbnet/utils  |  1 +
 3 files changed, 73 insertions(+), 72 deletions(-)
 delete mode 100755 examples/industrial_data_pretraining/lcbnet/demo_nj.sh
 create mode 100644 examples/industrial_data_pretraining/lcbnet/demo_nj2.sh
 create mode 120000 examples/industrial_data_pretraining/lcbnet/utils

diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
deleted file mode 100755
index d9f42a033..000000000
--- a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
+++ /dev/null
@@ -1,72 +0,0 @@
-file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
-CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
-inference_device="cuda"
-
-if [ ${inference_device} == "cuda" ]; then
-    nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
-else
-    inference_batch_size=1
-    CUDA_VISIBLE_DEVICES=""
-    for JOB in $(seq ${nj}); do
-        CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
-    done
-fi
-
-inference_dir="outputs/test"
-_logdir="${inference_dir}/logdir"
-echo "inference_dir: ${inference_dir}"
-
-# mkdir -p "${_logdir}"
-# key_file1=${file_dir}/wav.scp
-# key_file2=${file_dir}/ocr.txt
-# split_scps1=
-# split_scps2=
-# for JOB in $(seq "${nj}"); do
-#     split_scps1+=" ${_logdir}/wav.${JOB}.scp"
-#     split_scps2+=" ${_logdir}/ocr.${JOB}.txt"
-# done
-# utils/split_scp.pl "${key_file1}" ${split_scps1}
-# utils/split_scp.pl "${key_file2}" ${split_scps2}
-
-# gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
-# for JOB in $(seq ${nj}); do
-#     {
-#         id=$((JOB-1))
-#         gpuid=${gpuid_list_array[$id]}
-
-#         export CUDA_VISIBLE_DEVICES=${gpuid}
-
-#         python -m funasr.bin.inference \
-#         --config-path=${file_dir} \
-#         --config-name="config.yaml" \
-#         ++init_param=${file_dir}/model.pb \
-#         ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
-#         ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
-#         +data_type='["kaldi_ark", "text"]' \
-#         ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
-#         ++output_dir="${inference_dir}/${JOB}" \
-#         ++device="${inference_device}" \
-#         ++ncpu=1 \
-#         ++disable_log=true  &> ${_logdir}/log.${JOB}.txt
-
-#     }&
-# done
-# wait
-
-
-#mkdir -p ${inference_dir}/1best_recog
-for f in token; do
-    if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then
-        for JOB in $(seq "${nj}"); do
-            cat "${inference_dir}/${JOB}/1best_recog/${f}"
-        done | sort -k1 >"${inference_dir}/1best_recog/${f}"
-    fi
-done
-
-echo "Computing WER ..."
-echo "Computing WER ..."
-#python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc
-
-#cp  ${data_dir}/text ${inference_dir}/1best_recog/text.ref
-#python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer
-#tail -n 3 ${inference_dir}/1best_recog/text.cer
\ No newline at end of file
diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj2.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj2.sh
new file mode 100644
index 000000000..205c28fa3
--- /dev/null
+++ b/examples/industrial_data_pretraining/lcbnet/demo_nj2.sh
@@ -0,0 +1,72 @@
+file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
+CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+inference_device="cuda"
+
+if [ ${inference_device} == "cuda" ]; then
+    nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+    else
+        inference_batch_size=1
+            CUDA_VISIBLE_DEVICES=""
+                for JOB in $(seq ${nj}); do
+                        CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
+                            done
+                            fi
+
+                            inference_dir="outputs/test"
+                            _logdir="${inference_dir}/logdir"
+                            echo "inference_dir: ${inference_dir}"
+
+                            # mkdir -p "${_logdir}"
+                            # key_file1=${file_dir}/wav.scp
+                            # key_file2=${file_dir}/ocr.txt
+                            # split_scps1=
+                            # split_scps2=
+                            # for JOB in $(seq "${nj}"); do
+                            #     split_scps1+=" ${_logdir}/wav.${JOB}.scp"
+                            #     split_scps2+=" ${_logdir}/ocr.${JOB}.txt"
+                            # done
+                            # utils/split_scp.pl "${key_file1}" ${split_scps1}
+                            # utils/split_scp.pl "${key_file2}" ${split_scps2}
+
+                            # gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
+                            # for JOB in $(seq ${nj}); do
+                            #     {
+                            #         id=$((JOB-1))
+                            #         gpuid=${gpuid_list_array[$id]}
+
+                            #         export CUDA_VISIBLE_DEVICES=${gpuid}
+
+                            #         python -m funasr.bin.inference \
+                            #         --config-path=${file_dir} \
+                            #         --config-name="config.yaml" \
+                            #         ++init_param=${file_dir}/model.pb \
+                            #         ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
+                            #         ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
+                            #         +data_type='["kaldi_ark", "text"]' \
+                            #         ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
+                            #         ++output_dir="${inference_dir}/${JOB}" \
+                            #         ++device="${inference_device}" \
+                            #         ++ncpu=1 \
+                            #         ++disable_log=true  &> ${_logdir}/log.${JOB}.txt
+
+                            #     }&
+                            # done
+                            # wait
+
+
+                            #mkdir -p ${inference_dir}/1best_recog
+                            for f in token; do
+                                if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then
+                                        for JOB in $(seq "${nj}"); do
+                                                    cat "${inference_dir}/${JOB}/1best_recog/${f}"
+                                                            done | sort -k1 >"${inference_dir}/1best_recog/${f}"
+                                                                fi
+                                                                done
+
+                                                                echo "Computing WER ..."
+                                                                echo "Computing WER ..."
+                                                                #python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc
+
+                                                                #cp  ${data_dir}/text ${inference_dir}/1best_recog/text.ref
+                                                                #python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer
+                                                                #tail -n 3 ${inference_dir}/1best_recog/text.cer
diff --git a/examples/industrial_data_pretraining/lcbnet/utils b/examples/industrial_data_pretraining/lcbnet/utils
new file mode 120000
index 000000000..be5e5a322
--- /dev/null
+++ b/examples/industrial_data_pretraining/lcbnet/utils
@@ -0,0 +1 @@
+../../aishell/paraformer/utils
\ No newline at end of file

From e702cad2fb38d8458d57b8ee7639e35ef84f0967 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 19:36:19 +0800
Subject: [PATCH 067/101] test

---
 .../lcbnet/demo_nj.sh                         | 72 +++++++++++++++++++
 .../lcbnet/demo_nj2.sh                        | 72 -------------------
 2 files changed, 72 insertions(+), 72 deletions(-)
 create mode 100644 examples/industrial_data_pretraining/lcbnet/demo_nj.sh
 delete mode 100644 examples/industrial_data_pretraining/lcbnet/demo_nj2.sh

diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
new file mode 100644
index 000000000..d9f42a033
--- /dev/null
+++ b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
@@ -0,0 +1,72 @@
+file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
+CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+inference_device="cuda"
+
+if [ ${inference_device} == "cuda" ]; then
+    nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+else
+    inference_batch_size=1
+    CUDA_VISIBLE_DEVICES=""
+    for JOB in $(seq ${nj}); do
+        CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
+    done
+fi
+
+inference_dir="outputs/test"
+_logdir="${inference_dir}/logdir"
+echo "inference_dir: ${inference_dir}"
+
+# mkdir -p "${_logdir}"
+# key_file1=${file_dir}/wav.scp
+# key_file2=${file_dir}/ocr.txt
+# split_scps1=
+# split_scps2=
+# for JOB in $(seq "${nj}"); do
+#     split_scps1+=" ${_logdir}/wav.${JOB}.scp"
+#     split_scps2+=" ${_logdir}/ocr.${JOB}.txt"
+# done
+# utils/split_scp.pl "${key_file1}" ${split_scps1}
+# utils/split_scp.pl "${key_file2}" ${split_scps2}
+
+# gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
+# for JOB in $(seq ${nj}); do
+#     {
+#         id=$((JOB-1))
+#         gpuid=${gpuid_list_array[$id]}
+
+#         export CUDA_VISIBLE_DEVICES=${gpuid}
+
+#         python -m funasr.bin.inference \
+#         --config-path=${file_dir} \
+#         --config-name="config.yaml" \
+#         ++init_param=${file_dir}/model.pb \
+#         ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
+#         ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
+#         +data_type='["kaldi_ark", "text"]' \
+#         ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
+#         ++output_dir="${inference_dir}/${JOB}" \
+#         ++device="${inference_device}" \
+#         ++ncpu=1 \
+#         ++disable_log=true  &> ${_logdir}/log.${JOB}.txt
+
+#     }&
+# done
+# wait
+
+
+#mkdir -p ${inference_dir}/1best_recog
+for f in token; do
+    if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then
+        for JOB in $(seq "${nj}"); do
+            cat "${inference_dir}/${JOB}/1best_recog/${f}"
+        done | sort -k1 >"${inference_dir}/1best_recog/${f}"
+    fi
+done
+
+echo "Computing WER ..."
+echo "Computing WER ..."
+#python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc
+
+#cp  ${data_dir}/text ${inference_dir}/1best_recog/text.ref
+#python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer
+#tail -n 3 ${inference_dir}/1best_recog/text.cer
\ No newline at end of file
diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj2.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj2.sh
deleted file mode 100644
index 205c28fa3..000000000
--- a/examples/industrial_data_pretraining/lcbnet/demo_nj2.sh
+++ /dev/null
@@ -1,72 +0,0 @@
-file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
-CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
-inference_device="cuda"
-
-if [ ${inference_device} == "cuda" ]; then
-    nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
-    else
-        inference_batch_size=1
-            CUDA_VISIBLE_DEVICES=""
-                for JOB in $(seq ${nj}); do
-                        CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
-                            done
-                            fi
-
-                            inference_dir="outputs/test"
-                            _logdir="${inference_dir}/logdir"
-                            echo "inference_dir: ${inference_dir}"
-
-                            # mkdir -p "${_logdir}"
-                            # key_file1=${file_dir}/wav.scp
-                            # key_file2=${file_dir}/ocr.txt
-                            # split_scps1=
-                            # split_scps2=
-                            # for JOB in $(seq "${nj}"); do
-                            #     split_scps1+=" ${_logdir}/wav.${JOB}.scp"
-                            #     split_scps2+=" ${_logdir}/ocr.${JOB}.txt"
-                            # done
-                            # utils/split_scp.pl "${key_file1}" ${split_scps1}
-                            # utils/split_scp.pl "${key_file2}" ${split_scps2}
-
-                            # gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
-                            # for JOB in $(seq ${nj}); do
-                            #     {
-                            #         id=$((JOB-1))
-                            #         gpuid=${gpuid_list_array[$id]}
-
-                            #         export CUDA_VISIBLE_DEVICES=${gpuid}
-
-                            #         python -m funasr.bin.inference \
-                            #         --config-path=${file_dir} \
-                            #         --config-name="config.yaml" \
-                            #         ++init_param=${file_dir}/model.pb \
-                            #         ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
-                            #         ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
-                            #         +data_type='["kaldi_ark", "text"]' \
-                            #         ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
-                            #         ++output_dir="${inference_dir}/${JOB}" \
-                            #         ++device="${inference_device}" \
-                            #         ++ncpu=1 \
-                            #         ++disable_log=true  &> ${_logdir}/log.${JOB}.txt
-
-                            #     }&
-                            # done
-                            # wait
-
-
-                            #mkdir -p ${inference_dir}/1best_recog
-                            for f in token; do
-                                if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then
-                                        for JOB in $(seq "${nj}"); do
-                                                    cat "${inference_dir}/${JOB}/1best_recog/${f}"
-                                                            done | sort -k1 >"${inference_dir}/1best_recog/${f}"
-                                                                fi
-                                                                done
-
-                                                                echo "Computing WER ..."
-                                                                echo "Computing WER ..."
-                                                                #python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc
-
-                                                                #cp  ${data_dir}/text ${inference_dir}/1best_recog/text.ref
-                                                                #python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer
-                                                                #tail -n 3 ${inference_dir}/1best_recog/text.cer

From 0fbebf114a84fd25170b4fdf997e6dc69556f299 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 19:39:00 +0800
Subject: [PATCH 068/101] test

---
 examples/industrial_data_pretraining/lcbnet/demo_nj.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 examples/industrial_data_pretraining/lcbnet/demo_nj.sh

diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
old mode 100644
new mode 100755

From e66b05020b30ccb1df04b1383ae21098591fe827 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 19:39:07 +0800
Subject: [PATCH 069/101] test

---
 .../industrial_data_pretraining/lcbnet/demo_nj.sh   | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)
 mode change 100755 => 100644 examples/industrial_data_pretraining/lcbnet/demo_nj.sh

diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
old mode 100755
new mode 100644
index d9f42a033..9d7755f2e
--- a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
@@ -55,13 +55,12 @@ echo "inference_dir: ${inference_dir}"
 
 
 #mkdir -p ${inference_dir}/1best_recog
-for f in token; do
-    if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then
-        for JOB in $(seq "${nj}"); do
-            cat "${inference_dir}/${JOB}/1best_recog/${f}"
-        done | sort -k1 >"${inference_dir}/1best_recog/${f}"
-    fi
-done
+
+if [ -f "${inference_dir}/${JOB}/1best_recog/token" ]; then
+    for JOB in $(seq "${nj}"); do
+        cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token"
+    done  
+fi
 
 echo "Computing WER ..."
 echo "Computing WER ..."

From 61597039b4e5f4b28ff1762f67d4a79f93f9c3b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 19:39:56 +0800
Subject: [PATCH 070/101] test

---
 examples/industrial_data_pretraining/lcbnet/demo_nj.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 examples/industrial_data_pretraining/lcbnet/demo_nj.sh

diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
old mode 100644
new mode 100755

From 47823c9007c7040dd05367f1a170a7be9fef188b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 19:42:16 +0800
Subject: [PATCH 071/101] test

---
 examples/industrial_data_pretraining/lcbnet/demo_nj.sh | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
index 9d7755f2e..c7e17594c 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
@@ -56,11 +56,9 @@ echo "inference_dir: ${inference_dir}"
 
 #mkdir -p ${inference_dir}/1best_recog
 
-if [ -f "${inference_dir}/${JOB}/1best_recog/token" ]; then
-    for JOB in $(seq "${nj}"); do
-        cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token"
-    done  
-fi
+for JOB in $(seq "${nj}"); do
+    cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token"
+done  
 
 echo "Computing WER ..."
 echo "Computing WER ..."

From 7904f2782697768e0d74b04ccf214c156b101696 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Wed, 28 Feb 2024 19:46:38 +0800
Subject: [PATCH 072/101] test

---
 .../lcbnet/demo_nj.sh                         | 78 +++++++++----------
 1 file changed, 38 insertions(+), 40 deletions(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
index c7e17594c..4aae9e5ed 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
@@ -16,54 +16,52 @@ inference_dir="outputs/test"
 _logdir="${inference_dir}/logdir"
 echo "inference_dir: ${inference_dir}"
 
-# mkdir -p "${_logdir}"
-# key_file1=${file_dir}/wav.scp
-# key_file2=${file_dir}/ocr.txt
-# split_scps1=
-# split_scps2=
-# for JOB in $(seq "${nj}"); do
-#     split_scps1+=" ${_logdir}/wav.${JOB}.scp"
-#     split_scps2+=" ${_logdir}/ocr.${JOB}.txt"
-# done
-# utils/split_scp.pl "${key_file1}" ${split_scps1}
-# utils/split_scp.pl "${key_file2}" ${split_scps2}
+mkdir -p "${_logdir}"
+key_file1=${file_dir}/wav.scp
+key_file2=${file_dir}/ocr.txt
+split_scps1=
+split_scps2=
+for JOB in $(seq "${nj}"); do
+    split_scps1+=" ${_logdir}/wav.${JOB}.scp"
+    split_scps2+=" ${_logdir}/ocr.${JOB}.txt"
+done
+utils/split_scp.pl "${key_file1}" ${split_scps1}
+utils/split_scp.pl "${key_file2}" ${split_scps2}
 
-# gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
-# for JOB in $(seq ${nj}); do
-#     {
-#         id=$((JOB-1))
-#         gpuid=${gpuid_list_array[$id]}
+gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
+for JOB in $(seq ${nj}); do
+    {
+        id=$((JOB-1))
+        gpuid=${gpuid_list_array[$id]}
 
-#         export CUDA_VISIBLE_DEVICES=${gpuid}
+        export CUDA_VISIBLE_DEVICES=${gpuid}
 
-#         python -m funasr.bin.inference \
-#         --config-path=${file_dir} \
-#         --config-name="config.yaml" \
-#         ++init_param=${file_dir}/model.pb \
-#         ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
-#         ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
-#         +data_type='["kaldi_ark", "text"]' \
-#         ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
-#         ++output_dir="${inference_dir}/${JOB}" \
-#         ++device="${inference_device}" \
-#         ++ncpu=1 \
-#         ++disable_log=true  &> ${_logdir}/log.${JOB}.txt
+        python -m funasr.bin.inference \
+        --config-path=${file_dir} \
+        --config-name="config.yaml" \
+        ++init_param=${file_dir}/model.pb \
+        ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
+        ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
+        +data_type='["kaldi_ark", "text"]' \
+        ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
+        ++output_dir="${inference_dir}/${JOB}" \
+        ++device="${inference_device}" \
+        ++ncpu=1 \
+        ++disable_log=true  &> ${_logdir}/log.${JOB}.txt
 
-#     }&
-# done
-# wait
+    }&
+done
+wait
 
 
-#mkdir -p ${inference_dir}/1best_recog
+mkdir -p ${inference_dir}/1best_recog
 
 for JOB in $(seq "${nj}"); do
-    cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token"
+   cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token"
 done  
 
 echo "Computing WER ..."
-echo "Computing WER ..."
-#python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc
-
-#cp  ${data_dir}/text ${inference_dir}/1best_recog/text.ref
-#python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer
-#tail -n 3 ${inference_dir}/1best_recog/text.cer
\ No newline at end of file
+sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/'  ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc
+cp  ${file_dir}/text ${inference_dir}/1best_recog/token.ref
+python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer
+tail -n 3 ${inference_dir}/1best_recog/token.cer
\ No newline at end of file

From 574155be137b7e0af4f874d4025d15c85b265e22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 29 Feb 2024 16:07:49 +0800
Subject: [PATCH 073/101] atsr

---
 .../lcbnet/compute_wer_details.py             | 702 ++++++++++++++++++
 .../lcbnet/demo.sh                            |  80 +-
 .../lcbnet/demo_nj.sh                         |  67 --
 .../lcbnet/run_bwer_recall.sh                 |  11 +
 4 files changed, 782 insertions(+), 78 deletions(-)
 create mode 100755 examples/industrial_data_pretraining/lcbnet/compute_wer_details.py
 delete mode 100755 examples/industrial_data_pretraining/lcbnet/demo_nj.sh
 create mode 100755 examples/industrial_data_pretraining/lcbnet/run_bwer_recall.sh

diff --git a/examples/industrial_data_pretraining/lcbnet/compute_wer_details.py b/examples/industrial_data_pretraining/lcbnet/compute_wer_details.py
new file mode 100755
index 000000000..e72d87155
--- /dev/null
+++ b/examples/industrial_data_pretraining/lcbnet/compute_wer_details.py
@@ -0,0 +1,702 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+from enum import Enum
+import re, sys, unicodedata
+import codecs
+import argparse
+from tqdm import tqdm
+import os
+import pdb
+remove_tag = False
+spacelist = [" ", "\t", "\r", "\n"]
+puncts = [
+    "!",
+    ",",
+    "?",
+    "、",
+    "。",
+    "！",
+    "，",
+    "；",
+    "？",
+    "：",
+    "「",
+    "」",
+    "︰",
+    "『",
+    "』",
+    "《",
+    "》",
+]
+
+
+class Code(Enum):
+    match = 1
+    substitution = 2
+    insertion = 3
+    deletion = 4
+
+
+class WordError(object):
+    def __init__(self):
+        self.errors = {
+            Code.substitution: 0,
+            Code.insertion: 0,
+            Code.deletion: 0,
+        }
+        self.ref_words = 0
+
+    def get_wer(self):
+        assert self.ref_words != 0
+        errors = (
+                self.errors[Code.substitution]
+                + self.errors[Code.insertion]
+                + self.errors[Code.deletion]
+        )
+        return 100.0 * errors / self.ref_words
+
+    def get_result_string(self):
+        return (
+            f"error_rate={self.get_wer():.4f}, "
+            f"ref_words={self.ref_words}, "
+            f"subs={self.errors[Code.substitution]}, "
+            f"ins={self.errors[Code.insertion]}, "
+            f"dels={self.errors[Code.deletion]}"
+        )
+
+
+def characterize(string):
+    res = []
+    i = 0
+    while i < len(string):
+        char = string[i]
+        if char in puncts:
+            i += 1
+            continue
+        cat1 = unicodedata.category(char)
+        # https://unicodebook.readthedocs.io/unicode.html#unicode-categories
+        if cat1 == "Zs" or cat1 == "Cn" or char in spacelist:  # space or not assigned
+            i += 1
+            continue
+        if cat1 == "Lo":  # letter-other
+            res.append(char)
+            i += 1
+        else:
+            # some input looks like: <unk><noise>, we want to separate it to two words.
+            sep = " "
+            if char == "<":
+                sep = ">"
+            j = i + 1
+            while j < len(string):
+                c = string[j]
+                if ord(c) >= 128 or (c in spacelist) or (c == sep):
+                    break
+                j += 1
+            if j < len(string) and string[j] == ">":
+                j += 1
+            res.append(string[i:j])
+            i = j
+    return res
+
+
+def stripoff_tags(x):
+    if not x:
+        return ""
+    chars = []
+    i = 0
+    T = len(x)
+    while i < T:
+        if x[i] == "<":
+            while i < T and x[i] != ">":
+                i += 1
+            i += 1
+        else:
+            chars.append(x[i])
+            i += 1
+    return "".join(chars)
+
+
+def normalize(sentence, ignore_words, cs, split=None):
+    """sentence, ignore_words are both in unicode"""
+    new_sentence = []
+    for token in sentence:
+        x = token
+        if not cs:
+            x = x.upper()
+        if x in ignore_words:
+            continue
+        if remove_tag:
+            x = stripoff_tags(x)
+        if not x:
+            continue
+        if split and x in split:
+            new_sentence += split[x]
+        else:
+            new_sentence.append(x)
+    return new_sentence
+
+
+class Calculator:
+    def __init__(self):
+        self.data = {}
+        self.space = []
+        self.cost = {}
+        self.cost["cor"] = 0
+        self.cost["sub"] = 1
+        self.cost["del"] = 1
+        self.cost["ins"] = 1
+
+    def calculate(self, lab, rec):
+        # Initialization
+        lab.insert(0, "")
+        rec.insert(0, "")
+        while len(self.space) < len(lab):
+            self.space.append([])
+        for row in self.space:
+            for element in row:
+                element["dist"] = 0
+                element["error"] = "non"
+            while len(row) < len(rec):
+                row.append({"dist": 0, "error": "non"})
+        for i in range(len(lab)):
+            self.space[i][0]["dist"] = i
+            self.space[i][0]["error"] = "del"
+        for j in range(len(rec)):
+            self.space[0][j]["dist"] = j
+            self.space[0][j]["error"] = "ins"
+        self.space[0][0]["error"] = "non"
+        for token in lab:
+            if token not in self.data and len(token) > 0:
+                self.data[token] = {"all": 0, "cor": 0, "sub": 0, "ins": 0, "del": 0}
+        for token in rec:
+            if token not in self.data and len(token) > 0:
+                self.data[token] = {"all": 0, "cor": 0, "sub": 0, "ins": 0, "del": 0}
+        # Computing edit distance
+        for i, lab_token in enumerate(lab):
+            for j, rec_token in enumerate(rec):
+                if i == 0 or j == 0:
+                    continue
+                min_dist = sys.maxsize
+                min_error = "none"
+                dist = self.space[i - 1][j]["dist"] + self.cost["del"]
+                error = "del"
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                dist = self.space[i][j - 1]["dist"] + self.cost["ins"]
+                error = "ins"
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                if lab_token == rec_token.replace("<BIAS>", ""):
+                    dist = self.space[i - 1][j - 1]["dist"] + self.cost["cor"]
+                    error = "cor"
+                else:
+                    dist = self.space[i - 1][j - 1]["dist"] + self.cost["sub"]
+                    error = "sub"
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                self.space[i][j]["dist"] = min_dist
+                self.space[i][j]["error"] = min_error
+        # Tracing back
+        result = {
+            "lab": [],
+            "rec": [],
+            "code": [],
+            "all": 0,
+            "cor": 0,
+            "sub": 0,
+            "ins": 0,
+            "del": 0,
+        }
+        i = len(lab) - 1
+        j = len(rec) - 1
+        while True:
+            if self.space[i][j]["error"] == "cor":  # correct
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]["all"] = self.data[lab[i]]["all"] + 1
+                    self.data[lab[i]]["cor"] = self.data[lab[i]]["cor"] + 1
+                    result["all"] = result["all"] + 1
+                    result["cor"] = result["cor"] + 1
+                result["lab"].insert(0, lab[i])
+                result["rec"].insert(0, rec[j])
+                result["code"].insert(0, Code.match)
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]["error"] == "sub":  # substitution
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]["all"] = self.data[lab[i]]["all"] + 1
+                    self.data[lab[i]]["sub"] = self.data[lab[i]]["sub"] + 1
+                    result["all"] = result["all"] + 1
+                    result["sub"] = result["sub"] + 1
+                result["lab"].insert(0, lab[i])
+                result["rec"].insert(0, rec[j])
+                result["code"].insert(0, Code.substitution)
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]["error"] == "del":  # deletion
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]["all"] = self.data[lab[i]]["all"] + 1
+                    self.data[lab[i]]["del"] = self.data[lab[i]]["del"] + 1
+                    result["all"] = result["all"] + 1
+                    result["del"] = result["del"] + 1
+                result["lab"].insert(0, lab[i])
+                result["rec"].insert(0, "")
+                result["code"].insert(0, Code.deletion)
+                i = i - 1
+            elif self.space[i][j]["error"] == "ins":  # insertion
+                if len(rec[j]) > 0:
+                    self.data[rec[j]]["ins"] = self.data[rec[j]]["ins"] + 1
+                    result["ins"] = result["ins"] + 1
+                result["lab"].insert(0, "")
+                result["rec"].insert(0, rec[j])
+                result["code"].insert(0, Code.insertion)
+                j = j - 1
+            elif self.space[i][j]["error"] == "non":  # starting point
+                break
+            else:  # shouldn't reach here
+                print(
+                    "this should not happen , i = {i} , j = {j} , error = {error}".format(
+                        i=i, j=j, error=self.space[i][j]["error"]
+                    )
+                )
+        return result
+
+    def overall(self):
+        result = {"all": 0, "cor": 0, "sub": 0, "ins": 0, "del": 0}
+        for token in self.data:
+            result["all"] = result["all"] + self.data[token]["all"]
+            result["cor"] = result["cor"] + self.data[token]["cor"]
+            result["sub"] = result["sub"] + self.data[token]["sub"]
+            result["ins"] = result["ins"] + self.data[token]["ins"]
+            result["del"] = result["del"] + self.data[token]["del"]
+        return result
+
+    def cluster(self, data):
+        result = {"all": 0, "cor": 0, "sub": 0, "ins": 0, "del": 0}
+        for token in data:
+            if token in self.data:
+                result["all"] = result["all"] + self.data[token]["all"]
+                result["cor"] = result["cor"] + self.data[token]["cor"]
+                result["sub"] = result["sub"] + self.data[token]["sub"]
+                result["ins"] = result["ins"] + self.data[token]["ins"]
+                result["del"] = result["del"] + self.data[token]["del"]
+        return result
+
+    def keys(self):
+        return list(self.data.keys())
+
+
+def width(string):
+    return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
+
+
+def default_cluster(word):
+    unicode_names = [unicodedata.name(char) for char in word]
+    for i in reversed(range(len(unicode_names))):
+        if unicode_names[i].startswith("DIGIT"):  # 1
+            unicode_names[i] = "Number"  # 'DIGIT'
+        elif unicode_names[i].startswith("CJK UNIFIED IDEOGRAPH") or unicode_names[
+            i
+        ].startswith("CJK COMPATIBILITY IDEOGRAPH"):
+            # 明 / 郎
+            unicode_names[i] = "Mandarin"  # 'CJK IDEOGRAPH'
+        elif unicode_names[i].startswith("LATIN CAPITAL LETTER") or unicode_names[
+            i
+        ].startswith("LATIN SMALL LETTER"):
+            # A / a
+            unicode_names[i] = "English"  # 'LATIN LETTER'
+        elif unicode_names[i].startswith("HIRAGANA LETTER"):  # は こ め
+            unicode_names[i] = "Japanese"  # 'GANA LETTER'
+        elif (
+                unicode_names[i].startswith("AMPERSAND")
+                or unicode_names[i].startswith("APOSTROPHE")
+                or unicode_names[i].startswith("COMMERCIAL AT")
+                or unicode_names[i].startswith("DEGREE CELSIUS")
+                or unicode_names[i].startswith("EQUALS SIGN")
+                or unicode_names[i].startswith("FULL STOP")
+                or unicode_names[i].startswith("HYPHEN-MINUS")
+                or unicode_names[i].startswith("LOW LINE")
+                or unicode_names[i].startswith("NUMBER SIGN")
+                or unicode_names[i].startswith("PLUS SIGN")
+                or unicode_names[i].startswith("SEMICOLON")
+        ):
+            # & / ' / @ / ℃ / = / . / - / _ / # / + / ;
+            del unicode_names[i]
+        else:
+            return "Other"
+    if len(unicode_names) == 0:
+        return "Other"
+    if len(unicode_names) == 1:
+        return unicode_names[0]
+    for i in range(len(unicode_names) - 1):
+        if unicode_names[i] != unicode_names[i + 1]:
+            return "Other"
+    return unicode_names[0]
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="wer cal")
+    parser.add_argument("--ref", type=str, help="Text input path")
+    parser.add_argument("--ref_ocr", type=str, help="Text input path")
+    parser.add_argument("--rec_name", type=str, action="append", default=[])
+    parser.add_argument("--rec_file", type=str, action="append", default=[])
+    parser.add_argument("--verbose", type=int, default=1, help="show")
+    parser.add_argument("--char", type=bool, default=True, help="show")
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    cluster_file = ""
+    ignore_words = set()
+    tochar = args.char
+    verbose = args.verbose
+    padding_symbol = " "
+    case_sensitive = False
+    max_words_per_line = sys.maxsize
+    split = None
+
+    if not case_sensitive:
+        ig = set([w.upper() for w in ignore_words])
+        ignore_words = ig
+
+    default_clusters = {}
+    default_words = {}
+    ref_file = args.ref
+    ref_ocr = args.ref_ocr
+    rec_files = args.rec_file
+    rec_names = args.rec_name
+    assert len(rec_files) == len(rec_names)
+
+    # load ocr
+    ref_ocr_dict = {}
+    with codecs.open(ref_ocr, "r", "utf-8") as fh:
+        for line in fh:
+            if "$" in line:
+                line = line.replace("$", " ")
+            if tochar:
+                array = characterize(line)
+            else:
+                array = line.strip().split()
+            if len(array) == 0:
+                continue
+            fid = array[0]
+            ref_ocr_dict[fid] = normalize(array[1:], ignore_words, case_sensitive, split)
+
+    if split and not case_sensitive:
+        newsplit = dict()
+        for w in split:
+            words = split[w]
+            for i in range(len(words)):
+                words[i] = words[i].upper()
+            newsplit[w.upper()] = words
+        split = newsplit
+
+    rec_sets = {}
+    calculators_dict = dict()
+    ub_wer_dict = dict()
+    hotwords_related_dict = dict()  # 记录recall相关的内容
+    for i, hyp_file in enumerate(rec_files):
+        rec_sets[rec_names[i]] = dict()
+        with codecs.open(hyp_file, "r", "utf-8") as fh:
+            for line in fh:
+                if tochar:
+                    array = characterize(line)
+                else:
+                    array = line.strip().split()
+                if len(array) == 0:
+                    continue
+                fid = array[0]
+                rec_sets[rec_names[i]][fid] = normalize(array[1:], ignore_words, case_sensitive, split)
+
+        calculators_dict[rec_names[i]] = Calculator()
+        ub_wer_dict[rec_names[i]] = {"u_wer": WordError(), "b_wer": WordError(), "wer": WordError()}
+        hotwords_related_dict[rec_names[i]] = {'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0}
+        # tp: 热词在label里，同时在rec里
+        # tn: 热词不在label里，同时不在rec里
+        # fp: 热词不在label里，但是在rec里
+        # fn: 热词在label里，但是不在rec里
+
+    # record wrong label but in ocr
+    wrong_rec_but_in_ocr_dict = {}
+    for rec_name in rec_names:
+        wrong_rec_but_in_ocr_dict[rec_name] = 0
+
+    _file_total_len = 0
+    with os.popen("cat {} | wc -l".format(ref_file)) as pipe:
+        _file_total_len = int(pipe.read().strip())
+
+    # compute error rate on the interaction of reference file and hyp file
+    for line in tqdm(open(ref_file, 'r', encoding='utf-8'), total=_file_total_len):
+        if tochar:
+            array = characterize(line)
+        else:
+            array = line.rstrip('\n').split()
+        if len(array) == 0: continue
+        fid = array[0]
+        lab = normalize(array[1:], ignore_words, case_sensitive, split)
+
+        if verbose:
+            print('\nutt: %s' % fid)
+
+        ocr_text = ref_ocr_dict[fid]
+        ocr_set = set(ocr_text)
+        print('ocr: {}'.format(" ".join(ocr_text)))
+        list_match = []  # 指label里面在ocr里面的内容
+        list_not_mathch = []
+        tmp_error = 0
+        tmp_match = 0
+        for index in range(len(lab)):
+            # text_list.append(uttlist[index+1])
+            if lab[index] not in ocr_set:
+                tmp_error += 1
+                list_not_mathch.append(lab[index])
+            else:
+                tmp_match += 1
+                list_match.append(lab[index])
+        print('label in ocr: {}'.format(" ".join(list_match)))
+
+        # for each reco file
+        base_wrong_ocr_wer = None
+        ocr_wrong_ocr_wer = None
+
+        for rec_name in rec_names:
+            rec_set = rec_sets[rec_name]
+            if fid not in rec_set:
+                continue
+            rec = rec_set[fid]
+
+            # print(rec)
+            for word in rec + lab:
+                if word not in default_words:
+                    default_cluster_name = default_cluster(word)
+                    if default_cluster_name not in default_clusters:
+                        default_clusters[default_cluster_name] = {}
+                    if word not in default_clusters[default_cluster_name]:
+                        default_clusters[default_cluster_name][word] = 1
+                    default_words[word] = default_cluster_name
+
+            result = calculators_dict[rec_name].calculate(lab.copy(), rec.copy())
+            if verbose:
+                if result['all'] != 0:
+                    wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all']
+                else:
+                    wer = 0.0
+            print('WER(%s): %4.2f %%' % (rec_name, wer), end=' ')
+            print('N=%d C=%d S=%d D=%d I=%d' %
+                  (result['all'], result['cor'], result['sub'], result['del'], result['ins']))
+
+
+            # print(result['rec'])
+            wrong_rec_but_in_ocr = []
+            for idx in range(len(result['lab'])):
+                if result['lab'][idx] != "":
+                    if result['lab'][idx] != result['rec'][idx].replace("<BIAS>", ""):
+                        if result['lab'][idx] in list_match:
+                            wrong_rec_but_in_ocr.append(result['lab'][idx])
+                            wrong_rec_but_in_ocr_dict[rec_name] += 1
+            print('wrong_rec_but_in_ocr: {}'.format(" ".join(wrong_rec_but_in_ocr)))
+
+            if rec_name == "base":
+                base_wrong_ocr_wer = len(wrong_rec_but_in_ocr)
+            if "ocr" in rec_name or "hot" in rec_name:
+                ocr_wrong_ocr_wer = len(wrong_rec_but_in_ocr)
+                if ocr_wrong_ocr_wer < base_wrong_ocr_wer:
+                    print("{} {} helps, {} -> {}".format(fid, rec_name, base_wrong_ocr_wer, ocr_wrong_ocr_wer))
+                elif ocr_wrong_ocr_wer > base_wrong_ocr_wer:
+                    print("{} {} hurts, {} -> {}".format(fid, rec_name, base_wrong_ocr_wer, ocr_wrong_ocr_wer))
+
+            # recall = 0
+            # false_alarm = 0
+            # for idx in range(len(result['lab'])):
+            #     if "<BIAS>" in result['rec'][idx]:
+            #         if result['rec'][idx].replace("<BIAS>", "") in list_match:
+            #             recall += 1
+            #         else:
+            #             false_alarm += 1
+            # print("bias hotwords recall: {}, fa: {}, list_match {}, recall: {:.2f}, fa: {:.2f}".format(
+            #     recall, false_alarm, len(list_match), recall / len(list_match) if len(list_match) != 0 else 0, false_alarm / len(list_match) if len(list_match) != 0 else 0
+            # ))
+            # tp: 热词在label里，同时在rec里
+            # tn: 热词不在label里，同时不在rec里
+            # fp: 热词不在label里，但是在rec里
+            # fn: 热词在label里，但是不在rec里
+            _rec_list = [word.replace("<BIAS>", "") for word in rec]
+            _label_list = [word for word in lab]
+            _tp = _tn = _fp = _fn = 0
+            hot_true_list = [hotword for hotword in ocr_text if hotword in _label_list]
+            hot_bad_list = [hotword for hotword in ocr_text if hotword not in _label_list]
+            for badhotword in hot_bad_list:
+                count = len([word for word in _rec_list if word == badhotword])
+                # print(f"bad {badhotword} count: {count}")
+                # for word in _rec_list:
+                #     if badhotword == word:
+                #         count += 1
+                if count == 0:
+                    hotwords_related_dict[rec_name]['tn'] += 1
+                    _tn += 1
+                    # fp: 0
+                else:
+                    hotwords_related_dict[rec_name]['fp'] += count
+                    _fp += count
+                    # tn: 0
+                # if badhotword in _rec_list:
+                #     hotwords_related_dict[rec_name]['fp'] += 1
+                # else:
+                #     hotwords_related_dict[rec_name]['tn'] += 1
+            for hotword in hot_true_list:
+                true_count = len([word for word in _label_list if hotword == word])
+                rec_count = len([word for word in _rec_list if hotword == word])
+                # print(f"good {hotword} true_count: {true_count}, rec_count: {rec_count}")
+                if rec_count == true_count:
+                    hotwords_related_dict[rec_name]['tp'] += true_count
+                    _tp += true_count
+                elif rec_count > true_count:
+                    hotwords_related_dict[rec_name]['tp'] += true_count
+                    # fp: 不在label里，但是在rec里
+                    hotwords_related_dict[rec_name]['fp'] += rec_count - true_count
+                    _tp += true_count
+                    _fp += rec_count - true_count
+                else:
+                    hotwords_related_dict[rec_name]['tp'] += rec_count
+                    # fn: 热词在label里，但是不在rec里
+                    hotwords_related_dict[rec_name]['fn'] += true_count - rec_count
+                    _tp += rec_count
+                    _fn += true_count - rec_count
+            print("hotword: tp: {}, tn: {}, fp: {}, fn: {}, all: {}, recall: {:.2f}%".format(
+                _tp, _tn, _fp, _fn, sum([_tp, _tn, _fp, _fn]), _tp / (_tp + _fn) * 100 if (_tp + _fn) != 0 else 0
+            ))
+
+            # if hotword in _rec_list:
+            #     hotwords_related_dict[rec_name]['tp'] += 1
+            # else:
+            #     hotwords_related_dict[rec_name]['fn'] += 1
+            # 计算uwer, bwer, wer
+            for code, rec_word, lab_word in zip(result["code"], result["rec"], result["lab"]):
+                if code == Code.match:
+                    ub_wer_dict[rec_name]["wer"].ref_words += 1
+                    if lab_word in hot_true_list:
+                        # tmp_ref.append(ref_tokens[ref_idx])
+                        ub_wer_dict[rec_name]["b_wer"].ref_words += 1
+                    else:
+                        ub_wer_dict[rec_name]["u_wer"].ref_words += 1
+                elif code == Code.substitution:
+                    ub_wer_dict[rec_name]["wer"].ref_words += 1
+                    ub_wer_dict[rec_name]["wer"].errors[Code.substitution] += 1
+                    if lab_word in hot_true_list:
+                        # tmp_ref.append(ref_tokens[ref_idx])
+                        ub_wer_dict[rec_name]["b_wer"].ref_words += 1
+                        ub_wer_dict[rec_name]["b_wer"].errors[Code.substitution] += 1
+                    else:
+                        ub_wer_dict[rec_name]["u_wer"].ref_words += 1
+                        ub_wer_dict[rec_name]["u_wer"].errors[Code.substitution] += 1
+                elif code == Code.deletion:
+                    ub_wer_dict[rec_name]["wer"].ref_words += 1
+                    ub_wer_dict[rec_name]["wer"].errors[Code.deletion] += 1
+                    if lab_word in hot_true_list:
+                        # tmp_ref.append(ref_tokens[ref_idx])
+                        ub_wer_dict[rec_name]["b_wer"].ref_words += 1
+                        ub_wer_dict[rec_name]["b_wer"].errors[Code.deletion] += 1
+                    else:
+                        ub_wer_dict[rec_name]["u_wer"].ref_words += 1
+                        ub_wer_dict[rec_name]["u_wer"].errors[Code.deletion] += 1
+                elif code == Code.insertion:
+                    ub_wer_dict[rec_name]["wer"].errors[Code.insertion] += 1
+                    if rec_word in hot_true_list:
+                        ub_wer_dict[rec_name]["b_wer"].errors[Code.insertion] += 1
+                    else:
+                        ub_wer_dict[rec_name]["u_wer"].errors[Code.insertion] += 1
+
+            space = {}
+            space['lab'] = []
+            space['rec'] = []
+            for idx in range(len(result['lab'])):
+                len_lab = width(result['lab'][idx])
+                len_rec = width(result['rec'][idx])
+                length = max(len_lab, len_rec)
+                space['lab'].append(length - len_lab)
+                space['rec'].append(length - len_rec)
+            upper_lab = len(result['lab'])
+            upper_rec = len(result['rec'])
+            lab1, rec1 = 0, 0
+            while lab1 < upper_lab or rec1 < upper_rec:
+                if verbose > 1:
+                    print('lab(%s):' % fid.encode('utf-8'), end=' ')
+                else:
+                    print('lab:', end=' ')
+                lab2 = min(upper_lab, lab1 + max_words_per_line)
+                for idx in range(lab1, lab2):
+                    token = result['lab'][idx]
+                    print('{token}'.format(token=token), end='')
+                    for n in range(space['lab'][idx]):
+                        print(padding_symbol, end='')
+                    print(' ', end='')
+                print()
+                if verbose > 1:
+                    print('rec(%s):' % fid.encode('utf-8'), end=' ')
+                else:
+                    print('rec:', end=' ')
+
+                rec2 = min(upper_rec, rec1 + max_words_per_line)
+                for idx in range(rec1, rec2):
+                    token = result['rec'][idx]
+                    print('{token}'.format(token=token), end='')
+                    for n in range(space['rec'][idx]):
+                        print(padding_symbol, end='')
+                    print(' ', end='')
+                print()
+                # print('\n', end='\n')
+                lab1 = lab2
+                rec1 = rec2
+        print('\n', end='\n')
+        # break
+    if verbose:
+        print('===========================================================================')
+        print()
+
+    print(wrong_rec_but_in_ocr_dict)
+    for rec_name in rec_names:
+        result = calculators_dict[rec_name].overall()
+
+        if result['all'] != 0:
+            wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all']
+        else:
+            wer = 0.0
+        print('{} Overall -> {:4.2f} %'.format(rec_name, wer), end=' ')
+        print('N=%d C=%d S=%d D=%d I=%d' %
+              (result['all'], result['cor'], result['sub'], result['del'], result['ins']))
+        print(f"WER: {ub_wer_dict[rec_name]['wer'].get_result_string()}")
+        print(f"U-WER: {ub_wer_dict[rec_name]['u_wer'].get_result_string()}")
+        print(f"B-WER: {ub_wer_dict[rec_name]['b_wer'].get_result_string()}")
+
+        print('hotword: tp: {}, tn: {}, fp: {}, fn: {}, all: {}, recall: {:.2f}%'.format(
+            hotwords_related_dict[rec_name]['tp'],
+            hotwords_related_dict[rec_name]['tn'],
+            hotwords_related_dict[rec_name]['fp'],
+            hotwords_related_dict[rec_name]['fn'],
+            sum([v for k, v in hotwords_related_dict[rec_name].items()]),
+            hotwords_related_dict[rec_name]['tp'] / (
+                    hotwords_related_dict[rec_name]['tp'] + hotwords_related_dict[rec_name]['fn']
+            ) * 100 if hotwords_related_dict[rec_name]['tp'] + hotwords_related_dict[rec_name]['fn'] != 0 else 0
+        ))
+
+        # tp: 热词在label里，同时在rec里
+        # tn: 热词不在label里，同时不在rec里
+        # fp: 热词不在label里，但是在rec里
+        # fn: 热词在label里，但是不在rec里
+        if not verbose:
+            print()
+        print()
+
+
+if __name__ == "__main__":
+    args = get_args()
+    
+    # print("")
+    print(args)
+    main(args)
+
diff --git a/examples/industrial_data_pretraining/lcbnet/demo.sh b/examples/industrial_data_pretraining/lcbnet/demo.sh
index 9515f985d..f90b8e24b 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo.sh
@@ -1,13 +1,71 @@
 file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
+CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+inference_device="cuda"
 
-#CUDA_VISIBLE_DEVICES="" \
-python -m funasr.bin.inference \
---config-path=${file_dir} \
---config-name="config.yaml" \
-++init_param=${file_dir}/model.pb \
-++tokenizer_conf.token_list=${file_dir}/tokens.txt \
-++input=[${file_dir}/wav.scp,${file_dir}/ocr.txt] \
-+data_type='["kaldi_ark", "text"]' \
-++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
-++output_dir="./outputs/debug" \
-++device="cpu" \
+if [ ${inference_device} == "cuda" ]; then
+    nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+else
+    inference_batch_size=1
+    CUDA_VISIBLE_DEVICES=""
+    for JOB in $(seq ${nj}); do
+        CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
+    done
+fi
+
+inference_dir="outputs/slidespeech_dev_beamsearch"
+_logdir="${inference_dir}/logdir"
+echo "inference_dir: ${inference_dir}"
+
+mkdir -p "${_logdir}"
+key_file1=${file_dir}/dev/wav.scp
+key_file2=${file_dir}/dev/ocr.txt
+split_scps1=
+split_scps2=
+for JOB in $(seq "${nj}"); do
+    split_scps1+=" ${_logdir}/wav.${JOB}.scp"
+    split_scps2+=" ${_logdir}/ocr.${JOB}.txt"
+done
+utils/split_scp.pl "${key_file1}" ${split_scps1}
+utils/split_scp.pl "${key_file2}" ${split_scps2}
+
+gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
+for JOB in $(seq ${nj}); do
+    {
+        id=$((JOB-1))
+        gpuid=${gpuid_list_array[$id]}
+
+        export CUDA_VISIBLE_DEVICES=${gpuid}
+
+        python -m funasr.bin.inference \
+        --config-path=${file_dir} \
+        --config-name="config.yaml" \
+        ++init_param=${file_dir}/model.pb \
+        ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
+        ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
+        +data_type='["kaldi_ark", "text"]' \
+        ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
+        ++output_dir="${inference_dir}/${JOB}" \
+        ++device="${inference_device}" \
+        ++ncpu=1 \
+        ++disable_log=true  &> ${_logdir}/log.${JOB}.txt
+
+    }&
+done
+wait
+
+
+mkdir -p ${inference_dir}/1best_recog
+
+for JOB in $(seq "${nj}"); do
+   cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token"
+done  
+
+echo "Computing WER ..."
+sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/'  ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc
+cp  ${file_dir}/dev/text ${inference_dir}/1best_recog/token.ref
+cp  ${file_dir}/dev/ocr.list ${inference_dir}/1best_recog/ocr.list
+python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer
+tail -n 3 ${inference_dir}/1best_recog/token.cer
+
+./run_bwer_recall.sh  ${inference_dir}/1best_recog/
+tail -n 6 ${inference_dir}/1best_recog/BWER-UWER.results |head -n 5
diff --git a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh b/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
deleted file mode 100755
index 4aae9e5ed..000000000
--- a/examples/industrial_data_pretraining/lcbnet/demo_nj.sh
+++ /dev/null
@@ -1,67 +0,0 @@
-file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
-CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
-inference_device="cuda"
-
-if [ ${inference_device} == "cuda" ]; then
-    nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
-else
-    inference_batch_size=1
-    CUDA_VISIBLE_DEVICES=""
-    for JOB in $(seq ${nj}); do
-        CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
-    done
-fi
-
-inference_dir="outputs/test"
-_logdir="${inference_dir}/logdir"
-echo "inference_dir: ${inference_dir}"
-
-mkdir -p "${_logdir}"
-key_file1=${file_dir}/wav.scp
-key_file2=${file_dir}/ocr.txt
-split_scps1=
-split_scps2=
-for JOB in $(seq "${nj}"); do
-    split_scps1+=" ${_logdir}/wav.${JOB}.scp"
-    split_scps2+=" ${_logdir}/ocr.${JOB}.txt"
-done
-utils/split_scp.pl "${key_file1}" ${split_scps1}
-utils/split_scp.pl "${key_file2}" ${split_scps2}
-
-gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
-for JOB in $(seq ${nj}); do
-    {
-        id=$((JOB-1))
-        gpuid=${gpuid_list_array[$id]}
-
-        export CUDA_VISIBLE_DEVICES=${gpuid}
-
-        python -m funasr.bin.inference \
-        --config-path=${file_dir} \
-        --config-name="config.yaml" \
-        ++init_param=${file_dir}/model.pb \
-        ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
-        ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
-        +data_type='["kaldi_ark", "text"]' \
-        ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
-        ++output_dir="${inference_dir}/${JOB}" \
-        ++device="${inference_device}" \
-        ++ncpu=1 \
-        ++disable_log=true  &> ${_logdir}/log.${JOB}.txt
-
-    }&
-done
-wait
-
-
-mkdir -p ${inference_dir}/1best_recog
-
-for JOB in $(seq "${nj}"); do
-   cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token"
-done  
-
-echo "Computing WER ..."
-sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/'  ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc
-cp  ${file_dir}/text ${inference_dir}/1best_recog/token.ref
-python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer
-tail -n 3 ${inference_dir}/1best_recog/token.cer
\ No newline at end of file
diff --git a/examples/industrial_data_pretraining/lcbnet/run_bwer_recall.sh b/examples/industrial_data_pretraining/lcbnet/run_bwer_recall.sh
new file mode 100755
index 000000000..7d6b6ff8b
--- /dev/null
+++ b/examples/industrial_data_pretraining/lcbnet/run_bwer_recall.sh
@@ -0,0 +1,11 @@
+#now_result_name=asr_conformer_acc1_lr002_warm20000/decode_asr_asr_model_valid.acc.ave
+#hotword_type=ocr_1ngram_top10_hotwords_list
+hot_exp_suf=$1
+
+
+python compute_wer_details.py --v 1 \
+   --ref ${hot_exp_suf}/token.ref \
+   --ref_ocr ${hot_exp_suf}/ocr.list  \
+   --rec_name base \
+   --rec_file ${hot_exp_suf}/token.proc \
+   > ${hot_exp_suf}/BWER-UWER.results

From d12d18886cf5d1c7daaf74bd348cf3ca7b2c8b7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 29 Feb 2024 16:20:18 +0800
Subject: [PATCH 074/101] test

---
 .../industrial_data_pretraining/lcbnet/demo_pdb.sh  | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 examples/industrial_data_pretraining/lcbnet/demo_pdb.sh

diff --git a/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh b/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh
new file mode 100644
index 000000000..e435905bf
--- /dev/null
+++ b/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh
@@ -0,0 +1,13 @@
+file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
+
+#CUDA_VISIBLE_DEVICES="" \
+python -m funasr.bin.inference \
+--config-path=${file_dir} \
+--config-name="config.yaml" \
+++init_param=${file_dir}/model.pb \
+++tokenizer_conf.token_list=${file_dir}/tokens.txt \
+++input=[${file_dir}/dev/wav.scp,${file_dir}/dev/ocr.txt] \
++data_type='["kaldi_ark", "text"]' \
+++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
+++output_dir="./outputs/debug" \
+++device="cpu" \

From 4477e27bf08f065dacd37c82fc88e69f43805328 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 29 Feb 2024 16:26:54 +0800
Subject: [PATCH 075/101] test

---
 examples/industrial_data_pretraining/lcbnet/demo.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo.sh b/examples/industrial_data_pretraining/lcbnet/demo.sh
index f90b8e24b..825289188 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo.sh
@@ -12,7 +12,7 @@ else
     done
 fi
 
-inference_dir="outputs/slidespeech_dev_beamsearch"
+inference_dir="outputs/slidespeech_dev_beamsearch_new"
 _logdir="${inference_dir}/logdir"
 echo "inference_dir: ${inference_dir}"
 

From 650c506cda3f6d38ad4805f02fe2700d2287400d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 29 Feb 2024 18:57:17 +0800
Subject: [PATCH 076/101] atsr

---
 examples/industrial_data_pretraining/lcbnet/demo_pdb.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 examples/industrial_data_pretraining/lcbnet/demo_pdb.sh

diff --git a/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh b/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh
old mode 100644
new mode 100755

From 84ad3e48a0a1c29967a4cf9195ad202d434c7860 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 29 Feb 2024 18:58:08 +0800
Subject: [PATCH 077/101] atsr

---
 funasr/utils/load_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py
index 87412bd87..bd8de3e87 100644
--- a/funasr/utils/load_utils.py
+++ b/funasr/utils/load_utils.py
@@ -34,6 +34,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs:
     if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): # local file
         if data_type is None or data_type == "sound":
             data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list)
+            pdb.set_trace()
             if kwargs.get("reduce_channels", True):
                 data_or_path_or_list = data_or_path_or_list.mean(0)
         elif data_type == "text" and tokenizer is not None:

From 96eaabca5b2e9c93b40c9840e2ae0003a618bb6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Thu, 29 Feb 2024 19:02:43 +0800
Subject: [PATCH 078/101] atsr

---
 funasr/utils/load_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py
index bd8de3e87..87412bd87 100644
--- a/funasr/utils/load_utils.py
+++ b/funasr/utils/load_utils.py
@@ -34,7 +34,6 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs:
     if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): # local file
         if data_type is None or data_type == "sound":
             data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list)
-            pdb.set_trace()
             if kwargs.get("reduce_channels", True):
                 data_or_path_or_list = data_or_path_or_list.mean(0)
         elif data_type == "text" and tokenizer is not None:

From 8f63be3af7264f3b0831d91e2e54800fcd246120 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Fri, 1 Mar 2024 10:16:03 +0800
Subject: [PATCH 079/101] atsr

---
 funasr/auto/auto_model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py
index 3f99e4d17..89e38eafc 100644
--- a/funasr/auto/auto_model.py
+++ b/funasr/auto/auto_model.py
@@ -41,6 +41,7 @@ def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None):
     chars = string.ascii_letters + string.digits
     if isinstance(data_in, str) and data_in.startswith('http'): # url
         data_in = download_from_url(data_in)
+    pdb.set_trace()
     if isinstance(data_in, str) and os.path.exists(data_in): # wav_path; filelist: wav.scp, file.jsonl;text.txt;
         _, file_extension = os.path.splitext(data_in)
         file_extension = file_extension.lower()

From ec98a8e13859e4c3f1e55fca5f09e91be1b3d810 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Fri, 1 Mar 2024 10:26:23 +0800
Subject: [PATCH 080/101] atsr

---
 funasr/auto/auto_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py
index 89e38eafc..ef261dfcc 100644
--- a/funasr/auto/auto_model.py
+++ b/funasr/auto/auto_model.py
@@ -213,7 +213,7 @@ class AutoModel:
         batch_size = kwargs.get("batch_size", 1)
         # if kwargs.get("device", "cpu") == "cpu":
         #     batch_size = 1
-        
+        pdb.set_trace()
         key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key)
 
         speed_stats = {}

From fae6fd6d16e0fb060aa063790893f7555c421c4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Fri, 1 Mar 2024 10:29:15 +0800
Subject: [PATCH 081/101] atsr

---
 funasr/bin/inference.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/funasr/bin/inference.py b/funasr/bin/inference.py
index d2f0c149d..3943d7ed4 100644
--- a/funasr/bin/inference.py
+++ b/funasr/bin/inference.py
@@ -1,7 +1,7 @@
 import hydra
 import logging
 from omegaconf import DictConfig, OmegaConf, ListConfig
-
+import pdb
 from funasr.auto.auto_model import AutoModel
 
 
@@ -23,6 +23,7 @@ def main_hydra(cfg: DictConfig):
     if kwargs.get("debug", False):
         import pdb; pdb.set_trace()
     model = AutoModel(**kwargs)
+    pdb.set_trace()
     res = model.generate(input=kwargs["input"])
     print(res)
 

From 21b49bd56f44ea5921bacd8c0a3c5e35680cb405 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Fri, 1 Mar 2024 10:36:27 +0800
Subject: [PATCH 082/101] atsr

---
 funasr/auto/auto_model.py | 2 +-
 funasr/bin/inference.py   | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py
index ef261dfcc..56dd5b5e4 100644
--- a/funasr/auto/auto_model.py
+++ b/funasr/auto/auto_model.py
@@ -213,7 +213,7 @@ class AutoModel:
         batch_size = kwargs.get("batch_size", 1)
         # if kwargs.get("device", "cpu") == "cpu":
         #     batch_size = 1
-        pdb.set_trace()
+
         key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key)
 
         speed_stats = {}
diff --git a/funasr/bin/inference.py b/funasr/bin/inference.py
index 3943d7ed4..d2f0c149d 100644
--- a/funasr/bin/inference.py
+++ b/funasr/bin/inference.py
@@ -1,7 +1,7 @@
 import hydra
 import logging
 from omegaconf import DictConfig, OmegaConf, ListConfig
-import pdb
+
 from funasr.auto.auto_model import AutoModel
 
 
@@ -23,7 +23,6 @@ def main_hydra(cfg: DictConfig):
     if kwargs.get("debug", False):
         import pdb; pdb.set_trace()
     model = AutoModel(**kwargs)
-    pdb.set_trace()
     res = model.generate(input=kwargs["input"])
     print(res)
 

From d4955ba39594dfec455c3045807927809974507f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Fri, 1 Mar 2024 10:42:37 +0800
Subject: [PATCH 083/101] atsr

---
 funasr/utils/load_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py
index 87412bd87..ccb5670c2 100644
--- a/funasr/utils/load_utils.py
+++ b/funasr/utils/load_utils.py
@@ -31,6 +31,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs:
             return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs, data_type=data_type, **kwargs) for audio in data_or_path_or_list]
     if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith('http'): # download url to local file
         data_or_path_or_list = download_from_url(data_or_path_or_list)
+    pdb.set_trace()
     if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): # local file
         if data_type is None or data_type == "sound":
             data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list)
@@ -66,7 +67,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs:
     else:
         pass
         # print(f"unsupport data type: {data_or_path_or_list}, return raw data")
-
+    pdb.set_trace()
     if audio_fs != fs and data_type != "text":
         resampler = torchaudio.transforms.Resample(audio_fs, fs)
         data_or_path_or_list = resampler(data_or_path_or_list[None, :])[0, :]

From 6d7b9457103264b760f79918aa13ec1b89474670 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Fri, 1 Mar 2024 11:10:44 +0800
Subject: [PATCH 084/101] atsr

---
 funasr/auto/auto_model.py     | 2 +-
 funasr/models/lcbnet/model.py | 3 ++-
 funasr/utils/load_utils.py    | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py
index 56dd5b5e4..9bb9ce07d 100644
--- a/funasr/auto/auto_model.py
+++ b/funasr/auto/auto_model.py
@@ -41,7 +41,7 @@ def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None):
     chars = string.ascii_letters + string.digits
     if isinstance(data_in, str) and data_in.startswith('http'): # url
         data_in = download_from_url(data_in)
-    pdb.set_trace()
+
     if isinstance(data_in, str) and os.path.exists(data_in): # wav_path; filelist: wav.scp, file.jsonl;text.txt;
         _, file_extension = os.path.splitext(data_in)
         file_extension = file_extension.lower()
diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index ab557e6d8..09e6dd137 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -426,6 +426,7 @@ class LCBNet(nn.Module):
                                                             tokenizer=tokenizer)
             time2 = time.perf_counter()
             meta_data["load_data"] = f"{time2 - time1:0.3f}"
+            pdb.set_trace()
             audio_sample_list = sample_list[0]
             ocr_sample_list = sample_list[1]
             speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
@@ -441,7 +442,7 @@ class LCBNet(nn.Module):
         encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
         if isinstance(encoder_out, tuple):
             encoder_out = encoder_out[0]
-        
+        pdb.set_trace()
         ocr_list_new = [[x + 1 if x != 0 else x for x in sublist] for sublist in ocr_sample_list]
         ocr = torch.tensor(ocr_list_new).to(device=kwargs["device"])
         ocr_lengths = ocr.new_full([1], dtype=torch.long, fill_value=ocr.size(1)).to(device=kwargs["device"])
diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py
index ccb5670c2..644af2324 100644
--- a/funasr/utils/load_utils.py
+++ b/funasr/utils/load_utils.py
@@ -31,7 +31,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs:
             return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs, data_type=data_type, **kwargs) for audio in data_or_path_or_list]
     if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith('http'): # download url to local file
         data_or_path_or_list = download_from_url(data_or_path_or_list)
-    pdb.set_trace()
+
     if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): # local file
         if data_type is None or data_type == "sound":
             data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list)
@@ -67,7 +67,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs:
     else:
         pass
         # print(f"unsupport data type: {data_or_path_or_list}, return raw data")
-    pdb.set_trace()
+
     if audio_fs != fs and data_type != "text":
         resampler = torchaudio.transforms.Resample(audio_fs, fs)
         data_or_path_or_list = resampler(data_or_path_or_list[None, :])[0, :]

From 583f918e0ec752518a0263b09ce9b9e55f047fa4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Fri, 1 Mar 2024 11:18:29 +0800
Subject: [PATCH 085/101] atsr

---
 funasr/models/lcbnet/model.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index 09e6dd137..e83f8d783 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -428,7 +428,10 @@ class LCBNet(nn.Module):
             meta_data["load_data"] = f"{time2 - time1:0.3f}"
             pdb.set_trace()
             audio_sample_list = sample_list[0]
-            ocr_sample_list = sample_list[1]
+            if len(sample_list) >1:
+                ocr_sample_list = sample_list[1]
+            else:
+                ocr_sample_list = [294, 0]
             speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
                                                    frontend=frontend)
             time3 = time.perf_counter()

From de21c10a2e8a7f93719902708a5ca7970d9e3f00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Fri, 1 Mar 2024 11:30:49 +0800
Subject: [PATCH 086/101] atsr

---
 funasr/models/lcbnet/model.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index e83f8d783..15e2fa1c4 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -426,12 +426,11 @@ class LCBNet(nn.Module):
                                                             tokenizer=tokenizer)
             time2 = time.perf_counter()
             meta_data["load_data"] = f"{time2 - time1:0.3f}"
-            pdb.set_trace()
             audio_sample_list = sample_list[0]
             if len(sample_list) >1:
                 ocr_sample_list = sample_list[1]
             else:
-                ocr_sample_list = [294, 0]
+                ocr_sample_list = [[294, 0]]
             speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
                                                    frontend=frontend)
             time3 = time.perf_counter()

From 52cec216726806c1a1223a305ae6c3f87bce8558 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Fri, 1 Mar 2024 11:33:31 +0800
Subject: [PATCH 087/101] atsr

---
 funasr/models/lcbnet/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/funasr/models/lcbnet/model.py b/funasr/models/lcbnet/model.py
index 15e2fa1c4..3ac319c61 100644
--- a/funasr/models/lcbnet/model.py
+++ b/funasr/models/lcbnet/model.py
@@ -444,7 +444,7 @@ class LCBNet(nn.Module):
         encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
         if isinstance(encoder_out, tuple):
             encoder_out = encoder_out[0]
-        pdb.set_trace()
+
         ocr_list_new = [[x + 1 if x != 0 else x for x in sublist] for sublist in ocr_sample_list]
         ocr = torch.tensor(ocr_list_new).to(device=kwargs["device"])
         ocr_lengths = ocr.new_full([1], dtype=torch.long, fill_value=ocr.size(1)).to(device=kwargs["device"])

From 1d7ba1be1ad824135698e8000386c1fd55268ae4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Mon, 4 Mar 2024 13:52:21 +0800
Subject: [PATCH 088/101] atsr

---
 .gitignore                                    |  1 +
 .../lcbnet/demo2.sh                           | 71 +++++++++++++++++++
 .../lcbnet/demo2_tmp.sh                       | 71 +++++++++++++++++++
 .../lcbnet/demo_pdb.sh                        |  9 ++-
 .../lcbnet/demo_pdb2.sh                       | 15 ++++
 .../lcbnet/demo_tmp1.sh                       | 71 +++++++++++++++++++
 6 files changed, 236 insertions(+), 2 deletions(-)
 create mode 100755 examples/industrial_data_pretraining/lcbnet/demo2.sh
 create mode 100755 examples/industrial_data_pretraining/lcbnet/demo2_tmp.sh
 create mode 100755 examples/industrial_data_pretraining/lcbnet/demo_pdb2.sh
 create mode 100755 examples/industrial_data_pretraining/lcbnet/demo_tmp1.sh

diff --git a/.gitignore b/.gitignore
index bdfe70f1a..d2b4c53b9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,4 +25,5 @@ outputs*
 emotion2vec*
 GPT-SoVITS*
 examples/*/*/outputs
+examples/*/*/exp
 cmd_read
diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh
new file mode 100755
index 000000000..69df6d16c
--- /dev/null
+++ b/examples/industrial_data_pretraining/lcbnet/demo2.sh
@@ -0,0 +1,71 @@
+file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
+CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+inference_device="cuda"
+test_set="dev_wav"
+if [ ${inference_device} == "cuda" ]; then
+    nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+else
+    inference_batch_size=1
+    CUDA_VISIBLE_DEVICES=""
+    for JOB in $(seq ${nj}); do
+        CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
+    done
+fi
+
+inference_dir="outputs/slidespeech_dev_beamsearch_wav"
+_logdir="${inference_dir}/logdir"
+echo "inference_dir: ${inference_dir}"
+
+mkdir -p "${_logdir}"
+key_file1=${file_dir}/${test_set}/wav.scp
+key_file2=${file_dir}/${test_set}/ocr.txt
+split_scps1=
+split_scps2=
+for JOB in $(seq "${nj}"); do
+    split_scps1+=" ${_logdir}/wav.${JOB}.scp"
+    split_scps2+=" ${_logdir}/ocr.${JOB}.txt"
+done
+utils/split_scp.pl "${key_file1}" ${split_scps1}
+utils/split_scp.pl "${key_file2}" ${split_scps2}
+
+gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
+for JOB in $(seq ${nj}); do
+    {
+        id=$((JOB-1))
+        gpuid=${gpuid_list_array[$id]}
+
+        export CUDA_VISIBLE_DEVICES=${gpuid}
+
+        python -m funasr.bin.inference \
+        --config-path=${file_dir} \
+        --config-name="config.yaml" \
+        ++init_param=${file_dir}/model.pb \
+        ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
+        ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
+        +data_type='["sound", "text"]' \
+        ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
+        ++output_dir="${inference_dir}/${JOB}" \
+        ++device="${inference_device}" \
+        ++ncpu=1 \
+        ++disable_log=true  &> ${_logdir}/log.${JOB}.txt
+
+    }&
+done
+wait
+
+
+mkdir -p ${inference_dir}/1best_recog
+
+for JOB in $(seq "${nj}"); do
+   cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token"
+done  
+
+echo "Computing WER ..."
+sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/'  ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc
+cp  ${file_dir}/${test_set}/text ${inference_dir}/1best_recog/token.ref
+cp  ${file_dir}/${test_set}/ocr.list ${inference_dir}/1best_recog/ocr.list
+python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer
+tail -n 3 ${inference_dir}/1best_recog/token.cer
+
+./run_bwer_recall.sh  ${inference_dir}/1best_recog/
+tail -n 6 ${inference_dir}/1best_recog/BWER-UWER.results |head -n 5
diff --git a/examples/industrial_data_pretraining/lcbnet/demo2_tmp.sh b/examples/industrial_data_pretraining/lcbnet/demo2_tmp.sh
new file mode 100755
index 000000000..da6ad686d
--- /dev/null
+++ b/examples/industrial_data_pretraining/lcbnet/demo2_tmp.sh
@@ -0,0 +1,71 @@
+file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
+CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+inference_device="cuda"
+test_set="test_wav"
+if [ ${inference_device} == "cuda" ]; then
+    nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+else
+    inference_batch_size=1
+    CUDA_VISIBLE_DEVICES=""
+    for JOB in $(seq ${nj}); do
+        CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
+    done
+fi
+
+inference_dir="outputs/slidespeech_test_beamsearch_wav"
+_logdir="${inference_dir}/logdir"
+echo "inference_dir: ${inference_dir}"
+
+mkdir -p "${_logdir}"
+key_file1=${file_dir}/${test_set}/wav.scp
+key_file2=${file_dir}/${test_set}/ocr.txt
+split_scps1=
+split_scps2=
+for JOB in $(seq "${nj}"); do
+    split_scps1+=" ${_logdir}/wav.${JOB}.scp"
+    split_scps2+=" ${_logdir}/ocr.${JOB}.txt"
+done
+utils/split_scp.pl "${key_file1}" ${split_scps1}
+utils/split_scp.pl "${key_file2}" ${split_scps2}
+
+gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
+for JOB in $(seq ${nj}); do
+    {
+        id=$((JOB-1))
+        gpuid=${gpuid_list_array[$id]}
+
+        export CUDA_VISIBLE_DEVICES=${gpuid}
+
+        python -m funasr.bin.inference \
+        --config-path=${file_dir} \
+        --config-name="config.yaml" \
+        ++init_param=${file_dir}/model.pb \
+        ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
+        ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
+        +data_type='["sound", "text"]' \
+        ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
+        ++output_dir="${inference_dir}/${JOB}" \
+        ++device="${inference_device}" \
+        ++ncpu=1 \
+        ++disable_log=true  &> ${_logdir}/log.${JOB}.txt
+
+    }&
+done
+wait
+
+
+mkdir -p ${inference_dir}/1best_recog
+
+for JOB in $(seq "${nj}"); do
+   cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token"
+done  
+
+echo "Computing WER ..."
+sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/'  ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc
+cp  ${file_dir}/${test_set}/text ${inference_dir}/1best_recog/token.ref
+cp  ${file_dir}/${test_set}/ocr.list ${inference_dir}/1best_recog/ocr.list
+python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer
+tail -n 3 ${inference_dir}/1best_recog/token.cer
+
+./run_bwer_recall.sh  ${inference_dir}/1best_recog/
+tail -n 6 ${inference_dir}/1best_recog/BWER-UWER.results |head -n 5
diff --git a/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh b/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh
index e435905bf..0747a8d7b 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh
@@ -6,8 +6,13 @@ python -m funasr.bin.inference \
 --config-name="config.yaml" \
 ++init_param=${file_dir}/model.pb \
 ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
-++input=[${file_dir}/dev/wav.scp,${file_dir}/dev/ocr.txt] \
-+data_type='["kaldi_ark", "text"]' \
++input=["${file_dir}/example/asr_example.wav","${file_dir}/example/ocr.txt"] \
++data_type='["sound","text"]' \
 ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
 ++output_dir="./outputs/debug" \
 ++device="cpu" \
+
+#++input=["/nfs/yufan.yf/workspace/espnet/egs2/youtube_ppt/asr/dump/raw/dev_oracle_v1_new/data/format.1/YTB+--tMoLpQI-w+00322.wav"] \
+#+data_type='["sound"]' \
+#++input=["/nfs/yufan.yf/workspace/espnet/egs2/youtube_ppt/asr/dump/raw/dev_oracle_v1_new/data/format.1/YTB+--tMoLpQI-w+00322.wav","/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch/example/ocr2.txt"]  \
+#+data_type='["sound","text"]' \
diff --git a/examples/industrial_data_pretraining/lcbnet/demo_pdb2.sh b/examples/industrial_data_pretraining/lcbnet/demo_pdb2.sh
new file mode 100755
index 000000000..557e9b2d8
--- /dev/null
+++ b/examples/industrial_data_pretraining/lcbnet/demo_pdb2.sh
@@ -0,0 +1,15 @@
+file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
+
+#CUDA_VISIBLE_DEVICES="" \
+python -m funasr.bin.inference \
+--config-path=${file_dir} \
+--config-name="config.yaml" \
+++init_param=${file_dir}/model.pb \
+++tokenizer_conf.token_list=${file_dir}/tokens.txt \
+++input=[${file_dir}/dev_wav/wav.scp,${file_dir}/dev_wav/ocr.txt] \
++data_type='["sound", "text"]' \
+++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
+++output_dir="./outputs/debug" \
+++device="cpu" \
+
+#++input=[${file_dir}/dev_wav/wav.scp,${file_dir}/dev_wav/ocr.txt] \
diff --git a/examples/industrial_data_pretraining/lcbnet/demo_tmp1.sh b/examples/industrial_data_pretraining/lcbnet/demo_tmp1.sh
new file mode 100755
index 000000000..488f7d2a1
--- /dev/null
+++ b/examples/industrial_data_pretraining/lcbnet/demo_tmp1.sh
@@ -0,0 +1,71 @@
+file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
+CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+inference_device="cuda"
+
+if [ ${inference_device} == "cuda" ]; then
+    nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+else
+    inference_batch_size=1
+    CUDA_VISIBLE_DEVICES=""
+    for JOB in $(seq ${nj}); do
+        CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
+    done
+fi
+
+inference_dir="outputs/slidespeech_test_beamsearch_new"
+_logdir="${inference_dir}/logdir"
+echo "inference_dir: ${inference_dir}"
+
+mkdir -p "${_logdir}"
+key_file1=${file_dir}/test/wav.scp
+key_file2=${file_dir}/test/ocr.txt
+split_scps1=
+split_scps2=
+for JOB in $(seq "${nj}"); do
+    split_scps1+=" ${_logdir}/wav.${JOB}.scp"
+    split_scps2+=" ${_logdir}/ocr.${JOB}.txt"
+done
+utils/split_scp.pl "${key_file1}" ${split_scps1}
+utils/split_scp.pl "${key_file2}" ${split_scps2}
+
+gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
+for JOB in $(seq ${nj}); do
+    {
+        id=$((JOB-1))
+        gpuid=${gpuid_list_array[$id]}
+
+        export CUDA_VISIBLE_DEVICES=${gpuid}
+
+        python -m funasr.bin.inference \
+        --config-path=${file_dir} \
+        --config-name="config.yaml" \
+        ++init_param=${file_dir}/model.pb \
+        ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
+        ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
+        +data_type='["kaldi_ark", "text"]' \
+        ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
+        ++output_dir="${inference_dir}/${JOB}" \
+        ++device="${inference_device}" \
+        ++ncpu=1 \
+        ++disable_log=true  &> ${_logdir}/log.${JOB}.txt
+
+    }&
+done
+wait
+
+
+mkdir -p ${inference_dir}/1best_recog
+
+for JOB in $(seq "${nj}"); do
+   cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token"
+done  
+
+echo "Computing WER ..."
+sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/'  ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc
+cp  ${file_dir}/test/text ${inference_dir}/1best_recog/token.ref
+cp  ${file_dir}/test/ocr.list ${inference_dir}/1best_recog/ocr.list
+python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer
+tail -n 3 ${inference_dir}/1best_recog/token.cer
+
+./run_bwer_recall.sh  ${inference_dir}/1best_recog/
+tail -n 6 ${inference_dir}/1best_recog/BWER-UWER.results |head -n 5

From aa45aeeaa7e8abd11bc1be392b4547685645ca5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Mon, 4 Mar 2024 14:29:52 +0800
Subject: [PATCH 089/101] atsr

---
 .../lcbnet/demp.py                            | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 examples/industrial_data_pretraining/lcbnet/demp.py

diff --git a/examples/industrial_data_pretraining/lcbnet/demp.py b/examples/industrial_data_pretraining/lcbnet/demp.py
new file mode 100644
index 000000000..cb08290b4
--- /dev/null
+++ b/examples/industrial_data_pretraining/lcbnet/demp.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+model = AutoModel(model="iic/LCB-NET"
+                  )
+
+
+# example1
+#res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
+                    )
+#print(res)
+
+
+'''
+# tensor or numpy as input
+# example2
+import torchaudio
+import os
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+input_tensor, sample_rate = torchaudio.load(wav_file)
+input_tensor = input_tensor.mean(0)
+res = model.generate(input=[input_tensor], batch_size_s=300, is_final=True)
+
+
+# example3
+import soundfile
+
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+res = model.generate(input=[speech], batch_size_s=300, is_final=True)
+'''

From 4e881fc2be339718b771b7469d9b83be89943fa6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Mon, 4 Mar 2024 14:32:10 +0800
Subject: [PATCH 090/101] atsr

---
 .../industrial_data_pretraining/lcbnet/{demp.py => demo.py}  | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)
 rename examples/industrial_data_pretraining/lcbnet/{demp.py => demo.py} (91%)
 mode change 100644 => 100755

diff --git a/examples/industrial_data_pretraining/lcbnet/demp.py b/examples/industrial_data_pretraining/lcbnet/demo.py
old mode 100644
new mode 100755
similarity index 91%
rename from examples/industrial_data_pretraining/lcbnet/demp.py
rename to examples/industrial_data_pretraining/lcbnet/demo.py
index cb08290b4..b9d70c779
--- a/examples/industrial_data_pretraining/lcbnet/demp.py
+++ b/examples/industrial_data_pretraining/lcbnet/demo.py
@@ -5,13 +5,12 @@
 
 from funasr import AutoModel
 
-model = AutoModel(model="iic/LCB-NET"
-                  )
+model = AutoModel(model="iic/LCB-NET")
 
 
 # example1
 #res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
-                    )
+#                    )
 #print(res)
 
 

From 8cca996c7003a5756d09a6a278a9e7efd23b5701 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Mon, 4 Mar 2024 14:45:03 +0800
Subject: [PATCH 091/101] atsr

---
 examples/industrial_data_pretraining/lcbnet/demo.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo.py b/examples/industrial_data_pretraining/lcbnet/demo.py
index b9d70c779..d8e6c9755 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo.py
+++ b/examples/industrial_data_pretraining/lcbnet/demo.py
@@ -5,7 +5,8 @@
 
 from funasr import AutoModel
 
-model = AutoModel(model="iic/LCB-NET")
+model = AutoModel(model="iic/LCB-NET",
+                  model_revision="v1.0.0")
 
 
 # example1

From 9070774ab6c9a7149d31240fb0d686485f30f8e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Mon, 4 Mar 2024 15:21:47 +0800
Subject: [PATCH 092/101] atsr

---
 .../lcbnet/README.md                          | 105 ++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 examples/industrial_data_pretraining/lcbnet/README.md

diff --git a/examples/industrial_data_pretraining/lcbnet/README.md b/examples/industrial_data_pretraining/lcbnet/README.md
new file mode 100644
index 000000000..4273ec085
--- /dev/null
+++ b/examples/industrial_data_pretraining/lcbnet/README.md
@@ -0,0 +1,105 @@
+---
+tasks:
+- audio-visual-speech-recognition 
+domain:
+- audio, visual
+model-type:
+- Autoregressive
+frameworks:
+- pytorch
+backbone:
+- transformer/conformer
+metrics:
+- WER/B-WER
+license: Apache License 2.0
+language: 
+- en
+tags:
+- FunASR
+- Alibaba
+- ICASSP 2024
+- Audio-Visual
+- Hotword
+- Long-Context Biasing
+datasets:
+  train:
+  - SlideSpeech corpus
+  test:
+  - dev and test of SlideSpeech corpus
+indexing:
+   results:
+   - task:
+       name: Audio-Visual Speech Recognition
+     dataset:
+       name: SlideSpeech corpus
+       type: audio    # optional
+       args: 16k sampling rate, 5002 bpe units  # optional
+     metrics:
+       - type: WER
+         value: 18.8%  # float
+         description: beamsearch search, withou lm, avg.
+         args: default
+
+widgets:
+  - task: audio-visual-speech-recognition 
+    inputs:
+      - type: audio
+        name: input
+        title: 音频
+      - type: text
+        name: input
+        title: OCR识别文本
+finetune-support: True
+---
+
+
+# Paraformer-large模型介绍
+
+## Highlights
+- 热词版本：[Paraformer-large热词版模型](https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary)支持热词定制功能，基于提供的热词列表进行激励增强，提升热词的召回率和准确率。
+- 长音频版本：[Paraformer-large长音频模型](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)，集成VAD、ASR、标点与时间戳功能，可直接对时长为数小时音频进行识别，并输出带标点文字与时间戳。
+
+## <strong>[FunASR开源项目介绍](https://github.com/alibaba-damo-academy/FunASR)</strong>
+<strong>[FunASR](https://github.com/alibaba-damo-academy/FunASR)</strong>希望在语音识别的学术研究和工业应用之间架起一座桥梁。通过发布工业级语音识别模型的训练和微调，研究人员和开发人员可以更方便地进行语音识别模型的研究和生产，并推动语音识别生态的发展。让语音识别更有趣！
+
+[**github仓库**](https://github.com/alibaba-damo-academy/FunASR)
+| [**最新动态**](https://github.com/alibaba-damo-academy/FunASR#whats-new) 
+| [**环境安装**](https://github.com/alibaba-damo-academy/FunASR#installation)
+| [**服务部署**](https://www.funasr.com)
+| [**模型库**](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)
+| [**联系我们**](https://github.com/alibaba-damo-academy/FunASR#contact)
+
+
+## 模型原理介绍
+
+随着在线会议和课程越来越普遍，如何利用视频幻灯片中丰富的文本信息来改善语音识别（Automatic  Speech Recognition， ASR）面临着新的挑战。视频中的幻灯片与语音实时同步，相比于统一的稀有词列表，能够提供更长的上下文相关信息。因此，我们提出了一种创新的长上下文偏置网络（LCB-net），用于音频-视觉语音识别（Audio-Visual Speech Recognition，AVSR），以更好地利用视频中的长时上下文信息。
+
+<p align="center">
+<img src="fig/lcbnet1.png" alt="AVSR整体流程框架"  width="500" />
+<p align="center">
+<img src="fig/lcbnet2.png" alt="LCB-NET模型结构"  width="500" />
+
+
+具体来说，我们首先使用OCR技术来检测和识别幻灯片中的文本内容，其次我们采用关键词提取技术来获取文本内容中的关键词短语。最后，我们将关键词拼接成长上下文文本和音频同时输入到我们的LCB-net模型中进行识别。而LCB-net模型采用了双编码器结构，同时建模音频和长上下文文本信息。此外，我们还引入了一个显式的偏置词预测模块，通过使用二元交叉熵（BCE）损失函数显式预测长上下文文本中在音频中出现的关键偏置词。此外，为增强LCB-net的泛化能力和稳健性，我们还采用了动态的关键词模拟策略。实验证明，我们提出的LCB-net热词模型，不仅能够提升关键词的识别效果，同时也能够提升非关键词的识别效果。具体实验结果如下所示：
+
+<p align="center">
+<img src="fig/lcbnet3.png" alt="实验结果"  width="500" />
+
+
+更详细的细节见：
+- 论文： [LCB-net: Long-Context Biasing for Audio-Visual Speech Recognition](https://arxiv.org/abs/2401.06390)
+
+
+
+
+
+## 相关论文以及引用信息
+
+```BibTeX
+@inproceedings{yu2024lcbnet,
+  title={LCB-net: Long-Context Biasing for Audio-Visual Speech Recognition},
+  author={Fan Yu, Haoxu Wang, Xian Shi, Shiliang Zhang},
+  booktitle={ICASSP},
+  year={2024}
+}
+```
\ No newline at end of file

From beb5db6c1c44664a1415dc85e7cf441504f1c2a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Mon, 4 Mar 2024 15:29:03 +0800
Subject: [PATCH 093/101] atsr

---
 examples/industrial_data_pretraining/lcbnet/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/README.md b/examples/industrial_data_pretraining/lcbnet/README.md
index 4273ec085..c0d4b1ecb 100644
--- a/examples/industrial_data_pretraining/lcbnet/README.md
+++ b/examples/industrial_data_pretraining/lcbnet/README.md
@@ -75,9 +75,9 @@ finetune-support: True
 随着在线会议和课程越来越普遍，如何利用视频幻灯片中丰富的文本信息来改善语音识别（Automatic  Speech Recognition， ASR）面临着新的挑战。视频中的幻灯片与语音实时同步，相比于统一的稀有词列表，能够提供更长的上下文相关信息。因此，我们提出了一种创新的长上下文偏置网络（LCB-net），用于音频-视觉语音识别（Audio-Visual Speech Recognition，AVSR），以更好地利用视频中的长时上下文信息。
 
 <p align="center">
-<img src="fig/lcbnet1.png" alt="AVSR整体流程框架"  width="500" />
+<img src="fig/lcbnet1.png" alt="AVSR整体流程框架"  width="800" />
 <p align="center">
-<img src="fig/lcbnet2.png" alt="LCB-NET模型结构"  width="500" />
+<img src="fig/lcbnet2.png" alt="LCB-NET模型结构"  width="800" />
 
 
 具体来说，我们首先使用OCR技术来检测和识别幻灯片中的文本内容，其次我们采用关键词提取技术来获取文本内容中的关键词短语。最后，我们将关键词拼接成长上下文文本和音频同时输入到我们的LCB-net模型中进行识别。而LCB-net模型采用了双编码器结构，同时建模音频和长上下文文本信息。此外，我们还引入了一个显式的偏置词预测模块，通过使用二元交叉熵（BCE）损失函数显式预测长上下文文本中在音频中出现的关键偏置词。此外，为增强LCB-net的泛化能力和稳健性，我们还采用了动态的关键词模拟策略。实验证明，我们提出的LCB-net热词模型，不仅能够提升关键词的识别效果，同时也能够提升非关键词的识别效果。具体实验结果如下所示：

From 9876729a257b2ebb6c9289e7d442c8e086d96e7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Mon, 4 Mar 2024 15:41:22 +0800
Subject: [PATCH 094/101] atsr

---
 examples/industrial_data_pretraining/lcbnet/demo.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo.py b/examples/industrial_data_pretraining/lcbnet/demo.py
index d8e6c9755..fe51f08e1 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo.py
+++ b/examples/industrial_data_pretraining/lcbnet/demo.py
@@ -10,9 +10,8 @@ model = AutoModel(model="iic/LCB-NET",
 
 
 # example1
-#res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
-#                    )
-#print(res)
+res = model.generate(input='["~/.cache/modelscope/hub/iic/LCB-NET/example/asr_example.wav","~/.cache/modelscope/hub/iic/LCB-NET/example/ocr.txt"]',data_type='["sound", "text"]')
+print(res)
 
 
 '''

From fd365acc9452e0e8fdf2cb8da82fb8ccde0326b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Mon, 4 Mar 2024 15:45:53 +0800
Subject: [PATCH 095/101] atsr

---
 examples/industrial_data_pretraining/lcbnet/demo.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo.py b/examples/industrial_data_pretraining/lcbnet/demo.py
index fe51f08e1..d0870bc8a 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo.py
+++ b/examples/industrial_data_pretraining/lcbnet/demo.py
@@ -11,6 +11,7 @@ model = AutoModel(model="iic/LCB-NET",
 
 # example1
 res = model.generate(input='["~/.cache/modelscope/hub/iic/LCB-NET/example/asr_example.wav","~/.cache/modelscope/hub/iic/LCB-NET/example/ocr.txt"]',data_type='["sound", "text"]')
+
 print(res)
 
 

From 1a6d9d5cc422dcd1e6dd5b9c67047d63bc6cd667 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Mon, 4 Mar 2024 16:32:28 +0800
Subject: [PATCH 096/101] atsr

---
 .gitignore                                    |  1 +
 .../lcbnet/demo.py                            |  2 +-
 .../lcbnet/demo.sh                            | 10 +--
 .../lcbnet/demo2.sh                           | 71 -------------------
 .../lcbnet/demo2_tmp.sh                       | 71 -------------------
 .../lcbnet/demo_pdb.sh                        | 18 -----
 .../lcbnet/demo_pdb2.sh                       | 15 ----
 .../lcbnet/demo_tmp1.sh                       | 71 -------------------
 funasr/utils/load_utils.py                    |  2 -
 9 files changed, 7 insertions(+), 254 deletions(-)
 delete mode 100755 examples/industrial_data_pretraining/lcbnet/demo2.sh
 delete mode 100755 examples/industrial_data_pretraining/lcbnet/demo2_tmp.sh
 delete mode 100755 examples/industrial_data_pretraining/lcbnet/demo_pdb.sh
 delete mode 100755 examples/industrial_data_pretraining/lcbnet/demo_pdb2.sh
 delete mode 100755 examples/industrial_data_pretraining/lcbnet/demo_tmp1.sh

diff --git a/.gitignore b/.gitignore
index d2b4c53b9..1f2a3d1a7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,4 +26,5 @@ emotion2vec*
 GPT-SoVITS*
 examples/*/*/outputs
 examples/*/*/exp
+examples/*/*/tmp
 cmd_read
diff --git a/examples/industrial_data_pretraining/lcbnet/demo.py b/examples/industrial_data_pretraining/lcbnet/demo.py
index d0870bc8a..602a986d5 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo.py
+++ b/examples/industrial_data_pretraining/lcbnet/demo.py
@@ -10,7 +10,7 @@ model = AutoModel(model="iic/LCB-NET",
 
 
 # example1
-res = model.generate(input='["~/.cache/modelscope/hub/iic/LCB-NET/example/asr_example.wav","~/.cache/modelscope/hub/iic/LCB-NET/example/ocr.txt"]',data_type='["sound", "text"]')
+res = model.generate(input=("https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/asr_example.wav","https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/ocr.txt"),data_type=("sound", "text"))
 
 print(res)
 
diff --git a/examples/industrial_data_pretraining/lcbnet/demo.sh b/examples/industrial_data_pretraining/lcbnet/demo.sh
index 825289188..3e04ccd1e 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo.sh
@@ -1,5 +1,5 @@
-file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
-CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+file_dir="/home/yf352572/.cache/modelscope/hub/iic/LCB-NET/"
+CUDA_VISIBLE_DEVICES="0,1"
 inference_device="cuda"
 
 if [ ${inference_device} == "cuda" ]; then
@@ -12,7 +12,7 @@ else
     done
 fi
 
-inference_dir="outputs/slidespeech_dev_beamsearch_new"
+inference_dir="outputs/slidespeech_dev"
 _logdir="${inference_dir}/logdir"
 echo "inference_dir: ${inference_dir}"
 
@@ -39,11 +39,11 @@ for JOB in $(seq ${nj}); do
         python -m funasr.bin.inference \
         --config-path=${file_dir} \
         --config-name="config.yaml" \
-        ++init_param=${file_dir}/model.pb \
+        ++init_param=${file_dir}/model.pt \
         ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
         ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
         +data_type='["kaldi_ark", "text"]' \
-        ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
+        ++tokenizer_conf.bpemodel=${file_dir}/bpe.pt \
         ++output_dir="${inference_dir}/${JOB}" \
         ++device="${inference_device}" \
         ++ncpu=1 \
diff --git a/examples/industrial_data_pretraining/lcbnet/demo2.sh b/examples/industrial_data_pretraining/lcbnet/demo2.sh
deleted file mode 100755
index 69df6d16c..000000000
--- a/examples/industrial_data_pretraining/lcbnet/demo2.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
-CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
-inference_device="cuda"
-test_set="dev_wav"
-if [ ${inference_device} == "cuda" ]; then
-    nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
-else
-    inference_batch_size=1
-    CUDA_VISIBLE_DEVICES=""
-    for JOB in $(seq ${nj}); do
-        CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
-    done
-fi
-
-inference_dir="outputs/slidespeech_dev_beamsearch_wav"
-_logdir="${inference_dir}/logdir"
-echo "inference_dir: ${inference_dir}"
-
-mkdir -p "${_logdir}"
-key_file1=${file_dir}/${test_set}/wav.scp
-key_file2=${file_dir}/${test_set}/ocr.txt
-split_scps1=
-split_scps2=
-for JOB in $(seq "${nj}"); do
-    split_scps1+=" ${_logdir}/wav.${JOB}.scp"
-    split_scps2+=" ${_logdir}/ocr.${JOB}.txt"
-done
-utils/split_scp.pl "${key_file1}" ${split_scps1}
-utils/split_scp.pl "${key_file2}" ${split_scps2}
-
-gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
-for JOB in $(seq ${nj}); do
-    {
-        id=$((JOB-1))
-        gpuid=${gpuid_list_array[$id]}
-
-        export CUDA_VISIBLE_DEVICES=${gpuid}
-
-        python -m funasr.bin.inference \
-        --config-path=${file_dir} \
-        --config-name="config.yaml" \
-        ++init_param=${file_dir}/model.pb \
-        ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
-        ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
-        +data_type='["sound", "text"]' \
-        ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
-        ++output_dir="${inference_dir}/${JOB}" \
-        ++device="${inference_device}" \
-        ++ncpu=1 \
-        ++disable_log=true  &> ${_logdir}/log.${JOB}.txt
-
-    }&
-done
-wait
-
-
-mkdir -p ${inference_dir}/1best_recog
-
-for JOB in $(seq "${nj}"); do
-   cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token"
-done  
-
-echo "Computing WER ..."
-sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/'  ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc
-cp  ${file_dir}/${test_set}/text ${inference_dir}/1best_recog/token.ref
-cp  ${file_dir}/${test_set}/ocr.list ${inference_dir}/1best_recog/ocr.list
-python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer
-tail -n 3 ${inference_dir}/1best_recog/token.cer
-
-./run_bwer_recall.sh  ${inference_dir}/1best_recog/
-tail -n 6 ${inference_dir}/1best_recog/BWER-UWER.results |head -n 5
diff --git a/examples/industrial_data_pretraining/lcbnet/demo2_tmp.sh b/examples/industrial_data_pretraining/lcbnet/demo2_tmp.sh
deleted file mode 100755
index da6ad686d..000000000
--- a/examples/industrial_data_pretraining/lcbnet/demo2_tmp.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
-CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
-inference_device="cuda"
-test_set="test_wav"
-if [ ${inference_device} == "cuda" ]; then
-    nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
-else
-    inference_batch_size=1
-    CUDA_VISIBLE_DEVICES=""
-    for JOB in $(seq ${nj}); do
-        CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
-    done
-fi
-
-inference_dir="outputs/slidespeech_test_beamsearch_wav"
-_logdir="${inference_dir}/logdir"
-echo "inference_dir: ${inference_dir}"
-
-mkdir -p "${_logdir}"
-key_file1=${file_dir}/${test_set}/wav.scp
-key_file2=${file_dir}/${test_set}/ocr.txt
-split_scps1=
-split_scps2=
-for JOB in $(seq "${nj}"); do
-    split_scps1+=" ${_logdir}/wav.${JOB}.scp"
-    split_scps2+=" ${_logdir}/ocr.${JOB}.txt"
-done
-utils/split_scp.pl "${key_file1}" ${split_scps1}
-utils/split_scp.pl "${key_file2}" ${split_scps2}
-
-gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
-for JOB in $(seq ${nj}); do
-    {
-        id=$((JOB-1))
-        gpuid=${gpuid_list_array[$id]}
-
-        export CUDA_VISIBLE_DEVICES=${gpuid}
-
-        python -m funasr.bin.inference \
-        --config-path=${file_dir} \
-        --config-name="config.yaml" \
-        ++init_param=${file_dir}/model.pb \
-        ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
-        ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
-        +data_type='["sound", "text"]' \
-        ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
-        ++output_dir="${inference_dir}/${JOB}" \
-        ++device="${inference_device}" \
-        ++ncpu=1 \
-        ++disable_log=true  &> ${_logdir}/log.${JOB}.txt
-
-    }&
-done
-wait
-
-
-mkdir -p ${inference_dir}/1best_recog
-
-for JOB in $(seq "${nj}"); do
-   cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token"
-done  
-
-echo "Computing WER ..."
-sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/'  ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc
-cp  ${file_dir}/${test_set}/text ${inference_dir}/1best_recog/token.ref
-cp  ${file_dir}/${test_set}/ocr.list ${inference_dir}/1best_recog/ocr.list
-python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer
-tail -n 3 ${inference_dir}/1best_recog/token.cer
-
-./run_bwer_recall.sh  ${inference_dir}/1best_recog/
-tail -n 6 ${inference_dir}/1best_recog/BWER-UWER.results |head -n 5
diff --git a/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh b/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh
deleted file mode 100755
index 0747a8d7b..000000000
--- a/examples/industrial_data_pretraining/lcbnet/demo_pdb.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
-
-#CUDA_VISIBLE_DEVICES="" \
-python -m funasr.bin.inference \
---config-path=${file_dir} \
---config-name="config.yaml" \
-++init_param=${file_dir}/model.pb \
-++tokenizer_conf.token_list=${file_dir}/tokens.txt \
-+input=["${file_dir}/example/asr_example.wav","${file_dir}/example/ocr.txt"] \
-+data_type='["sound","text"]' \
-++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
-++output_dir="./outputs/debug" \
-++device="cpu" \
-
-#++input=["/nfs/yufan.yf/workspace/espnet/egs2/youtube_ppt/asr/dump/raw/dev_oracle_v1_new/data/format.1/YTB+--tMoLpQI-w+00322.wav"] \
-#+data_type='["sound"]' \
-#++input=["/nfs/yufan.yf/workspace/espnet/egs2/youtube_ppt/asr/dump/raw/dev_oracle_v1_new/data/format.1/YTB+--tMoLpQI-w+00322.wav","/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch/example/ocr2.txt"]  \
-#+data_type='["sound","text"]' \
diff --git a/examples/industrial_data_pretraining/lcbnet/demo_pdb2.sh b/examples/industrial_data_pretraining/lcbnet/demo_pdb2.sh
deleted file mode 100755
index 557e9b2d8..000000000
--- a/examples/industrial_data_pretraining/lcbnet/demo_pdb2.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
-
-#CUDA_VISIBLE_DEVICES="" \
-python -m funasr.bin.inference \
---config-path=${file_dir} \
---config-name="config.yaml" \
-++init_param=${file_dir}/model.pb \
-++tokenizer_conf.token_list=${file_dir}/tokens.txt \
-++input=[${file_dir}/dev_wav/wav.scp,${file_dir}/dev_wav/ocr.txt] \
-+data_type='["sound", "text"]' \
-++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
-++output_dir="./outputs/debug" \
-++device="cpu" \
-
-#++input=[${file_dir}/dev_wav/wav.scp,${file_dir}/dev_wav/ocr.txt] \
diff --git a/examples/industrial_data_pretraining/lcbnet/demo_tmp1.sh b/examples/industrial_data_pretraining/lcbnet/demo_tmp1.sh
deleted file mode 100755
index 488f7d2a1..000000000
--- a/examples/industrial_data_pretraining/lcbnet/demo_tmp1.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-file_dir="/nfs/yufan.yf/workspace/github/FunASR/examples/industrial_data_pretraining/lcbnet/exp/speech_lcbnet_contextual_asr-en-16k-bpe-vocab5002-pytorch"
-CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
-inference_device="cuda"
-
-if [ ${inference_device} == "cuda" ]; then
-    nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
-else
-    inference_batch_size=1
-    CUDA_VISIBLE_DEVICES=""
-    for JOB in $(seq ${nj}); do
-        CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
-    done
-fi
-
-inference_dir="outputs/slidespeech_test_beamsearch_new"
-_logdir="${inference_dir}/logdir"
-echo "inference_dir: ${inference_dir}"
-
-mkdir -p "${_logdir}"
-key_file1=${file_dir}/test/wav.scp
-key_file2=${file_dir}/test/ocr.txt
-split_scps1=
-split_scps2=
-for JOB in $(seq "${nj}"); do
-    split_scps1+=" ${_logdir}/wav.${JOB}.scp"
-    split_scps2+=" ${_logdir}/ocr.${JOB}.txt"
-done
-utils/split_scp.pl "${key_file1}" ${split_scps1}
-utils/split_scp.pl "${key_file2}" ${split_scps2}
-
-gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
-for JOB in $(seq ${nj}); do
-    {
-        id=$((JOB-1))
-        gpuid=${gpuid_list_array[$id]}
-
-        export CUDA_VISIBLE_DEVICES=${gpuid}
-
-        python -m funasr.bin.inference \
-        --config-path=${file_dir} \
-        --config-name="config.yaml" \
-        ++init_param=${file_dir}/model.pb \
-        ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
-        ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
-        +data_type='["kaldi_ark", "text"]' \
-        ++tokenizer_conf.bpemodel=${file_dir}/bpe.model \
-        ++output_dir="${inference_dir}/${JOB}" \
-        ++device="${inference_device}" \
-        ++ncpu=1 \
-        ++disable_log=true  &> ${_logdir}/log.${JOB}.txt
-
-    }&
-done
-wait
-
-
-mkdir -p ${inference_dir}/1best_recog
-
-for JOB in $(seq "${nj}"); do
-   cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token"
-done  
-
-echo "Computing WER ..."
-sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/'  ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc
-cp  ${file_dir}/test/text ${inference_dir}/1best_recog/token.ref
-cp  ${file_dir}/test/ocr.list ${inference_dir}/1best_recog/ocr.list
-python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer
-tail -n 3 ${inference_dir}/1best_recog/token.cer
-
-./run_bwer_recall.sh  ${inference_dir}/1best_recog/
-tail -n 6 ${inference_dir}/1best_recog/BWER-UWER.results |head -n 5
diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py
index 644af2324..84c38f9b9 100644
--- a/funasr/utils/load_utils.py
+++ b/funasr/utils/load_utils.py
@@ -89,8 +89,6 @@ def load_bytes(input):
     return array
 
 def extract_fbank(data, data_len = None, data_type: str="sound", frontend=None, **kwargs):
-    # import pdb;
-    # pdb.set_trace()
     if isinstance(data, np.ndarray):
         data = torch.from_numpy(data)
         if len(data.shape) < 2:

From d9e60d9ddc92ab5746842b5a2b6f7a423de2a795 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Mon, 4 Mar 2024 16:44:31 +0800
Subject: [PATCH 097/101] atsr

---
 .../lcbnet/README.md                          | 73 +++++++++++++++++++
 .../lcbnet/demo.py                            | 26 +------
 2 files changed, 75 insertions(+), 24 deletions(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/README.md b/examples/industrial_data_pretraining/lcbnet/README.md
index c0d4b1ecb..ff75b40b8 100644
--- a/examples/industrial_data_pretraining/lcbnet/README.md
+++ b/examples/industrial_data_pretraining/lcbnet/README.md
@@ -91,6 +91,79 @@ finetune-support: True
 
 
 
+## 基于ModelScope进行推理
+
+- 推理支持音频格式如下：
+  - wav文件路径，例如：data/test/asr_example.wav
+  - pcm文件路径，例如：data/test/asr_example.pcm
+  - ark文件路径，例如：data/test/data.ark
+  - wav文件url，例如：https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/asr_example.wav
+  - wav二进制数据，格式bytes，例如：用户直接从文件里读出bytes数据或者是麦克风录出bytes数据。
+  - 已解析的audio音频，例如：audio, rate = soundfile.read("asr_example_zh.wav")，类型为numpy.ndarray或者torch.Tensor。
+  - wav.scp文件，需符合如下要求(以下分别为sound和kaldi_ark格式)：
+
+```sh
+cat wav.scp
+asr_example1  data/test/asr_example1.wav
+asr_example2  data/test/asr_example2.wav
+
+cat wav.scp
+asr_example1  data/test/data_wav.ark:22
+asr_example2  data/test/data_wav.ark:90445
+...
+```
+
+- 推理支持OCR预测文本格式如下：
+  - ocr.txt文件，需符合如下要求：
+```sh
+cat ocr.txt
+asr_example1  ANIMAL <blank> RIGHTS <blank> MANAGER <blank> PLOEG
+asr_example2  UNIVERSITY <blank> CAMPUS <blank> DEANO
+...
+```
+
+- 若输入格式wav文件和ocr文件均为url，api调用方式可参考如下范例：
+
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="iic/LCB-NET",
+                  model_revision="v2.0.0")
+res = model.generate(input=("https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/asr_example.wav","https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/ocr.txt"),data_type=("sound", "text"))
+```
+
+
+## 复现论文中的结果
+```python
+python -m funasr.bin.inference \
+        --config-path=${file_dir} \
+        --config-name="config.yaml" \
+        ++init_param=${file_dir}/model.pt \
+        ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
+        ++input=[${_logdir}/wav.scp,${_logdir}/ocr.txt] \
+        +data_type='["kaldi_ark", "text"]' \
+        ++tokenizer_conf.bpemodel=${file_dir}/bpe.pt \
+        ++output_dir="${inference_dir}/results" \
+        ++device="${inference_device}" \
+        ++ncpu=1 \
+        ++disable_log=true
+
+```
+
+
+识别结果输出路径结构如下：
+
+```sh
+tree output_dir/
+output_dir/
+└── 1best_recog
+    ├── text
+    └── token
+```
+
+token：语音识别结果文件
+
+可以使用funasr里面提供的run_bwer_recall.sh计算WER、BWER、UWER和Recall。
 
 
 ## 相关论文以及引用信息
diff --git a/examples/industrial_data_pretraining/lcbnet/demo.py b/examples/industrial_data_pretraining/lcbnet/demo.py
index 602a986d5..ac679cec8 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo.py
+++ b/examples/industrial_data_pretraining/lcbnet/demo.py
@@ -6,30 +6,8 @@
 from funasr import AutoModel
 
 model = AutoModel(model="iic/LCB-NET",
-                  model_revision="v1.0.0")
+                  model_revision="v2.0.0")
 
-
-# example1
 res = model.generate(input=("https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/asr_example.wav","https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/ocr.txt"),data_type=("sound", "text"))
 
-print(res)
-
-
-'''
-# tensor or numpy as input
-# example2
-import torchaudio
-import os
-wav_file = os.path.join(model.model_path, "example/asr_example.wav")
-input_tensor, sample_rate = torchaudio.load(wav_file)
-input_tensor = input_tensor.mean(0)
-res = model.generate(input=[input_tensor], batch_size_s=300, is_final=True)
-
-
-# example3
-import soundfile
-
-wav_file = os.path.join(model.model_path, "example/asr_example.wav")
-speech, sample_rate = soundfile.read(wav_file)
-res = model.generate(input=[speech], batch_size_s=300, is_final=True)
-'''
+print(res)
\ No newline at end of file

From 9c884c566ff1a7c26f0f28e8d8ad4deb281a954d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Mon, 4 Mar 2024 16:48:01 +0800
Subject: [PATCH 098/101] atsr

---
 examples/industrial_data_pretraining/lcbnet/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/README.md b/examples/industrial_data_pretraining/lcbnet/README.md
index ff75b40b8..923588668 100644
--- a/examples/industrial_data_pretraining/lcbnet/README.md
+++ b/examples/industrial_data_pretraining/lcbnet/README.md
@@ -164,7 +164,7 @@ output_dir/
 token：语音识别结果文件
 
 可以使用funasr里面提供的run_bwer_recall.sh计算WER、BWER、UWER和Recall。
-
+详细脚本可以参考funasr里面的demo.sh脚本，需要注意的是你需要修改一下iic/LCB-NET/conf.yaml中CMVN(stats_file)的路径和iic/LCB-NET/dev/wav.scp里面ark的路径，修改为你自己本地的路径，然后跑解码。
 
 ## 相关论文以及引用信息
 

From 015b1e424e9ee96c770f3221deef4c20a544e883 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Mon, 4 Mar 2024 17:06:21 +0800
Subject: [PATCH 099/101] atsr

---
 examples/industrial_data_pretraining/lcbnet/demo.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo.sh b/examples/industrial_data_pretraining/lcbnet/demo.sh
index 3e04ccd1e..2f226bc03 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo.sh
+++ b/examples/industrial_data_pretraining/lcbnet/demo.sh
@@ -44,6 +44,7 @@ for JOB in $(seq ${nj}); do
         ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
         +data_type='["kaldi_ark", "text"]' \
         ++tokenizer_conf.bpemodel=${file_dir}/bpe.pt \
+        ++normalize_conf.stats_file=${file_dir}/am.mvn \
         ++output_dir="${inference_dir}/${JOB}" \
         ++device="${inference_device}" \
         ++ncpu=1 \

From a2f263bd05498cf4f35d78ee0ee8755ba84d09ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Mon, 4 Mar 2024 17:09:05 +0800
Subject: [PATCH 100/101] atsr

---
 examples/industrial_data_pretraining/lcbnet/demo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/demo.py b/examples/industrial_data_pretraining/lcbnet/demo.py
index ac679cec8..4ca52553f 100755
--- a/examples/industrial_data_pretraining/lcbnet/demo.py
+++ b/examples/industrial_data_pretraining/lcbnet/demo.py
@@ -6,7 +6,7 @@
 from funasr import AutoModel
 
 model = AutoModel(model="iic/LCB-NET",
-                  model_revision="v2.0.0")
+                  model_revision="v1.0.0")
 
 res = model.generate(input=("https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/asr_example.wav","https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/ocr.txt"),data_type=("sound", "text"))
 

From 1162dee2dd2971243607bb58a766987acda6e9ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AF=AD=E5=B8=86?= <yf352572@alibaba-inc.com>
Date: Mon, 4 Mar 2024 17:11:46 +0800
Subject: [PATCH 101/101] atsr

---
 .../lcbnet/README.md                          | 178 ------------------
 1 file changed, 178 deletions(-)
 delete mode 100644 examples/industrial_data_pretraining/lcbnet/README.md

diff --git a/examples/industrial_data_pretraining/lcbnet/README.md b/examples/industrial_data_pretraining/lcbnet/README.md
deleted file mode 100644
index 923588668..000000000
--- a/examples/industrial_data_pretraining/lcbnet/README.md
+++ /dev/null
@@ -1,178 +0,0 @@
----
-tasks:
-- audio-visual-speech-recognition 
-domain:
-- audio, visual
-model-type:
-- Autoregressive
-frameworks:
-- pytorch
-backbone:
-- transformer/conformer
-metrics:
-- WER/B-WER
-license: Apache License 2.0
-language: 
-- en
-tags:
-- FunASR
-- Alibaba
-- ICASSP 2024
-- Audio-Visual
-- Hotword
-- Long-Context Biasing
-datasets:
-  train:
-  - SlideSpeech corpus
-  test:
-  - dev and test of SlideSpeech corpus
-indexing:
-   results:
-   - task:
-       name: Audio-Visual Speech Recognition
-     dataset:
-       name: SlideSpeech corpus
-       type: audio    # optional
-       args: 16k sampling rate, 5002 bpe units  # optional
-     metrics:
-       - type: WER
-         value: 18.8%  # float
-         description: beamsearch search, withou lm, avg.
-         args: default
-
-widgets:
-  - task: audio-visual-speech-recognition 
-    inputs:
-      - type: audio
-        name: input
-        title: 音频
-      - type: text
-        name: input
-        title: OCR识别文本
-finetune-support: True
----
-
-
-# Paraformer-large模型介绍
-
-## Highlights
-- 热词版本：[Paraformer-large热词版模型](https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary)支持热词定制功能，基于提供的热词列表进行激励增强，提升热词的召回率和准确率。
-- 长音频版本：[Paraformer-large长音频模型](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)，集成VAD、ASR、标点与时间戳功能，可直接对时长为数小时音频进行识别，并输出带标点文字与时间戳。
-
-## <strong>[FunASR开源项目介绍](https://github.com/alibaba-damo-academy/FunASR)</strong>
-<strong>[FunASR](https://github.com/alibaba-damo-academy/FunASR)</strong>希望在语音识别的学术研究和工业应用之间架起一座桥梁。通过发布工业级语音识别模型的训练和微调，研究人员和开发人员可以更方便地进行语音识别模型的研究和生产，并推动语音识别生态的发展。让语音识别更有趣！
-
-[**github仓库**](https://github.com/alibaba-damo-academy/FunASR)
-| [**最新动态**](https://github.com/alibaba-damo-academy/FunASR#whats-new) 
-| [**环境安装**](https://github.com/alibaba-damo-academy/FunASR#installation)
-| [**服务部署**](https://www.funasr.com)
-| [**模型库**](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)
-| [**联系我们**](https://github.com/alibaba-damo-academy/FunASR#contact)
-
-
-## 模型原理介绍
-
-随着在线会议和课程越来越普遍，如何利用视频幻灯片中丰富的文本信息来改善语音识别（Automatic  Speech Recognition， ASR）面临着新的挑战。视频中的幻灯片与语音实时同步，相比于统一的稀有词列表，能够提供更长的上下文相关信息。因此，我们提出了一种创新的长上下文偏置网络（LCB-net），用于音频-视觉语音识别（Audio-Visual Speech Recognition，AVSR），以更好地利用视频中的长时上下文信息。
-
-<p align="center">
-<img src="fig/lcbnet1.png" alt="AVSR整体流程框架"  width="800" />
-<p align="center">
-<img src="fig/lcbnet2.png" alt="LCB-NET模型结构"  width="800" />
-
-
-具体来说，我们首先使用OCR技术来检测和识别幻灯片中的文本内容，其次我们采用关键词提取技术来获取文本内容中的关键词短语。最后，我们将关键词拼接成长上下文文本和音频同时输入到我们的LCB-net模型中进行识别。而LCB-net模型采用了双编码器结构，同时建模音频和长上下文文本信息。此外，我们还引入了一个显式的偏置词预测模块，通过使用二元交叉熵（BCE）损失函数显式预测长上下文文本中在音频中出现的关键偏置词。此外，为增强LCB-net的泛化能力和稳健性，我们还采用了动态的关键词模拟策略。实验证明，我们提出的LCB-net热词模型，不仅能够提升关键词的识别效果，同时也能够提升非关键词的识别效果。具体实验结果如下所示：
-
-<p align="center">
-<img src="fig/lcbnet3.png" alt="实验结果"  width="500" />
-
-
-更详细的细节见：
-- 论文： [LCB-net: Long-Context Biasing for Audio-Visual Speech Recognition](https://arxiv.org/abs/2401.06390)
-
-
-
-## 基于ModelScope进行推理
-
-- 推理支持音频格式如下：
-  - wav文件路径，例如：data/test/asr_example.wav
-  - pcm文件路径，例如：data/test/asr_example.pcm
-  - ark文件路径，例如：data/test/data.ark
-  - wav文件url，例如：https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/asr_example.wav
-  - wav二进制数据，格式bytes，例如：用户直接从文件里读出bytes数据或者是麦克风录出bytes数据。
-  - 已解析的audio音频，例如：audio, rate = soundfile.read("asr_example_zh.wav")，类型为numpy.ndarray或者torch.Tensor。
-  - wav.scp文件，需符合如下要求(以下分别为sound和kaldi_ark格式)：
-
-```sh
-cat wav.scp
-asr_example1  data/test/asr_example1.wav
-asr_example2  data/test/asr_example2.wav
-
-cat wav.scp
-asr_example1  data/test/data_wav.ark:22
-asr_example2  data/test/data_wav.ark:90445
-...
-```
-
-- 推理支持OCR预测文本格式如下：
-  - ocr.txt文件，需符合如下要求：
-```sh
-cat ocr.txt
-asr_example1  ANIMAL <blank> RIGHTS <blank> MANAGER <blank> PLOEG
-asr_example2  UNIVERSITY <blank> CAMPUS <blank> DEANO
-...
-```
-
-- 若输入格式wav文件和ocr文件均为url，api调用方式可参考如下范例：
-
-```python
-from funasr import AutoModel
-
-model = AutoModel(model="iic/LCB-NET",
-                  model_revision="v2.0.0")
-res = model.generate(input=("https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/asr_example.wav","https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/ocr.txt"),data_type=("sound", "text"))
-```
-
-
-## 复现论文中的结果
-```python
-python -m funasr.bin.inference \
-        --config-path=${file_dir} \
-        --config-name="config.yaml" \
-        ++init_param=${file_dir}/model.pt \
-        ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
-        ++input=[${_logdir}/wav.scp,${_logdir}/ocr.txt] \
-        +data_type='["kaldi_ark", "text"]' \
-        ++tokenizer_conf.bpemodel=${file_dir}/bpe.pt \
-        ++output_dir="${inference_dir}/results" \
-        ++device="${inference_device}" \
-        ++ncpu=1 \
-        ++disable_log=true
-
-```
-
-
-识别结果输出路径结构如下：
-
-```sh
-tree output_dir/
-output_dir/
-└── 1best_recog
-    ├── text
-    └── token
-```
-
-token：语音识别结果文件
-
-可以使用funasr里面提供的run_bwer_recall.sh计算WER、BWER、UWER和Recall。
-详细脚本可以参考funasr里面的demo.sh脚本，需要注意的是你需要修改一下iic/LCB-NET/conf.yaml中CMVN(stats_file)的路径和iic/LCB-NET/dev/wav.scp里面ark的路径，修改为你自己本地的路径，然后跑解码。
-
-## 相关论文以及引用信息
-
-```BibTeX
-@inproceedings{yu2024lcbnet,
-  title={LCB-net: Long-Context Biasing for Audio-Visual Speech Recognition},
-  author={Fan Yu, Haoxu Wang, Xian Shi, Shiliang Zhang},
-  booktitle={ICASSP},
-  year={2024}
-}
-```
\ No newline at end of file