From 4adb76a6edbca93aae7caa83382e764d7b058f07 Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: Wed, 8 May 2024 19:21:58 +0800
Subject: [PATCH] Dev gzf exp (#1707)

* resume from step

* batch

* batch

* batch

* batch

* batch

* batch

* batch

* batch

* batch

* batch

* batch

* batch

* batch

* batch

* batch

* train_loss_avg train_acc_avg

* train_loss_avg train_acc_avg

* train_loss_avg train_acc_avg

* log step

* wav is not exist

* wav is not exist

* decoding

* decoding

* decoding

* wechat

* decoding key

* decoding key

* decoding key

* decoding key

* decoding key

* decoding key

* dynamic batch
---
 funasr/auto/auto_model.py           | 20 +++++++++++--------
 funasr/models/sense_voice/model.py  | 30 +++++++++++++++++++++--------
 funasr/models/sense_voice/search.py |  9 ++++-----
 3 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py
index 577c328a6..97eb325da 100644
--- a/funasr/auto/auto_model.py
+++ b/funasr/auto/auto_model.py
@@ -364,7 +364,6 @@ class AutoModel:
             if len(sorted_data) > 0 and len(sorted_data[0]) > 0:
                 batch_size = max(batch_size, sorted_data[0][0][1] - sorted_data[0][0][0])
 
-            batch_size_ms_cum = 0
             beg_idx = 0
             beg_asr_total = time.time()
             time_speech_total_per_sample = speech_lengths / 16000
@@ -373,19 +372,22 @@ class AutoModel:
             # pbar_sample = tqdm(colour="blue", total=n, dynamic_ncols=True)
 
             all_segments = []
+            max_len_in_batch = 0
+            end_idx = 1
             for j, _ in enumerate(range(0, n)):
                 # pbar_sample.update(1)
-                batch_size_ms_cum += sorted_data[j][0][1] - sorted_data[j][0][0]
+                sample_length = sorted_data[j][0][1] - sorted_data[j][0][0]
+                potential_batch_length = max(max_len_in_batch, sample_length) * (j + 1 - beg_idx)
+                # batch_size_ms_cum += sorted_data[j][0][1] - sorted_data[j][0][0]
                 if (
                     j < n - 1
-                    and (batch_size_ms_cum + sorted_data[j + 1][0][1] - sorted_data[j + 1][0][0])
-                    < batch_size
-                    and (sorted_data[j + 1][0][1] - sorted_data[j + 1][0][0])
-                    < batch_size_threshold_ms
+                    and sample_length < batch_size_threshold_ms
+                    and potential_batch_length < batch_size
                 ):
+                    max_len_in_batch = max(max_len_in_batch, sample_length)
+                    end_idx += 1
                     continue
-                batch_size_ms_cum = 0
-                end_idx = j + 1
+
                 speech_j, speech_lengths_j = slice_padding_audio_samples(
                     speech, speech_lengths, sorted_data[beg_idx:end_idx]
                 )
@@ -410,6 +412,8 @@ class AutoModel:
                         )
                         results[_b]["spk_embedding"] = spk_res[0]["spk_embedding"]
                 beg_idx = end_idx
+                end_idx += 1
+                max_len_in_batch = sample_length
                 if len(results) < 1:
                     continue
                 results_sorted.extend(results)
diff --git a/funasr/models/sense_voice/model.py b/funasr/models/sense_voice/model.py
index 00bc85b5d..56e61e7ab 100644
--- a/funasr/models/sense_voice/model.py
+++ b/funasr/models/sense_voice/model.py
@@ -516,16 +516,23 @@ class SenseVoiceRWKV(nn.Module):
 
         # Paramterts for rich decoding
         self.beam_search.emo_unk = tokenizer.encode(
-            DecodingOptions.get("emo_unk_token", "<|SPECIAL_TOKEN_1|>"), allowed_special="all")[0]
+            DecodingOptions.get("emo_unk_token", "<|SPECIAL_TOKEN_1|>"), allowed_special="all"
+        )[0]
         self.beam_search.emo_unk_score = 1
         self.beam_search.emo_tokens = tokenizer.encode(
-            DecodingOptions.get("emo_target_tokens", "<|HAPPY|><|SAD|><|ANGRY|>"), allowed_special="all")
+            DecodingOptions.get("emo_target_tokens", "<|HAPPY|><|SAD|><|ANGRY|>"),
+            allowed_special="all",
+        )
         self.beam_search.emo_scores = DecodingOptions.get("emo_target_threshold", [0.1, 0.1, 0.1])
 
         self.beam_search.event_bg_token = tokenizer.encode(
-            DecodingOptions.get("gain_tokens_bg", "<|Speech|><|BGM|><|Applause|><|Laughter|>"), allowed_special="all")
+            DecodingOptions.get("gain_tokens_bg", "<|Speech|><|BGM|><|Applause|><|Laughter|>"),
+            allowed_special="all",
+        )
         self.beam_search.event_ed_token = tokenizer.encode(
-            DecodingOptions.get("gain_tokens_ed", "<|/Speech|><|/BGM|><|/Applause|><|/Laughter|>"), allowed_special="all")
+            DecodingOptions.get("gain_tokens_ed", "<|/Speech|><|/BGM|><|/Applause|><|/Laughter|>"),
+            allowed_special="all",
+        )
         self.beam_search.event_score_ga = DecodingOptions.get("gain_tokens_score", [1, 1, 1, 1])
 
         encoder_out, encoder_out_lens = self.encode(
@@ -859,16 +866,23 @@ class SenseVoiceFSMN(nn.Module):
 
         # Paramterts for rich decoding
         self.beam_search.emo_unk = tokenizer.encode(
-            DecodingOptions.get("emo_unk_token", "<|SPECIAL_TOKEN_1|>"), allowed_special="all")[0]
+            DecodingOptions.get("emo_unk_token", "<|SPECIAL_TOKEN_1|>"), allowed_special="all"
+        )[0]
         self.beam_search.emo_unk_score = 1
         self.beam_search.emo_tokens = tokenizer.encode(
-            DecodingOptions.get("emo_target_tokens", "<|HAPPY|><|SAD|><|ANGRY|>"), allowed_special="all")
+            DecodingOptions.get("emo_target_tokens", "<|HAPPY|><|SAD|><|ANGRY|>"),
+            allowed_special="all",
+        )
         self.beam_search.emo_scores = DecodingOptions.get("emo_target_threshold", [0.1, 0.1, 0.1])
 
         self.beam_search.event_bg_token = tokenizer.encode(
-            DecodingOptions.get("gain_tokens_bg", "<|Speech|><|BGM|><|Applause|><|Laughter|>"), allowed_special="all")
+            DecodingOptions.get("gain_tokens_bg", "<|Speech|><|BGM|><|Applause|><|Laughter|>"),
+            allowed_special="all",
+        )
         self.beam_search.event_ed_token = tokenizer.encode(
-            DecodingOptions.get("gain_tokens_ed", "<|/Speech|><|/BGM|><|/Applause|><|/Laughter|>"), allowed_special="all")
+            DecodingOptions.get("gain_tokens_ed", "<|/Speech|><|/BGM|><|/Applause|><|/Laughter|>"),
+            allowed_special="all",
+        )
         self.beam_search.event_score_ga = DecodingOptions.get("gain_tokens_score", [1, 1, 1, 1])
 
         encoder_out, encoder_out_lens = self.encode(
diff --git a/funasr/models/sense_voice/search.py b/funasr/models/sense_voice/search.py
index 4400ce75d..3a1a049d1 100644
--- a/funasr/models/sense_voice/search.py
+++ b/funasr/models/sense_voice/search.py
@@ -54,7 +54,6 @@ class BeamSearch(torch.nn.Module):
         event_bg_token: List[int] = field(default_factory=lambda: [58946, 58948, 58950, 58952]),
         event_ed_token: List[int] = field(default_factory=lambda: [58947, 58949, 58951, 58953]),
         event_score_ga: List[float] = field(default_factory=lambda: [1, 1, 5, 25]),
-
         token_list: List[str] = None,
         pre_beam_ratio: float = 1.5,
         pre_beam_score_key: str = None,
@@ -209,16 +208,17 @@ class BeamSearch(torch.nn.Module):
 
             last_token = yseq[-1]
             if last_token in self.emo_tokens + [self.emo_unk]:
-                # prevent output event after emotation token 
+                # prevent output event after emotation token
                 score[self.event_bg_token] = -np.inf
 
-            for eve_bg, eve_ed, eve_ga in zip(self.event_bg_token, self.event_ed_token, self.event_score_ga):
+            for eve_bg, eve_ed, eve_ga in zip(
+                self.event_bg_token, self.event_ed_token, self.event_score_ga
+            ):
                 score_offset = get_score(yseq, eve_bg, eve_ed)
                 score[eve_bg] += score_offset[0]
                 score[eve_ed] += score_offset[1]
                 score[eve_bg] += math.log(eve_ga)
 
-
             score[self.emo_unk] += math.log(self.emo_unk_score)
             for emo, emo_th in zip(self.emo_tokens, self.emo_scores):
                 if score.argmax() == emo and score[emo] < math.log(emo_th):
@@ -232,7 +232,6 @@ class BeamSearch(torch.nn.Module):
 
         return scores, states
 
-
     def score_partial(
         self, hyp: Hypothesis, ids: torch.Tensor, x: torch.Tensor
     ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: