From 4adb76a6edbca93aae7caa83382e764d7b058f07 Mon Sep 17 00:00:00 2001 From: zhifu gao Date: Wed, 8 May 2024 19:21:58 +0800 Subject: [PATCH] Dev gzf exp (#1707) * resume from step * batch * batch * batch * batch * batch * batch * batch * batch * batch * batch * batch * batch * batch * batch * batch * train_loss_avg train_acc_avg * train_loss_avg train_acc_avg * train_loss_avg train_acc_avg * log step * wav is not exist * wav is not exist * decoding * decoding * decoding * wechat * decoding key * decoding key * decoding key * decoding key * decoding key * decoding key * dynamic batch --- funasr/auto/auto_model.py | 20 +++++++++++-------- funasr/models/sense_voice/model.py | 30 +++++++++++++++++++++-------- funasr/models/sense_voice/search.py | 9 ++++----- 3 files changed, 38 insertions(+), 21 deletions(-) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index 577c328a6..97eb325da 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -364,7 +364,6 @@ class AutoModel: if len(sorted_data) > 0 and len(sorted_data[0]) > 0: batch_size = max(batch_size, sorted_data[0][0][1] - sorted_data[0][0][0]) - batch_size_ms_cum = 0 beg_idx = 0 beg_asr_total = time.time() time_speech_total_per_sample = speech_lengths / 16000 @@ -373,19 +372,22 @@ class AutoModel: # pbar_sample = tqdm(colour="blue", total=n, dynamic_ncols=True) all_segments = [] + max_len_in_batch = 0 + end_idx = 1 for j, _ in enumerate(range(0, n)): # pbar_sample.update(1) - batch_size_ms_cum += sorted_data[j][0][1] - sorted_data[j][0][0] + sample_length = sorted_data[j][0][1] - sorted_data[j][0][0] + potential_batch_length = max(max_len_in_batch, sample_length) * (j + 1 - beg_idx) + # batch_size_ms_cum += sorted_data[j][0][1] - sorted_data[j][0][0] if ( j < n - 1 - and (batch_size_ms_cum + sorted_data[j + 1][0][1] - sorted_data[j + 1][0][0]) - < batch_size - and (sorted_data[j + 1][0][1] - sorted_data[j + 1][0][0]) - < batch_size_threshold_ms + and sample_length < batch_size_threshold_ms + and potential_batch_length < batch_size ): + max_len_in_batch = max(max_len_in_batch, sample_length) + end_idx += 1 continue - batch_size_ms_cum = 0 - end_idx = j + 1 + speech_j, speech_lengths_j = slice_padding_audio_samples( speech, speech_lengths, sorted_data[beg_idx:end_idx] ) @@ -410,6 +412,8 @@ class AutoModel: ) results[_b]["spk_embedding"] = spk_res[0]["spk_embedding"] beg_idx = end_idx + end_idx += 1 + max_len_in_batch = sample_length if len(results) < 1: continue results_sorted.extend(results) diff --git a/funasr/models/sense_voice/model.py b/funasr/models/sense_voice/model.py index 00bc85b5d..56e61e7ab 100644 --- a/funasr/models/sense_voice/model.py +++ b/funasr/models/sense_voice/model.py @@ -516,16 +516,23 @@ class SenseVoiceRWKV(nn.Module): # Paramterts for rich decoding self.beam_search.emo_unk = tokenizer.encode( - DecodingOptions.get("emo_unk_token", "<|SPECIAL_TOKEN_1|>"), allowed_special="all")[0] + DecodingOptions.get("emo_unk_token", "<|SPECIAL_TOKEN_1|>"), allowed_special="all" + )[0] self.beam_search.emo_unk_score = 1 self.beam_search.emo_tokens = tokenizer.encode( - DecodingOptions.get("emo_target_tokens", "<|HAPPY|><|SAD|><|ANGRY|>"), allowed_special="all") + DecodingOptions.get("emo_target_tokens", "<|HAPPY|><|SAD|><|ANGRY|>"), + allowed_special="all", + ) self.beam_search.emo_scores = DecodingOptions.get("emo_target_threshold", [0.1, 0.1, 0.1]) self.beam_search.event_bg_token = tokenizer.encode( - DecodingOptions.get("gain_tokens_bg", "<|Speech|><|BGM|><|Applause|><|Laughter|>"), allowed_special="all") + DecodingOptions.get("gain_tokens_bg", "<|Speech|><|BGM|><|Applause|><|Laughter|>"), + allowed_special="all", + ) self.beam_search.event_ed_token = tokenizer.encode( - DecodingOptions.get("gain_tokens_ed", "<|/Speech|><|/BGM|><|/Applause|><|/Laughter|>"), allowed_special="all") + DecodingOptions.get("gain_tokens_ed", "<|/Speech|><|/BGM|><|/Applause|><|/Laughter|>"), + allowed_special="all", + ) self.beam_search.event_score_ga = DecodingOptions.get("gain_tokens_score", [1, 1, 1, 1]) encoder_out, encoder_out_lens = self.encode( @@ -859,16 +866,23 @@ class SenseVoiceFSMN(nn.Module): # Paramterts for rich decoding self.beam_search.emo_unk = tokenizer.encode( - DecodingOptions.get("emo_unk_token", "<|SPECIAL_TOKEN_1|>"), allowed_special="all")[0] + DecodingOptions.get("emo_unk_token", "<|SPECIAL_TOKEN_1|>"), allowed_special="all" + )[0] self.beam_search.emo_unk_score = 1 self.beam_search.emo_tokens = tokenizer.encode( - DecodingOptions.get("emo_target_tokens", "<|HAPPY|><|SAD|><|ANGRY|>"), allowed_special="all") + DecodingOptions.get("emo_target_tokens", "<|HAPPY|><|SAD|><|ANGRY|>"), + allowed_special="all", + ) self.beam_search.emo_scores = DecodingOptions.get("emo_target_threshold", [0.1, 0.1, 0.1]) self.beam_search.event_bg_token = tokenizer.encode( - DecodingOptions.get("gain_tokens_bg", "<|Speech|><|BGM|><|Applause|><|Laughter|>"), allowed_special="all") + DecodingOptions.get("gain_tokens_bg", "<|Speech|><|BGM|><|Applause|><|Laughter|>"), + allowed_special="all", + ) self.beam_search.event_ed_token = tokenizer.encode( - DecodingOptions.get("gain_tokens_ed", "<|/Speech|><|/BGM|><|/Applause|><|/Laughter|>"), allowed_special="all") + DecodingOptions.get("gain_tokens_ed", "<|/Speech|><|/BGM|><|/Applause|><|/Laughter|>"), + allowed_special="all", + ) self.beam_search.event_score_ga = DecodingOptions.get("gain_tokens_score", [1, 1, 1, 1]) encoder_out, encoder_out_lens = self.encode( diff --git a/funasr/models/sense_voice/search.py b/funasr/models/sense_voice/search.py index 4400ce75d..3a1a049d1 100644 --- a/funasr/models/sense_voice/search.py +++ b/funasr/models/sense_voice/search.py @@ -54,7 +54,6 @@ class BeamSearch(torch.nn.Module): event_bg_token: List[int] = field(default_factory=lambda: [58946, 58948, 58950, 58952]), event_ed_token: List[int] = field(default_factory=lambda: [58947, 58949, 58951, 58953]), event_score_ga: List[float] = field(default_factory=lambda: [1, 1, 5, 25]), - token_list: List[str] = None, pre_beam_ratio: float = 1.5, pre_beam_score_key: str = None, @@ -209,16 +208,17 @@ class BeamSearch(torch.nn.Module): last_token = yseq[-1] if last_token in self.emo_tokens + [self.emo_unk]: - # prevent output event after emotation token + # prevent output event after emotation token score[self.event_bg_token] = -np.inf - for eve_bg, eve_ed, eve_ga in zip(self.event_bg_token, self.event_ed_token, self.event_score_ga): + for eve_bg, eve_ed, eve_ga in zip( + self.event_bg_token, self.event_ed_token, self.event_score_ga + ): score_offset = get_score(yseq, eve_bg, eve_ed) score[eve_bg] += score_offset[0] score[eve_ed] += score_offset[1] score[eve_bg] += math.log(eve_ga) - score[self.emo_unk] += math.log(self.emo_unk_score) for emo, emo_th in zip(self.emo_tokens, self.emo_scores): if score.argmax() == emo and score[emo] < math.log(emo_th): @@ -232,7 +232,6 @@ class BeamSearch(torch.nn.Module): return scores, states - def score_partial( self, hyp: Hypothesis, ids: torch.Tensor, x: torch.Tensor ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: