mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
Dev gzf exp (#1707)
* resume from step * batch * batch * batch * batch * batch * batch * batch * batch * batch * batch * batch * batch * batch * batch * batch * train_loss_avg train_acc_avg * train_loss_avg train_acc_avg * train_loss_avg train_acc_avg * log step * wav is not exist * wav is not exist * decoding * decoding * decoding * wechat * decoding key * decoding key * decoding key * decoding key * decoding key * decoding key * dynamic batch
This commit is contained in:
parent
28bb448050
commit
4adb76a6ed
@ -364,7 +364,6 @@ class AutoModel:
|
|||||||
if len(sorted_data) > 0 and len(sorted_data[0]) > 0:
|
if len(sorted_data) > 0 and len(sorted_data[0]) > 0:
|
||||||
batch_size = max(batch_size, sorted_data[0][0][1] - sorted_data[0][0][0])
|
batch_size = max(batch_size, sorted_data[0][0][1] - sorted_data[0][0][0])
|
||||||
|
|
||||||
batch_size_ms_cum = 0
|
|
||||||
beg_idx = 0
|
beg_idx = 0
|
||||||
beg_asr_total = time.time()
|
beg_asr_total = time.time()
|
||||||
time_speech_total_per_sample = speech_lengths / 16000
|
time_speech_total_per_sample = speech_lengths / 16000
|
||||||
@ -373,19 +372,22 @@ class AutoModel:
|
|||||||
# pbar_sample = tqdm(colour="blue", total=n, dynamic_ncols=True)
|
# pbar_sample = tqdm(colour="blue", total=n, dynamic_ncols=True)
|
||||||
|
|
||||||
all_segments = []
|
all_segments = []
|
||||||
|
max_len_in_batch = 0
|
||||||
|
end_idx = 1
|
||||||
for j, _ in enumerate(range(0, n)):
|
for j, _ in enumerate(range(0, n)):
|
||||||
# pbar_sample.update(1)
|
# pbar_sample.update(1)
|
||||||
batch_size_ms_cum += sorted_data[j][0][1] - sorted_data[j][0][0]
|
sample_length = sorted_data[j][0][1] - sorted_data[j][0][0]
|
||||||
|
potential_batch_length = max(max_len_in_batch, sample_length) * (j + 1 - beg_idx)
|
||||||
|
# batch_size_ms_cum += sorted_data[j][0][1] - sorted_data[j][0][0]
|
||||||
if (
|
if (
|
||||||
j < n - 1
|
j < n - 1
|
||||||
and (batch_size_ms_cum + sorted_data[j + 1][0][1] - sorted_data[j + 1][0][0])
|
and sample_length < batch_size_threshold_ms
|
||||||
< batch_size
|
and potential_batch_length < batch_size
|
||||||
and (sorted_data[j + 1][0][1] - sorted_data[j + 1][0][0])
|
|
||||||
< batch_size_threshold_ms
|
|
||||||
):
|
):
|
||||||
|
max_len_in_batch = max(max_len_in_batch, sample_length)
|
||||||
|
end_idx += 1
|
||||||
continue
|
continue
|
||||||
batch_size_ms_cum = 0
|
|
||||||
end_idx = j + 1
|
|
||||||
speech_j, speech_lengths_j = slice_padding_audio_samples(
|
speech_j, speech_lengths_j = slice_padding_audio_samples(
|
||||||
speech, speech_lengths, sorted_data[beg_idx:end_idx]
|
speech, speech_lengths, sorted_data[beg_idx:end_idx]
|
||||||
)
|
)
|
||||||
@ -410,6 +412,8 @@ class AutoModel:
|
|||||||
)
|
)
|
||||||
results[_b]["spk_embedding"] = spk_res[0]["spk_embedding"]
|
results[_b]["spk_embedding"] = spk_res[0]["spk_embedding"]
|
||||||
beg_idx = end_idx
|
beg_idx = end_idx
|
||||||
|
end_idx += 1
|
||||||
|
max_len_in_batch = sample_length
|
||||||
if len(results) < 1:
|
if len(results) < 1:
|
||||||
continue
|
continue
|
||||||
results_sorted.extend(results)
|
results_sorted.extend(results)
|
||||||
|
|||||||
@ -516,16 +516,23 @@ class SenseVoiceRWKV(nn.Module):
|
|||||||
|
|
||||||
# Paramterts for rich decoding
|
# Paramterts for rich decoding
|
||||||
self.beam_search.emo_unk = tokenizer.encode(
|
self.beam_search.emo_unk = tokenizer.encode(
|
||||||
DecodingOptions.get("emo_unk_token", "<|SPECIAL_TOKEN_1|>"), allowed_special="all")[0]
|
DecodingOptions.get("emo_unk_token", "<|SPECIAL_TOKEN_1|>"), allowed_special="all"
|
||||||
|
)[0]
|
||||||
self.beam_search.emo_unk_score = 1
|
self.beam_search.emo_unk_score = 1
|
||||||
self.beam_search.emo_tokens = tokenizer.encode(
|
self.beam_search.emo_tokens = tokenizer.encode(
|
||||||
DecodingOptions.get("emo_target_tokens", "<|HAPPY|><|SAD|><|ANGRY|>"), allowed_special="all")
|
DecodingOptions.get("emo_target_tokens", "<|HAPPY|><|SAD|><|ANGRY|>"),
|
||||||
|
allowed_special="all",
|
||||||
|
)
|
||||||
self.beam_search.emo_scores = DecodingOptions.get("emo_target_threshold", [0.1, 0.1, 0.1])
|
self.beam_search.emo_scores = DecodingOptions.get("emo_target_threshold", [0.1, 0.1, 0.1])
|
||||||
|
|
||||||
self.beam_search.event_bg_token = tokenizer.encode(
|
self.beam_search.event_bg_token = tokenizer.encode(
|
||||||
DecodingOptions.get("gain_tokens_bg", "<|Speech|><|BGM|><|Applause|><|Laughter|>"), allowed_special="all")
|
DecodingOptions.get("gain_tokens_bg", "<|Speech|><|BGM|><|Applause|><|Laughter|>"),
|
||||||
|
allowed_special="all",
|
||||||
|
)
|
||||||
self.beam_search.event_ed_token = tokenizer.encode(
|
self.beam_search.event_ed_token = tokenizer.encode(
|
||||||
DecodingOptions.get("gain_tokens_ed", "<|/Speech|><|/BGM|><|/Applause|><|/Laughter|>"), allowed_special="all")
|
DecodingOptions.get("gain_tokens_ed", "<|/Speech|><|/BGM|><|/Applause|><|/Laughter|>"),
|
||||||
|
allowed_special="all",
|
||||||
|
)
|
||||||
self.beam_search.event_score_ga = DecodingOptions.get("gain_tokens_score", [1, 1, 1, 1])
|
self.beam_search.event_score_ga = DecodingOptions.get("gain_tokens_score", [1, 1, 1, 1])
|
||||||
|
|
||||||
encoder_out, encoder_out_lens = self.encode(
|
encoder_out, encoder_out_lens = self.encode(
|
||||||
@ -859,16 +866,23 @@ class SenseVoiceFSMN(nn.Module):
|
|||||||
|
|
||||||
# Paramterts for rich decoding
|
# Paramterts for rich decoding
|
||||||
self.beam_search.emo_unk = tokenizer.encode(
|
self.beam_search.emo_unk = tokenizer.encode(
|
||||||
DecodingOptions.get("emo_unk_token", "<|SPECIAL_TOKEN_1|>"), allowed_special="all")[0]
|
DecodingOptions.get("emo_unk_token", "<|SPECIAL_TOKEN_1|>"), allowed_special="all"
|
||||||
|
)[0]
|
||||||
self.beam_search.emo_unk_score = 1
|
self.beam_search.emo_unk_score = 1
|
||||||
self.beam_search.emo_tokens = tokenizer.encode(
|
self.beam_search.emo_tokens = tokenizer.encode(
|
||||||
DecodingOptions.get("emo_target_tokens", "<|HAPPY|><|SAD|><|ANGRY|>"), allowed_special="all")
|
DecodingOptions.get("emo_target_tokens", "<|HAPPY|><|SAD|><|ANGRY|>"),
|
||||||
|
allowed_special="all",
|
||||||
|
)
|
||||||
self.beam_search.emo_scores = DecodingOptions.get("emo_target_threshold", [0.1, 0.1, 0.1])
|
self.beam_search.emo_scores = DecodingOptions.get("emo_target_threshold", [0.1, 0.1, 0.1])
|
||||||
|
|
||||||
self.beam_search.event_bg_token = tokenizer.encode(
|
self.beam_search.event_bg_token = tokenizer.encode(
|
||||||
DecodingOptions.get("gain_tokens_bg", "<|Speech|><|BGM|><|Applause|><|Laughter|>"), allowed_special="all")
|
DecodingOptions.get("gain_tokens_bg", "<|Speech|><|BGM|><|Applause|><|Laughter|>"),
|
||||||
|
allowed_special="all",
|
||||||
|
)
|
||||||
self.beam_search.event_ed_token = tokenizer.encode(
|
self.beam_search.event_ed_token = tokenizer.encode(
|
||||||
DecodingOptions.get("gain_tokens_ed", "<|/Speech|><|/BGM|><|/Applause|><|/Laughter|>"), allowed_special="all")
|
DecodingOptions.get("gain_tokens_ed", "<|/Speech|><|/BGM|><|/Applause|><|/Laughter|>"),
|
||||||
|
allowed_special="all",
|
||||||
|
)
|
||||||
self.beam_search.event_score_ga = DecodingOptions.get("gain_tokens_score", [1, 1, 1, 1])
|
self.beam_search.event_score_ga = DecodingOptions.get("gain_tokens_score", [1, 1, 1, 1])
|
||||||
|
|
||||||
encoder_out, encoder_out_lens = self.encode(
|
encoder_out, encoder_out_lens = self.encode(
|
||||||
|
|||||||
@ -54,7 +54,6 @@ class BeamSearch(torch.nn.Module):
|
|||||||
event_bg_token: List[int] = field(default_factory=lambda: [58946, 58948, 58950, 58952]),
|
event_bg_token: List[int] = field(default_factory=lambda: [58946, 58948, 58950, 58952]),
|
||||||
event_ed_token: List[int] = field(default_factory=lambda: [58947, 58949, 58951, 58953]),
|
event_ed_token: List[int] = field(default_factory=lambda: [58947, 58949, 58951, 58953]),
|
||||||
event_score_ga: List[float] = field(default_factory=lambda: [1, 1, 5, 25]),
|
event_score_ga: List[float] = field(default_factory=lambda: [1, 1, 5, 25]),
|
||||||
|
|
||||||
token_list: List[str] = None,
|
token_list: List[str] = None,
|
||||||
pre_beam_ratio: float = 1.5,
|
pre_beam_ratio: float = 1.5,
|
||||||
pre_beam_score_key: str = None,
|
pre_beam_score_key: str = None,
|
||||||
@ -209,16 +208,17 @@ class BeamSearch(torch.nn.Module):
|
|||||||
|
|
||||||
last_token = yseq[-1]
|
last_token = yseq[-1]
|
||||||
if last_token in self.emo_tokens + [self.emo_unk]:
|
if last_token in self.emo_tokens + [self.emo_unk]:
|
||||||
# prevent output event after emotation token
|
# prevent output event after emotation token
|
||||||
score[self.event_bg_token] = -np.inf
|
score[self.event_bg_token] = -np.inf
|
||||||
|
|
||||||
for eve_bg, eve_ed, eve_ga in zip(self.event_bg_token, self.event_ed_token, self.event_score_ga):
|
for eve_bg, eve_ed, eve_ga in zip(
|
||||||
|
self.event_bg_token, self.event_ed_token, self.event_score_ga
|
||||||
|
):
|
||||||
score_offset = get_score(yseq, eve_bg, eve_ed)
|
score_offset = get_score(yseq, eve_bg, eve_ed)
|
||||||
score[eve_bg] += score_offset[0]
|
score[eve_bg] += score_offset[0]
|
||||||
score[eve_ed] += score_offset[1]
|
score[eve_ed] += score_offset[1]
|
||||||
score[eve_bg] += math.log(eve_ga)
|
score[eve_bg] += math.log(eve_ga)
|
||||||
|
|
||||||
|
|
||||||
score[self.emo_unk] += math.log(self.emo_unk_score)
|
score[self.emo_unk] += math.log(self.emo_unk_score)
|
||||||
for emo, emo_th in zip(self.emo_tokens, self.emo_scores):
|
for emo, emo_th in zip(self.emo_tokens, self.emo_scores):
|
||||||
if score.argmax() == emo and score[emo] < math.log(emo_th):
|
if score.argmax() == emo and score[emo] < math.log(emo_th):
|
||||||
@ -232,7 +232,6 @@ class BeamSearch(torch.nn.Module):
|
|||||||
|
|
||||||
return scores, states
|
return scores, states
|
||||||
|
|
||||||
|
|
||||||
def score_partial(
|
def score_partial(
|
||||||
self, hyp: Hypothesis, ids: torch.Tensor, x: torch.Tensor
|
self, hyp: Hypothesis, ids: torch.Tensor, x: torch.Tensor
|
||||||
) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
|
) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user