修复 当选用SenseVoice模型处理长音频(同时开启vad和output_timestamp)时的异常 (#2413)

This commit is contained in:
hohaiuhsx 2025-03-10 23:16:22 +08:00 committed by GitHub
parent 9c67d9b969
commit 6fe10a8dbf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -925,11 +925,11 @@ class SenseVoiceSmall(nn.Module):
align = ctc_forced_align(
logits_speech.unsqueeze(0).float(),
torch.Tensor(token_int[4:]).unsqueeze(0).long().to(logits_speech.device),
(encoder_out_lens - 4).long(),
(encoder_out_lens[i] - 4).long(),
torch.tensor(len(token_int) - 4).unsqueeze(0).long().to(logits_speech.device),
ignore_id=self.ignore_id,
)
pred = groupby(align[0, : encoder_out_lens[0]])
pred = groupby(align[0, : encoder_out_lens[i]])
_start = 0
token_id = 0
ts_max = encoder_out_lens[i] - 4
@ -951,20 +951,26 @@ class SenseVoiceSmall(nn.Module):
def post(self, timestamp):
timestamp_new = []
prev_word = None
for i, t in enumerate(timestamp):
word, start, end = t
start = int(start * 1000)
end = int(end * 1000)
if word == "":
continue
if i == 0:
# timestamp_new.append([word, start, end])
timestamp_new.append([int(start * 1000), int(end * 1000)])
elif word.startswith("") or len(word) == 1 or not word[1].isalpha():
timestamp_new.append([start, end])
elif word.startswith(""):
word = word[1:]
# timestamp_new.append([word, start, end])
timestamp_new.append([int(start * 1000), int(end * 1000)])
timestamp_new.append([start, end])
elif prev_word.isalpha() and prev_word.isascii() and word.isalpha() and word.isascii():
prev_word += word
timestamp_new[-1][1] = end
else:
# timestamp_new[-1][0] += word
timestamp_new[-1][1] = int(end * 1000)
timestamp_new.append([start, end])
prev_word = word
return timestamp_new
def export(self, **kwargs):