mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
simple streaming
This commit is contained in:
parent
441b997f19
commit
4a8cb6f0c4
@ -3130,6 +3130,8 @@ class LLMASRXvecSlotTTS(nn.Module):
|
|||||||
_text = f"<|endofprompt|><|sil|>{text}" + ("<|sil|>" if is_last else "")
|
_text = f"<|endofprompt|><|sil|>{text}" + ("<|sil|>" if is_last else "")
|
||||||
text_token = self.tts_tokenizer_warpper(_text)
|
text_token = self.tts_tokenizer_warpper(_text)
|
||||||
|
|
||||||
|
cur_token, feat, wav = None, None, None
|
||||||
|
if len(text_token) > tts_text_chunk_size:
|
||||||
text_token = torch.tensor([text_token], dtype=torch.long, device=device)
|
text_token = torch.tensor([text_token], dtype=torch.long, device=device)
|
||||||
text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.long, device=device)
|
text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.long, device=device)
|
||||||
cur_token, feat = self.tts_model.streaming_one_step(
|
cur_token, feat = self.tts_model.streaming_one_step(
|
||||||
@ -3145,6 +3147,7 @@ class LLMASRXvecSlotTTS(nn.Module):
|
|||||||
outside_prompt_lengths=llm_cur_kv_cache_len,
|
outside_prompt_lengths=llm_cur_kv_cache_len,
|
||||||
sampling="threshold_1e-6",
|
sampling="threshold_1e-6",
|
||||||
chunk_idx=chunk_idx,
|
chunk_idx=chunk_idx,
|
||||||
|
diff_steps=5,
|
||||||
)
|
)
|
||||||
if cur_token is not None and cur_token.shape[1] > 0 and feat.shape[2] > 0:
|
if cur_token is not None and cur_token.shape[1] > 0 and feat.shape[2] > 0:
|
||||||
# process first package, token in B,T,D, feat in B,F,T
|
# process first package, token in B,T,D, feat in B,F,T
|
||||||
@ -3164,8 +3167,6 @@ class LLMASRXvecSlotTTS(nn.Module):
|
|||||||
prompt_audio[0] = torch.concat([prompt_audio[0], feat.transpose(1, 2)], dim=1)
|
prompt_audio[0] = torch.concat([prompt_audio[0], feat.transpose(1, 2)], dim=1)
|
||||||
wav = self.vocoder.inference(feat.transpose(1, 2))
|
wav = self.vocoder.inference(feat.transpose(1, 2))
|
||||||
chunk_idx += 1
|
chunk_idx += 1
|
||||||
else:
|
|
||||||
cur_token, feat, wav = None, None, None
|
|
||||||
|
|
||||||
return ((cur_token, feat, wav), (text, last_t_size, prompt_token, prompt_audio, chunk_idx))
|
return ((cur_token, feat, wav), (text, last_t_size, prompt_token, prompt_audio, chunk_idx))
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user