From 10e95b07a1a91d144998da720137a114f6314e04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B8=B8=E9=9B=81?= <zhifu.gzf@alibaba-inc.com>
Date: Thu, 11 Jul 2024 11:41:03 +0800
Subject: [PATCH] update

---
 .../llm_asr/app_chatbot_audio_audio.py                   | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/examples/industrial_data_pretraining/llm_asr/app_chatbot_audio_audio.py b/examples/industrial_data_pretraining/llm_asr/app_chatbot_audio_audio.py
index b8d768e8c..92dc8f358 100644
--- a/examples/industrial_data_pretraining/llm_asr/app_chatbot_audio_audio.py
+++ b/examples/industrial_data_pretraining/llm_asr/app_chatbot_audio_audio.py
@@ -22,6 +22,8 @@ import re
 import os
 import sys
 
+import numpy as np
+
 if len(sys.argv) > 1:
     ckpt_dir = sys.argv[1]
     ckpt_id = sys.argv[2]
@@ -47,6 +49,9 @@ init_param_ckpt = f"{os.path.join(ckpt_dir, ckpt_id)}"
 flow_init = "/data/zhifu.gzf/init_model/cosyvoice_flow_matching_for_streaming_with_prompt_random_cut_sft_zh_0630_25hz_1/60epoch.pth.prefix"
 vocoder_init = "/data/zhifu.gzf/init_model/hiftnet_1400k_cvt/model.pth.prefix"
 init_param = f"{init_param},{init_param_ckpt},{flow_init},{vocoder_init}"
+spk_emb = np.load(
+    "/data/zhifu.gzf/init_model/cosyvoice_flow_matching_for_streaming_with_prompt_random_cut_sft_zh_0630_25hz_1/xvec/xiaoxia.npy"
+)
 
 model_llm = AutoModel(
     model=ckpt_dir,
@@ -119,12 +124,14 @@ def model_inference(input_wav, text_inputs, state, turn_num, history):
 
     res = model_llm.generate(
         input=[contents_i],
+        spk_emb=spk_emb,
         tearchforing=False,
         cache={},
         key="test_demo",
     )
+    print(res)
     res_text = res[0]["text"]
-    history[-1][1] = gr.Audio((16000, res[0]["wav"].flatten()), autoplay=True)
+    history[-1][1] = gr.Audio((22050, res[0]["wav"].cpu().flatten().numpy()), autoplay=True)
     out_his = state.get("out", "")
     out = f"{out_his}" f"<br><br>" f"Q: {asr_out}" f"<br>" f"A: {res_text}"
     # out = f"{res}"