diff --git a/egs_modelscope/asr_vad_spk/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/demo.py b/egs_modelscope/asr_vad_spk/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/demo.py
new file mode 100644
index 000000000..a3773b4d3
--- /dev/null
+++ b/egs_modelscope/asr_vad_spk/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/demo.py
@@ -0,0 +1,17 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == '__main__':
+    audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav'
+    output_dir = "./results"
+    inference_pipeline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model='damo/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn',
+        vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
+        #punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
+        punc_model='damo/punc_ct-transformer_cn-en-common-vocab471067-large',
+        output_dir=output_dir,
+    )
+    rec_result = inference_pipeline(audio_in=audio_in, batch_size_token=5000, batch_size_token_threshold_s=40, max_single_segment_time=6000)
+    print(rec_result)
+
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index 15dbdd499..f4140e195 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -787,7 +787,7 @@ def inference_paraformer_vad_speaker(
         time_stamp_writer: bool = True,
         punc_infer_config: Optional[str] = None,
         punc_model_file: Optional[str] = None,
-        sv_model_file: Optional[str] = None,
+        sv_model_file: Optional[str] = "~/.cache/modelscope/hub/damo/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/campplus_cn_common.bin",
         streaming: bool = False,
         embedding_node: str = "resnet1_dense",
         sv_threshold: float = 0.9465,
@@ -933,7 +933,7 @@ def inference_paraformer_vad_speaker(
             #####  speaker_verification  #####
             ##################################
             # load sv model
-            sv_model_dict = torch.load(sv_model_file, map_location=torch.device('cpu'))
+            sv_model_dict = torch.load(sv_model_file.replace("~", os.environ['HOME']), map_location=torch.device('cpu'))
             sv_model = CAMPPlus()
             sv_model.load_state_dict(sv_model_dict)
             sv_model.eval()
@@ -1084,7 +1084,6 @@ def inference_paraformer_vad_speaker(
             logging.info("decoding, utt: {}, predictions: {}".format(key, text_postprocessed_punc))
         torch.cuda.empty_cache()
         distribute_spk(asr_result_list[0]['sentences'], sv_output)
-        import pdb; pdb.set_trace()
         return asr_result_list
 
     return _forward
@@ -2030,7 +2029,7 @@ def inference_launch(**kwargs):
         return inference_paraformer(**kwargs)
     elif mode == "paraformer_streaming":
         return inference_paraformer_online(**kwargs)
-    elif mode == "paraformer_vad_speaker":
+    elif mode.startswith("paraformer_vad_speaker"):
         return inference_paraformer_vad_speaker(**kwargs)
     elif mode.startswith("paraformer_vad"):
         return inference_paraformer_vad_punc(**kwargs)