diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common-pytorch/README.md b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common-pytorch/README.md
deleted file mode 100644
index 6d9cd3024..000000000
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common-pytorch/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained ModelScope Model
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
-
-- Then you can run the pipeline to infer with:
-```python
-    python infer.py
-```
-
-
-Modify inference related parameters in vad.yaml.
-
-- max_end_silence_time: The end-point silence duration  to judge the end of sentence, the parameter range is 500ms~6000ms, and the default value is 800ms
-- speech_noise_thres:  The balance of speech and silence scores, the parameter range is (-1,1)
-    - The value tends to -1, the greater probability of noise being judged as speech
-    - The value tends to 1, the greater probability of speech being judged as noise
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common-pytorch/infer.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common-pytorch/infer.py
deleted file mode 100755
index e11d5d21f..000000000
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common-pytorch/infer.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == '__main__':
-    audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav'
-    output_dir = None
-    inference_pipline = pipeline(
-        task=Tasks.auto_speech_recognition,
-        model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
-        model_revision=None,
-        output_dir=output_dir,
-        batch_size=1,
-    )
-    segments_result = inference_pipline(audio_in=audio_in)
-    print(segments_result)