## Inference with pipeline ### Speech Recognition #### Paraformer model ```python from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks inference_pipeline = pipeline( task=Tasks.auto_speech_recognition, model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch', ) rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav') print(rec_result) ``` ### Voice Activity Detection #### FSMN-VAD ```python from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger import logging logger = get_logger(log_level=logging.CRITICAL) logger.setLevel(logging.CRITICAL) inference_pipeline = pipeline( task=Tasks.voice_activity_detection, model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch', ) segments_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav') print(segments_result) ``` ### Punctuation Restoration #### CT_Transformer ```python from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks inference_pipeline = pipeline( task=Tasks.punctuation, model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch', ) rec_result = inference_pipeline(text_in='我们都是木头人不会讲话不会动') print(rec_result) ``` ### Timestamp Prediction #### TP-Aligner ```python from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks inference_pipeline = pipeline( task=Tasks.speech_timestamp, model='damo/speech_timestamp_prediction-v1-16k-offline', output_dir='./tmp') rec_result = inference_pipeline( audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_timestamps.wav', text_in='一 个 东 太 平 洋 国 家 为 什 么 跑 到 西 太 平 洋 来 了 呢',) print(rec_result) ``` ### Speaker Verification #### X-vector ```python from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks import numpy as np inference_sv_pipline = pipeline( task=Tasks.speaker_verification, model='damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch' ) # embedding extract spk_embedding = inference_sv_pipline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav')["spk_embedding"] # speaker verification rec_result = inference_sv_pipline(audio_in=('https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav','https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_same.wav')) print(rec_result["scores"][0]) ``` ## Finetune with pipeline ### Speech Recognition #### Paraformer model finetune.py ```python import os from modelscope.metainfo import Trainers from modelscope.trainers import build_trainer from modelscope.msdatasets.audio.asr_dataset import ASRDataset def modelscope_finetune(params): if not os.path.exists(params.output_dir): os.makedirs(params.output_dir, exist_ok=True) # dataset split ["train", "validation"] ds_dict = ASRDataset.load(params.data_path, namespace='speech_asr') kwargs = dict( model=params.model, data_dir=ds_dict, dataset_type=params.dataset_type, work_dir=params.output_dir, batch_bins=params.batch_bins, max_epoch=params.max_epoch, lr=params.lr) trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) trainer.train() if __name__ == '__main__': from funasr.utils.modelscope_param import modelscope_args params = modelscope_args(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch") params.output_dir = "./checkpoint" # 模型保存路径 params.data_path = "speech_asr_aishell1_trainsets" # 数据路径,可以为modelscope中已上传数据,也可以是本地数据 params.dataset_type = "small" # 小数据量设置small,若数据量大于1000小时,请使用large params.batch_bins = 2000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒, params.max_epoch = 50 # 最大训练轮数 params.lr = 0.00005 # 设置学习率 modelscope_finetune(params) ``` ```shell python finetune.py &> log.txt & ``` If you want finetune with multi-GPUs, you could: ```shell CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch --nproc_per_node 2 finetune.py > log.txt 2>&1 ```