mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
Merge branch 'dev_gzf' of github.com:alibaba-damo-academy/FunASR into dev_gzf
This commit is contained in:
commit
b0887f1767
@ -1,30 +0,0 @@
|
||||
# ModelScope Model
|
||||
|
||||
## How to finetune and infer using a pretrained Paraformer-large Model
|
||||
|
||||
### Finetune
|
||||
|
||||
- Modify finetune training related parameters in `finetune.py`
|
||||
- <strong>output_dir:</strong> # result dir
|
||||
- <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
|
||||
- <strong>batch_bins:</strong> # batch size
|
||||
- <strong>max_epoch:</strong> # number of training epoch
|
||||
- <strong>lr:</strong> # learning rate
|
||||
|
||||
- Then you can run the pipeline to finetune with:
|
||||
```python
|
||||
python finetune.py
|
||||
```
|
||||
|
||||
### Inference
|
||||
|
||||
Or you can use the finetuned model for inference directly.
|
||||
|
||||
- Setting parameters in `infer.py`
|
||||
- <strong>data_dir:</strong> # the dataset dir
|
||||
- <strong>output_dir:</strong> # result dir
|
||||
|
||||
- Then you can run the pipeline to infer with:
|
||||
```python
|
||||
python infer.py
|
||||
```
|
||||
@ -1,23 +0,0 @@
|
||||
# Paraformer-Large
|
||||
- Model link: <https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/summary>
|
||||
- Model size: 220M
|
||||
|
||||
# Environments
|
||||
- date: `Fri Feb 10 13:34:24 CST 2023`
|
||||
- python version: `3.7.12`
|
||||
- FunASR version: `0.1.6`
|
||||
- pytorch version: `pytorch 1.7.0`
|
||||
- Git hash: ``
|
||||
- Commit date: ``
|
||||
|
||||
# Beachmark Results
|
||||
|
||||
## AISHELL-1
|
||||
- Decode config:
|
||||
- Decode without CTC
|
||||
- Decode without LM
|
||||
|
||||
| testset CER(%) | base model|finetune model |
|
||||
|:--------------:|:---------:|:-------------:|
|
||||
| dev | 1.75 |1.62 |
|
||||
| test | 1.95 |1.78 |
|
||||
@ -1,36 +0,0 @@
|
||||
import os
|
||||
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.trainers import build_trainer
|
||||
|
||||
from funasr.datasets.ms_dataset import MsDataset
|
||||
from funasr.utils.modelscope_param import modelscope_args
|
||||
|
||||
|
||||
def modelscope_finetune(params):
|
||||
if not os.path.exists(params.output_dir):
|
||||
os.makedirs(params.output_dir, exist_ok=True)
|
||||
# dataset split ["train", "validation"]
|
||||
ds_dict = MsDataset.load(params.data_path)
|
||||
kwargs = dict(
|
||||
model=params.model,
|
||||
data_dir=ds_dict,
|
||||
dataset_type=params.dataset_type,
|
||||
work_dir=params.output_dir,
|
||||
batch_bins=params.batch_bins,
|
||||
max_epoch=params.max_epoch,
|
||||
lr=params.lr)
|
||||
trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
|
||||
trainer.train()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
params = modelscope_args(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch", data_path="./data")
|
||||
params.output_dir = "./checkpoint" # m模型保存路径
|
||||
params.data_path = "./example_data/" # 数据路径
|
||||
params.dataset_type = "small" # 小数据量设置small,若数据量大于1000小时,请使用large
|
||||
params.batch_bins = 2000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒,
|
||||
params.max_epoch = 50 # 最大训练轮数
|
||||
params.lr = 0.00005 # 设置学习率
|
||||
|
||||
modelscope_finetune(params)
|
||||
@ -1,101 +0,0 @@
|
||||
import os
|
||||
import shutil
|
||||
from multiprocessing import Pool
|
||||
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
from funasr.utils.compute_wer import compute_wer
|
||||
|
||||
|
||||
def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
|
||||
output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
|
||||
if ngpu > 0:
|
||||
use_gpu = 1
|
||||
gpu_id = int(idx) - 1
|
||||
else:
|
||||
use_gpu = 0
|
||||
gpu_id = -1
|
||||
if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
|
||||
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
|
||||
else:
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
|
||||
inference_pipline = pipeline(
|
||||
task=Tasks.auto_speech_recognition,
|
||||
model=model,
|
||||
output_dir=output_dir_job,
|
||||
batch_size=batch_size,
|
||||
ngpu=use_gpu,
|
||||
)
|
||||
audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
|
||||
inference_pipline(audio_in=audio_in)
|
||||
|
||||
|
||||
def modelscope_infer(params):
|
||||
# prepare for multi-GPU decoding
|
||||
ngpu = params["ngpu"]
|
||||
njob = params["njob"]
|
||||
batch_size = params["batch_size"]
|
||||
output_dir = params["output_dir"]
|
||||
model = params["model"]
|
||||
if os.path.exists(output_dir):
|
||||
shutil.rmtree(output_dir)
|
||||
os.mkdir(output_dir)
|
||||
split_dir = os.path.join(output_dir, "split")
|
||||
os.mkdir(split_dir)
|
||||
if ngpu > 0:
|
||||
nj = ngpu
|
||||
elif ngpu == 0:
|
||||
nj = njob
|
||||
wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
|
||||
with open(wav_scp_file) as f:
|
||||
lines = f.readlines()
|
||||
num_lines = len(lines)
|
||||
num_job_lines = num_lines // nj
|
||||
start = 0
|
||||
for i in range(nj):
|
||||
end = start + num_job_lines
|
||||
file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1)))
|
||||
with open(file, "w") as f:
|
||||
if i == nj - 1:
|
||||
f.writelines(lines[start:])
|
||||
else:
|
||||
f.writelines(lines[start:end])
|
||||
start = end
|
||||
|
||||
p = Pool(nj)
|
||||
for i in range(nj):
|
||||
p.apply_async(modelscope_infer_core,
|
||||
args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
|
||||
p.close()
|
||||
p.join()
|
||||
|
||||
# combine decoding results
|
||||
best_recog_path = os.path.join(output_dir, "1best_recog")
|
||||
os.mkdir(best_recog_path)
|
||||
files = ["text", "token", "score"]
|
||||
for file in files:
|
||||
with open(os.path.join(best_recog_path, file), "w") as f:
|
||||
for i in range(nj):
|
||||
job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file)
|
||||
with open(job_file) as f_job:
|
||||
lines = f_job.readlines()
|
||||
f.writelines(lines)
|
||||
|
||||
# If text exists, compute CER
|
||||
text_in = os.path.join(params["data_dir"], "text")
|
||||
if os.path.exists(text_in):
|
||||
text_proc_file = os.path.join(best_recog_path, "token")
|
||||
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
params = {}
|
||||
params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch"
|
||||
params["data_dir"] = "./data/test"
|
||||
params["output_dir"] = "./results"
|
||||
params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
|
||||
params["njob"] = 1 # if ngpu = 0, will use cpu decoding
|
||||
params["batch_size"] = 64
|
||||
modelscope_infer(params)
|
||||
@ -1,48 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
|
||||
from funasr.utils.compute_wer import compute_wer
|
||||
|
||||
def modelscope_infer_after_finetune(params):
|
||||
# prepare for decoding
|
||||
|
||||
try:
|
||||
pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
|
||||
except BaseException:
|
||||
raise BaseException(f"Please download pretrain model from ModelScope firstly.")
|
||||
shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
|
||||
decoding_path = os.path.join(params["output_dir"], "decode_results")
|
||||
if os.path.exists(decoding_path):
|
||||
shutil.rmtree(decoding_path)
|
||||
os.mkdir(decoding_path)
|
||||
|
||||
# decoding
|
||||
inference_pipeline = pipeline(
|
||||
task=Tasks.auto_speech_recognition,
|
||||
model=pretrained_model_path,
|
||||
output_dir=decoding_path,
|
||||
batch_size=params["batch_size"]
|
||||
)
|
||||
audio_in = os.path.join(params["data_dir"], "wav.scp")
|
||||
inference_pipeline(audio_in=audio_in)
|
||||
|
||||
# computer CER if GT text is set
|
||||
text_in = os.path.join(params["data_dir"], "text")
|
||||
if os.path.exists(text_in):
|
||||
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
|
||||
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
params = {}
|
||||
params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch"
|
||||
params["output_dir"] = "./checkpoint"
|
||||
params["data_dir"] = "./data/test"
|
||||
params["decoding_model_name"] = "valid.acc.ave_10best.pb"
|
||||
params["batch_size"] = 64
|
||||
modelscope_infer_after_finetune(params)
|
||||
@ -1,30 +0,0 @@
|
||||
# ModelScope Model
|
||||
|
||||
## How to finetune and infer using a pretrained Paraformer-large Model
|
||||
|
||||
### Finetune
|
||||
|
||||
- Modify finetune training related parameters in `finetune.py`
|
||||
- <strong>output_dir:</strong> # result dir
|
||||
- <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
|
||||
- <strong>batch_bins:</strong> # batch size
|
||||
- <strong>max_epoch:</strong> # number of training epoch
|
||||
- <strong>lr:</strong> # learning rate
|
||||
|
||||
- Then you can run the pipeline to finetune with:
|
||||
```python
|
||||
python finetune.py
|
||||
```
|
||||
|
||||
### Inference
|
||||
|
||||
Or you can use the finetuned model for inference directly.
|
||||
|
||||
- Setting parameters in `infer.py`
|
||||
- <strong>data_dir:</strong> # the dataset dir
|
||||
- <strong>output_dir:</strong> # result dir
|
||||
|
||||
- Then you can run the pipeline to infer with:
|
||||
```python
|
||||
python infer.py
|
||||
```
|
||||
@ -1,25 +0,0 @@
|
||||
# Paraformer-Large
|
||||
- Model link: <https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/summary>
|
||||
- Model size: 220M
|
||||
|
||||
# Environments
|
||||
- date: `Fri Feb 10 13:34:24 CST 2023`
|
||||
- python version: `3.7.12`
|
||||
- FunASR version: `0.1.6`
|
||||
- pytorch version: `pytorch 1.7.0`
|
||||
- Git hash: ``
|
||||
- Commit date: ``
|
||||
|
||||
# Beachmark Results
|
||||
|
||||
## AISHELL-2
|
||||
- Decode config:
|
||||
- Decode without CTC
|
||||
- Decode without LM
|
||||
|
||||
| testset | base model|finetune model|
|
||||
|:------------:|:---------:|:------------:|
|
||||
| dev_ios | 2.80 |2.60 |
|
||||
| test_android | 3.13 |2.84 |
|
||||
| test_ios | 2.85 |2.82 |
|
||||
| test_mic | 3.06 |2.88 |
|
||||
@ -1,36 +0,0 @@
|
||||
import os
|
||||
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.trainers import build_trainer
|
||||
|
||||
from funasr.datasets.ms_dataset import MsDataset
|
||||
from funasr.utils.modelscope_param import modelscope_args
|
||||
|
||||
|
||||
def modelscope_finetune(params):
|
||||
if not os.path.exists(params.output_dir):
|
||||
os.makedirs(params.output_dir, exist_ok=True)
|
||||
# dataset split ["train", "validation"]
|
||||
ds_dict = MsDataset.load(params.data_path)
|
||||
kwargs = dict(
|
||||
model=params.model,
|
||||
data_dir=ds_dict,
|
||||
dataset_type=params.dataset_type,
|
||||
work_dir=params.output_dir,
|
||||
batch_bins=params.batch_bins,
|
||||
max_epoch=params.max_epoch,
|
||||
lr=params.lr)
|
||||
trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
|
||||
trainer.train()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
params = modelscope_args(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch", data_path="./data")
|
||||
params.output_dir = "./checkpoint" # m模型保存路径
|
||||
params.data_path = "./example_data/" # 数据路径
|
||||
params.dataset_type = "small" # 小数据量设置small,若数据量大于1000小时,请使用large
|
||||
params.batch_bins = 2000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒,
|
||||
params.max_epoch = 50 # 最大训练轮数
|
||||
params.lr = 0.00005 # 设置学习率
|
||||
|
||||
modelscope_finetune(params)
|
||||
@ -1,101 +0,0 @@
|
||||
import os
|
||||
import shutil
|
||||
from multiprocessing import Pool
|
||||
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
from funasr.utils.compute_wer import compute_wer
|
||||
|
||||
|
||||
def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
|
||||
output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
|
||||
if ngpu > 0:
|
||||
use_gpu = 1
|
||||
gpu_id = int(idx) - 1
|
||||
else:
|
||||
use_gpu = 0
|
||||
gpu_id = -1
|
||||
if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
|
||||
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
|
||||
else:
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
|
||||
inference_pipline = pipeline(
|
||||
task=Tasks.auto_speech_recognition,
|
||||
model=model,
|
||||
output_dir=output_dir_job,
|
||||
batch_size=batch_size,
|
||||
ngpu=use_gpu,
|
||||
)
|
||||
audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
|
||||
inference_pipline(audio_in=audio_in)
|
||||
|
||||
|
||||
def modelscope_infer(params):
|
||||
# prepare for multi-GPU decoding
|
||||
ngpu = params["ngpu"]
|
||||
njob = params["njob"]
|
||||
batch_size = params["batch_size"]
|
||||
output_dir = params["output_dir"]
|
||||
model = params["model"]
|
||||
if os.path.exists(output_dir):
|
||||
shutil.rmtree(output_dir)
|
||||
os.mkdir(output_dir)
|
||||
split_dir = os.path.join(output_dir, "split")
|
||||
os.mkdir(split_dir)
|
||||
if ngpu > 0:
|
||||
nj = ngpu
|
||||
elif ngpu == 0:
|
||||
nj = njob
|
||||
wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
|
||||
with open(wav_scp_file) as f:
|
||||
lines = f.readlines()
|
||||
num_lines = len(lines)
|
||||
num_job_lines = num_lines // nj
|
||||
start = 0
|
||||
for i in range(nj):
|
||||
end = start + num_job_lines
|
||||
file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1)))
|
||||
with open(file, "w") as f:
|
||||
if i == nj - 1:
|
||||
f.writelines(lines[start:])
|
||||
else:
|
||||
f.writelines(lines[start:end])
|
||||
start = end
|
||||
|
||||
p = Pool(nj)
|
||||
for i in range(nj):
|
||||
p.apply_async(modelscope_infer_core,
|
||||
args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
|
||||
p.close()
|
||||
p.join()
|
||||
|
||||
# combine decoding results
|
||||
best_recog_path = os.path.join(output_dir, "1best_recog")
|
||||
os.mkdir(best_recog_path)
|
||||
files = ["text", "token", "score"]
|
||||
for file in files:
|
||||
with open(os.path.join(best_recog_path, file), "w") as f:
|
||||
for i in range(nj):
|
||||
job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file)
|
||||
with open(job_file) as f_job:
|
||||
lines = f_job.readlines()
|
||||
f.writelines(lines)
|
||||
|
||||
# If text exists, compute CER
|
||||
text_in = os.path.join(params["data_dir"], "text")
|
||||
if os.path.exists(text_in):
|
||||
text_proc_file = os.path.join(best_recog_path, "token")
|
||||
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
params = {}
|
||||
params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch"
|
||||
params["data_dir"] = "./data/test"
|
||||
params["output_dir"] = "./results"
|
||||
params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
|
||||
params["njob"] = 1 # if ngpu = 0, will use cpu decoding
|
||||
params["batch_size"] = 64
|
||||
modelscope_infer(params)
|
||||
@ -1,48 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
|
||||
from funasr.utils.compute_wer import compute_wer
|
||||
|
||||
def modelscope_infer_after_finetune(params):
|
||||
# prepare for decoding
|
||||
|
||||
try:
|
||||
pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
|
||||
except BaseException:
|
||||
raise BaseException(f"Please download pretrain model from ModelScope firstly.")
|
||||
shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
|
||||
decoding_path = os.path.join(params["output_dir"], "decode_results")
|
||||
if os.path.exists(decoding_path):
|
||||
shutil.rmtree(decoding_path)
|
||||
os.mkdir(decoding_path)
|
||||
|
||||
# decoding
|
||||
inference_pipeline = pipeline(
|
||||
task=Tasks.auto_speech_recognition,
|
||||
model=pretrained_model_path,
|
||||
output_dir=decoding_path,
|
||||
batch_size=params["batch_size"]
|
||||
)
|
||||
audio_in = os.path.join(params["data_dir"], "wav.scp")
|
||||
inference_pipeline(audio_in=audio_in)
|
||||
|
||||
# computer CER if GT text is set
|
||||
text_in = os.path.join(params["data_dir"], "text")
|
||||
if os.path.exists(text_in):
|
||||
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
|
||||
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
params = {}
|
||||
params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch"
|
||||
params["output_dir"] = "./checkpoint"
|
||||
params["data_dir"] = "./data/test"
|
||||
params["decoding_model_name"] = "valid.acc.ave_10best.pb"
|
||||
params["batch_size"] = 64
|
||||
modelscope_infer_after_finetune(params)
|
||||
@ -21,23 +21,26 @@
|
||||
|
||||
Or you can use the finetuned model for inference directly.
|
||||
|
||||
- Setting parameters in `infer.py`
|
||||
- Setting parameters in `infer.sh`
|
||||
- <strong>model:</strong> # model name on ModelScope
|
||||
- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
|
||||
- <strong>output_dir:</strong> # result dir
|
||||
- <strong>ngpu:</strong> # the number of GPUs for decoding, if `ngpu` > 0, use GPU decoding
|
||||
- <strong>njob:</strong> # the number of jobs for CPU decoding, if `ngpu` = 0, use CPU decoding, please set `njob`
|
||||
- <strong>batch_size:</strong> # batchsize of inference
|
||||
- <strong>gpu_inference:</strong> # whether to perform gpu decoding, set false for cpu decoding
|
||||
- <strong>gpuid_list:</strong> # set gpus, e.g., gpuid_list="0,1"
|
||||
- <strong>njob:</strong> # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob`
|
||||
|
||||
- Then you can run the pipeline to infer with:
|
||||
```python
|
||||
python infer.py
|
||||
sh infer.sh
|
||||
```
|
||||
|
||||
- Results
|
||||
|
||||
The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
|
||||
|
||||
If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `DETAILS.txt`, `RESULTS.txt` record the results and CER after text normalization.
|
||||
|
||||
### Inference using local finetuned model
|
||||
|
||||
- Modify inference related parameters in `infer_after_finetune.py`
|
||||
|
||||
@ -17,22 +17,22 @@
|
||||
- Decode without CTC
|
||||
- Decode without LM
|
||||
|
||||
| testset | CER(%)|
|
||||
|:---------:|:-----:|
|
||||
| dev | 1.75 |
|
||||
| test | 1.95 |
|
||||
| CER(%) | Pretrain model|[Finetune model](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/summary) |
|
||||
|:---------:|:-------------:|:-------------:|
|
||||
| dev | 1.75 |1.62 |
|
||||
| test | 1.95 |1.78 |
|
||||
|
||||
## AISHELL-2
|
||||
- Decode config:
|
||||
- Decode without CTC
|
||||
- Decode without LM
|
||||
|
||||
| testset | CER(%)|
|
||||
|:------------:|:-----:|
|
||||
| dev_ios | 2.80 |
|
||||
| test_android | 3.13 |
|
||||
| test_ios | 2.85 |
|
||||
| test_mic | 3.06 |
|
||||
| CER(%) | Pretrain model|[Finetune model](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/summary)|
|
||||
|:------------:|:-------------:|:------------:|
|
||||
| dev_ios | 2.80 |2.60 |
|
||||
| test_android | 3.13 |2.84 |
|
||||
| test_ios | 2.85 |2.82 |
|
||||
| test_mic | 3.06 |2.88 |
|
||||
|
||||
## Wenetspeech
|
||||
- Decode config:
|
||||
|
||||
@ -1,101 +1,25 @@
|
||||
import os
|
||||
import shutil
|
||||
from multiprocessing import Pool
|
||||
|
||||
import argparse
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
from funasr.utils.compute_wer import compute_wer
|
||||
|
||||
|
||||
def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
|
||||
output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
|
||||
if ngpu > 0:
|
||||
use_gpu = 1
|
||||
gpu_id = int(idx) - 1
|
||||
else:
|
||||
use_gpu = 0
|
||||
gpu_id = -1
|
||||
if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
|
||||
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
|
||||
else:
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
|
||||
inference_pipline = pipeline(
|
||||
def modelscope_infer(args):
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
|
||||
inference_pipeline = pipeline(
|
||||
task=Tasks.auto_speech_recognition,
|
||||
model=model,
|
||||
output_dir=output_dir_job,
|
||||
batch_size=batch_size,
|
||||
ngpu=use_gpu,
|
||||
model=args.model,
|
||||
output_dir=args.output_dir,
|
||||
batch_size=args.batch_size,
|
||||
)
|
||||
audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
|
||||
inference_pipline(audio_in=audio_in)
|
||||
|
||||
|
||||
def modelscope_infer(params):
|
||||
# prepare for multi-GPU decoding
|
||||
ngpu = params["ngpu"]
|
||||
njob = params["njob"]
|
||||
batch_size = params["batch_size"]
|
||||
output_dir = params["output_dir"]
|
||||
model = params["model"]
|
||||
if os.path.exists(output_dir):
|
||||
shutil.rmtree(output_dir)
|
||||
os.mkdir(output_dir)
|
||||
split_dir = os.path.join(output_dir, "split")
|
||||
os.mkdir(split_dir)
|
||||
if ngpu > 0:
|
||||
nj = ngpu
|
||||
elif ngpu == 0:
|
||||
nj = njob
|
||||
wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
|
||||
with open(wav_scp_file) as f:
|
||||
lines = f.readlines()
|
||||
num_lines = len(lines)
|
||||
num_job_lines = num_lines // nj
|
||||
start = 0
|
||||
for i in range(nj):
|
||||
end = start + num_job_lines
|
||||
file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1)))
|
||||
with open(file, "w") as f:
|
||||
if i == nj - 1:
|
||||
f.writelines(lines[start:])
|
||||
else:
|
||||
f.writelines(lines[start:end])
|
||||
start = end
|
||||
|
||||
p = Pool(nj)
|
||||
for i in range(nj):
|
||||
p.apply_async(modelscope_infer_core,
|
||||
args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
|
||||
p.close()
|
||||
p.join()
|
||||
|
||||
# combine decoding results
|
||||
best_recog_path = os.path.join(output_dir, "1best_recog")
|
||||
os.mkdir(best_recog_path)
|
||||
files = ["text", "token", "score"]
|
||||
for file in files:
|
||||
with open(os.path.join(best_recog_path, file), "w") as f:
|
||||
for i in range(nj):
|
||||
job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file)
|
||||
with open(job_file) as f_job:
|
||||
lines = f_job.readlines()
|
||||
f.writelines(lines)
|
||||
|
||||
# If text exists, compute CER
|
||||
text_in = os.path.join(params["data_dir"], "text")
|
||||
if os.path.exists(text_in):
|
||||
text_proc_file = os.path.join(best_recog_path, "token")
|
||||
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
|
||||
|
||||
inference_pipeline(audio_in=args.audio_in)
|
||||
|
||||
if __name__ == "__main__":
|
||||
params = {}
|
||||
params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||
params["data_dir"] = "./data/test"
|
||||
params["output_dir"] = "./results"
|
||||
params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
|
||||
params["njob"] = 1 # if ngpu = 0, will use cpu decoding
|
||||
params["batch_size"] = 64
|
||||
modelscope_infer(params)
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
|
||||
parser.add_argument('--audio_in', type=str, default="./data/test")
|
||||
parser.add_argument('--output_dir', type=str, default="./results/")
|
||||
parser.add_argument('--batch_size', type=int, default=64)
|
||||
parser.add_argument('--gpuid', type=str, default="0")
|
||||
args = parser.parse_args()
|
||||
modelscope_infer(args)
|
||||
@ -0,0 +1,95 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
set -u
|
||||
set -o pipefail
|
||||
|
||||
stage=1
|
||||
stop_stage=2
|
||||
model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||
data_dir="./data/test"
|
||||
output_dir="./results"
|
||||
batch_size=64
|
||||
gpu_inference=true # whether to perform gpu decoding
|
||||
gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1"
|
||||
njob=4 # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
|
||||
|
||||
|
||||
if ${gpu_inference}; then
|
||||
nj=$(echo $gpuid_list | awk -F "," '{print NF}')
|
||||
else
|
||||
nj=$njob
|
||||
batch_size=1
|
||||
gpuid_list=""
|
||||
for JOB in $(seq ${nj}); do
|
||||
gpuid_list=$gpuid_list"-1,"
|
||||
done
|
||||
fi
|
||||
|
||||
mkdir -p $output_dir/split
|
||||
split_scps=""
|
||||
for JOB in $(seq ${nj}); do
|
||||
split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
|
||||
done
|
||||
perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
|
||||
echo "Decoding ..."
|
||||
gpuid_list_array=(${gpuid_list//,/ })
|
||||
for JOB in $(seq ${nj}); do
|
||||
{
|
||||
id=$((JOB-1))
|
||||
gpuid=${gpuid_list_array[$id]}
|
||||
mkdir -p ${output_dir}/output.$JOB
|
||||
python infer.py \
|
||||
--model ${model} \
|
||||
--audio_in ${output_dir}/split/wav.$JOB.scp \
|
||||
--output_dir ${output_dir}/output.$JOB \
|
||||
--batch_size ${batch_size} \
|
||||
--gpuid ${gpuid}
|
||||
}&
|
||||
done
|
||||
wait
|
||||
|
||||
mkdir -p ${output_dir}/1best_recog
|
||||
for f in token score text; do
|
||||
if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
|
||||
for i in $(seq "${nj}"); do
|
||||
cat "${output_dir}/output.${i}/1best_recog/${f}"
|
||||
done | sort -k1 >"${output_dir}/1best_recog/${f}"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
|
||||
echo "Computing WER ..."
|
||||
python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
|
||||
python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref
|
||||
python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
|
||||
tail -n 3 ${output_dir}/1best_recog/text.cer
|
||||
fi
|
||||
|
||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
|
||||
echo "SpeechIO TIOBE textnorm"
|
||||
echo "$0 --> Normalizing REF text ..."
|
||||
./utils/textnorm_zh.py \
|
||||
--has_key --to_upper \
|
||||
${data_dir}/text \
|
||||
${output_dir}/1best_recog/ref.txt
|
||||
|
||||
echo "$0 --> Normalizing HYP text ..."
|
||||
./utils/textnorm_zh.py \
|
||||
--has_key --to_upper \
|
||||
${output_dir}/1best_recog/text.proc \
|
||||
${output_dir}/1best_recog/rec.txt
|
||||
grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt
|
||||
|
||||
echo "$0 --> computing WER/CER and alignment ..."
|
||||
./utils/error_rate_zh \
|
||||
--tokenizer char \
|
||||
--ref ${output_dir}/1best_recog/ref.txt \
|
||||
--hyp ${output_dir}/1best_recog/rec_non_empty.txt \
|
||||
${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt
|
||||
rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
|
||||
fi
|
||||
|
||||
@ -0,0 +1 @@
|
||||
../../../../egs/aishell/transformer/utils
|
||||
@ -6,8 +6,9 @@
|
||||
|
||||
- Modify finetune training related parameters in `finetune.py`
|
||||
- <strong>output_dir:</strong> # result dir
|
||||
- <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
|
||||
- <strong>batch_bins:</strong> # batch size
|
||||
- <strong>data_dir:</strong> # the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text`
|
||||
- <strong>dataset_type:</strong> # for dataset larger than 1000 hours, set as `large`, otherwise set as `small`
|
||||
- <strong>batch_bins:</strong> # batch size. For dataset_type is `small`, `batch_bins` indicates the feature frames. For dataset_type is `large`, `batch_bins` indicates the duration in ms
|
||||
- <strong>max_epoch:</strong> # number of training epoch
|
||||
- <strong>lr:</strong> # learning rate
|
||||
|
||||
@ -20,11 +21,38 @@
|
||||
|
||||
Or you can use the finetuned model for inference directly.
|
||||
|
||||
- Setting parameters in `infer.py`
|
||||
- <strong>data_dir:</strong> # the dataset dir
|
||||
- Setting parameters in `infer.sh`
|
||||
- <strong>model:</strong> # model name on ModelScope
|
||||
- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
|
||||
- <strong>output_dir:</strong> # result dir
|
||||
- <strong>batch_size:</strong> # batchsize of inference
|
||||
- <strong>gpu_inference:</strong> # whether to perform gpu decoding, set false for cpu decoding
|
||||
- <strong>gpuid_list:</strong> # set gpus, e.g., gpuid_list="0,1"
|
||||
- <strong>njob:</strong> # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob`
|
||||
|
||||
- Then you can run the pipeline to infer with:
|
||||
```python
|
||||
python infer.py
|
||||
sh infer.sh
|
||||
```
|
||||
|
||||
- Results
|
||||
|
||||
The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
|
||||
|
||||
### Inference using local finetuned model
|
||||
|
||||
- Modify inference related parameters in `infer_after_finetune.py`
|
||||
- <strong>modelscope_model_name: </strong> # model name on ModelScope
|
||||
- <strong>output_dir:</strong> # result dir
|
||||
- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
|
||||
- <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
|
||||
- <strong>batch_size:</strong> # batchsize of inference
|
||||
|
||||
- Then you can run the pipeline to finetune with:
|
||||
```python
|
||||
python infer_after_finetune.py
|
||||
```
|
||||
|
||||
- Results
|
||||
|
||||
The decoding results can be found in `$output_dir/decoding_results/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
|
||||
|
||||
@ -1,101 +1,25 @@
|
||||
import os
|
||||
import shutil
|
||||
from multiprocessing import Pool
|
||||
|
||||
import argparse
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
from funasr.utils.compute_wer import compute_wer
|
||||
|
||||
|
||||
def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
|
||||
output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
|
||||
if ngpu > 0:
|
||||
use_gpu = 1
|
||||
gpu_id = int(idx) - 1
|
||||
else:
|
||||
use_gpu = 0
|
||||
gpu_id = -1
|
||||
if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
|
||||
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
|
||||
else:
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
|
||||
inference_pipline = pipeline(
|
||||
def modelscope_infer(args):
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
|
||||
inference_pipeline = pipeline(
|
||||
task=Tasks.auto_speech_recognition,
|
||||
model=model,
|
||||
output_dir=output_dir_job,
|
||||
batch_size=batch_size,
|
||||
ngpu=use_gpu,
|
||||
model=args.model,
|
||||
output_dir=args.output_dir,
|
||||
batch_size=args.batch_size,
|
||||
)
|
||||
audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
|
||||
inference_pipline(audio_in=audio_in)
|
||||
|
||||
|
||||
def modelscope_infer(params):
|
||||
# prepare for multi-GPU decoding
|
||||
ngpu = params["ngpu"]
|
||||
njob = params["njob"]
|
||||
batch_size = params["batch_size"]
|
||||
output_dir = params["output_dir"]
|
||||
model = params["model"]
|
||||
if os.path.exists(output_dir):
|
||||
shutil.rmtree(output_dir)
|
||||
os.mkdir(output_dir)
|
||||
split_dir = os.path.join(output_dir, "split")
|
||||
os.mkdir(split_dir)
|
||||
if ngpu > 0:
|
||||
nj = ngpu
|
||||
elif ngpu == 0:
|
||||
nj = njob
|
||||
wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
|
||||
with open(wav_scp_file) as f:
|
||||
lines = f.readlines()
|
||||
num_lines = len(lines)
|
||||
num_job_lines = num_lines // nj
|
||||
start = 0
|
||||
for i in range(nj):
|
||||
end = start + num_job_lines
|
||||
file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1)))
|
||||
with open(file, "w") as f:
|
||||
if i == nj - 1:
|
||||
f.writelines(lines[start:])
|
||||
else:
|
||||
f.writelines(lines[start:end])
|
||||
start = end
|
||||
|
||||
p = Pool(nj)
|
||||
for i in range(nj):
|
||||
p.apply_async(modelscope_infer_core,
|
||||
args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
|
||||
p.close()
|
||||
p.join()
|
||||
|
||||
# combine decoding results
|
||||
best_recog_path = os.path.join(output_dir, "1best_recog")
|
||||
os.mkdir(best_recog_path)
|
||||
files = ["text", "token", "score"]
|
||||
for file in files:
|
||||
with open(os.path.join(best_recog_path, file), "w") as f:
|
||||
for i in range(nj):
|
||||
job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file)
|
||||
with open(job_file) as f_job:
|
||||
lines = f_job.readlines()
|
||||
f.writelines(lines)
|
||||
|
||||
# If text exists, compute CER
|
||||
text_in = os.path.join(params["data_dir"], "text")
|
||||
if os.path.exists(text_in):
|
||||
text_proc_file = os.path.join(best_recog_path, "token")
|
||||
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
|
||||
|
||||
inference_pipeline(audio_in=args.audio_in)
|
||||
|
||||
if __name__ == "__main__":
|
||||
params = {}
|
||||
params["model"] = "damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1"
|
||||
params["data_dir"] = "./data/test"
|
||||
params["output_dir"] = "./results"
|
||||
params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
|
||||
params["njob"] = 1 # if ngpu = 0, will use cpu decoding
|
||||
params["batch_size"] = 64
|
||||
modelscope_infer(params)
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model', type=str, default="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1")
|
||||
parser.add_argument('--audio_in', type=str, default="./data/test")
|
||||
parser.add_argument('--output_dir', type=str, default="./results/")
|
||||
parser.add_argument('--batch_size', type=int, default=64)
|
||||
parser.add_argument('--gpuid', type=str, default="0")
|
||||
args = parser.parse_args()
|
||||
modelscope_infer(args)
|
||||
@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
set -u
|
||||
set -o pipefail
|
||||
|
||||
stage=1
|
||||
stop_stage=2
|
||||
model="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1"
|
||||
data_dir="./data/test"
|
||||
output_dir="./results"
|
||||
batch_size=64
|
||||
gpu_inference=true # whether to perform gpu decoding
|
||||
gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1"
|
||||
njob=4 # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
|
||||
|
||||
|
||||
if ${gpu_inference}; then
|
||||
nj=$(echo $gpuid_list | awk -F "," '{print NF}')
|
||||
else
|
||||
nj=$njob
|
||||
batch_size=1
|
||||
gpuid_list=""
|
||||
for JOB in $(seq ${nj}); do
|
||||
gpuid_list=$gpuid_list"-1,"
|
||||
done
|
||||
fi
|
||||
|
||||
mkdir -p $output_dir/split
|
||||
split_scps=""
|
||||
for JOB in $(seq ${nj}); do
|
||||
split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
|
||||
done
|
||||
perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
|
||||
echo "Decoding ..."
|
||||
gpuid_list_array=(${gpuid_list//,/ })
|
||||
for JOB in $(seq ${nj}); do
|
||||
{
|
||||
id=$((JOB-1))
|
||||
gpuid=${gpuid_list_array[$id]}
|
||||
mkdir -p ${output_dir}/output.$JOB
|
||||
python infer.py \
|
||||
--model ${model} \
|
||||
--audio_in ${output_dir}/split/wav.$JOB.scp \
|
||||
--output_dir ${output_dir}/output.$JOB \
|
||||
--batch_size ${batch_size} \
|
||||
--gpuid ${gpuid}
|
||||
}&
|
||||
done
|
||||
wait
|
||||
|
||||
mkdir -p ${output_dir}/1best_recog
|
||||
for f in token score text; do
|
||||
if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
|
||||
for i in $(seq "${nj}"); do
|
||||
cat "${output_dir}/output.${i}/1best_recog/${f}"
|
||||
done | sort -k1 >"${output_dir}/1best_recog/${f}"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
|
||||
echo "Computing WER ..."
|
||||
python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
|
||||
python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref
|
||||
python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
|
||||
tail -n 3 ${output_dir}/1best_recog/text.cer
|
||||
fi
|
||||
@ -0,0 +1 @@
|
||||
../../../../egs/aishell/transformer/utils
|
||||
@ -226,7 +226,7 @@ def inference_modelscope(
|
||||
):
|
||||
results = []
|
||||
split_size = 10
|
||||
|
||||
cache_in = param_dict["cache"]
|
||||
if raw_inputs != None:
|
||||
line = raw_inputs.strip()
|
||||
key = "demo"
|
||||
@ -234,34 +234,12 @@ def inference_modelscope(
|
||||
item = {'key': key, 'value': ""}
|
||||
results.append(item)
|
||||
return results
|
||||
result, _, cache = text2punc(line, cache)
|
||||
item = {'key': key, 'value': result, 'cache': cache}
|
||||
result, _, cache = text2punc(line, cache_in)
|
||||
param_dict["cache"] = cache
|
||||
item = {'key': key, 'value': result}
|
||||
results.append(item)
|
||||
return results
|
||||
|
||||
for inference_text, _, _ in data_path_and_name_and_type:
|
||||
with open(inference_text, "r", encoding="utf-8") as fin:
|
||||
for line in fin:
|
||||
line = line.strip()
|
||||
segs = line.split("\t")
|
||||
if len(segs) != 2:
|
||||
continue
|
||||
key = segs[0]
|
||||
if len(segs[1]) == 0:
|
||||
continue
|
||||
result, _ = text2punc(segs[1])
|
||||
item = {'key': key, 'value': result}
|
||||
results.append(item)
|
||||
output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
|
||||
if output_path != None:
|
||||
output_file_name = "infer.out"
|
||||
Path(output_path).mkdir(parents=True, exist_ok=True)
|
||||
output_file_path = (Path(output_path) / output_file_name).absolute()
|
||||
with open(output_file_path, "w", encoding="utf-8") as fout:
|
||||
for item_i in results:
|
||||
key_out = item_i["key"]
|
||||
value_out = item_i["value"]
|
||||
fout.write(f"{key_out}\t{value_out}\n")
|
||||
return results
|
||||
|
||||
return _forward
|
||||
|
||||
@ -30,14 +30,7 @@ from funasr.models.frontend.wav_frontend import WavFrontendOnline
|
||||
from funasr.models.frontend.wav_frontend import WavFrontend
|
||||
from funasr.bin.vad_inference import Speech2VadSegment
|
||||
|
||||
header_colors = '\033[95m'
|
||||
end_colors = '\033[0m'
|
||||
|
||||
global_asr_language: str = 'zh-cn'
|
||||
global_sample_rate: Union[int, Dict[Any, int]] = {
|
||||
'audio_fs': 16000,
|
||||
'model_fs': 16000
|
||||
}
|
||||
|
||||
|
||||
class Speech2VadSegmentOnline(Speech2VadSegment):
|
||||
|
||||
@ -14,7 +14,7 @@ from funasr.utils.types import str2bool
|
||||
# torch_version = float(".".join(torch.__version__.split(".")[:2]))
|
||||
# assert torch_version > 1.9
|
||||
|
||||
class ASRModelExportParaformer:
|
||||
class ModelExport:
|
||||
def __init__(
|
||||
self,
|
||||
cache_dir: Union[Path, str] = None,
|
||||
@ -240,7 +240,7 @@ if __name__ == '__main__':
|
||||
parser.add_argument('--calib_num', type=int, default=200, help='calib max num')
|
||||
args = parser.parse_args()
|
||||
|
||||
export_model = ASRModelExportParaformer(
|
||||
export_model = ModelExport(
|
||||
cache_dir=args.export_dir,
|
||||
onnx=args.type == 'onnx',
|
||||
quant=args.quantize,
|
||||
|
||||
@ -42,7 +42,7 @@ pip install --editable ./
|
||||
导出onnx模型,[详见](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export),参考示例,从modelscope中模型导出:
|
||||
|
||||
```shell
|
||||
python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize False
|
||||
python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize True
|
||||
```
|
||||
|
||||
## Building Guidance for Linux/Unix
|
||||
|
||||
@ -25,6 +25,16 @@ Aishell1 [test set](https://www.openslr.org/33/) , the total audio duration is 3
|
||||
|
||||
## [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
|
||||
|
||||
Number of Parameter: 220M
|
||||
|
||||
Storage size: 880MB
|
||||
|
||||
Storage size after int8-quant: 237MB
|
||||
|
||||
CER: 1.95%
|
||||
|
||||
CER after int8-quant: 1.95%
|
||||
|
||||
### Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz 16core-32processor with avx512_vnni
|
||||
|
||||
| concurrent-tasks | processing time(s) | RTF | Speedup Rate |
|
||||
@ -73,6 +83,16 @@ Aishell1 [test set](https://www.openslr.org/33/) , the total audio duration is 3
|
||||
|
||||
## [Paraformer](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary)
|
||||
|
||||
Number of Parameter: 68M
|
||||
|
||||
Storage size: 275MB
|
||||
|
||||
Storage size after int8-quant: 81MB
|
||||
|
||||
CER: 3.73%
|
||||
|
||||
CER after int8-quant: 3.78%
|
||||
|
||||
### Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz 16core-32processor with avx512_vnni
|
||||
|
||||
| concurrent-tasks | processing time(s) | RTF | Speedup Rate |
|
||||
|
||||
@ -26,23 +26,28 @@ cd FunASR/funasr/runtime/python/grpc/
|
||||
|
||||
Step 1-2) Optional, Prepare server onnxruntime environment (on server).
|
||||
|
||||
Install [`rapid_paraformer`](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).
|
||||
Install [`onnx_paraformer`](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).
|
||||
|
||||
- Build the rapid_paraformer `whl`
|
||||
- Build the onnx_paraformer `whl`
|
||||
```
|
||||
git clone https://github.com/alibaba/FunASR.git && cd FunASR
|
||||
cd funasr/runtime/python/onnxruntime/rapid_paraformer
|
||||
python setup.py bdist_wheel
|
||||
python setup.py build
|
||||
python setup.py install
|
||||
```
|
||||
|
||||
- Install the build `whl`
|
||||
```
|
||||
pip install dist/rapid_paraformer-0.0.1-py3-none-any.whl
|
||||
```
|
||||
[//]: # ()
|
||||
[//]: # (- Install the build `whl`)
|
||||
|
||||
[//]: # (```)
|
||||
|
||||
[//]: # (pip install dist/rapid_paraformer-0.0.1-py3-none-any.whl)
|
||||
|
||||
[//]: # (```)
|
||||
|
||||
Export the model, more details ref to [export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).
|
||||
```
|
||||
python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
|
||||
```shell
|
||||
python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize True
|
||||
```
|
||||
|
||||
Step 2) Optional, generate protobuf file (run on server, the two generated pb files are both used for server and client).
|
||||
|
||||
157
funasr/runtime/python/utils/compute_wer.py
Executable file
157
funasr/runtime/python/utils/compute_wer.py
Executable file
@ -0,0 +1,157 @@
|
||||
import os
|
||||
import numpy as np
|
||||
import sys
|
||||
|
||||
def compute_wer(ref_file,
|
||||
hyp_file,
|
||||
cer_detail_file):
|
||||
rst = {
|
||||
'Wrd': 0,
|
||||
'Corr': 0,
|
||||
'Ins': 0,
|
||||
'Del': 0,
|
||||
'Sub': 0,
|
||||
'Snt': 0,
|
||||
'Err': 0.0,
|
||||
'S.Err': 0.0,
|
||||
'wrong_words': 0,
|
||||
'wrong_sentences': 0
|
||||
}
|
||||
|
||||
hyp_dict = {}
|
||||
ref_dict = {}
|
||||
with open(hyp_file, 'r') as hyp_reader:
|
||||
for line in hyp_reader:
|
||||
key = line.strip().split()[0]
|
||||
value = line.strip().split()[1:]
|
||||
hyp_dict[key] = value
|
||||
with open(ref_file, 'r') as ref_reader:
|
||||
for line in ref_reader:
|
||||
key = line.strip().split()[0]
|
||||
value = line.strip().split()[1:]
|
||||
ref_dict[key] = value
|
||||
|
||||
cer_detail_writer = open(cer_detail_file, 'w')
|
||||
for hyp_key in hyp_dict:
|
||||
if hyp_key in ref_dict:
|
||||
out_item = compute_wer_by_line(hyp_dict[hyp_key], ref_dict[hyp_key])
|
||||
rst['Wrd'] += out_item['nwords']
|
||||
rst['Corr'] += out_item['cor']
|
||||
rst['wrong_words'] += out_item['wrong']
|
||||
rst['Ins'] += out_item['ins']
|
||||
rst['Del'] += out_item['del']
|
||||
rst['Sub'] += out_item['sub']
|
||||
rst['Snt'] += 1
|
||||
if out_item['wrong'] > 0:
|
||||
rst['wrong_sentences'] += 1
|
||||
cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
|
||||
cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
|
||||
cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')
|
||||
|
||||
if rst['Wrd'] > 0:
|
||||
rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)
|
||||
if rst['Snt'] > 0:
|
||||
rst['S.Err'] = round(rst['wrong_sentences'] * 100 / rst['Snt'], 2)
|
||||
|
||||
cer_detail_writer.write('\n')
|
||||
cer_detail_writer.write("%WER " + str(rst['Err']) + " [ " + str(rst['wrong_words'])+ " / " + str(rst['Wrd']) +
|
||||
", " + str(rst['Ins']) + " ins, " + str(rst['Del']) + " del, " + str(rst['Sub']) + " sub ]" + '\n')
|
||||
cer_detail_writer.write("%SER " + str(rst['S.Err']) + " [ " + str(rst['wrong_sentences']) + " / " + str(rst['Snt']) + " ]" + '\n')
|
||||
cer_detail_writer.write("Scored " + str(len(hyp_dict)) + " sentences, " + str(len(hyp_dict) - rst['Snt']) + " not present in hyp." + '\n')
|
||||
|
||||
|
||||
def compute_wer_by_line(hyp,
|
||||
ref):
|
||||
hyp = list(map(lambda x: x.lower(), hyp))
|
||||
ref = list(map(lambda x: x.lower(), ref))
|
||||
|
||||
len_hyp = len(hyp)
|
||||
len_ref = len(ref)
|
||||
|
||||
cost_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int16)
|
||||
|
||||
ops_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int8)
|
||||
|
||||
for i in range(len_hyp + 1):
|
||||
cost_matrix[i][0] = i
|
||||
for j in range(len_ref + 1):
|
||||
cost_matrix[0][j] = j
|
||||
|
||||
for i in range(1, len_hyp + 1):
|
||||
for j in range(1, len_ref + 1):
|
||||
if hyp[i - 1] == ref[j - 1]:
|
||||
cost_matrix[i][j] = cost_matrix[i - 1][j - 1]
|
||||
else:
|
||||
substitution = cost_matrix[i - 1][j - 1] + 1
|
||||
insertion = cost_matrix[i - 1][j] + 1
|
||||
deletion = cost_matrix[i][j - 1] + 1
|
||||
|
||||
compare_val = [substitution, insertion, deletion]
|
||||
|
||||
min_val = min(compare_val)
|
||||
operation_idx = compare_val.index(min_val) + 1
|
||||
cost_matrix[i][j] = min_val
|
||||
ops_matrix[i][j] = operation_idx
|
||||
|
||||
match_idx = []
|
||||
i = len_hyp
|
||||
j = len_ref
|
||||
rst = {
|
||||
'nwords': len_ref,
|
||||
'cor': 0,
|
||||
'wrong': 0,
|
||||
'ins': 0,
|
||||
'del': 0,
|
||||
'sub': 0
|
||||
}
|
||||
while i >= 0 or j >= 0:
|
||||
i_idx = max(0, i)
|
||||
j_idx = max(0, j)
|
||||
|
||||
if ops_matrix[i_idx][j_idx] == 0: # correct
|
||||
if i - 1 >= 0 and j - 1 >= 0:
|
||||
match_idx.append((j - 1, i - 1))
|
||||
rst['cor'] += 1
|
||||
|
||||
i -= 1
|
||||
j -= 1
|
||||
|
||||
elif ops_matrix[i_idx][j_idx] == 2: # insert
|
||||
i -= 1
|
||||
rst['ins'] += 1
|
||||
|
||||
elif ops_matrix[i_idx][j_idx] == 3: # delete
|
||||
j -= 1
|
||||
rst['del'] += 1
|
||||
|
||||
elif ops_matrix[i_idx][j_idx] == 1: # substitute
|
||||
i -= 1
|
||||
j -= 1
|
||||
rst['sub'] += 1
|
||||
|
||||
if i < 0 and j >= 0:
|
||||
rst['del'] += 1
|
||||
elif j < 0 and i >= 0:
|
||||
rst['ins'] += 1
|
||||
|
||||
match_idx.reverse()
|
||||
wrong_cnt = cost_matrix[len_hyp][len_ref]
|
||||
rst['wrong'] = wrong_cnt
|
||||
|
||||
return rst
|
||||
|
||||
def print_cer_detail(rst):
|
||||
return ("(" + "nwords=" + str(rst['nwords']) + ",cor=" + str(rst['cor'])
|
||||
+ ",ins=" + str(rst['ins']) + ",del=" + str(rst['del']) + ",sub="
|
||||
+ str(rst['sub']) + ") corr:" + '{:.2%}'.format(rst['cor']/rst['nwords'])
|
||||
+ ",cer:" + '{:.2%}'.format(rst['wrong']/rst['nwords']))
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) != 4:
|
||||
print("usage : python compute-wer.py test.ref test.hyp test.wer")
|
||||
sys.exit(0)
|
||||
|
||||
ref_file = sys.argv[1]
|
||||
hyp_file = sys.argv[2]
|
||||
cer_detail_file = sys.argv[3]
|
||||
compute_wer(ref_file, hyp_file, cer_detail_file)
|
||||
48
funasr/runtime/python/utils/infer.py
Normal file
48
funasr/runtime/python/utils/infer.py
Normal file
@ -0,0 +1,48 @@
|
||||
import os
|
||||
import time
|
||||
import sys
|
||||
import librosa
|
||||
from funasr.utils.types import str2bool
|
||||
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model_dir', type=str, required=True)
|
||||
parser.add_argument('--backend', type=str, default='onnx', help='["onnx", "torch"]')
|
||||
parser.add_argument('--wav_file', type=str, default=None, help='amp fallback number')
|
||||
parser.add_argument('--quantize', type=str2bool, default=False, help='quantized model')
|
||||
parser.add_argument('--intra_op_num_threads', type=int, default=1, help='intra_op_num_threads for onnx')
|
||||
parser.add_argument('--output_dir', type=str, default=None, help='amp fallback number')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
from funasr.runtime.python.libtorch.torch_paraformer import Paraformer
|
||||
if args.backend == "onnx":
|
||||
from funasr.runtime.python.onnxruntime.rapid_paraformer import Paraformer
|
||||
|
||||
model = Paraformer(args.model_dir, batch_size=1, quantize=args.quantize, intra_op_num_threads=args.intra_op_num_threads)
|
||||
|
||||
wav_file_f = open(args.wav_file, 'r')
|
||||
wav_files = wav_file_f.readlines()
|
||||
|
||||
output_dir = args.output_dir
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
if os.name == 'nt': # Windows
|
||||
newline = '\r\n'
|
||||
else: # Linux Mac
|
||||
newline = '\n'
|
||||
text_f = open(os.path.join(output_dir, "text"), "w", newline=newline)
|
||||
token_f = open(os.path.join(output_dir, "token"), "w", newline=newline)
|
||||
|
||||
for i, wav_path_i in enumerate(wav_files):
|
||||
wav_name, wav_path = wav_path_i.strip().split()
|
||||
result = model(wav_path)
|
||||
text_i = "{} {}\n".format(wav_name, result[0]['preds'][0])
|
||||
token_i = "{} {}\n".format(wav_name, result[0]['preds'][1])
|
||||
text_f.write(text_i)
|
||||
text_f.flush()
|
||||
token_f.write(token_i)
|
||||
token_f.flush()
|
||||
text_f.close()
|
||||
token_f.close()
|
||||
|
||||
74
funasr/runtime/python/utils/infer.sh
Normal file
74
funasr/runtime/python/utils/infer.sh
Normal file
@ -0,0 +1,74 @@
|
||||
|
||||
split_scps_tool=split_scp.pl
|
||||
inference_tool=infer.py
|
||||
proce_text_tool=proce_text.py
|
||||
compute_wer_tool=compute_wer.py
|
||||
|
||||
nj=32
|
||||
stage=0
|
||||
stop_stage=2
|
||||
|
||||
scp="/nfs/haoneng.lhn/funasr_data/aishell-1/data/test/wav.scp"
|
||||
label_text="/nfs/haoneng.lhn/funasr_data/aishell-1/data/test/text"
|
||||
export_root="/nfs/zhifu.gzf/export"
|
||||
|
||||
|
||||
#:<<!
|
||||
model_name="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||
backend="onnx" # "torch"
|
||||
quantize='true' # 'False'
|
||||
fallback_op_num_torch=20
|
||||
tag=${model_name}/${backend}_quantize_${quantize}_${fallback_op_num_torch}
|
||||
!
|
||||
|
||||
output_dir=${export_root}/logs/${tag}/split$nj
|
||||
mkdir -p ${output_dir}
|
||||
echo ${output_dir}
|
||||
|
||||
|
||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
|
||||
|
||||
python -m funasr.export.export_model --model-name ${model_name} --export-dir ${export_root} --type ${backend} --quantize ${quantize} --audio_in ${scp} --fallback-num ${fallback_op_num_torch}
|
||||
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
|
||||
|
||||
model_dir=${export_root}/${model_name}
|
||||
split_scps=""
|
||||
for JOB in $(seq ${nj}); do
|
||||
split_scps="$split_scps $output_dir/wav.$JOB.scp"
|
||||
done
|
||||
|
||||
perl ${split_scps_tool} $scp ${split_scps}
|
||||
|
||||
|
||||
for JOB in $(seq ${nj}); do
|
||||
{
|
||||
core_id=`expr $JOB - 1`
|
||||
taskset -c ${core_id} python ${inference_tool} --backend ${backend} --model_dir ${model_dir} --wav_file ${output_dir}/wav.$JOB.scp --quantize ${quantize} --output_dir ${output_dir}/${JOB} &> ${output_dir}/log.$JOB.txt
|
||||
}&
|
||||
|
||||
done
|
||||
wait
|
||||
|
||||
mkdir -p ${output_dir}/1best_recog
|
||||
for f in token text; do
|
||||
if [ -f "${output_dir}/1/${f}" ]; then
|
||||
for JOB in $(seq "${nj}"); do
|
||||
cat "${output_dir}/${JOB}/${f}"
|
||||
done | sort -k1 >"${output_dir}/1best_recog/${f}"
|
||||
fi
|
||||
done
|
||||
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
|
||||
echo "Computing WER ..."
|
||||
python ${proce_text_tool} ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
|
||||
python ${proce_text_tool} ${label_text} ${output_dir}/1best_recog/text.ref
|
||||
python ${compute_wer_tool} ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
|
||||
tail -n 3 ${output_dir}/1best_recog/text.cer
|
||||
fi
|
||||
|
||||
31
funasr/runtime/python/utils/proce_text.py
Executable file
31
funasr/runtime/python/utils/proce_text.py
Executable file
@ -0,0 +1,31 @@
|
||||
|
||||
import sys
|
||||
import re
|
||||
|
||||
in_f = sys.argv[1]
|
||||
out_f = sys.argv[2]
|
||||
|
||||
|
||||
with open(in_f, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
|
||||
with open(out_f, "w", encoding="utf-8") as f:
|
||||
for line in lines:
|
||||
outs = line.strip().split(" ", 1)
|
||||
if len(outs) == 2:
|
||||
idx, text = outs
|
||||
text = re.sub("</s>", "", text)
|
||||
text = re.sub("<s>", "", text)
|
||||
text = re.sub("@@", "", text)
|
||||
text = re.sub("@", "", text)
|
||||
text = re.sub("<unk>", "", text)
|
||||
text = re.sub(" ", "", text)
|
||||
text = text.lower()
|
||||
else:
|
||||
idx = outs[0]
|
||||
text = " "
|
||||
|
||||
text = [x for x in text]
|
||||
text = " ".join(text)
|
||||
out = "{} {}\n".format(idx, text)
|
||||
f.write(out)
|
||||
95
funasr/runtime/python/websocket/ASR_client.py
Normal file
95
funasr/runtime/python/websocket/ASR_client.py
Normal file
@ -0,0 +1,95 @@
|
||||
import pyaudio
|
||||
# import websocket #区别服务端这里是 websocket-client库
|
||||
import time
|
||||
import websockets
|
||||
import asyncio
|
||||
from queue import Queue
|
||||
# import threading
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host",
|
||||
type=str,
|
||||
default="localhost",
|
||||
required=False,
|
||||
help="host ip, localhost, 0.0.0.0")
|
||||
parser.add_argument("--port",
|
||||
type=int,
|
||||
default=10095,
|
||||
required=False,
|
||||
help="grpc server port")
|
||||
parser.add_argument("--chunk_size",
|
||||
type=int,
|
||||
default=300,
|
||||
help="ms")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
voices = Queue()
|
||||
async def ws_client():
|
||||
global ws # 定义一个全局变量ws,用于保存websocket连接对象
|
||||
# uri = "ws://11.167.134.197:8899"
|
||||
uri = "ws://{}:{}".format(args.host, args.port)
|
||||
ws = await websockets.connect(uri, subprotocols=["binary"]) # 创建一个长连接
|
||||
ws.max_size = 1024 * 1024 * 20
|
||||
print("connected ws server")
|
||||
|
||||
async def send(data):
|
||||
global ws # 引用全局变量ws
|
||||
try:
|
||||
await ws.send(data) # 通过ws对象发送数据
|
||||
except Exception as e:
|
||||
print('Exception occurred:', e)
|
||||
|
||||
|
||||
|
||||
asyncio.get_event_loop().run_until_complete(ws_client()) # 启动协程
|
||||
|
||||
|
||||
# 其他函数可以通过调用send(data)来发送数据,例如:
|
||||
async def test():
|
||||
#print("2")
|
||||
global voices
|
||||
FORMAT = pyaudio.paInt16
|
||||
CHANNELS = 1
|
||||
RATE = 16000
|
||||
CHUNK = int(RATE / 1000 * args.chunk_size)
|
||||
|
||||
p = pyaudio.PyAudio()
|
||||
|
||||
stream = p.open(format=FORMAT,
|
||||
channels=CHANNELS,
|
||||
rate=RATE,
|
||||
input=True,
|
||||
frames_per_buffer=CHUNK)
|
||||
|
||||
while True:
|
||||
|
||||
data = stream.read(CHUNK)
|
||||
|
||||
voices.put(data)
|
||||
#print(voices.qsize())
|
||||
await asyncio.sleep(0.01)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
async def ws_send():
|
||||
global voices
|
||||
print("started to sending data!")
|
||||
while True:
|
||||
while not voices.empty():
|
||||
data = voices.get()
|
||||
voices.task_done()
|
||||
await send(data)
|
||||
await asyncio.sleep(0.01)
|
||||
await asyncio.sleep(0.01)
|
||||
|
||||
async def main():
|
||||
task = asyncio.create_task(test()) # 创建一个后台任务
|
||||
task2 = asyncio.create_task(ws_send()) # 创建一个后台任务
|
||||
|
||||
await asyncio.gather(task, task2)
|
||||
|
||||
asyncio.run(main())
|
||||
191
funasr/runtime/python/websocket/ASR_server.py
Normal file
191
funasr/runtime/python/websocket/ASR_server.py
Normal file
@ -0,0 +1,191 @@
|
||||
import asyncio
|
||||
import websockets
|
||||
import time
|
||||
from queue import Queue
|
||||
import threading
|
||||
import argparse
|
||||
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
import logging
|
||||
|
||||
logger = get_logger(log_level=logging.CRITICAL)
|
||||
logger.setLevel(logging.CRITICAL)
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host",
|
||||
type=str,
|
||||
default="0.0.0.0",
|
||||
required=False,
|
||||
help="host ip, localhost, 0.0.0.0")
|
||||
parser.add_argument("--port",
|
||||
type=int,
|
||||
default=10095,
|
||||
required=False,
|
||||
help="grpc server port")
|
||||
parser.add_argument("--asr_model",
|
||||
type=str,
|
||||
default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
||||
help="model from modelscope")
|
||||
parser.add_argument("--vad_model",
|
||||
type=str,
|
||||
default="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
|
||||
help="model from modelscope")
|
||||
|
||||
parser.add_argument("--punc_model",
|
||||
type=str,
|
||||
default="damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727",
|
||||
help="model from modelscope")
|
||||
parser.add_argument("--ngpu",
|
||||
type=int,
|
||||
default=1,
|
||||
help="0 for cpu, 1 for gpu")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("model loading")
|
||||
voices = Queue()
|
||||
speek = Queue()
|
||||
|
||||
# vad
|
||||
inference_pipeline_vad = pipeline(
|
||||
task=Tasks.voice_activity_detection,
|
||||
model=args.vad_model,
|
||||
model_revision=None,
|
||||
output_dir=None,
|
||||
batch_size=1,
|
||||
mode='online',
|
||||
ngpu=args.ngpu,
|
||||
)
|
||||
param_dict_vad = {'in_cache': dict(), "is_final": False}
|
||||
|
||||
# asr
|
||||
param_dict_asr = {}
|
||||
# param_dict["hotword"] = "小五 小五月" # 设置热词,用空格隔开
|
||||
inference_pipeline_asr = pipeline(
|
||||
task=Tasks.auto_speech_recognition,
|
||||
model=args.asr_model,
|
||||
param_dict=param_dict_asr,
|
||||
ngpu=args.ngpu,
|
||||
)
|
||||
|
||||
param_dict_punc = {'cache': list()}
|
||||
inference_pipeline_punc = pipeline(
|
||||
task=Tasks.punctuation,
|
||||
model=args.punc_model,
|
||||
model_revision=None,
|
||||
ngpu=args.ngpu,
|
||||
)
|
||||
|
||||
print("model loaded")
|
||||
|
||||
|
||||
|
||||
async def ws_serve(websocket, path):
|
||||
global voices
|
||||
try:
|
||||
async for message in websocket:
|
||||
voices.put(message)
|
||||
#print("put")
|
||||
except websockets.exceptions.ConnectionClosedError as e:
|
||||
print('Connection closed with exception:', e)
|
||||
except Exception as e:
|
||||
print('Exception occurred:', e)
|
||||
|
||||
start_server = websockets.serve(ws_serve, args.host, args.port, subprotocols=["binary"], ping_interval=None)
|
||||
|
||||
|
||||
def vad(data): # 推理
|
||||
global vad_pipline, param_dict_vad
|
||||
#print(type(data))
|
||||
# print(param_dict_vad)
|
||||
segments_result = inference_pipeline_vad(audio_in=data, param_dict=param_dict_vad)
|
||||
# print(segments_result)
|
||||
# print(param_dict_vad)
|
||||
speech_start = False
|
||||
speech_end = False
|
||||
|
||||
if len(segments_result) == 0 or len(segments_result["text"]) > 1:
|
||||
return speech_start, speech_end
|
||||
if segments_result["text"][0][0] != -1:
|
||||
speech_start = True
|
||||
if segments_result["text"][0][1] != -1:
|
||||
speech_end = True
|
||||
return speech_start, speech_end
|
||||
|
||||
def asr(): # 推理
|
||||
global inference_pipeline2
|
||||
global speek
|
||||
while True:
|
||||
while not speek.empty():
|
||||
audio_in = speek.get()
|
||||
speek.task_done()
|
||||
rec_result = inference_pipeline_asr(audio_in=audio_in)
|
||||
rec_result_punc = inference_pipeline_punc(text_in=rec_result['text'], param_dict=param_dict_punc)
|
||||
print(rec_result_punc)
|
||||
time.sleep(0.1)
|
||||
time.sleep(0.1)
|
||||
|
||||
|
||||
def main(): # 推理
|
||||
frames = [] # 存储所有的帧数据
|
||||
buffer = [] # 存储缓存中的帧数据(最多两个片段)
|
||||
# silence_count = 0 # 统计连续静音的次数
|
||||
# speech_detected = False # 标记是否检测到语音
|
||||
RECORD_NUM = 0
|
||||
global voices
|
||||
global speek
|
||||
speech_start, speech_end = False, False
|
||||
while True:
|
||||
while not voices.empty():
|
||||
|
||||
data = voices.get()
|
||||
#print("队列排队数",voices.qsize())
|
||||
voices.task_done()
|
||||
buffer.append(data)
|
||||
if len(buffer) > 2:
|
||||
buffer.pop(0) # 如果缓存超过两个片段,则删除最早的一个
|
||||
|
||||
if speech_start:
|
||||
frames.append(data)
|
||||
RECORD_NUM += 1
|
||||
speech_start_i, speech_end_i = vad(data)
|
||||
# print(speech_start_i, speech_end_i)
|
||||
if speech_start_i:
|
||||
speech_start = speech_start_i
|
||||
# if not speech_detected:
|
||||
# print("检测到人声...")
|
||||
# speech_detected = True # 标记为检测到语音
|
||||
frames = []
|
||||
frames.extend(buffer) # 把之前2个语音数据快加入
|
||||
# silence_count = 0 # 重置静音次数
|
||||
if speech_end_i or RECORD_NUM > 300:
|
||||
# silence_count += 1 # 增加静音次数
|
||||
# speech_end = speech_end_i
|
||||
speech_start = False
|
||||
# if RECORD_NUM > 300: #这里 50 可根据需求改为合适的数据快数量
|
||||
# print("说话结束或者超过设置最长时间...")
|
||||
audio_in = b"".join(frames)
|
||||
#asrt = threading.Thread(target=asr,args=(audio_in,))
|
||||
#asrt.start()
|
||||
speek.put(audio_in)
|
||||
#rec_result = inference_pipeline2(audio_in=audio_in) # ASR 模型里跑一跑
|
||||
frames = [] # 清空所有的帧数据
|
||||
buffer = [] # 清空缓存中的帧数据(最多两个片段)
|
||||
# silence_count = 0 # 统计连续静音的次数清零
|
||||
# speech_detected = False # 标记是否检测到语音
|
||||
RECORD_NUM = 0
|
||||
time.sleep(0.01)
|
||||
time.sleep(0.01)
|
||||
|
||||
|
||||
|
||||
s = threading.Thread(target=main)
|
||||
s.start()
|
||||
s = threading.Thread(target=asr)
|
||||
s.start()
|
||||
|
||||
asyncio.get_event_loop().run_until_complete(start_server)
|
||||
asyncio.get_event_loop().run_forever()
|
||||
46
funasr/runtime/python/websocket/README.md
Normal file
46
funasr/runtime/python/websocket/README.md
Normal file
@ -0,0 +1,46 @@
|
||||
# Using funasr with websocket
|
||||
We can send streaming audio data to server in real-time with grpc client every 300 ms e.g., and get transcribed text when stop speaking.
|
||||
The audio data is in streaming, the asr inference process is in offline.
|
||||
|
||||
# Steps
|
||||
|
||||
## For the Server
|
||||
|
||||
Install the modelscope and funasr
|
||||
|
||||
```shell
|
||||
pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
|
||||
git clone https://github.com/alibaba/FunASR.git && cd FunASR
|
||||
pip install --editable ./
|
||||
```
|
||||
|
||||
Install the requirements for server
|
||||
|
||||
```shell
|
||||
cd funasr/runtime/python/websocket
|
||||
pip install -r requirements_server.txt
|
||||
```
|
||||
|
||||
Start server
|
||||
|
||||
```shell
|
||||
python ASR_server.py --host "0.0.0.0" --port 10095 --asr_model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||
```
|
||||
|
||||
## For the client
|
||||
|
||||
Install the requirements for client
|
||||
```shell
|
||||
git clone https://github.com/alibaba/FunASR.git && cd FunASR
|
||||
cd funasr/runtime/python/websocket
|
||||
pip install -r requirements_client.txt
|
||||
```
|
||||
|
||||
Start client
|
||||
|
||||
```shell
|
||||
python ASR_client.py --host "127.0.0.1" --port 10095 --chunk_size 300
|
||||
```
|
||||
|
||||
## Acknowledge
|
||||
1. We acknowledge [cgisky1980](https://github.com/cgisky1980/FunASR) for contributing the websocket service.
|
||||
2
funasr/runtime/python/websocket/requirements_client.txt
Normal file
2
funasr/runtime/python/websocket/requirements_client.txt
Normal file
@ -0,0 +1,2 @@
|
||||
websockets
|
||||
pyaudio
|
||||
1
funasr/runtime/python/websocket/requirements_server.txt
Normal file
1
funasr/runtime/python/websocket/requirements_server.txt
Normal file
@ -0,0 +1 @@
|
||||
websockets
|
||||
@ -1193,12 +1193,18 @@ class AbsTask(ABC):
|
||||
# logging.basicConfig() is invoked in main_worker() instead of main()
|
||||
# because it can be invoked only once in a process.
|
||||
# FIXME(kamo): Should we use logging.getLogger()?
|
||||
# BUGFIX: Remove previous handlers and reset log level
|
||||
for handler in logging.root.handlers[:]:
|
||||
logging.root.removeHandler(handler)
|
||||
logging.basicConfig(
|
||||
level=args.log_level,
|
||||
format=f"[{os.uname()[1].split('.')[0]}]"
|
||||
f" %(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
|
||||
)
|
||||
else:
|
||||
# BUGFIX: Remove previous handlers and reset log level
|
||||
for handler in logging.root.handlers[:]:
|
||||
logging.root.removeHandler(handler)
|
||||
# Suppress logging if RANK != 0
|
||||
logging.basicConfig(
|
||||
level="ERROR",
|
||||
|
||||
Loading…
Reference in New Issue
Block a user