Merge branch 'dev_gzf' of github.com:alibaba-damo-academy/FunASR into dev_gzf

This commit is contained in:
mengzhe.cmz 2023-03-23 19:59:28 +08:00
commit b0887f1767
35 changed files with 941 additions and 726 deletions

View File

@ -1,30 +0,0 @@
# ModelScope Model
## How to finetune and infer using a pretrained Paraformer-large Model
### Finetune
- Modify finetune training related parameters in `finetune.py`
- <strong>output_dir:</strong> # result dir
- <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
- <strong>batch_bins:</strong> # batch size
- <strong>max_epoch:</strong> # number of training epoch
- <strong>lr:</strong> # learning rate
- Then you can run the pipeline to finetune with:
```python
python finetune.py
```
### Inference
Or you can use the finetuned model for inference directly.
- Setting parameters in `infer.py`
- <strong>data_dir:</strong> # the dataset dir
- <strong>output_dir:</strong> # result dir
- Then you can run the pipeline to infer with:
```python
python infer.py
```

View File

@ -1,23 +0,0 @@
# Paraformer-Large
- Model link: <https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/summary>
- Model size: 220M
# Environments
- date: `Fri Feb 10 13:34:24 CST 2023`
- python version: `3.7.12`
- FunASR version: `0.1.6`
- pytorch version: `pytorch 1.7.0`
- Git hash: ``
- Commit date: ``
# Beachmark Results
## AISHELL-1
- Decode config:
- Decode without CTC
- Decode without LM
| testset CER(%) | base model|finetune model |
|:--------------:|:---------:|:-------------:|
| dev | 1.75 |1.62 |
| test | 1.95 |1.78 |

View File

@ -1,36 +0,0 @@
import os
from modelscope.metainfo import Trainers
from modelscope.trainers import build_trainer
from funasr.datasets.ms_dataset import MsDataset
from funasr.utils.modelscope_param import modelscope_args
def modelscope_finetune(params):
if not os.path.exists(params.output_dir):
os.makedirs(params.output_dir, exist_ok=True)
# dataset split ["train", "validation"]
ds_dict = MsDataset.load(params.data_path)
kwargs = dict(
model=params.model,
data_dir=ds_dict,
dataset_type=params.dataset_type,
work_dir=params.output_dir,
batch_bins=params.batch_bins,
max_epoch=params.max_epoch,
lr=params.lr)
trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
trainer.train()
if __name__ == '__main__':
params = modelscope_args(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch", data_path="./data")
params.output_dir = "./checkpoint" # m模型保存路径
params.data_path = "./example_data/" # 数据路径
params.dataset_type = "small" # 小数据量设置small若数据量大于1000小时请使用large
params.batch_bins = 2000 # batch size如果dataset_type="small"batch_bins单位为fbank特征帧数如果dataset_type="large"batch_bins单位为毫秒
params.max_epoch = 50 # 最大训练轮数
params.lr = 0.00005 # 设置学习率
modelscope_finetune(params)

View File

@ -1,101 +0,0 @@
import os
import shutil
from multiprocessing import Pool
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from funasr.utils.compute_wer import compute_wer
def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
if ngpu > 0:
use_gpu = 1
gpu_id = int(idx) - 1
else:
use_gpu = 0
gpu_id = -1
if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
else:
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
inference_pipline = pipeline(
task=Tasks.auto_speech_recognition,
model=model,
output_dir=output_dir_job,
batch_size=batch_size,
ngpu=use_gpu,
)
audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
inference_pipline(audio_in=audio_in)
def modelscope_infer(params):
# prepare for multi-GPU decoding
ngpu = params["ngpu"]
njob = params["njob"]
batch_size = params["batch_size"]
output_dir = params["output_dir"]
model = params["model"]
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
os.mkdir(output_dir)
split_dir = os.path.join(output_dir, "split")
os.mkdir(split_dir)
if ngpu > 0:
nj = ngpu
elif ngpu == 0:
nj = njob
wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
with open(wav_scp_file) as f:
lines = f.readlines()
num_lines = len(lines)
num_job_lines = num_lines // nj
start = 0
for i in range(nj):
end = start + num_job_lines
file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1)))
with open(file, "w") as f:
if i == nj - 1:
f.writelines(lines[start:])
else:
f.writelines(lines[start:end])
start = end
p = Pool(nj)
for i in range(nj):
p.apply_async(modelscope_infer_core,
args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
p.close()
p.join()
# combine decoding results
best_recog_path = os.path.join(output_dir, "1best_recog")
os.mkdir(best_recog_path)
files = ["text", "token", "score"]
for file in files:
with open(os.path.join(best_recog_path, file), "w") as f:
for i in range(nj):
job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file)
with open(job_file) as f_job:
lines = f_job.readlines()
f.writelines(lines)
# If text exists, compute CER
text_in = os.path.join(params["data_dir"], "text")
if os.path.exists(text_in):
text_proc_file = os.path.join(best_recog_path, "token")
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
if __name__ == "__main__":
params = {}
params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch"
params["data_dir"] = "./data/test"
params["output_dir"] = "./results"
params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
params["njob"] = 1 # if ngpu = 0, will use cpu decoding
params["batch_size"] = 64
modelscope_infer(params)

View File

@ -1,48 +0,0 @@
import json
import os
import shutil
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.hub.snapshot_download import snapshot_download
from funasr.utils.compute_wer import compute_wer
def modelscope_infer_after_finetune(params):
# prepare for decoding
try:
pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
except BaseException:
raise BaseException(f"Please download pretrain model from ModelScope firstly.")
shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
decoding_path = os.path.join(params["output_dir"], "decode_results")
if os.path.exists(decoding_path):
shutil.rmtree(decoding_path)
os.mkdir(decoding_path)
# decoding
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
model=pretrained_model_path,
output_dir=decoding_path,
batch_size=params["batch_size"]
)
audio_in = os.path.join(params["data_dir"], "wav.scp")
inference_pipeline(audio_in=audio_in)
# computer CER if GT text is set
text_in = os.path.join(params["data_dir"], "text")
if os.path.exists(text_in):
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
if __name__ == '__main__':
params = {}
params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch"
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
params["decoding_model_name"] = "valid.acc.ave_10best.pb"
params["batch_size"] = 64
modelscope_infer_after_finetune(params)

View File

@ -1,30 +0,0 @@
# ModelScope Model
## How to finetune and infer using a pretrained Paraformer-large Model
### Finetune
- Modify finetune training related parameters in `finetune.py`
- <strong>output_dir:</strong> # result dir
- <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
- <strong>batch_bins:</strong> # batch size
- <strong>max_epoch:</strong> # number of training epoch
- <strong>lr:</strong> # learning rate
- Then you can run the pipeline to finetune with:
```python
python finetune.py
```
### Inference
Or you can use the finetuned model for inference directly.
- Setting parameters in `infer.py`
- <strong>data_dir:</strong> # the dataset dir
- <strong>output_dir:</strong> # result dir
- Then you can run the pipeline to infer with:
```python
python infer.py
```

View File

@ -1,25 +0,0 @@
# Paraformer-Large
- Model link: <https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/summary>
- Model size: 220M
# Environments
- date: `Fri Feb 10 13:34:24 CST 2023`
- python version: `3.7.12`
- FunASR version: `0.1.6`
- pytorch version: `pytorch 1.7.0`
- Git hash: ``
- Commit date: ``
# Beachmark Results
## AISHELL-2
- Decode config:
- Decode without CTC
- Decode without LM
| testset | base model|finetune model|
|:------------:|:---------:|:------------:|
| dev_ios | 2.80 |2.60 |
| test_android | 3.13 |2.84 |
| test_ios | 2.85 |2.82 |
| test_mic | 3.06 |2.88 |

View File

@ -1,36 +0,0 @@
import os
from modelscope.metainfo import Trainers
from modelscope.trainers import build_trainer
from funasr.datasets.ms_dataset import MsDataset
from funasr.utils.modelscope_param import modelscope_args
def modelscope_finetune(params):
if not os.path.exists(params.output_dir):
os.makedirs(params.output_dir, exist_ok=True)
# dataset split ["train", "validation"]
ds_dict = MsDataset.load(params.data_path)
kwargs = dict(
model=params.model,
data_dir=ds_dict,
dataset_type=params.dataset_type,
work_dir=params.output_dir,
batch_bins=params.batch_bins,
max_epoch=params.max_epoch,
lr=params.lr)
trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
trainer.train()
if __name__ == '__main__':
params = modelscope_args(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch", data_path="./data")
params.output_dir = "./checkpoint" # m模型保存路径
params.data_path = "./example_data/" # 数据路径
params.dataset_type = "small" # 小数据量设置small若数据量大于1000小时请使用large
params.batch_bins = 2000 # batch size如果dataset_type="small"batch_bins单位为fbank特征帧数如果dataset_type="large"batch_bins单位为毫秒
params.max_epoch = 50 # 最大训练轮数
params.lr = 0.00005 # 设置学习率
modelscope_finetune(params)

View File

@ -1,101 +0,0 @@
import os
import shutil
from multiprocessing import Pool
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from funasr.utils.compute_wer import compute_wer
def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
if ngpu > 0:
use_gpu = 1
gpu_id = int(idx) - 1
else:
use_gpu = 0
gpu_id = -1
if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
else:
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
inference_pipline = pipeline(
task=Tasks.auto_speech_recognition,
model=model,
output_dir=output_dir_job,
batch_size=batch_size,
ngpu=use_gpu,
)
audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
inference_pipline(audio_in=audio_in)
def modelscope_infer(params):
# prepare for multi-GPU decoding
ngpu = params["ngpu"]
njob = params["njob"]
batch_size = params["batch_size"]
output_dir = params["output_dir"]
model = params["model"]
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
os.mkdir(output_dir)
split_dir = os.path.join(output_dir, "split")
os.mkdir(split_dir)
if ngpu > 0:
nj = ngpu
elif ngpu == 0:
nj = njob
wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
with open(wav_scp_file) as f:
lines = f.readlines()
num_lines = len(lines)
num_job_lines = num_lines // nj
start = 0
for i in range(nj):
end = start + num_job_lines
file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1)))
with open(file, "w") as f:
if i == nj - 1:
f.writelines(lines[start:])
else:
f.writelines(lines[start:end])
start = end
p = Pool(nj)
for i in range(nj):
p.apply_async(modelscope_infer_core,
args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
p.close()
p.join()
# combine decoding results
best_recog_path = os.path.join(output_dir, "1best_recog")
os.mkdir(best_recog_path)
files = ["text", "token", "score"]
for file in files:
with open(os.path.join(best_recog_path, file), "w") as f:
for i in range(nj):
job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file)
with open(job_file) as f_job:
lines = f_job.readlines()
f.writelines(lines)
# If text exists, compute CER
text_in = os.path.join(params["data_dir"], "text")
if os.path.exists(text_in):
text_proc_file = os.path.join(best_recog_path, "token")
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
if __name__ == "__main__":
params = {}
params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch"
params["data_dir"] = "./data/test"
params["output_dir"] = "./results"
params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
params["njob"] = 1 # if ngpu = 0, will use cpu decoding
params["batch_size"] = 64
modelscope_infer(params)

View File

@ -1,48 +0,0 @@
import json
import os
import shutil
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.hub.snapshot_download import snapshot_download
from funasr.utils.compute_wer import compute_wer
def modelscope_infer_after_finetune(params):
# prepare for decoding
try:
pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
except BaseException:
raise BaseException(f"Please download pretrain model from ModelScope firstly.")
shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
decoding_path = os.path.join(params["output_dir"], "decode_results")
if os.path.exists(decoding_path):
shutil.rmtree(decoding_path)
os.mkdir(decoding_path)
# decoding
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
model=pretrained_model_path,
output_dir=decoding_path,
batch_size=params["batch_size"]
)
audio_in = os.path.join(params["data_dir"], "wav.scp")
inference_pipeline(audio_in=audio_in)
# computer CER if GT text is set
text_in = os.path.join(params["data_dir"], "text")
if os.path.exists(text_in):
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
if __name__ == '__main__':
params = {}
params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch"
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
params["decoding_model_name"] = "valid.acc.ave_10best.pb"
params["batch_size"] = 64
modelscope_infer_after_finetune(params)

View File

@ -21,23 +21,26 @@
Or you can use the finetuned model for inference directly.
- Setting parameters in `infer.py`
- Setting parameters in `infer.sh`
- <strong>model:</strong> # model name on ModelScope
- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- <strong>output_dir:</strong> # result dir
- <strong>ngpu:</strong> # the number of GPUs for decoding, if `ngpu` > 0, use GPU decoding
- <strong>njob:</strong> # the number of jobs for CPU decoding, if `ngpu` = 0, use CPU decoding, please set `njob`
- <strong>batch_size:</strong> # batchsize of inference
- <strong>gpu_inference:</strong> # whether to perform gpu decoding, set false for cpu decoding
- <strong>gpuid_list:</strong> # set gpus, e.g., gpuid_list="0,1"
- <strong>njob:</strong> # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob`
- Then you can run the pipeline to infer with:
```python
python infer.py
sh infer.sh
```
- Results
The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `DETAILS.txt`, `RESULTS.txt` record the results and CER after text normalization.
### Inference using local finetuned model
- Modify inference related parameters in `infer_after_finetune.py`

View File

@ -17,22 +17,22 @@
- Decode without CTC
- Decode without LM
| testset | CER(%)|
|:---------:|:-----:|
| dev | 1.75 |
| test | 1.95 |
| CER(%) | Pretrain model|[Finetune model](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/summary) |
|:---------:|:-------------:|:-------------:|
| dev | 1.75 |1.62 |
| test | 1.95 |1.78 |
## AISHELL-2
- Decode config:
- Decode without CTC
- Decode without LM
| testset | CER(%)|
|:------------:|:-----:|
| dev_ios | 2.80 |
| test_android | 3.13 |
| test_ios | 2.85 |
| test_mic | 3.06 |
| CER(%) | Pretrain model|[Finetune model](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/summary)|
|:------------:|:-------------:|:------------:|
| dev_ios | 2.80 |2.60 |
| test_android | 3.13 |2.84 |
| test_ios | 2.85 |2.82 |
| test_mic | 3.06 |2.88 |
## Wenetspeech
- Decode config:

View File

@ -1,101 +1,25 @@
import os
import shutil
from multiprocessing import Pool
import argparse
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from funasr.utils.compute_wer import compute_wer
def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
if ngpu > 0:
use_gpu = 1
gpu_id = int(idx) - 1
else:
use_gpu = 0
gpu_id = -1
if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
else:
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
inference_pipline = pipeline(
def modelscope_infer(args):
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
model=model,
output_dir=output_dir_job,
batch_size=batch_size,
ngpu=use_gpu,
model=args.model,
output_dir=args.output_dir,
batch_size=args.batch_size,
)
audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
inference_pipline(audio_in=audio_in)
def modelscope_infer(params):
# prepare for multi-GPU decoding
ngpu = params["ngpu"]
njob = params["njob"]
batch_size = params["batch_size"]
output_dir = params["output_dir"]
model = params["model"]
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
os.mkdir(output_dir)
split_dir = os.path.join(output_dir, "split")
os.mkdir(split_dir)
if ngpu > 0:
nj = ngpu
elif ngpu == 0:
nj = njob
wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
with open(wav_scp_file) as f:
lines = f.readlines()
num_lines = len(lines)
num_job_lines = num_lines // nj
start = 0
for i in range(nj):
end = start + num_job_lines
file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1)))
with open(file, "w") as f:
if i == nj - 1:
f.writelines(lines[start:])
else:
f.writelines(lines[start:end])
start = end
p = Pool(nj)
for i in range(nj):
p.apply_async(modelscope_infer_core,
args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
p.close()
p.join()
# combine decoding results
best_recog_path = os.path.join(output_dir, "1best_recog")
os.mkdir(best_recog_path)
files = ["text", "token", "score"]
for file in files:
with open(os.path.join(best_recog_path, file), "w") as f:
for i in range(nj):
job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file)
with open(job_file) as f_job:
lines = f_job.readlines()
f.writelines(lines)
# If text exists, compute CER
text_in = os.path.join(params["data_dir"], "text")
if os.path.exists(text_in):
text_proc_file = os.path.join(best_recog_path, "token")
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
inference_pipeline(audio_in=args.audio_in)
if __name__ == "__main__":
params = {}
params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
params["data_dir"] = "./data/test"
params["output_dir"] = "./results"
params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
params["njob"] = 1 # if ngpu = 0, will use cpu decoding
params["batch_size"] = 64
modelscope_infer(params)
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
parser.add_argument('--audio_in', type=str, default="./data/test")
parser.add_argument('--output_dir', type=str, default="./results/")
parser.add_argument('--batch_size', type=int, default=64)
parser.add_argument('--gpuid', type=str, default="0")
args = parser.parse_args()
modelscope_infer(args)

View File

@ -0,0 +1,95 @@
#!/usr/bin/env bash
set -e
set -u
set -o pipefail
stage=1
stop_stage=2
model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
data_dir="./data/test"
output_dir="./results"
batch_size=64
gpu_inference=true # whether to perform gpu decoding
gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1"
njob=4 # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
if ${gpu_inference}; then
nj=$(echo $gpuid_list | awk -F "," '{print NF}')
else
nj=$njob
batch_size=1
gpuid_list=""
for JOB in $(seq ${nj}); do
gpuid_list=$gpuid_list"-1,"
done
fi
mkdir -p $output_dir/split
split_scps=""
for JOB in $(seq ${nj}); do
split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
done
perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
echo "Decoding ..."
gpuid_list_array=(${gpuid_list//,/ })
for JOB in $(seq ${nj}); do
{
id=$((JOB-1))
gpuid=${gpuid_list_array[$id]}
mkdir -p ${output_dir}/output.$JOB
python infer.py \
--model ${model} \
--audio_in ${output_dir}/split/wav.$JOB.scp \
--output_dir ${output_dir}/output.$JOB \
--batch_size ${batch_size} \
--gpuid ${gpuid}
}&
done
wait
mkdir -p ${output_dir}/1best_recog
for f in token score text; do
if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
for i in $(seq "${nj}"); do
cat "${output_dir}/output.${i}/1best_recog/${f}"
done | sort -k1 >"${output_dir}/1best_recog/${f}"
fi
done
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
echo "Computing WER ..."
python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref
python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
tail -n 3 ${output_dir}/1best_recog/text.cer
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
echo "SpeechIO TIOBE textnorm"
echo "$0 --> Normalizing REF text ..."
./utils/textnorm_zh.py \
--has_key --to_upper \
${data_dir}/text \
${output_dir}/1best_recog/ref.txt
echo "$0 --> Normalizing HYP text ..."
./utils/textnorm_zh.py \
--has_key --to_upper \
${output_dir}/1best_recog/text.proc \
${output_dir}/1best_recog/rec.txt
grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt
echo "$0 --> computing WER/CER and alignment ..."
./utils/error_rate_zh \
--tokenizer char \
--ref ${output_dir}/1best_recog/ref.txt \
--hyp ${output_dir}/1best_recog/rec_non_empty.txt \
${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt
rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
fi

View File

@ -0,0 +1 @@
../../../../egs/aishell/transformer/utils

View File

@ -6,8 +6,9 @@
- Modify finetune training related parameters in `finetune.py`
- <strong>output_dir:</strong> # result dir
- <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
- <strong>batch_bins:</strong> # batch size
- <strong>data_dir:</strong> # the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text`
- <strong>dataset_type:</strong> # for dataset larger than 1000 hours, set as `large`, otherwise set as `small`
- <strong>batch_bins:</strong> # batch size. For dataset_type is `small`, `batch_bins` indicates the feature frames. For dataset_type is `large`, `batch_bins` indicates the duration in ms
- <strong>max_epoch:</strong> # number of training epoch
- <strong>lr:</strong> # learning rate
@ -20,11 +21,38 @@
Or you can use the finetuned model for inference directly.
- Setting parameters in `infer.py`
- <strong>data_dir:</strong> # the dataset dir
- Setting parameters in `infer.sh`
- <strong>model:</strong> # model name on ModelScope
- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- <strong>output_dir:</strong> # result dir
- <strong>batch_size:</strong> # batchsize of inference
- <strong>gpu_inference:</strong> # whether to perform gpu decoding, set false for cpu decoding
- <strong>gpuid_list:</strong> # set gpus, e.g., gpuid_list="0,1"
- <strong>njob:</strong> # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob`
- Then you can run the pipeline to infer with:
```python
python infer.py
sh infer.sh
```
- Results
The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
### Inference using local finetuned model
- Modify inference related parameters in `infer_after_finetune.py`
- <strong>modelscope_model_name: </strong> # model name on ModelScope
- <strong>output_dir:</strong> # result dir
- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
- <strong>batch_size:</strong> # batchsize of inference
- Then you can run the pipeline to finetune with:
```python
python infer_after_finetune.py
```
- Results
The decoding results can be found in `$output_dir/decoding_results/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.

View File

@ -1,101 +1,25 @@
import os
import shutil
from multiprocessing import Pool
import argparse
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from funasr.utils.compute_wer import compute_wer
def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
if ngpu > 0:
use_gpu = 1
gpu_id = int(idx) - 1
else:
use_gpu = 0
gpu_id = -1
if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
else:
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
inference_pipline = pipeline(
def modelscope_infer(args):
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
model=model,
output_dir=output_dir_job,
batch_size=batch_size,
ngpu=use_gpu,
model=args.model,
output_dir=args.output_dir,
batch_size=args.batch_size,
)
audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
inference_pipline(audio_in=audio_in)
def modelscope_infer(params):
# prepare for multi-GPU decoding
ngpu = params["ngpu"]
njob = params["njob"]
batch_size = params["batch_size"]
output_dir = params["output_dir"]
model = params["model"]
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
os.mkdir(output_dir)
split_dir = os.path.join(output_dir, "split")
os.mkdir(split_dir)
if ngpu > 0:
nj = ngpu
elif ngpu == 0:
nj = njob
wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
with open(wav_scp_file) as f:
lines = f.readlines()
num_lines = len(lines)
num_job_lines = num_lines // nj
start = 0
for i in range(nj):
end = start + num_job_lines
file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1)))
with open(file, "w") as f:
if i == nj - 1:
f.writelines(lines[start:])
else:
f.writelines(lines[start:end])
start = end
p = Pool(nj)
for i in range(nj):
p.apply_async(modelscope_infer_core,
args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
p.close()
p.join()
# combine decoding results
best_recog_path = os.path.join(output_dir, "1best_recog")
os.mkdir(best_recog_path)
files = ["text", "token", "score"]
for file in files:
with open(os.path.join(best_recog_path, file), "w") as f:
for i in range(nj):
job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file)
with open(job_file) as f_job:
lines = f_job.readlines()
f.writelines(lines)
# If text exists, compute CER
text_in = os.path.join(params["data_dir"], "text")
if os.path.exists(text_in):
text_proc_file = os.path.join(best_recog_path, "token")
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
inference_pipeline(audio_in=args.audio_in)
if __name__ == "__main__":
params = {}
params["model"] = "damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1"
params["data_dir"] = "./data/test"
params["output_dir"] = "./results"
params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
params["njob"] = 1 # if ngpu = 0, will use cpu decoding
params["batch_size"] = 64
modelscope_infer(params)
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1")
parser.add_argument('--audio_in', type=str, default="./data/test")
parser.add_argument('--output_dir', type=str, default="./results/")
parser.add_argument('--batch_size', type=int, default=64)
parser.add_argument('--gpuid', type=str, default="0")
args = parser.parse_args()
modelscope_infer(args)

View File

@ -0,0 +1,70 @@
#!/usr/bin/env bash
set -e
set -u
set -o pipefail
stage=1
stop_stage=2
model="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1"
data_dir="./data/test"
output_dir="./results"
batch_size=64
gpu_inference=true # whether to perform gpu decoding
gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1"
njob=4 # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
if ${gpu_inference}; then
nj=$(echo $gpuid_list | awk -F "," '{print NF}')
else
nj=$njob
batch_size=1
gpuid_list=""
for JOB in $(seq ${nj}); do
gpuid_list=$gpuid_list"-1,"
done
fi
mkdir -p $output_dir/split
split_scps=""
for JOB in $(seq ${nj}); do
split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
done
perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
echo "Decoding ..."
gpuid_list_array=(${gpuid_list//,/ })
for JOB in $(seq ${nj}); do
{
id=$((JOB-1))
gpuid=${gpuid_list_array[$id]}
mkdir -p ${output_dir}/output.$JOB
python infer.py \
--model ${model} \
--audio_in ${output_dir}/split/wav.$JOB.scp \
--output_dir ${output_dir}/output.$JOB \
--batch_size ${batch_size} \
--gpuid ${gpuid}
}&
done
wait
mkdir -p ${output_dir}/1best_recog
for f in token score text; do
if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
for i in $(seq "${nj}"); do
cat "${output_dir}/output.${i}/1best_recog/${f}"
done | sort -k1 >"${output_dir}/1best_recog/${f}"
fi
done
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
echo "Computing WER ..."
python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref
python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
tail -n 3 ${output_dir}/1best_recog/text.cer
fi

View File

@ -0,0 +1 @@
../../../../egs/aishell/transformer/utils

View File

@ -226,7 +226,7 @@ def inference_modelscope(
):
results = []
split_size = 10
cache_in = param_dict["cache"]
if raw_inputs != None:
line = raw_inputs.strip()
key = "demo"
@ -234,34 +234,12 @@ def inference_modelscope(
item = {'key': key, 'value': ""}
results.append(item)
return results
result, _, cache = text2punc(line, cache)
item = {'key': key, 'value': result, 'cache': cache}
result, _, cache = text2punc(line, cache_in)
param_dict["cache"] = cache
item = {'key': key, 'value': result}
results.append(item)
return results
for inference_text, _, _ in data_path_and_name_and_type:
with open(inference_text, "r", encoding="utf-8") as fin:
for line in fin:
line = line.strip()
segs = line.split("\t")
if len(segs) != 2:
continue
key = segs[0]
if len(segs[1]) == 0:
continue
result, _ = text2punc(segs[1])
item = {'key': key, 'value': result}
results.append(item)
output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
if output_path != None:
output_file_name = "infer.out"
Path(output_path).mkdir(parents=True, exist_ok=True)
output_file_path = (Path(output_path) / output_file_name).absolute()
with open(output_file_path, "w", encoding="utf-8") as fout:
for item_i in results:
key_out = item_i["key"]
value_out = item_i["value"]
fout.write(f"{key_out}\t{value_out}\n")
return results
return _forward

View File

@ -30,14 +30,7 @@ from funasr.models.frontend.wav_frontend import WavFrontendOnline
from funasr.models.frontend.wav_frontend import WavFrontend
from funasr.bin.vad_inference import Speech2VadSegment
header_colors = '\033[95m'
end_colors = '\033[0m'
global_asr_language: str = 'zh-cn'
global_sample_rate: Union[int, Dict[Any, int]] = {
'audio_fs': 16000,
'model_fs': 16000
}
class Speech2VadSegmentOnline(Speech2VadSegment):

View File

@ -14,7 +14,7 @@ from funasr.utils.types import str2bool
# torch_version = float(".".join(torch.__version__.split(".")[:2]))
# assert torch_version > 1.9
class ASRModelExportParaformer:
class ModelExport:
def __init__(
self,
cache_dir: Union[Path, str] = None,
@ -240,7 +240,7 @@ if __name__ == '__main__':
parser.add_argument('--calib_num', type=int, default=200, help='calib max num')
args = parser.parse_args()
export_model = ASRModelExportParaformer(
export_model = ModelExport(
cache_dir=args.export_dir,
onnx=args.type == 'onnx',
quant=args.quantize,

View File

@ -42,7 +42,7 @@ pip install --editable ./
导出onnx模型[详见](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export)参考示例从modelscope中模型导出
```shell
python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize False
python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize True
```
## Building Guidance for Linux/Unix

View File

@ -25,6 +25,16 @@ Aishell1 [test set](https://www.openslr.org/33/) , the total audio duration is 3
## [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
Number of Parameter: 220M
Storage size: 880MB
Storage size after int8-quant: 237MB
CER: 1.95%
CER after int8-quant: 1.95%
### Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz 16core-32processor with avx512_vnni
| concurrent-tasks | processing time(s) | RTF | Speedup Rate |
@ -73,6 +83,16 @@ Aishell1 [test set](https://www.openslr.org/33/) , the total audio duration is 3
## [Paraformer](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary)
Number of Parameter: 68M
Storage size: 275MB
Storage size after int8-quant: 81MB
CER: 3.73%
CER after int8-quant: 3.78%
### Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz 16core-32processor with avx512_vnni
| concurrent-tasks | processing time(s) | RTF | Speedup Rate |

View File

@ -26,23 +26,28 @@ cd FunASR/funasr/runtime/python/grpc/
Step 1-2) Optional, Prepare server onnxruntime environment (on server).
Install [`rapid_paraformer`](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).
Install [`onnx_paraformer`](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).
- Build the rapid_paraformer `whl`
- Build the onnx_paraformer `whl`
```
git clone https://github.com/alibaba/FunASR.git && cd FunASR
cd funasr/runtime/python/onnxruntime/rapid_paraformer
python setup.py bdist_wheel
python setup.py build
python setup.py install
```
- Install the build `whl`
```
pip install dist/rapid_paraformer-0.0.1-py3-none-any.whl
```
[//]: # ()
[//]: # (- Install the build `whl`)
[//]: # (```)
[//]: # (pip install dist/rapid_paraformer-0.0.1-py3-none-any.whl)
[//]: # (```)
Export the model, more details ref to [export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).
```
python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
```shell
python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize True
```
Step 2) Optional, generate protobuf file (run on server, the two generated pb files are both used for server and client).

View File

@ -0,0 +1,157 @@
import os
import numpy as np
import sys
def compute_wer(ref_file,
hyp_file,
cer_detail_file):
rst = {
'Wrd': 0,
'Corr': 0,
'Ins': 0,
'Del': 0,
'Sub': 0,
'Snt': 0,
'Err': 0.0,
'S.Err': 0.0,
'wrong_words': 0,
'wrong_sentences': 0
}
hyp_dict = {}
ref_dict = {}
with open(hyp_file, 'r') as hyp_reader:
for line in hyp_reader:
key = line.strip().split()[0]
value = line.strip().split()[1:]
hyp_dict[key] = value
with open(ref_file, 'r') as ref_reader:
for line in ref_reader:
key = line.strip().split()[0]
value = line.strip().split()[1:]
ref_dict[key] = value
cer_detail_writer = open(cer_detail_file, 'w')
for hyp_key in hyp_dict:
if hyp_key in ref_dict:
out_item = compute_wer_by_line(hyp_dict[hyp_key], ref_dict[hyp_key])
rst['Wrd'] += out_item['nwords']
rst['Corr'] += out_item['cor']
rst['wrong_words'] += out_item['wrong']
rst['Ins'] += out_item['ins']
rst['Del'] += out_item['del']
rst['Sub'] += out_item['sub']
rst['Snt'] += 1
if out_item['wrong'] > 0:
rst['wrong_sentences'] += 1
cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')
if rst['Wrd'] > 0:
rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)
if rst['Snt'] > 0:
rst['S.Err'] = round(rst['wrong_sentences'] * 100 / rst['Snt'], 2)
cer_detail_writer.write('\n')
cer_detail_writer.write("%WER " + str(rst['Err']) + " [ " + str(rst['wrong_words'])+ " / " + str(rst['Wrd']) +
", " + str(rst['Ins']) + " ins, " + str(rst['Del']) + " del, " + str(rst['Sub']) + " sub ]" + '\n')
cer_detail_writer.write("%SER " + str(rst['S.Err']) + " [ " + str(rst['wrong_sentences']) + " / " + str(rst['Snt']) + " ]" + '\n')
cer_detail_writer.write("Scored " + str(len(hyp_dict)) + " sentences, " + str(len(hyp_dict) - rst['Snt']) + " not present in hyp." + '\n')
def compute_wer_by_line(hyp,
ref):
hyp = list(map(lambda x: x.lower(), hyp))
ref = list(map(lambda x: x.lower(), ref))
len_hyp = len(hyp)
len_ref = len(ref)
cost_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int16)
ops_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int8)
for i in range(len_hyp + 1):
cost_matrix[i][0] = i
for j in range(len_ref + 1):
cost_matrix[0][j] = j
for i in range(1, len_hyp + 1):
for j in range(1, len_ref + 1):
if hyp[i - 1] == ref[j - 1]:
cost_matrix[i][j] = cost_matrix[i - 1][j - 1]
else:
substitution = cost_matrix[i - 1][j - 1] + 1
insertion = cost_matrix[i - 1][j] + 1
deletion = cost_matrix[i][j - 1] + 1
compare_val = [substitution, insertion, deletion]
min_val = min(compare_val)
operation_idx = compare_val.index(min_val) + 1
cost_matrix[i][j] = min_val
ops_matrix[i][j] = operation_idx
match_idx = []
i = len_hyp
j = len_ref
rst = {
'nwords': len_ref,
'cor': 0,
'wrong': 0,
'ins': 0,
'del': 0,
'sub': 0
}
while i >= 0 or j >= 0:
i_idx = max(0, i)
j_idx = max(0, j)
if ops_matrix[i_idx][j_idx] == 0: # correct
if i - 1 >= 0 and j - 1 >= 0:
match_idx.append((j - 1, i - 1))
rst['cor'] += 1
i -= 1
j -= 1
elif ops_matrix[i_idx][j_idx] == 2: # insert
i -= 1
rst['ins'] += 1
elif ops_matrix[i_idx][j_idx] == 3: # delete
j -= 1
rst['del'] += 1
elif ops_matrix[i_idx][j_idx] == 1: # substitute
i -= 1
j -= 1
rst['sub'] += 1
if i < 0 and j >= 0:
rst['del'] += 1
elif j < 0 and i >= 0:
rst['ins'] += 1
match_idx.reverse()
wrong_cnt = cost_matrix[len_hyp][len_ref]
rst['wrong'] = wrong_cnt
return rst
def print_cer_detail(rst):
return ("(" + "nwords=" + str(rst['nwords']) + ",cor=" + str(rst['cor'])
+ ",ins=" + str(rst['ins']) + ",del=" + str(rst['del']) + ",sub="
+ str(rst['sub']) + ") corr:" + '{:.2%}'.format(rst['cor']/rst['nwords'])
+ ",cer:" + '{:.2%}'.format(rst['wrong']/rst['nwords']))
if __name__ == '__main__':
if len(sys.argv) != 4:
print("usage : python compute-wer.py test.ref test.hyp test.wer")
sys.exit(0)
ref_file = sys.argv[1]
hyp_file = sys.argv[2]
cer_detail_file = sys.argv[3]
compute_wer(ref_file, hyp_file, cer_detail_file)

View File

@ -0,0 +1,48 @@
import os
import time
import sys
import librosa
from funasr.utils.types import str2bool
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--model_dir', type=str, required=True)
parser.add_argument('--backend', type=str, default='onnx', help='["onnx", "torch"]')
parser.add_argument('--wav_file', type=str, default=None, help='amp fallback number')
parser.add_argument('--quantize', type=str2bool, default=False, help='quantized model')
parser.add_argument('--intra_op_num_threads', type=int, default=1, help='intra_op_num_threads for onnx')
parser.add_argument('--output_dir', type=str, default=None, help='amp fallback number')
args = parser.parse_args()
from funasr.runtime.python.libtorch.torch_paraformer import Paraformer
if args.backend == "onnx":
from funasr.runtime.python.onnxruntime.rapid_paraformer import Paraformer
model = Paraformer(args.model_dir, batch_size=1, quantize=args.quantize, intra_op_num_threads=args.intra_op_num_threads)
wav_file_f = open(args.wav_file, 'r')
wav_files = wav_file_f.readlines()
output_dir = args.output_dir
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if os.name == 'nt': # Windows
newline = '\r\n'
else: # Linux Mac
newline = '\n'
text_f = open(os.path.join(output_dir, "text"), "w", newline=newline)
token_f = open(os.path.join(output_dir, "token"), "w", newline=newline)
for i, wav_path_i in enumerate(wav_files):
wav_name, wav_path = wav_path_i.strip().split()
result = model(wav_path)
text_i = "{} {}\n".format(wav_name, result[0]['preds'][0])
token_i = "{} {}\n".format(wav_name, result[0]['preds'][1])
text_f.write(text_i)
text_f.flush()
token_f.write(token_i)
token_f.flush()
text_f.close()
token_f.close()

View File

@ -0,0 +1,74 @@
split_scps_tool=split_scp.pl
inference_tool=infer.py
proce_text_tool=proce_text.py
compute_wer_tool=compute_wer.py
nj=32
stage=0
stop_stage=2
scp="/nfs/haoneng.lhn/funasr_data/aishell-1/data/test/wav.scp"
label_text="/nfs/haoneng.lhn/funasr_data/aishell-1/data/test/text"
export_root="/nfs/zhifu.gzf/export"
#:<<!
model_name="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
backend="onnx" # "torch"
quantize='true' # 'False'
fallback_op_num_torch=20
tag=${model_name}/${backend}_quantize_${quantize}_${fallback_op_num_torch}
!
output_dir=${export_root}/logs/${tag}/split$nj
mkdir -p ${output_dir}
echo ${output_dir}
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
python -m funasr.export.export_model --model-name ${model_name} --export-dir ${export_root} --type ${backend} --quantize ${quantize} --audio_in ${scp} --fallback-num ${fallback_op_num_torch}
fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
model_dir=${export_root}/${model_name}
split_scps=""
for JOB in $(seq ${nj}); do
split_scps="$split_scps $output_dir/wav.$JOB.scp"
done
perl ${split_scps_tool} $scp ${split_scps}
for JOB in $(seq ${nj}); do
{
core_id=`expr $JOB - 1`
taskset -c ${core_id} python ${inference_tool} --backend ${backend} --model_dir ${model_dir} --wav_file ${output_dir}/wav.$JOB.scp --quantize ${quantize} --output_dir ${output_dir}/${JOB} &> ${output_dir}/log.$JOB.txt
}&
done
wait
mkdir -p ${output_dir}/1best_recog
for f in token text; do
if [ -f "${output_dir}/1/${f}" ]; then
for JOB in $(seq "${nj}"); do
cat "${output_dir}/${JOB}/${f}"
done | sort -k1 >"${output_dir}/1best_recog/${f}"
fi
done
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
echo "Computing WER ..."
python ${proce_text_tool} ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
python ${proce_text_tool} ${label_text} ${output_dir}/1best_recog/text.ref
python ${compute_wer_tool} ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
tail -n 3 ${output_dir}/1best_recog/text.cer
fi

View File

@ -0,0 +1,31 @@
import sys
import re
in_f = sys.argv[1]
out_f = sys.argv[2]
with open(in_f, "r", encoding="utf-8") as f:
lines = f.readlines()
with open(out_f, "w", encoding="utf-8") as f:
for line in lines:
outs = line.strip().split(" ", 1)
if len(outs) == 2:
idx, text = outs
text = re.sub("</s>", "", text)
text = re.sub("<s>", "", text)
text = re.sub("@@", "", text)
text = re.sub("@", "", text)
text = re.sub("<unk>", "", text)
text = re.sub(" ", "", text)
text = text.lower()
else:
idx = outs[0]
text = " "
text = [x for x in text]
text = " ".join(text)
out = "{} {}\n".format(idx, text)
f.write(out)

View File

@ -0,0 +1,95 @@
import pyaudio
# import websocket #区别服务端这里是 websocket-client库
import time
import websockets
import asyncio
from queue import Queue
# import threading
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--host",
type=str,
default="localhost",
required=False,
help="host ip, localhost, 0.0.0.0")
parser.add_argument("--port",
type=int,
default=10095,
required=False,
help="grpc server port")
parser.add_argument("--chunk_size",
type=int,
default=300,
help="ms")
args = parser.parse_args()
voices = Queue()
async def ws_client():
global ws # 定义一个全局变量ws用于保存websocket连接对象
# uri = "ws://11.167.134.197:8899"
uri = "ws://{}:{}".format(args.host, args.port)
ws = await websockets.connect(uri, subprotocols=["binary"]) # 创建一个长连接
ws.max_size = 1024 * 1024 * 20
print("connected ws server")
async def send(data):
global ws # 引用全局变量ws
try:
await ws.send(data) # 通过ws对象发送数据
except Exception as e:
print('Exception occurred:', e)
asyncio.get_event_loop().run_until_complete(ws_client()) # 启动协程
# 其他函数可以通过调用send(data)来发送数据,例如:
async def test():
#print("2")
global voices
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = int(RATE / 1000 * args.chunk_size)
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
while True:
data = stream.read(CHUNK)
voices.put(data)
#print(voices.qsize())
await asyncio.sleep(0.01)
async def ws_send():
global voices
print("started to sending data!")
while True:
while not voices.empty():
data = voices.get()
voices.task_done()
await send(data)
await asyncio.sleep(0.01)
await asyncio.sleep(0.01)
async def main():
task = asyncio.create_task(test()) # 创建一个后台任务
task2 = asyncio.create_task(ws_send()) # 创建一个后台任务
await asyncio.gather(task, task2)
asyncio.run(main())

View File

@ -0,0 +1,191 @@
import asyncio
import websockets
import time
from queue import Queue
import threading
import argparse
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.logger import get_logger
import logging
logger = get_logger(log_level=logging.CRITICAL)
logger.setLevel(logging.CRITICAL)
parser = argparse.ArgumentParser()
parser.add_argument("--host",
type=str,
default="0.0.0.0",
required=False,
help="host ip, localhost, 0.0.0.0")
parser.add_argument("--port",
type=int,
default=10095,
required=False,
help="grpc server port")
parser.add_argument("--asr_model",
type=str,
default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
help="model from modelscope")
parser.add_argument("--vad_model",
type=str,
default="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
help="model from modelscope")
parser.add_argument("--punc_model",
type=str,
default="damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727",
help="model from modelscope")
parser.add_argument("--ngpu",
type=int,
default=1,
help="0 for cpu, 1 for gpu")
args = parser.parse_args()
print("model loading")
voices = Queue()
speek = Queue()
# vad
inference_pipeline_vad = pipeline(
task=Tasks.voice_activity_detection,
model=args.vad_model,
model_revision=None,
output_dir=None,
batch_size=1,
mode='online',
ngpu=args.ngpu,
)
param_dict_vad = {'in_cache': dict(), "is_final": False}
# asr
param_dict_asr = {}
# param_dict["hotword"] = "小五 小五月" # 设置热词,用空格隔开
inference_pipeline_asr = pipeline(
task=Tasks.auto_speech_recognition,
model=args.asr_model,
param_dict=param_dict_asr,
ngpu=args.ngpu,
)
param_dict_punc = {'cache': list()}
inference_pipeline_punc = pipeline(
task=Tasks.punctuation,
model=args.punc_model,
model_revision=None,
ngpu=args.ngpu,
)
print("model loaded")
async def ws_serve(websocket, path):
global voices
try:
async for message in websocket:
voices.put(message)
#print("put")
except websockets.exceptions.ConnectionClosedError as e:
print('Connection closed with exception:', e)
except Exception as e:
print('Exception occurred:', e)
start_server = websockets.serve(ws_serve, args.host, args.port, subprotocols=["binary"], ping_interval=None)
def vad(data): # 推理
global vad_pipline, param_dict_vad
#print(type(data))
# print(param_dict_vad)
segments_result = inference_pipeline_vad(audio_in=data, param_dict=param_dict_vad)
# print(segments_result)
# print(param_dict_vad)
speech_start = False
speech_end = False
if len(segments_result) == 0 or len(segments_result["text"]) > 1:
return speech_start, speech_end
if segments_result["text"][0][0] != -1:
speech_start = True
if segments_result["text"][0][1] != -1:
speech_end = True
return speech_start, speech_end
def asr(): # 推理
global inference_pipeline2
global speek
while True:
while not speek.empty():
audio_in = speek.get()
speek.task_done()
rec_result = inference_pipeline_asr(audio_in=audio_in)
rec_result_punc = inference_pipeline_punc(text_in=rec_result['text'], param_dict=param_dict_punc)
print(rec_result_punc)
time.sleep(0.1)
time.sleep(0.1)
def main(): # 推理
frames = [] # 存储所有的帧数据
buffer = [] # 存储缓存中的帧数据(最多两个片段)
# silence_count = 0 # 统计连续静音的次数
# speech_detected = False # 标记是否检测到语音
RECORD_NUM = 0
global voices
global speek
speech_start, speech_end = False, False
while True:
while not voices.empty():
data = voices.get()
#print("队列排队数",voices.qsize())
voices.task_done()
buffer.append(data)
if len(buffer) > 2:
buffer.pop(0) # 如果缓存超过两个片段,则删除最早的一个
if speech_start:
frames.append(data)
RECORD_NUM += 1
speech_start_i, speech_end_i = vad(data)
# print(speech_start_i, speech_end_i)
if speech_start_i:
speech_start = speech_start_i
# if not speech_detected:
# print("检测到人声...")
# speech_detected = True # 标记为检测到语音
frames = []
frames.extend(buffer) # 把之前2个语音数据快加入
# silence_count = 0 # 重置静音次数
if speech_end_i or RECORD_NUM > 300:
# silence_count += 1 # 增加静音次数
# speech_end = speech_end_i
speech_start = False
# if RECORD_NUM > 300: #这里 50 可根据需求改为合适的数据快数量
# print("说话结束或者超过设置最长时间...")
audio_in = b"".join(frames)
#asrt = threading.Thread(target=asr,args=(audio_in,))
#asrt.start()
speek.put(audio_in)
#rec_result = inference_pipeline2(audio_in=audio_in) # ASR 模型里跑一跑
frames = [] # 清空所有的帧数据
buffer = [] # 清空缓存中的帧数据(最多两个片段)
# silence_count = 0 # 统计连续静音的次数清零
# speech_detected = False # 标记是否检测到语音
RECORD_NUM = 0
time.sleep(0.01)
time.sleep(0.01)
s = threading.Thread(target=main)
s.start()
s = threading.Thread(target=asr)
s.start()
asyncio.get_event_loop().run_until_complete(start_server)
asyncio.get_event_loop().run_forever()

View File

@ -0,0 +1,46 @@
# Using funasr with websocket
We can send streaming audio data to server in real-time with grpc client every 300 ms e.g., and get transcribed text when stop speaking.
The audio data is in streaming, the asr inference process is in offline.
# Steps
## For the Server
Install the modelscope and funasr
```shell
pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
git clone https://github.com/alibaba/FunASR.git && cd FunASR
pip install --editable ./
```
Install the requirements for server
```shell
cd funasr/runtime/python/websocket
pip install -r requirements_server.txt
```
Start server
```shell
python ASR_server.py --host "0.0.0.0" --port 10095 --asr_model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
```
## For the client
Install the requirements for client
```shell
git clone https://github.com/alibaba/FunASR.git && cd FunASR
cd funasr/runtime/python/websocket
pip install -r requirements_client.txt
```
Start client
```shell
python ASR_client.py --host "127.0.0.1" --port 10095 --chunk_size 300
```
## Acknowledge
1. We acknowledge [cgisky1980](https://github.com/cgisky1980/FunASR) for contributing the websocket service.

View File

@ -0,0 +1,2 @@
websockets
pyaudio

View File

@ -0,0 +1 @@
websockets

View File

@ -1193,12 +1193,18 @@ class AbsTask(ABC):
# logging.basicConfig() is invoked in main_worker() instead of main()
# because it can be invoked only once in a process.
# FIXME(kamo): Should we use logging.getLogger()?
# BUGFIX: Remove previous handlers and reset log level
for handler in logging.root.handlers[:]:
logging.root.removeHandler(handler)
logging.basicConfig(
level=args.log_level,
format=f"[{os.uname()[1].split('.')[0]}]"
f" %(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
else:
# BUGFIX: Remove previous handlers and reset log level
for handler in logging.root.handlers[:]:
logging.root.removeHandler(handler)
# Suppress logging if RANK != 0
logging.basicConfig(
level="ERROR",