diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 901446797..cfb94f367 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -11,23 +11,23 @@ jobs: docs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v1 - - uses: ammaraskar/sphinx-action@master - with: - docs-folder: "docs/" - pre-build-command: "pip install sphinx-markdown-tables nbsphinx jinja2 recommonmark sphinx_rtd_theme" + - uses: actions/checkout@v1 + - uses: ammaraskar/sphinx-action@master + with: + docs-folder: "docs_cn/" + pre-build-command: "pip install sphinx-markdown-tables nbsphinx jinja2 recommonmark sphinx_rtd_theme" - - name: deploy copy - if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev' - run: | - mkdir public - touch public/.nojekyll - cp -r docs/_build/html/* public/ + - name: deploy copy + if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev' + run: | + mkdir public + touch public/.nojekyll + cp -r docs_cn/_build/html/* public/ - - name: deploy github.io pages - if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev' - uses: peaceiris/actions-gh-pages@v2.3.1 - env: - GITHUB_TOKEN: ${{ secrets.ACCESS_TOKEN }} - PUBLISH_BRANCH: gh-pages - PUBLISH_DIR: public \ No newline at end of file + - name: deploy github.io pages + if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev' + uses: peaceiris/actions-gh-pages@v2.3.1 + env: + GITHUB_TOKEN: ${{ secrets.ACCESS_TOKEN }} + PUBLISH_BRANCH: gh-pages + PUBLISH_DIR: public \ No newline at end of file diff --git a/README.md b/README.md index b70962ca1..79ad962d7 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ # FunASR: A Fundamental End-to-End Speech Recognition Toolkit -FunASR hopes to build a bridge between academic research and industrial applications on speech recognition. By supporting the training & finetuning of the industrial-grade speech recognition model released on [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition), researchers and developers can conduct research and production of speech recognition models more conveniently, and promote the development of speech recognition ecology. ASR for Fun![Model Zoo](docs/modelscope_models.md) +FunASR hopes to build a bridge between academic research and industrial applications on speech recognition. By supporting the training & finetuning of the industrial-grade speech recognition model released on [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition), researchers and developers can conduct research and production of speech recognition models more conveniently, and promote the development of speech recognition ecology. ASR for Fun! ## Release Notes: ### 2023.1.16, funasr-0.1.6 @@ -31,6 +31,9 @@ pip install --editable ./ ``` For more details, please ref to [installation](https://github.com/alibaba-damo-academy/FunASR/wiki#%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85) +## Usage +For users who are new to FunASR and ModelScope, please refer to [FunASR Docs](https://alibaba-damo-academy.github.io/FunASR/index.html). + ## Contact If you have any questions about FunASR, please contact us by diff --git a/docs_cn/Makefile b/docs_cn/Makefile new file mode 100644 index 000000000..d58379b8b --- /dev/null +++ b/docs_cn/Makefile @@ -0,0 +1,21 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = FunASR +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs_cn/conf.py b/docs_cn/conf.py new file mode 100644 index 000000000..01899914f --- /dev/null +++ b/docs_cn/conf.py @@ -0,0 +1,67 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'FunASR' +copyright = '2022, Speech Lab, Alibaba Group' +author = 'Speech Lab, Alibaba Grou' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "nbsphinx", + "sphinx.ext.autodoc", + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + "sphinx.ext.mathjax", + "sphinx.ext.todo", + # "sphinxarg.ext", + "sphinx_markdown_tables", + 'recommonmark', + 'sphinx_rtd_theme', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +source_suffix = [".rst", ".md"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = "sphinx" + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# + +html_theme = "sphinx_rtd_theme" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] \ No newline at end of file diff --git a/docs_cn/get_started.md b/docs_cn/get_started.md new file mode 100644 index 000000000..6e077e022 --- /dev/null +++ b/docs_cn/get_started.md @@ -0,0 +1,130 @@ +# 快速开始 +在此我们将以"使用AISHELL-1数据集,从随机初始化训练一个paraformer模型"为例,介绍如何使用FunASR。根据这个例子,用户可以类似地使用别的数据集(如AISHELL-2数据集等)训练别的模型(如conformer,transformer等)。 + +## 整体介绍 + +我们提供了`egs/aishell/paraformer/run.sh`来实现使用AISHELL-1数据集训练一个paraformer模型。该脚本包含5个阶段,包括从数据处理到训练解码等整个流程,同时提供了单/多GPU训练和CPU/GPU解码。在详细介绍每个阶段之前,我们先对用户需要手动设置的一些参数进行说明。 +- `CUDA_VISIBLE_DEVICES`: 可用的GPU列表 +- `gpu_num`: 用于训练的GPU数量 +- `gpu_inference`: 是否使用GPU进行解码 +- `njob`: for CPU decoding, indicating the total number of CPU jobs; for GPU decoding, indicating the number of jobs on each GPU. 对于CPU解码,表示解码任务数;对于GPU解码 +- `data_aishell`: AISHELL-1原始数据的路径 +- `feats_dir`: 经过处理得到的特征的保存路径 +- `nj`: 数据处理时的并行任务数 +- `speed_perturb`: 变速设置 +- `exp_dir`: 实验结果的保存路径 +- `tag`: 实验结果目录的后缀名 + +## 阶段 0: 数据准备 +本阶段用于处理原始的AISHELL-1数据,并生成相应的`wav.scp`和`text`,保存在`$feats_dir/data/xxx`目录下,这里的`xxx`表示`train`, `dev` 或 `test`(下同)。 这里我们假设用户已经下载好了AISHELL-1数据集。如果没有,用户可以在[这里](https://www.openslr.org/33/) 下载数据,并将`$data_aishell`设置为相应的路径。下面给出生成的`wav.scp`和`text`的示例: +本阶段用于处理原始的AISHELL-1数据,并生成相应的`wav.scp`和`text`,保存在`$feats_dir/data/xxx`目录下,这里的`xxx`表示`train`, `dev` 或 `test`(下同)。 这里我们假设用户已经下载好了AISHELL-1数据集。如果没有,用户可以在[这里](https://www.openslr.org/33/) 下载数据,并将`$data_aishell`设置为相应的路径。下面给出生成的`wav.scp`和`text`的示例: +* `wav.scp` +``` +BAC009S0002W0122 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav +BAC009S0002W0123 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0123.wav +BAC009S0002W0124 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0124.wav +... +``` +* `text` +``` +BAC009S0002W0122 而 对 楼 市 成 交 抑 制 作 用 最 大 的 限 购 +BAC009S0002W0123 也 成 为 地 方 政 府 的 眼 中 钉 +BAC009S0002W0124 自 六 月 底 呼 和 浩 特 市 率 先 宣 布 取 消 限 购 后 +... +``` +可以看到,这两个文件均包括两列,第一列是音频的id,第二列分别是音频路径和音频对应的抄本。 + +## 阶段 1:特征提取 +本阶段将会基于原始的音频`wav.scp`提取FBank特征。如果指定了参数`speed_perturb`,则会额外对音频进行变速来实现数据增强。用户可以设置`nj`参数来控制特征提取的并行任务数。处理后的特征保存在目录`$feats_dir/dump/xxx/ark`下,相应的`feats.scp`文件路径为`$feats_dir/dump/xxx/feats.scp`。下面给出`feats.scp`的示例: +* `feats.scp` +``` +... +BAC009S0002W0122_sp0.9 /nfs/funasr_data/aishell-1/dump/fbank/train/ark/feats.16.ark:592751055 +... +``` +注意,该文件的样本顺序已经进行了随机打乱。该文件包括两列,第一列是音频的id,第二列是对应的kaldi-ark格式的特征。另外,在此阶段还会生成训练需要用到的`speech_shape`和`text_shape`两个文件,记录了每个样本的特征维度和抄本长度。下面给出这两个文件的示例: +* `speech_shape` +``` +... +BAC009S0002W0122_sp0.9 665,80 +... +``` +* `text_shape` +``` +... +BAC009S0002W0122_sp0.9 15 +... +``` +可以看到,这两个文件均包括两列,第一列是音频的id,第二列是对应的特征的维度和抄本的长度。 + +## 阶段 2:字典准备 +本阶段用于生成字典,用于训练过程中,字符到整数索引之间的映射。生成的字典文件的路径为`$feats_dir/data/zh_toekn_list/char/tokens.txt`。下面给出`tokens.txt`的示例: +* `tokens.txt` +``` + + + +一 +丁 +... +龚 +龟 + +``` +* ``: 表示CTC训练中的blank +* ``: 表示句子的起始符 +* ``: 表示句子的终止符 +* ``: 表示字典外的字符 + +## 阶段 3:训练 +本阶段对应模型的训练。在开始训练之前,需要指定实验结果保存目录`exp_dir`,训练可用GPU`CUDA_VISIBLE_DEVICES`和训练的gpu数量`gpu_num`。默认情况下,最好的`$keep_nbest_models`模型结果会被平均从而来获取更好的性能。 + +* DDP Training + +我们提供了分布式训练(DDP)功能,具体的细节可以在[这里](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) 找到。为了开启分布式训练,需要设置`gpu_num`大于1。例如,设置`CUDA_VISIBLE_DEVICES=0,1,5,6,7`,`gpu_num=3`,则编号为0,1和5的GPU会被用于训练。 + +* DataLoader + +我们提供了基于[Pytorch Iterable-style DataPipes](https://pytorch.org/data/beta/torchdata.datapipes.iter.html) 实现的大数据DataLoader,用户可以通过设置`dataset_type=large`来启用。 + +* Configuration + +训练相关的参数,包括模型,优化器,数据等,均可以通过`conf`目录下的config文件指定。同时,用户也可以直接在`run.sh`脚本中指定相关参数。请避免在config文件和`run.sh`脚本中设置相同的参数,以免造成歧义。 + +* Training Steps + +我们提供了两种方式来控制训练的总步数,对应的参数分别为`max_epoch`和`max_update`。`max_epoch`表示训练的最大epoch数,`max_update`表示训练的最大迭代次数。如果这两个参数同时被指定,则一旦训练步数到达其中任意一个参数,训练结束。 + +* Tensorboard + +用户可以通过tensorboard来观察训练过程中的损失,学习率等。可以通过下述指定来实现: +``` +tensorboard --logdir ${exp_dir}/exp/${model_dir}/tensorboard/train +``` + +## 阶段 4: 解码 +本阶段用于解码得到识别结果,同时计算CER来验证训练得到的模型性能。 + +* Mode Selection +由于我们提供了paraformer,uniasr和conformer等模型,因此在解码时,需要指定相应的解码模式。对应的参数为`mode`,相应的可选设置为`asr/paraformer/uniase`等。 + +* Configuration + +我们提供了ctc解码, attention解码和ctc-attention混合解码。这几种解码方式可以通过`conf`下的解码配置文件中的`ctc_weight`参数来指定。具体的,`ctc_weight=1.0`表示CTC解码, `ctc_weight=0.0`表示attention解码, `0.0`_ 上发布工业级语音识别模型以及支持相关的训练和微调,研究者和开发者们可以更方便地进行语音识别模型的研究和生产,促进语音识别生态的发展。ASR for Fun! + +.. toctree:: + :maxdepth: 1 + :caption: 教程: + + ./installation.md + ./papers.md + ./get_started.md + +.. toctree:: + :maxdepth: 1 + :caption: ModelScope: + + ./modelscope_models.md + ./modelscope_usages.md + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs_cn/installation.md b/docs_cn/installation.md new file mode 100755 index 000000000..fc747808e --- /dev/null +++ b/docs_cn/installation.md @@ -0,0 +1,36 @@ +# 安装 +FunASR的安装十分便捷,下面将给出详细的安装步骤: + +- 安装Conda并创建虚拟环境 +``` sh +wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh +sh Miniconda3-latest-Linux-x86_64.sh +conda create -n funasr python=3.7 +conda activate funasr +``` + +- 安装Pytorch (版本 >= 1.7.0): + +```sh +pip install torch torchvision torchaudio +``` + +关于更多的版本, 请参照 [https://pytorch.org/get-started/locally](https://pytorch.org/get-started/locally) + +- 安装 ModelScope + +对于国内用户,可以通过配置下述镜像源来加快下载速度 +```sh +pip config set global.index-url https://mirror.sjtu.edu.cn/pypi/web/simple +``` + +安装或更新ModelScope +``` sh +pip install "modelscope[audio]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html +``` + +- 下载FunASR仓库,并安装剩余所需依赖 +``` sh +git clone https://github.com/alibaba/FunASR.git && cd FunASR +pip install --editable ./ +``` \ No newline at end of file diff --git a/docs_cn/make.bat b/docs_cn/make.bat new file mode 100644 index 000000000..747ffb7b3 --- /dev/null +++ b/docs_cn/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs_cn/modelscope_models.md b/docs_cn/modelscope_models.md new file mode 100644 index 000000000..8501c1ffa --- /dev/null +++ b/docs_cn/modelscope_models.md @@ -0,0 +1,34 @@ +# ModelScope上的预训练模型 + +## 模型许可证 +- Apache License 2.0 + +## 模型库 +这里我们提供了一些基于不同数据集训练得到的几种预训练模型,所有的预训练模型和更多细节可以参见 [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition) 。 + +| Datasets | Hours | Model | Online/Offline | Language | Framework | Checkpoint | +|:-----:|:-----:|:--------------:|:--------------:| :---: | :---: | --- | +| Alibaba Speech Data | 60000 | Paraformer | Offline | CN | Pytorch |[speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) | +| Alibaba Speech Data | 50000 | Paraformer | Offline | CN | Tensorflow |[speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary) | +| Alibaba Speech Data | 50000 | Paraformer | Offline | CN | Tensorflow |[speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary) | +| Alibaba Speech Data | 50000 | Paraformer | Online | CN | Tensorflow |[speech_paraformer_asr_nat-zh-cn-16k-common-vocab3444-tensorflow1-online](http://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab3444-tensorflow1-online/summary) | +| Alibaba Speech Data | 50000 | UniASR | Online | CN | Tensorflow |[speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online/summary) | +| Alibaba Speech Data | 50000 | UniASR | Offline | CN | Tensorflow |[speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline](https://www.modelscope.cn/models/damo/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/summary) | +| Alibaba Speech Data | 50000 | UniASR | Online | CN&EN | Tensorflow |[speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-online/summary) | +| Alibaba Speech Data | 50000 | UniASR | Offline | CN&EN | Tensorflow |[speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-offline/summary) | +| Alibaba Speech Data | 20000 | UniASR | Online | CN-Accent | Tensorflow |[speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online/summary) | +| Alibaba Speech Data | 20000 | UniASR | Offline | CN-Accent | Tensorflow |[speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/summary) | +| Alibaba Speech Data | 30000 | Paraformer-8K | Online | CN | Tensorflow |[speech_paraformer_asr_nat-zh-cn-8k-common-vocab3444-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab3444-tensorflow1-online/summary) | +| Alibaba Speech Data | 30000 | Paraformer-8K | Offline | CN | Tensorflow |[speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/summary) | +| Alibaba Speech Data | 30000 | Paraformer-8K | Online | CN | Pytorch |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary) | +| Alibaba Speech Data | 30000 | Paraformer-8K | Offline | CN | Pytorch |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/summary) | +| Alibaba Speech Data | 30000 | UniASR-8K | Online | CN | Tensorflow |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online/summary) | +| Alibaba Speech Data | 30000 | UniASR-8K | Offline | CN | Tensorflow |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/summary) | +| Alibaba Speech Data | 30000 | UniASR-8K | Online | CN | Pytorch |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary) | +| Alibaba Speech Data | 30000 | UniASR-8K | Offline | CN | Pytorch |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/summary) | +| AISHELL-1 | 178 | Paraformer | Offline | CN | Pytorch | [speech_paraformer_asr_nat-aishell1-pytorch](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-aishell1-pytorch/summary) | +| AISHELL-2 | 1000 | Paraformer | Offline | CN | Pytorch | [speech_paraformer_asr_nat-aishell2-pytorch](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-aishell2-pytorch/summary) | +| AISHELL-1 | 178 | ParaformerBert | Offline | CN | Pytorch | [speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch](https://modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary) | +| AISHELL-2 | 1000 | ParaformerBert | Offline | CN | Pytorch | [speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch](https://modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary) | +| AISHELL-1 | 178 | Conformer | Offline | CN | Pytorch | [speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch](https://modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary) | +| AISHELL-2 | 1000 | Conformer | Offline | CN | Pytorch | [speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch](https://modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary) | diff --git a/docs_cn/modelscope_usages.md b/docs_cn/modelscope_usages.md new file mode 100644 index 000000000..6e1420a03 --- /dev/null +++ b/docs_cn/modelscope_usages.md @@ -0,0 +1,52 @@ +# 快速使用ModelScope +ModelScope是阿里巴巴推出的开源模型即服务共享平台,为广大学术界用户和工业界用户提供灵活、便捷的模型应用支持。具体的使用方法和开源模型可以参见[ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition) 。在语音方向,我们提供了自回归/非自回归语音识别,语音预训练,标点预测等模型,用户可以方便使用。 + +## 整体介绍 +我们在egs_modelscope目录下提供了相关模型的使用,支持直接用我们提供的模型进行推理,同时也支持将我们提供的模型作为预训练好的模型作为初始模型进行微调。下面,我们将以egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch目录中提供的模型来进行介绍,包括`infer.py`,`finetune.py`和`infer_after_finetune.py`,对应的功能如下: +- `infer.py`: 基于我们提供的模型,对指定的数据集进行推理 +- `finetune.py`: 将我们提供的模型作为初始模型进行微调 +- `infer_after_finetune.py`: 基于微调得到的模型,对指定的数据集进行推理 + +## 模型推理 +我们提供了`infer.py`来实现模型推理。基于此文件,用户可以基于我们提供的模型,对指定的数据集进行推理,得到相应的识别结果。如果同时给定了抄本,则会同时计算CER。在开始推理前,用户可以指定如下参数来修改推理配置: +* `data_dir`:数据集目录。目录下应该包括音频列表文件`wav.scp`和抄本文件`text`(可选),具体格式可以参见[快速开始](./get_started.md)中的说明。如果`text`文件存在,则会相应的计算CER,否则会跳过。 +* `output_dir`:推理结果保存目录 +* `batch_size`:推理时的batch大小 +* `ctc_weight`:部分模型包含CTC模块,可以设置该参数来指定推理时,CTC模块的权重 + +除了直接在`infer.py`中设置参数外,用户也可以通过手动修改模型下载目录下的`decoding.yaml`文件中的参数来修改推理配置。 + +## 模型微调 +我们提供了`finetune.py`来实现模型微调。基于此文件,用户可以基于我们提供的模型作为初始模型,在指定的数据集上进行微调,从而在特征领域取得更好的性能。在微调开始前,用户可以指定如下参数来修改微调配置: +* `data_path`:数据目录。该目录下应该包括存放训练集数据的`train`目录和存放验证集数据的`dev`目录。每个目录中需要包括音频列表文件`wav.scp`和抄本文件`text` +* `output_dir`:微调结果保存目录 +* `dataset_type`:对于小数据集,设置为`small`;当数据量大于1000小时时,设置为`large` +* `batch_bins`:batch size,如果dataset_type设置为`small`,batch_bins单位为fbank特征帧数;如果dataset_type=`large`,batch_bins单位为毫秒 +* `max_epoch`:最大的训练轮数 + +以下参数也可以进行设置。但是如果没有特别的需求,可以忽略,直接使用我们给定的默认值: +* `accum_grad`:梯度累积 +* `keep_nbest_models`:选择性能最好的`keep_nbest_models`个模型的参数进行平均,得到性能更好的模型 +* `optim`:设置微调时的优化器 +* `lr`:设置微调时的学习率 +* `scheduler`:设置学习率调整策略 +* `scheduler_conf`:学习率调整策略的相关参数 +* `specaug`:设置谱增广 +* `specaug_conf`:谱增广的相关参数 + +除了直接在`finetune.py`中设置参数外,用户也可以通过手动修改模型下载目录下的`finetune.yaml`文件中的参数来修改微调配置。 + +## 基于微调后的模型推理 +我们提供了`infer_after_finetune.py`来实现基于用户自己微调得到的模型进行推理。基于此文件,用户可以基于微调后的模型,对指定的数据集进行推理,得到相应的识别结果。如果同时给定了抄本,则会同时计算CER。在开始推理前,用户可以指定如下参数来修改推理配置: +* `data_dir`:数据集目录。目录下应该包括音频列表文件`wav.scp`和抄本文件`text`(可选)。如果`text`文件存在,则会相应的计算CER,否则会跳过。 +* `output_dir`:推理结果保存目录 +* `batch_size`:推理时的batch大小 +* `ctc_weight`:部分模型包含CTC模块,可以设置该参数来指定推理时,CTC模块的权重 +* `decoding_model_name`:指定用于推理的模型名 + +以下参数也可以进行设置。但是如果没有特别的需求,可以忽略,直接使用我们给定的默认值: +* `modelscope_model_name`:微调时使用的初始模型 +* `required_files`:使用modelscope接口进行推理时需要用到的文件 + +## 注意事项 +部分模型可能在微调、推理时存在一些特有的参数,这部分参数可以在对应目录的README.md文件中找到具体用法。 \ No newline at end of file diff --git a/docs_cn/papers.md b/docs_cn/papers.md new file mode 100644 index 000000000..34a815033 --- /dev/null +++ b/docs_cn/papers.md @@ -0,0 +1,4 @@ +# 论文 + +- [Universal ASR: Unifying Streaming and Non-Streaming ASR Using a Single Encoder-Decoder Model](https://arxiv.org/abs/2010.14099), arXiv preprint arXiv:2010.14099, 2020. +- [Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition](https://arxiv.org/abs/2206.08317), INTERSPEECH 2022. \ No newline at end of file diff --git a/funasr/bin/build_trainer.py b/funasr/bin/build_trainer.py index c13f91b2f..5ef736a19 100644 --- a/funasr/bin/build_trainer.py +++ b/funasr/bin/build_trainer.py @@ -34,8 +34,22 @@ def parse_args(mode): return args, ASRTask -def build_trainer(modelscope_dict, data_dir, output_dir, train_set="train", dev_set="validation", distributed=False, - dataset_type="small", lr=None, batch_bins=None, max_epoch=None, mate_params=None): +def build_trainer(modelscope_dict, + data_dir, + output_dir, + train_set="train", + dev_set="validation", + distributed=False, + dataset_type="small", + batch_bins=None, + max_epoch=None, + optim=None, + lr=None, + scheduler=None, + scheduler_conf=None, + specaug=None, + specaug_conf=None, + param_dict=None): mode = modelscope_dict['mode'] args, ASRTask = parse_args(mode=mode) # ddp related @@ -94,8 +108,18 @@ def build_trainer(modelscope_dict, data_dir, output_dir, train_set="train", dev_ args.output_dir = output_dir args.gpu_id = args.local_rank args.config = finetune_config + if optim is not None: + args.optim = optim if lr is not None: args.optim_conf["lr"] = lr + if scheduler is not None: + args.scheduler = scheduler + if scheduler_conf is not None: + args.scheduler_conf = scheduler_conf + if specaug is not None: + args.specaug = specaug + if specaug_conf is not None: + args.specaug_conf = specaug_conf if max_epoch is not None: args.max_epoch = max_epoch if batch_bins is not None: diff --git a/funasr/utils/modelscope_param.py b/funasr/utils/modelscope_param.py index 5d6bffb25..9ff196a75 100644 --- a/funasr/utils/modelscope_param.py +++ b/funasr/utils/modelscope_param.py @@ -1,25 +1,35 @@ - class modelscope_args(): - def __init__(self, - task: str = "", - model: str = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", - data_path: str = None, - output_dir: str = None, - model_revision: str = None, - dataset_type: str = "small", - batch_bins: int = 2000, - max_epoch: int = None, - lr: float = None, - ): - self.task = task - self.model = model - self.data_path = data_path - self.output_dir = output_dir - self.model_revision = model_revision - self.dataset_type = dataset_type - self.batch_bins = batch_bins - self.max_epoch = max_epoch - self.lr = lr - - - \ No newline at end of file + def __init__(self, + task: str = "", + model: str = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", + data_path: str = None, + output_dir: str = None, + model_revision: str = None, + dataset_type: str = "small", + batch_bins: int = 2000, + max_epoch: int = None, + accum_grad: int = None, + keep_nbest_models: int = None, + optim: str = None, + lr: float = None, + scheduler: str = None, + scheduler_conf: dict = None, + specaug: str = None, + specaug_conf: dict = None, + ): + self.task = task + self.model = model + self.data_path = data_path + self.output_dir = output_dir + self.model_revision = model_revision + self.dataset_type = dataset_type + self.batch_bins = batch_bins + self.max_epoch = max_epoch + self.accum_grad = accum_grad + self.keep_nbest_models = keep_nbest_models + self.optim = optim + self.lr = lr + self.scheduler = scheduler + self.scheduler_conf = scheduler_conf + self.specaug = specaug + self.specaug_conf = specaug_conf