
|

|
|
+|:---------------------------------------------------------------:|:---------------------------------------------------------------:|:-----------------------------------------------------------:|
## Acknowledge
@@ -111,4 +86,4 @@ This project is licensed under the [The MIT License](https://opensource.org/lice
booktitle={arXiv preprint arXiv:2301.12343}
year={2023}
}
-```
\ No newline at end of file
+```
diff --git a/docs/images/damo.png b/docs/images/damo.png
new file mode 100644
index 000000000..4e692ed0e
Binary files /dev/null and b/docs/images/damo.png differ
diff --git a/docs/images/nwpu.png b/docs/images/nwpu.png
new file mode 100644
index 000000000..6b4713c98
Binary files /dev/null and b/docs/images/nwpu.png differ
diff --git a/egs/aishell/conformer/run.sh b/egs/aishell/conformer/run.sh
index 41db45dfd..09ddab8a5 100755
--- a/egs/aishell/conformer/run.sh
+++ b/egs/aishell/conformer/run.sh
@@ -52,7 +52,7 @@ asr_config=conf/train_asr_conformer.yaml
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/data2vec_paraformer_finetune/run.sh b/egs/aishell/data2vec_paraformer_finetune/run.sh
index cada164dc..d033ce26a 100755
--- a/egs/aishell/data2vec_paraformer_finetune/run.sh
+++ b/egs/aishell/data2vec_paraformer_finetune/run.sh
@@ -55,7 +55,7 @@ asr_config=conf/train_asr_paraformer_transformer_12e_6d_3072_768.yaml
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/data2vec_transformer_finetune/run.sh b/egs/aishell/data2vec_transformer_finetune/run.sh
index 7ab8626bb..26222e666 100755
--- a/egs/aishell/data2vec_transformer_finetune/run.sh
+++ b/egs/aishell/data2vec_transformer_finetune/run.sh
@@ -55,7 +55,7 @@ asr_config=conf/train_asr_transformer_12e_6d_3072_768.yaml
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.cer_ctc.ave_10best.pth
+inference_asr_model=valid.cer_ctc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/paraformer/run.sh b/egs/aishell/paraformer/run.sh
index 2b0f1449b..53b5f906d 100755
--- a/egs/aishell/paraformer/run.sh
+++ b/egs/aishell/paraformer/run.sh
@@ -52,7 +52,7 @@ asr_config=conf/train_asr_paraformer_conformer_12e_6d_2048_256.yaml
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/paraformerbert/run.sh b/egs/aishell/paraformerbert/run.sh
index 96310ab84..2487eacd8 100755
--- a/egs/aishell/paraformerbert/run.sh
+++ b/egs/aishell/paraformerbert/run.sh
@@ -56,7 +56,7 @@ asr_config=conf/train_asr_paraformerbert_conformer_12e_6d_2048_256.yaml
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/transformer/run.sh b/egs/aishell/transformer/run.sh
index 4c307b07c..f66a338ba 100755
--- a/egs/aishell/transformer/run.sh
+++ b/egs/aishell/transformer/run.sh
@@ -52,7 +52,7 @@ asr_config=conf/train_asr_conformer.yaml
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell2/conformer/run.sh b/egs/aishell2/conformer/run.sh
index bd6d81ea9..f9ea69ada 100755
--- a/egs/aishell2/conformer/run.sh
+++ b/egs/aishell2/conformer/run.sh
@@ -54,7 +54,7 @@ asr_config=conf/train_asr_conformer.yaml
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/paraformer/run.sh b/egs/aishell2/paraformer/run.sh
index 2b7d84131..e1ea4fe73 100755
--- a/egs/aishell2/paraformer/run.sh
+++ b/egs/aishell2/paraformer/run.sh
@@ -54,7 +54,7 @@ asr_config=conf/train_asr_paraformer_conformer_20e_1280_320_6d_1280_320.yaml
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/paraformerbert/run.sh b/egs/aishell2/paraformerbert/run.sh
index d0407d480..239a7e339 100755
--- a/egs/aishell2/paraformerbert/run.sh
+++ b/egs/aishell2/paraformerbert/run.sh
@@ -58,7 +58,7 @@ asr_config=conf/train_asr_paraformerbert_conformer_20e_6d_1280_320.yaml
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/transformer/run.sh b/egs/aishell2/transformer/run.sh
index a5a14ec09..6f2dd4d8d 100755
--- a/egs/aishell2/transformer/run.sh
+++ b/egs/aishell2/transformer/run.sh
@@ -54,7 +54,7 @@ asr_config=conf/train_asr_transformer.yaml
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/transformerLM/run.sh b/egs/aishell2/transformerLM/run.sh
index 28e376287..9e7a7135b 100755
--- a/egs/aishell2/transformerLM/run.sh
+++ b/egs/aishell2/transformerLM/run.sh
@@ -34,7 +34,7 @@ exp_dir=./data
tag=exp1
model_dir="baseline_$(basename "${lm_config}" .yaml)_${lang}_${token_type}_${tag}"
lm_exp=${exp_dir}/exp/${model_dir}
-inference_lm=valid.loss.ave.pth # Language model path for decoding.
+inference_lm=valid.loss.ave.pb # Language model path for decoding.
stage=0
stop_stage=3
diff --git a/egs/alimeeting/diarization/sond/infer_alimeeting_test.py b/egs/alimeeting/diarization/sond/infer_alimeeting_test.py
index 0988f5d03..b4d534bee 100644
--- a/egs/alimeeting/diarization/sond/infer_alimeeting_test.py
+++ b/egs/alimeeting/diarization/sond/infer_alimeeting_test.py
@@ -4,7 +4,7 @@ import sys
def main():
diar_config_path = sys.argv[1] if len(sys.argv) > 1 else "sond_fbank.yaml"
- diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pth"
+ diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pb"
output_dir = sys.argv[3] if len(sys.argv) > 3 else "./outputs"
data_path_and_name_and_type = [
("data/test_rmsil/feats.scp", "speech", "kaldi_ark"),
diff --git a/egs/alimeeting/diarization/sond/run.sh b/egs/alimeeting/diarization/sond/run.sh
index 7e9a7f7ba..19ae40cdd 100644
--- a/egs/alimeeting/diarization/sond/run.sh
+++ b/egs/alimeeting/diarization/sond/run.sh
@@ -17,9 +17,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "Downloading Pre-trained model..."
git clone https://www.modelscope.cn/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch.git
git clone https://www.modelscope.cn/damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch.git
- ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth ./sv.pth
+ ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb ./sv.pb
cp speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.yaml ./sv.yaml
- ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pth ./sond.pth
+ ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pb ./sond.pb
cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond_fbank.yaml ./sond_fbank.yaml
cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.yaml ./sond.yaml
echo "Done."
@@ -30,7 +30,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "Calculating diarization results..."
- python infer_alimeeting_test.py sond_fbank.yaml sond.pth outputs
+ python infer_alimeeting_test.py sond_fbank.yaml sond.pb outputs
python local/convert_label_to_rttm.py \
outputs/labels.txt \
data/test_rmsil/raw_rmsil_map.scp \
diff --git a/egs/alimeeting/diarization/sond/unit_test.py b/egs/alimeeting/diarization/sond/unit_test.py
index 84a424762..0f40ab29e 100644
--- a/egs/alimeeting/diarization/sond/unit_test.py
+++ b/egs/alimeeting/diarization/sond/unit_test.py
@@ -4,7 +4,7 @@ import os
def test_fbank_cpu_infer():
diar_config_path = "config_fbank.yaml"
- diar_model_path = "sond.pth"
+ diar_model_path = "sond.pb"
output_dir = "./outputs"
data_path_and_name_and_type = [
("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -24,7 +24,7 @@ def test_fbank_cpu_infer():
def test_fbank_gpu_infer():
diar_config_path = "config_fbank.yaml"
- diar_model_path = "sond.pth"
+ diar_model_path = "sond.pb"
output_dir = "./outputs"
data_path_and_name_and_type = [
("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -45,7 +45,7 @@ def test_fbank_gpu_infer():
def test_wav_gpu_infer():
diar_config_path = "config.yaml"
- diar_model_path = "sond.pth"
+ diar_model_path = "sond.pb"
output_dir = "./outputs"
data_path_and_name_and_type = [
("data/unit_test/test_wav.scp", "speech", "sound"),
@@ -66,7 +66,7 @@ def test_wav_gpu_infer():
def test_without_profile_gpu_infer():
diar_config_path = "config.yaml"
- diar_model_path = "sond.pth"
+ diar_model_path = "sond.pb"
output_dir = "./outputs"
raw_inputs = [[
"data/unit_test/raw_inputs/record.wav",
diff --git a/egs/callhome/diarization/sond/sond.yaml b/egs/callhome/diarization/sond/sond.yaml
new file mode 100644
index 000000000..868163f0a
--- /dev/null
+++ b/egs/callhome/diarization/sond/sond.yaml
@@ -0,0 +1,2739 @@
+config: finetune.yaml
+print_config: false
+log_level: INFO
+dry_run: false
+iterator_type: sequence
+output_dir: exp/sond
+ngpu: 1
+seed: 0
+num_workers: 16
+num_att_plot: 0
+dist_backend: nccl
+dist_init_method: env://
+dist_world_size: null
+dist_rank: null
+local_rank: 0
+dist_master_addr: null
+dist_master_port: null
+dist_launcher: null
+multiprocessing_distributed: true
+distributed: false
+unused_parameters: true
+sharded_ddp: false
+ddp_backend: pytorch_ddp
+cudnn_enabled: true
+cudnn_benchmark: false
+cudnn_deterministic: true
+collect_stats: false
+write_collected_feats: false
+max_epoch: 50
+patience: null
+val_scheduler_criterion:
+- valid
+- acc
+early_stopping_criterion:
+- valid
+- loss
+- min
+best_model_criterion:
+- - valid
+ - acc
+ - max
+keep_nbest_models: 10
+nbest_averaging_interval: 0
+grad_clip: 5
+grad_clip_type: 2.0
+grad_noise: false
+accum_grad: 1
+no_forward_run: false
+resume: true
+train_dtype: float32
+use_amp: false
+log_interval: 50
+use_matplotlib: false
+use_tensorboard: true
+use_wandb: false
+wandb_project: null
+wandb_id: null
+wandb_entity: null
+wandb_name: null
+wandb_model_log_interval: -1
+use_pai: true
+detect_anomaly: false
+pretrain_path: null
+init_param: []
+ignore_init_mismatch: false
+freeze_param: []
+num_iters_per_epoch: null
+batch_size: 20
+valid_batch_size: null
+batch_bins: 10000
+valid_batch_bins: null
+train_shape_file:
+- /data/volume1/youyan/aishell/ark/train/speech_shape.1
+- /data/volume1/youyan/aishell/ark/train/text_shape.1
+valid_shape_file:
+- /data/volume1/youyan/aishell/ark/dev/speech_shape.1
+- /data/volume1/youyan/aishell/ark/dev/text_shape.1
+batch_type: length
+valid_batch_type: null
+fold_length:
+- 512
+- 150
+sort_in_batch: descending
+sort_batch: descending
+multiple_iterator: false
+chunk_length: 500
+chunk_shift_ratio: 0.5
+num_cache_chunks: 1024
+train_data_path_and_name_and_type:
+- - /data/volume1/youyan/aishell/ark/train/data.scp
+ - speech
+ - kaldi_ark
+- - /data/volume1/youyan/aishell/ark/train/data.text.1
+ - text
+ - text
+valid_data_path_and_name_and_type:
+- - /data/volume1/youyan/aishell/ark/dev/data.scp
+ - speech
+ - kaldi_ark
+- - /data/volume1/youyan/aishell/ark/dev/data.text.1
+ - text
+ - text
+allow_variable_data_keys: false
+max_cache_size: 0.0
+max_cache_fd: 32
+valid_max_cache_size: null
+optim: adam
+optim_conf:
+ lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+ warmup_steps: 30000
+token_list:
+- '0'
+- '1'
+- '2'
+- '3'
+- '4'
+- '5'
+- '6'
+- '7'
+- '8'
+- '9'
+- '10'
+- '11'
+- '12'
+- '13'
+- '14'
+- '15'
+- '16'
+- '17'
+- '18'
+- '19'
+- '20'
+- '21'
+- '22'
+- '23'
+- '24'
+- '25'
+- '26'
+- '27'
+- '28'
+- '29'
+- '30'
+- '32'
+- '33'
+- '34'
+- '35'
+- '36'
+- '37'
+- '38'
+- '39'
+- '40'
+- '41'
+- '42'
+- '43'
+- '44'
+- '45'
+- '46'
+- '48'
+- '49'
+- '50'
+- '51'
+- '52'
+- '53'
+- '54'
+- '56'
+- '57'
+- '58'
+- '60'
+- '64'
+- '65'
+- '66'
+- '67'
+- '68'
+- '69'
+- '70'
+- '71'
+- '72'
+- '73'
+- '74'
+- '75'
+- '76'
+- '77'
+- '78'
+- '80'
+- '81'
+- '82'
+- '83'
+- '84'
+- '85'
+- '86'
+- '88'
+- '89'
+- '90'
+- '92'
+- '96'
+- '97'
+- '98'
+- '99'
+- '100'
+- '101'
+- '102'
+- '104'
+- '105'
+- '106'
+- '108'
+- '112'
+- '113'
+- '114'
+- '116'
+- '120'
+- '128'
+- '129'
+- '130'
+- '131'
+- '132'
+- '133'
+- '134'
+- '135'
+- '136'
+- '137'
+- '138'
+- '139'
+- '140'
+- '141'
+- '142'
+- '144'
+- '145'
+- '146'
+- '147'
+- '148'
+- '149'
+- '150'
+- '152'
+- '153'
+- '154'
+- '156'
+- '160'
+- '161'
+- '162'
+- '163'
+- '164'
+- '165'
+- '166'
+- '168'
+- '169'
+- '170'
+- '172'
+- '176'
+- '177'
+- '178'
+- '180'
+- '184'
+- '192'
+- '193'
+- '194'
+- '195'
+- '196'
+- '197'
+- '198'
+- '200'
+- '201'
+- '202'
+- '204'
+- '208'
+- '209'
+- '210'
+- '212'
+- '216'
+- '224'
+- '225'
+- '226'
+- '228'
+- '232'
+- '240'
+- '256'
+- '257'
+- '258'
+- '259'
+- '260'
+- '261'
+- '262'
+- '263'
+- '264'
+- '265'
+- '266'
+- '267'
+- '268'
+- '269'
+- '270'
+- '272'
+- '273'
+- '274'
+- '275'
+- '276'
+- '277'
+- '278'
+- '280'
+- '281'
+- '282'
+- '284'
+- '288'
+- '289'
+- '290'
+- '291'
+- '292'
+- '293'
+- '294'
+- '296'
+- '297'
+- '298'
+- '300'
+- '304'
+- '305'
+- '306'
+- '308'
+- '312'
+- '320'
+- '321'
+- '322'
+- '323'
+- '324'
+- '325'
+- '326'
+- '328'
+- '329'
+- '330'
+- '332'
+- '336'
+- '337'
+- '338'
+- '340'
+- '344'
+- '352'
+- '353'
+- '354'
+- '356'
+- '360'
+- '368'
+- '384'
+- '385'
+- '386'
+- '387'
+- '388'
+- '389'
+- '390'
+- '392'
+- '393'
+- '394'
+- '396'
+- '400'
+- '401'
+- '402'
+- '404'
+- '408'
+- '416'
+- '417'
+- '418'
+- '420'
+- '424'
+- '432'
+- '448'
+- '449'
+- '450'
+- '452'
+- '456'
+- '464'
+- '480'
+- '512'
+- '513'
+- '514'
+- '515'
+- '516'
+- '517'
+- '518'
+- '519'
+- '520'
+- '521'
+- '522'
+- '523'
+- '524'
+- '525'
+- '526'
+- '528'
+- '529'
+- '530'
+- '531'
+- '532'
+- '533'
+- '534'
+- '536'
+- '537'
+- '538'
+- '540'
+- '544'
+- '545'
+- '546'
+- '547'
+- '548'
+- '549'
+- '550'
+- '552'
+- '553'
+- '554'
+- '556'
+- '560'
+- '561'
+- '562'
+- '564'
+- '568'
+- '576'
+- '577'
+- '578'
+- '579'
+- '580'
+- '581'
+- '582'
+- '584'
+- '585'
+- '586'
+- '588'
+- '592'
+- '593'
+- '594'
+- '596'
+- '600'
+- '608'
+- '609'
+- '610'
+- '612'
+- '616'
+- '624'
+- '640'
+- '641'
+- '642'
+- '643'
+- '644'
+- '645'
+- '646'
+- '648'
+- '649'
+- '650'
+- '652'
+- '656'
+- '657'
+- '658'
+- '660'
+- '664'
+- '672'
+- '673'
+- '674'
+- '676'
+- '680'
+- '688'
+- '704'
+- '705'
+- '706'
+- '708'
+- '712'
+- '720'
+- '736'
+- '768'
+- '769'
+- '770'
+- '771'
+- '772'
+- '773'
+- '774'
+- '776'
+- '777'
+- '778'
+- '780'
+- '784'
+- '785'
+- '786'
+- '788'
+- '792'
+- '800'
+- '801'
+- '802'
+- '804'
+- '808'
+- '816'
+- '832'
+- '833'
+- '834'
+- '836'
+- '840'
+- '848'
+- '864'
+- '896'
+- '897'
+- '898'
+- '900'
+- '904'
+- '912'
+- '928'
+- '960'
+- '1024'
+- '1025'
+- '1026'
+- '1027'
+- '1028'
+- '1029'
+- '1030'
+- '1031'
+- '1032'
+- '1033'
+- '1034'
+- '1035'
+- '1036'
+- '1037'
+- '1038'
+- '1040'
+- '1041'
+- '1042'
+- '1043'
+- '1044'
+- '1045'
+- '1046'
+- '1048'
+- '1049'
+- '1050'
+- '1052'
+- '1056'
+- '1057'
+- '1058'
+- '1059'
+- '1060'
+- '1061'
+- '1062'
+- '1064'
+- '1065'
+- '1066'
+- '1068'
+- '1072'
+- '1073'
+- '1074'
+- '1076'
+- '1080'
+- '1088'
+- '1089'
+- '1090'
+- '1091'
+- '1092'
+- '1093'
+- '1094'
+- '1096'
+- '1097'
+- '1098'
+- '1100'
+- '1104'
+- '1105'
+- '1106'
+- '1108'
+- '1112'
+- '1120'
+- '1121'
+- '1122'
+- '1124'
+- '1128'
+- '1136'
+- '1152'
+- '1153'
+- '1154'
+- '1155'
+- '1156'
+- '1157'
+- '1158'
+- '1160'
+- '1161'
+- '1162'
+- '1164'
+- '1168'
+- '1169'
+- '1170'
+- '1172'
+- '1176'
+- '1184'
+- '1185'
+- '1186'
+- '1188'
+- '1192'
+- '1200'
+- '1216'
+- '1217'
+- '1218'
+- '1220'
+- '1224'
+- '1232'
+- '1248'
+- '1280'
+- '1281'
+- '1282'
+- '1283'
+- '1284'
+- '1285'
+- '1286'
+- '1288'
+- '1289'
+- '1290'
+- '1292'
+- '1296'
+- '1297'
+- '1298'
+- '1300'
+- '1304'
+- '1312'
+- '1313'
+- '1314'
+- '1316'
+- '1320'
+- '1328'
+- '1344'
+- '1345'
+- '1346'
+- '1348'
+- '1352'
+- '1360'
+- '1376'
+- '1408'
+- '1409'
+- '1410'
+- '1412'
+- '1416'
+- '1424'
+- '1440'
+- '1472'
+- '1536'
+- '1537'
+- '1538'
+- '1539'
+- '1540'
+- '1541'
+- '1542'
+- '1544'
+- '1545'
+- '1546'
+- '1548'
+- '1552'
+- '1553'
+- '1554'
+- '1556'
+- '1560'
+- '1568'
+- '1569'
+- '1570'
+- '1572'
+- '1576'
+- '1584'
+- '1600'
+- '1601'
+- '1602'
+- '1604'
+- '1608'
+- '1616'
+- '1632'
+- '1664'
+- '1665'
+- '1666'
+- '1668'
+- '1672'
+- '1680'
+- '1696'
+- '1728'
+- '1792'
+- '1793'
+- '1794'
+- '1796'
+- '1800'
+- '1808'
+- '1824'
+- '1856'
+- '1920'
+- '2048'
+- '2049'
+- '2050'
+- '2051'
+- '2052'
+- '2053'
+- '2054'
+- '2055'
+- '2056'
+- '2057'
+- '2058'
+- '2059'
+- '2060'
+- '2061'
+- '2062'
+- '2064'
+- '2065'
+- '2066'
+- '2067'
+- '2068'
+- '2069'
+- '2070'
+- '2072'
+- '2073'
+- '2074'
+- '2076'
+- '2080'
+- '2081'
+- '2082'
+- '2083'
+- '2084'
+- '2085'
+- '2086'
+- '2088'
+- '2089'
+- '2090'
+- '2092'
+- '2096'
+- '2097'
+- '2098'
+- '2100'
+- '2104'
+- '2112'
+- '2113'
+- '2114'
+- '2115'
+- '2116'
+- '2117'
+- '2118'
+- '2120'
+- '2121'
+- '2122'
+- '2124'
+- '2128'
+- '2129'
+- '2130'
+- '2132'
+- '2136'
+- '2144'
+- '2145'
+- '2146'
+- '2148'
+- '2152'
+- '2160'
+- '2176'
+- '2177'
+- '2178'
+- '2179'
+- '2180'
+- '2181'
+- '2182'
+- '2184'
+- '2185'
+- '2186'
+- '2188'
+- '2192'
+- '2193'
+- '2194'
+- '2196'
+- '2200'
+- '2208'
+- '2209'
+- '2210'
+- '2212'
+- '2216'
+- '2224'
+- '2240'
+- '2241'
+- '2242'
+- '2244'
+- '2248'
+- '2256'
+- '2272'
+- '2304'
+- '2305'
+- '2306'
+- '2307'
+- '2308'
+- '2309'
+- '2310'
+- '2312'
+- '2313'
+- '2314'
+- '2316'
+- '2320'
+- '2321'
+- '2322'
+- '2324'
+- '2328'
+- '2336'
+- '2337'
+- '2338'
+- '2340'
+- '2344'
+- '2352'
+- '2368'
+- '2369'
+- '2370'
+- '2372'
+- '2376'
+- '2384'
+- '2400'
+- '2432'
+- '2433'
+- '2434'
+- '2436'
+- '2440'
+- '2448'
+- '2464'
+- '2496'
+- '2560'
+- '2561'
+- '2562'
+- '2563'
+- '2564'
+- '2565'
+- '2566'
+- '2568'
+- '2569'
+- '2570'
+- '2572'
+- '2576'
+- '2577'
+- '2578'
+- '2580'
+- '2584'
+- '2592'
+- '2593'
+- '2594'
+- '2596'
+- '2600'
+- '2608'
+- '2624'
+- '2625'
+- '2626'
+- '2628'
+- '2632'
+- '2640'
+- '2656'
+- '2688'
+- '2689'
+- '2690'
+- '2692'
+- '2696'
+- '2704'
+- '2720'
+- '2752'
+- '2816'
+- '2817'
+- '2818'
+- '2820'
+- '2824'
+- '2832'
+- '2848'
+- '2880'
+- '2944'
+- '3072'
+- '3073'
+- '3074'
+- '3075'
+- '3076'
+- '3077'
+- '3078'
+- '3080'
+- '3081'
+- '3082'
+- '3084'
+- '3088'
+- '3089'
+- '3090'
+- '3092'
+- '3096'
+- '3104'
+- '3105'
+- '3106'
+- '3108'
+- '3112'
+- '3120'
+- '3136'
+- '3137'
+- '3138'
+- '3140'
+- '3144'
+- '3152'
+- '3168'
+- '3200'
+- '3201'
+- '3202'
+- '3204'
+- '3208'
+- '3216'
+- '3232'
+- '3264'
+- '3328'
+- '3329'
+- '3330'
+- '3332'
+- '3336'
+- '3344'
+- '3360'
+- '3392'
+- '3456'
+- '3584'
+- '3585'
+- '3586'
+- '3588'
+- '3592'
+- '3600'
+- '3616'
+- '3648'
+- '3712'
+- '3840'
+- '4096'
+- '4097'
+- '4098'
+- '4099'
+- '4100'
+- '4101'
+- '4102'
+- '4103'
+- '4104'
+- '4105'
+- '4106'
+- '4107'
+- '4108'
+- '4109'
+- '4110'
+- '4112'
+- '4113'
+- '4114'
+- '4115'
+- '4116'
+- '4117'
+- '4118'
+- '4120'
+- '4121'
+- '4122'
+- '4124'
+- '4128'
+- '4129'
+- '4130'
+- '4131'
+- '4132'
+- '4133'
+- '4134'
+- '4136'
+- '4137'
+- '4138'
+- '4140'
+- '4144'
+- '4145'
+- '4146'
+- '4148'
+- '4152'
+- '4160'
+- '4161'
+- '4162'
+- '4163'
+- '4164'
+- '4165'
+- '4166'
+- '4168'
+- '4169'
+- '4170'
+- '4172'
+- '4176'
+- '4177'
+- '4178'
+- '4180'
+- '4184'
+- '4192'
+- '4193'
+- '4194'
+- '4196'
+- '4200'
+- '4208'
+- '4224'
+- '4225'
+- '4226'
+- '4227'
+- '4228'
+- '4229'
+- '4230'
+- '4232'
+- '4233'
+- '4234'
+- '4236'
+- '4240'
+- '4241'
+- '4242'
+- '4244'
+- '4248'
+- '4256'
+- '4257'
+- '4258'
+- '4260'
+- '4264'
+- '4272'
+- '4288'
+- '4289'
+- '4290'
+- '4292'
+- '4296'
+- '4304'
+- '4320'
+- '4352'
+- '4353'
+- '4354'
+- '4355'
+- '4356'
+- '4357'
+- '4358'
+- '4360'
+- '4361'
+- '4362'
+- '4364'
+- '4368'
+- '4369'
+- '4370'
+- '4372'
+- '4376'
+- '4384'
+- '4385'
+- '4386'
+- '4388'
+- '4392'
+- '4400'
+- '4416'
+- '4417'
+- '4418'
+- '4420'
+- '4424'
+- '4432'
+- '4448'
+- '4480'
+- '4481'
+- '4482'
+- '4484'
+- '4488'
+- '4496'
+- '4512'
+- '4544'
+- '4608'
+- '4609'
+- '4610'
+- '4611'
+- '4612'
+- '4613'
+- '4614'
+- '4616'
+- '4617'
+- '4618'
+- '4620'
+- '4624'
+- '4625'
+- '4626'
+- '4628'
+- '4632'
+- '4640'
+- '4641'
+- '4642'
+- '4644'
+- '4648'
+- '4656'
+- '4672'
+- '4673'
+- '4674'
+- '4676'
+- '4680'
+- '4688'
+- '4704'
+- '4736'
+- '4737'
+- '4738'
+- '4740'
+- '4744'
+- '4752'
+- '4768'
+- '4800'
+- '4864'
+- '4865'
+- '4866'
+- '4868'
+- '4872'
+- '4880'
+- '4896'
+- '4928'
+- '4992'
+- '5120'
+- '5121'
+- '5122'
+- '5123'
+- '5124'
+- '5125'
+- '5126'
+- '5128'
+- '5129'
+- '5130'
+- '5132'
+- '5136'
+- '5137'
+- '5138'
+- '5140'
+- '5144'
+- '5152'
+- '5153'
+- '5154'
+- '5156'
+- '5160'
+- '5168'
+- '5184'
+- '5185'
+- '5186'
+- '5188'
+- '5192'
+- '5200'
+- '5216'
+- '5248'
+- '5249'
+- '5250'
+- '5252'
+- '5256'
+- '5264'
+- '5280'
+- '5312'
+- '5376'
+- '5377'
+- '5378'
+- '5380'
+- '5384'
+- '5392'
+- '5408'
+- '5440'
+- '5504'
+- '5632'
+- '5633'
+- '5634'
+- '5636'
+- '5640'
+- '5648'
+- '5664'
+- '5696'
+- '5760'
+- '5888'
+- '6144'
+- '6145'
+- '6146'
+- '6147'
+- '6148'
+- '6149'
+- '6150'
+- '6152'
+- '6153'
+- '6154'
+- '6156'
+- '6160'
+- '6161'
+- '6162'
+- '6164'
+- '6168'
+- '6176'
+- '6177'
+- '6178'
+- '6180'
+- '6184'
+- '6192'
+- '6208'
+- '6209'
+- '6210'
+- '6212'
+- '6216'
+- '6224'
+- '6240'
+- '6272'
+- '6273'
+- '6274'
+- '6276'
+- '6280'
+- '6288'
+- '6304'
+- '6336'
+- '6400'
+- '6401'
+- '6402'
+- '6404'
+- '6408'
+- '6416'
+- '6432'
+- '6464'
+- '6528'
+- '6656'
+- '6657'
+- '6658'
+- '6660'
+- '6664'
+- '6672'
+- '6688'
+- '6720'
+- '6784'
+- '6912'
+- '7168'
+- '7169'
+- '7170'
+- '7172'
+- '7176'
+- '7184'
+- '7200'
+- '7232'
+- '7296'
+- '7424'
+- '7680'
+- '8192'
+- '8193'
+- '8194'
+- '8195'
+- '8196'
+- '8197'
+- '8198'
+- '8199'
+- '8200'
+- '8201'
+- '8202'
+- '8203'
+- '8204'
+- '8205'
+- '8206'
+- '8208'
+- '8209'
+- '8210'
+- '8211'
+- '8212'
+- '8213'
+- '8214'
+- '8216'
+- '8217'
+- '8218'
+- '8220'
+- '8224'
+- '8225'
+- '8226'
+- '8227'
+- '8228'
+- '8229'
+- '8230'
+- '8232'
+- '8233'
+- '8234'
+- '8236'
+- '8240'
+- '8241'
+- '8242'
+- '8244'
+- '8248'
+- '8256'
+- '8257'
+- '8258'
+- '8259'
+- '8260'
+- '8261'
+- '8262'
+- '8264'
+- '8265'
+- '8266'
+- '8268'
+- '8272'
+- '8273'
+- '8274'
+- '8276'
+- '8280'
+- '8288'
+- '8289'
+- '8290'
+- '8292'
+- '8296'
+- '8304'
+- '8320'
+- '8321'
+- '8322'
+- '8323'
+- '8324'
+- '8325'
+- '8326'
+- '8328'
+- '8329'
+- '8330'
+- '8332'
+- '8336'
+- '8337'
+- '8338'
+- '8340'
+- '8344'
+- '8352'
+- '8353'
+- '8354'
+- '8356'
+- '8360'
+- '8368'
+- '8384'
+- '8385'
+- '8386'
+- '8388'
+- '8392'
+- '8400'
+- '8416'
+- '8448'
+- '8449'
+- '8450'
+- '8451'
+- '8452'
+- '8453'
+- '8454'
+- '8456'
+- '8457'
+- '8458'
+- '8460'
+- '8464'
+- '8465'
+- '8466'
+- '8468'
+- '8472'
+- '8480'
+- '8481'
+- '8482'
+- '8484'
+- '8488'
+- '8496'
+- '8512'
+- '8513'
+- '8514'
+- '8516'
+- '8520'
+- '8528'
+- '8544'
+- '8576'
+- '8577'
+- '8578'
+- '8580'
+- '8584'
+- '8592'
+- '8608'
+- '8640'
+- '8704'
+- '8705'
+- '8706'
+- '8707'
+- '8708'
+- '8709'
+- '8710'
+- '8712'
+- '8713'
+- '8714'
+- '8716'
+- '8720'
+- '8721'
+- '8722'
+- '8724'
+- '8728'
+- '8736'
+- '8737'
+- '8738'
+- '8740'
+- '8744'
+- '8752'
+- '8768'
+- '8769'
+- '8770'
+- '8772'
+- '8776'
+- '8784'
+- '8800'
+- '8832'
+- '8833'
+- '8834'
+- '8836'
+- '8840'
+- '8848'
+- '8864'
+- '8896'
+- '8960'
+- '8961'
+- '8962'
+- '8964'
+- '8968'
+- '8976'
+- '8992'
+- '9024'
+- '9088'
+- '9216'
+- '9217'
+- '9218'
+- '9219'
+- '9220'
+- '9221'
+- '9222'
+- '9224'
+- '9225'
+- '9226'
+- '9228'
+- '9232'
+- '9233'
+- '9234'
+- '9236'
+- '9240'
+- '9248'
+- '9249'
+- '9250'
+- '9252'
+- '9256'
+- '9264'
+- '9280'
+- '9281'
+- '9282'
+- '9284'
+- '9288'
+- '9296'
+- '9312'
+- '9344'
+- '9345'
+- '9346'
+- '9348'
+- '9352'
+- '9360'
+- '9376'
+- '9408'
+- '9472'
+- '9473'
+- '9474'
+- '9476'
+- '9480'
+- '9488'
+- '9504'
+- '9536'
+- '9600'
+- '9728'
+- '9729'
+- '9730'
+- '9732'
+- '9736'
+- '9744'
+- '9760'
+- '9792'
+- '9856'
+- '9984'
+- '10240'
+- '10241'
+- '10242'
+- '10243'
+- '10244'
+- '10245'
+- '10246'
+- '10248'
+- '10249'
+- '10250'
+- '10252'
+- '10256'
+- '10257'
+- '10258'
+- '10260'
+- '10264'
+- '10272'
+- '10273'
+- '10274'
+- '10276'
+- '10280'
+- '10288'
+- '10304'
+- '10305'
+- '10306'
+- '10308'
+- '10312'
+- '10320'
+- '10336'
+- '10368'
+- '10369'
+- '10370'
+- '10372'
+- '10376'
+- '10384'
+- '10400'
+- '10432'
+- '10496'
+- '10497'
+- '10498'
+- '10500'
+- '10504'
+- '10512'
+- '10528'
+- '10560'
+- '10624'
+- '10752'
+- '10753'
+- '10754'
+- '10756'
+- '10760'
+- '10768'
+- '10784'
+- '10816'
+- '10880'
+- '11008'
+- '11264'
+- '11265'
+- '11266'
+- '11268'
+- '11272'
+- '11280'
+- '11296'
+- '11328'
+- '11392'
+- '11520'
+- '11776'
+- '12288'
+- '12289'
+- '12290'
+- '12291'
+- '12292'
+- '12293'
+- '12294'
+- '12296'
+- '12297'
+- '12298'
+- '12300'
+- '12304'
+- '12305'
+- '12306'
+- '12308'
+- '12312'
+- '12320'
+- '12321'
+- '12322'
+- '12324'
+- '12328'
+- '12336'
+- '12352'
+- '12353'
+- '12354'
+- '12356'
+- '12360'
+- '12368'
+- '12384'
+- '12416'
+- '12417'
+- '12418'
+- '12420'
+- '12424'
+- '12432'
+- '12448'
+- '12480'
+- '12544'
+- '12545'
+- '12546'
+- '12548'
+- '12552'
+- '12560'
+- '12576'
+- '12608'
+- '12672'
+- '12800'
+- '12801'
+- '12802'
+- '12804'
+- '12808'
+- '12816'
+- '12832'
+- '12864'
+- '12928'
+- '13056'
+- '13312'
+- '13313'
+- '13314'
+- '13316'
+- '13320'
+- '13328'
+- '13344'
+- '13376'
+- '13440'
+- '13568'
+- '13824'
+- '14336'
+- '14337'
+- '14338'
+- '14340'
+- '14344'
+- '14352'
+- '14368'
+- '14400'
+- '14464'
+- '14592'
+- '14848'
+- '15360'
+- '16384'
+- '16385'
+- '16386'
+- '16387'
+- '16388'
+- '16389'
+- '16390'
+- '16391'
+- '16392'
+- '16393'
+- '16394'
+- '16395'
+- '16396'
+- '16397'
+- '16398'
+- '16400'
+- '16401'
+- '16402'
+- '16403'
+- '16404'
+- '16405'
+- '16406'
+- '16408'
+- '16409'
+- '16410'
+- '16412'
+- '16416'
+- '16417'
+- '16418'
+- '16419'
+- '16420'
+- '16421'
+- '16422'
+- '16424'
+- '16425'
+- '16426'
+- '16428'
+- '16432'
+- '16433'
+- '16434'
+- '16436'
+- '16440'
+- '16448'
+- '16449'
+- '16450'
+- '16451'
+- '16452'
+- '16453'
+- '16454'
+- '16456'
+- '16457'
+- '16458'
+- '16460'
+- '16464'
+- '16465'
+- '16466'
+- '16468'
+- '16472'
+- '16480'
+- '16481'
+- '16482'
+- '16484'
+- '16488'
+- '16496'
+- '16512'
+- '16513'
+- '16514'
+- '16515'
+- '16516'
+- '16517'
+- '16518'
+- '16520'
+- '16521'
+- '16522'
+- '16524'
+- '16528'
+- '16529'
+- '16530'
+- '16532'
+- '16536'
+- '16544'
+- '16545'
+- '16546'
+- '16548'
+- '16552'
+- '16560'
+- '16576'
+- '16577'
+- '16578'
+- '16580'
+- '16584'
+- '16592'
+- '16608'
+- '16640'
+- '16641'
+- '16642'
+- '16643'
+- '16644'
+- '16645'
+- '16646'
+- '16648'
+- '16649'
+- '16650'
+- '16652'
+- '16656'
+- '16657'
+- '16658'
+- '16660'
+- '16664'
+- '16672'
+- '16673'
+- '16674'
+- '16676'
+- '16680'
+- '16688'
+- '16704'
+- '16705'
+- '16706'
+- '16708'
+- '16712'
+- '16720'
+- '16736'
+- '16768'
+- '16769'
+- '16770'
+- '16772'
+- '16776'
+- '16784'
+- '16800'
+- '16832'
+- '16896'
+- '16897'
+- '16898'
+- '16899'
+- '16900'
+- '16901'
+- '16902'
+- '16904'
+- '16905'
+- '16906'
+- '16908'
+- '16912'
+- '16913'
+- '16914'
+- '16916'
+- '16920'
+- '16928'
+- '16929'
+- '16930'
+- '16932'
+- '16936'
+- '16944'
+- '16960'
+- '16961'
+- '16962'
+- '16964'
+- '16968'
+- '16976'
+- '16992'
+- '17024'
+- '17025'
+- '17026'
+- '17028'
+- '17032'
+- '17040'
+- '17056'
+- '17088'
+- '17152'
+- '17153'
+- '17154'
+- '17156'
+- '17160'
+- '17168'
+- '17184'
+- '17216'
+- '17280'
+- '17408'
+- '17409'
+- '17410'
+- '17411'
+- '17412'
+- '17413'
+- '17414'
+- '17416'
+- '17417'
+- '17418'
+- '17420'
+- '17424'
+- '17425'
+- '17426'
+- '17428'
+- '17432'
+- '17440'
+- '17441'
+- '17442'
+- '17444'
+- '17448'
+- '17456'
+- '17472'
+- '17473'
+- '17474'
+- '17476'
+- '17480'
+- '17488'
+- '17504'
+- '17536'
+- '17537'
+- '17538'
+- '17540'
+- '17544'
+- '17552'
+- '17568'
+- '17600'
+- '17664'
+- '17665'
+- '17666'
+- '17668'
+- '17672'
+- '17680'
+- '17696'
+- '17728'
+- '17792'
+- '17920'
+- '17921'
+- '17922'
+- '17924'
+- '17928'
+- '17936'
+- '17952'
+- '17984'
+- '18048'
+- '18176'
+- '18432'
+- '18433'
+- '18434'
+- '18435'
+- '18436'
+- '18437'
+- '18438'
+- '18440'
+- '18441'
+- '18442'
+- '18444'
+- '18448'
+- '18449'
+- '18450'
+- '18452'
+- '18456'
+- '18464'
+- '18465'
+- '18466'
+- '18468'
+- '18472'
+- '18480'
+- '18496'
+- '18497'
+- '18498'
+- '18500'
+- '18504'
+- '18512'
+- '18528'
+- '18560'
+- '18561'
+- '18562'
+- '18564'
+- '18568'
+- '18576'
+- '18592'
+- '18624'
+- '18688'
+- '18689'
+- '18690'
+- '18692'
+- '18696'
+- '18704'
+- '18720'
+- '18752'
+- '18816'
+- '18944'
+- '18945'
+- '18946'
+- '18948'
+- '18952'
+- '18960'
+- '18976'
+- '19008'
+- '19072'
+- '19200'
+- '19456'
+- '19457'
+- '19458'
+- '19460'
+- '19464'
+- '19472'
+- '19488'
+- '19520'
+- '19584'
+- '19712'
+- '19968'
+- '20480'
+- '20481'
+- '20482'
+- '20483'
+- '20484'
+- '20485'
+- '20486'
+- '20488'
+- '20489'
+- '20490'
+- '20492'
+- '20496'
+- '20497'
+- '20498'
+- '20500'
+- '20504'
+- '20512'
+- '20513'
+- '20514'
+- '20516'
+- '20520'
+- '20528'
+- '20544'
+- '20545'
+- '20546'
+- '20548'
+- '20552'
+- '20560'
+- '20576'
+- '20608'
+- '20609'
+- '20610'
+- '20612'
+- '20616'
+- '20624'
+- '20640'
+- '20672'
+- '20736'
+- '20737'
+- '20738'
+- '20740'
+- '20744'
+- '20752'
+- '20768'
+- '20800'
+- '20864'
+- '20992'
+- '20993'
+- '20994'
+- '20996'
+- '21000'
+- '21008'
+- '21024'
+- '21056'
+- '21120'
+- '21248'
+- '21504'
+- '21505'
+- '21506'
+- '21508'
+- '21512'
+- '21520'
+- '21536'
+- '21568'
+- '21632'
+- '21760'
+- '22016'
+- '22528'
+- '22529'
+- '22530'
+- '22532'
+- '22536'
+- '22544'
+- '22560'
+- '22592'
+- '22656'
+- '22784'
+- '23040'
+- '23552'
+- '24576'
+- '24577'
+- '24578'
+- '24579'
+- '24580'
+- '24581'
+- '24582'
+- '24584'
+- '24585'
+- '24586'
+- '24588'
+- '24592'
+- '24593'
+- '24594'
+- '24596'
+- '24600'
+- '24608'
+- '24609'
+- '24610'
+- '24612'
+- '24616'
+- '24624'
+- '24640'
+- '24641'
+- '24642'
+- '24644'
+- '24648'
+- '24656'
+- '24672'
+- '24704'
+- '24705'
+- '24706'
+- '24708'
+- '24712'
+- '24720'
+- '24736'
+- '24768'
+- '24832'
+- '24833'
+- '24834'
+- '24836'
+- '24840'
+- '24848'
+- '24864'
+- '24896'
+- '24960'
+- '25088'
+- '25089'
+- '25090'
+- '25092'
+- '25096'
+- '25104'
+- '25120'
+- '25152'
+- '25216'
+- '25344'
+- '25600'
+- '25601'
+- '25602'
+- '25604'
+- '25608'
+- '25616'
+- '25632'
+- '25664'
+- '25728'
+- '25856'
+- '26112'
+- '26624'
+- '26625'
+- '26626'
+- '26628'
+- '26632'
+- '26640'
+- '26656'
+- '26688'
+- '26752'
+- '26880'
+- '27136'
+- '27648'
+- '28672'
+- '28673'
+- '28674'
+- '28676'
+- '28680'
+- '28688'
+- '28704'
+- '28736'
+- '28800'
+- '28928'
+- '29184'
+- '29696'
+- '30720'
+- '32768'
+- '32769'
+- '32770'
+- '32771'
+- '32772'
+- '32773'
+- '32774'
+- '32775'
+- '32776'
+- '32777'
+- '32778'
+- '32779'
+- '32780'
+- '32781'
+- '32782'
+- '32784'
+- '32785'
+- '32786'
+- '32787'
+- '32788'
+- '32789'
+- '32790'
+- '32792'
+- '32793'
+- '32794'
+- '32796'
+- '32800'
+- '32801'
+- '32802'
+- '32803'
+- '32804'
+- '32805'
+- '32806'
+- '32808'
+- '32809'
+- '32810'
+- '32812'
+- '32816'
+- '32817'
+- '32818'
+- '32820'
+- '32824'
+- '32832'
+- '32833'
+- '32834'
+- '32835'
+- '32836'
+- '32837'
+- '32838'
+- '32840'
+- '32841'
+- '32842'
+- '32844'
+- '32848'
+- '32849'
+- '32850'
+- '32852'
+- '32856'
+- '32864'
+- '32865'
+- '32866'
+- '32868'
+- '32872'
+- '32880'
+- '32896'
+- '32897'
+- '32898'
+- '32899'
+- '32900'
+- '32901'
+- '32902'
+- '32904'
+- '32905'
+- '32906'
+- '32908'
+- '32912'
+- '32913'
+- '32914'
+- '32916'
+- '32920'
+- '32928'
+- '32929'
+- '32930'
+- '32932'
+- '32936'
+- '32944'
+- '32960'
+- '32961'
+- '32962'
+- '32964'
+- '32968'
+- '32976'
+- '32992'
+- '33024'
+- '33025'
+- '33026'
+- '33027'
+- '33028'
+- '33029'
+- '33030'
+- '33032'
+- '33033'
+- '33034'
+- '33036'
+- '33040'
+- '33041'
+- '33042'
+- '33044'
+- '33048'
+- '33056'
+- '33057'
+- '33058'
+- '33060'
+- '33064'
+- '33072'
+- '33088'
+- '33089'
+- '33090'
+- '33092'
+- '33096'
+- '33104'
+- '33120'
+- '33152'
+- '33153'
+- '33154'
+- '33156'
+- '33160'
+- '33168'
+- '33184'
+- '33216'
+- '33280'
+- '33281'
+- '33282'
+- '33283'
+- '33284'
+- '33285'
+- '33286'
+- '33288'
+- '33289'
+- '33290'
+- '33292'
+- '33296'
+- '33297'
+- '33298'
+- '33300'
+- '33304'
+- '33312'
+- '33313'
+- '33314'
+- '33316'
+- '33320'
+- '33328'
+- '33344'
+- '33345'
+- '33346'
+- '33348'
+- '33352'
+- '33360'
+- '33376'
+- '33408'
+- '33409'
+- '33410'
+- '33412'
+- '33416'
+- '33424'
+- '33440'
+- '33472'
+- '33536'
+- '33537'
+- '33538'
+- '33540'
+- '33544'
+- '33552'
+- '33568'
+- '33600'
+- '33664'
+- '33792'
+- '33793'
+- '33794'
+- '33795'
+- '33796'
+- '33797'
+- '33798'
+- '33800'
+- '33801'
+- '33802'
+- '33804'
+- '33808'
+- '33809'
+- '33810'
+- '33812'
+- '33816'
+- '33824'
+- '33825'
+- '33826'
+- '33828'
+- '33832'
+- '33840'
+- '33856'
+- '33857'
+- '33858'
+- '33860'
+- '33864'
+- '33872'
+- '33888'
+- '33920'
+- '33921'
+- '33922'
+- '33924'
+- '33928'
+- '33936'
+- '33952'
+- '33984'
+- '34048'
+- '34049'
+- '34050'
+- '34052'
+- '34056'
+- '34064'
+- '34080'
+- '34112'
+- '34176'
+- '34304'
+- '34305'
+- '34306'
+- '34308'
+- '34312'
+- '34320'
+- '34336'
+- '34368'
+- '34432'
+- '34560'
+- '34816'
+- '34817'
+- '34818'
+- '34819'
+- '34820'
+- '34821'
+- '34822'
+- '34824'
+- '34825'
+- '34826'
+- '34828'
+- '34832'
+- '34833'
+- '34834'
+- '34836'
+- '34840'
+- '34848'
+- '34849'
+- '34850'
+- '34852'
+- '34856'
+- '34864'
+- '34880'
+- '34881'
+- '34882'
+- '34884'
+- '34888'
+- '34896'
+- '34912'
+- '34944'
+- '34945'
+- '34946'
+- '34948'
+- '34952'
+- '34960'
+- '34976'
+- '35008'
+- '35072'
+- '35073'
+- '35074'
+- '35076'
+- '35080'
+- '35088'
+- '35104'
+- '35136'
+- '35200'
+- '35328'
+- '35329'
+- '35330'
+- '35332'
+- '35336'
+- '35344'
+- '35360'
+- '35392'
+- '35456'
+- '35584'
+- '35840'
+- '35841'
+- '35842'
+- '35844'
+- '35848'
+- '35856'
+- '35872'
+- '35904'
+- '35968'
+- '36096'
+- '36352'
+- '36864'
+- '36865'
+- '36866'
+- '36867'
+- '36868'
+- '36869'
+- '36870'
+- '36872'
+- '36873'
+- '36874'
+- '36876'
+- '36880'
+- '36881'
+- '36882'
+- '36884'
+- '36888'
+- '36896'
+- '36897'
+- '36898'
+- '36900'
+- '36904'
+- '36912'
+- '36928'
+- '36929'
+- '36930'
+- '36932'
+- '36936'
+- '36944'
+- '36960'
+- '36992'
+- '36993'
+- '36994'
+- '36996'
+- '37000'
+- '37008'
+- '37024'
+- '37056'
+- '37120'
+- '37121'
+- '37122'
+- '37124'
+- '37128'
+- '37136'
+- '37152'
+- '37184'
+- '37248'
+- '37376'
+- '37377'
+- '37378'
+- '37380'
+- '37384'
+- '37392'
+- '37408'
+- '37440'
+- '37504'
+- '37632'
+- '37888'
+- '37889'
+- '37890'
+- '37892'
+- '37896'
+- '37904'
+- '37920'
+- '37952'
+- '38016'
+- '38144'
+- '38400'
+- '38912'
+- '38913'
+- '38914'
+- '38916'
+- '38920'
+- '38928'
+- '38944'
+- '38976'
+- '39040'
+- '39168'
+- '39424'
+- '39936'
+- '40960'
+- '40961'
+- '40962'
+- '40963'
+- '40964'
+- '40965'
+- '40966'
+- '40968'
+- '40969'
+- '40970'
+- '40972'
+- '40976'
+- '40977'
+- '40978'
+- '40980'
+- '40984'
+- '40992'
+- '40993'
+- '40994'
+- '40996'
+- '41000'
+- '41008'
+- '41024'
+- '41025'
+- '41026'
+- '41028'
+- '41032'
+- '41040'
+- '41056'
+- '41088'
+- '41089'
+- '41090'
+- '41092'
+- '41096'
+- '41104'
+- '41120'
+- '41152'
+- '41216'
+- '41217'
+- '41218'
+- '41220'
+- '41224'
+- '41232'
+- '41248'
+- '41280'
+- '41344'
+- '41472'
+- '41473'
+- '41474'
+- '41476'
+- '41480'
+- '41488'
+- '41504'
+- '41536'
+- '41600'
+- '41728'
+- '41984'
+- '41985'
+- '41986'
+- '41988'
+- '41992'
+- '42000'
+- '42016'
+- '42048'
+- '42112'
+- '42240'
+- '42496'
+- '43008'
+- '43009'
+- '43010'
+- '43012'
+- '43016'
+- '43024'
+- '43040'
+- '43072'
+- '43136'
+- '43264'
+- '43520'
+- '44032'
+- '45056'
+- '45057'
+- '45058'
+- '45060'
+- '45064'
+- '45072'
+- '45088'
+- '45120'
+- '45184'
+- '45312'
+- '45568'
+- '46080'
+- '47104'
+- '49152'
+- '49153'
+- '49154'
+- '49155'
+- '49156'
+- '49157'
+- '49158'
+- '49160'
+- '49161'
+- '49162'
+- '49164'
+- '49168'
+- '49169'
+- '49170'
+- '49172'
+- '49176'
+- '49184'
+- '49185'
+- '49186'
+- '49188'
+- '49192'
+- '49200'
+- '49216'
+- '49217'
+- '49218'
+- '49220'
+- '49224'
+- '49232'
+- '49248'
+- '49280'
+- '49281'
+- '49282'
+- '49284'
+- '49288'
+- '49296'
+- '49312'
+- '49344'
+- '49408'
+- '49409'
+- '49410'
+- '49412'
+- '49416'
+- '49424'
+- '49440'
+- '49472'
+- '49536'
+- '49664'
+- '49665'
+- '49666'
+- '49668'
+- '49672'
+- '49680'
+- '49696'
+- '49728'
+- '49792'
+- '49920'
+- '50176'
+- '50177'
+- '50178'
+- '50180'
+- '50184'
+- '50192'
+- '50208'
+- '50240'
+- '50304'
+- '50432'
+- '50688'
+- '51200'
+- '51201'
+- '51202'
+- '51204'
+- '51208'
+- '51216'
+- '51232'
+- '51264'
+- '51328'
+- '51456'
+- '51712'
+- '52224'
+- '53248'
+- '53249'
+- '53250'
+- '53252'
+- '53256'
+- '53264'
+- '53280'
+- '53312'
+- '53376'
+- '53504'
+- '53760'
+- '54272'
+- '55296'
+- '57344'
+- '57345'
+- '57346'
+- '57348'
+- '57352'
+- '57360'
+- '57376'
+- '57408'
+- '57472'
+- '57600'
+- '57856'
+- '58368'
+- '59392'
+- '61440'
+init: null
+input_size: null
+cmvn_file: null
+ctc_conf:
+ dropout_rate: 0.0
+ ctc_type: builtin
+ reduce: true
+ ignore_nan_grad: true
+joint_net_conf: null
+use_preprocessor: true
+token_type: char
+bpemodel: null
+non_linguistic_symbols: null
+cleaner: null
+g2p: null
+speech_volume_normalize: null
+rir_scp: null
+rir_apply_prob: 1.0
+noise_scp: null
+noise_apply_prob: 1.0
+noise_db_range: '13_15'
+specaug: null
+specaug_conf: {}
+normalize: null
+normalize_conf: {}
+label_aggregator: null
+label_aggregator_conf: {}
+model: sond
+model_conf:
+ lsm_weight: 0.1
+ length_normalized_loss: true
+ max_spk_num: 16
+ normalize_speech_speaker: true
+# speech encoder
+encoder: resnet34_sp_l2reg
+encoder_conf:
+ # pass by model, equal to feature dim
+ # input_size: 80
+ pooling_type: "window_shift"
+ batchnorm_momentum: 0.01
+ pool_size: 20
+ stride: 1
+ tf2torch_tensor_name_prefix_torch: encoder
+ tf2torch_tensor_name_prefix_tf: EAND/speech_encoder
+speaker_encoder: null
+speaker_encoder_conf: {}
+ci_scorer: conv
+ci_scorer_conf:
+ input_units: 512
+ num_layers: 3
+ num_units: 512
+ kernel_size: 1
+ dropout_rate: 0.0
+ position_encoder: null
+ out_units: 1
+ out_norm: false
+ auxiliary_states: false
+ tf2torch_tensor_name_prefix_torch: ci_scorer
+ tf2torch_tensor_name_prefix_tf: EAND/compute_distance_layer/ci_scorer
+cd_scorer: san
+cd_scorer_conf:
+ input_size: 512
+ output_size: 512
+ out_units: 1
+ attention_heads: 4
+ linear_units: 1024
+ num_blocks: 4
+ dropout_rate: 0.0
+ positional_dropout_rate: 0.0
+ attention_dropout_rate: 0.0
+ # use string "null" to remove input layer
+ input_layer: "null"
+ pos_enc_class: null
+ normalize_before: true
+ tf2torch_tensor_name_prefix_torch: cd_scorer
+ tf2torch_tensor_name_prefix_tf: EAND/compute_distance_layer/cd_scorer
+# post net
+decoder: fsmn
+decoder_conf:
+ in_units: 32
+ out_units: 2517
+ filter_size: 31
+ fsmn_num_layers: 6
+ dnn_num_layers: 1
+ num_memory_units: 16
+ ffn_inner_dim: 512
+ dropout_rate: 0.0
+ tf2torch_tensor_name_prefix_torch: decoder
+ tf2torch_tensor_name_prefix_tf: EAND/post_net
+frontend: wav_frontend
+frontend_conf:
+ fs: 8000
+ window: povey
+ n_mels: 80
+ frame_length: 25
+ frame_shift: 10
+ filter_length_min: -1
+ filter_length_max: -1
+ lfr_m: 1
+ lfr_n: 1
+ dither: 0.0
+ snip_edges: false
+ upsacle_samples: false
+num_worker_count: 1
+required:
+- output_dir
+- token_list
+oss_bucket: 'null'
+version: 0.1.4
diff --git a/egs/callhome/diarization/sond/sond_fbank.yaml b/egs/callhome/diarization/sond/sond_fbank.yaml
new file mode 100644
index 000000000..fc76259f4
--- /dev/null
+++ b/egs/callhome/diarization/sond/sond_fbank.yaml
@@ -0,0 +1,2739 @@
+config: finetune.yaml
+print_config: false
+log_level: INFO
+dry_run: false
+iterator_type: sequence
+output_dir: exp/sond
+ngpu: 1
+seed: 0
+num_workers: 16
+num_att_plot: 0
+dist_backend: nccl
+dist_init_method: env://
+dist_world_size: null
+dist_rank: null
+local_rank: 0
+dist_master_addr: null
+dist_master_port: null
+dist_launcher: null
+multiprocessing_distributed: true
+distributed: false
+unused_parameters: true
+sharded_ddp: false
+ddp_backend: pytorch_ddp
+cudnn_enabled: true
+cudnn_benchmark: false
+cudnn_deterministic: true
+collect_stats: false
+write_collected_feats: false
+max_epoch: 50
+patience: null
+val_scheduler_criterion:
+- valid
+- acc
+early_stopping_criterion:
+- valid
+- loss
+- min
+best_model_criterion:
+- - valid
+ - acc
+ - max
+keep_nbest_models: 10
+nbest_averaging_interval: 0
+grad_clip: 5
+grad_clip_type: 2.0
+grad_noise: false
+accum_grad: 1
+no_forward_run: false
+resume: true
+train_dtype: float32
+use_amp: false
+log_interval: 50
+use_matplotlib: false
+use_tensorboard: true
+use_wandb: false
+wandb_project: null
+wandb_id: null
+wandb_entity: null
+wandb_name: null
+wandb_model_log_interval: -1
+use_pai: true
+detect_anomaly: false
+pretrain_path: null
+init_param: []
+ignore_init_mismatch: false
+freeze_param: []
+num_iters_per_epoch: null
+batch_size: 20
+valid_batch_size: null
+batch_bins: 10000
+valid_batch_bins: null
+train_shape_file:
+- /data/volume1/youyan/aishell/ark/train/speech_shape.1
+- /data/volume1/youyan/aishell/ark/train/text_shape.1
+valid_shape_file:
+- /data/volume1/youyan/aishell/ark/dev/speech_shape.1
+- /data/volume1/youyan/aishell/ark/dev/text_shape.1
+batch_type: length
+valid_batch_type: null
+fold_length:
+- 512
+- 150
+sort_in_batch: descending
+sort_batch: descending
+multiple_iterator: false
+chunk_length: 500
+chunk_shift_ratio: 0.5
+num_cache_chunks: 1024
+train_data_path_and_name_and_type:
+- - /data/volume1/youyan/aishell/ark/train/data.scp
+ - speech
+ - kaldi_ark
+- - /data/volume1/youyan/aishell/ark/train/data.text.1
+ - text
+ - text
+valid_data_path_and_name_and_type:
+- - /data/volume1/youyan/aishell/ark/dev/data.scp
+ - speech
+ - kaldi_ark
+- - /data/volume1/youyan/aishell/ark/dev/data.text.1
+ - text
+ - text
+allow_variable_data_keys: false
+max_cache_size: 0.0
+max_cache_fd: 32
+valid_max_cache_size: null
+optim: adam
+optim_conf:
+ lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+ warmup_steps: 30000
+token_list:
+- '0'
+- '1'
+- '2'
+- '3'
+- '4'
+- '5'
+- '6'
+- '7'
+- '8'
+- '9'
+- '10'
+- '11'
+- '12'
+- '13'
+- '14'
+- '15'
+- '16'
+- '17'
+- '18'
+- '19'
+- '20'
+- '21'
+- '22'
+- '23'
+- '24'
+- '25'
+- '26'
+- '27'
+- '28'
+- '29'
+- '30'
+- '32'
+- '33'
+- '34'
+- '35'
+- '36'
+- '37'
+- '38'
+- '39'
+- '40'
+- '41'
+- '42'
+- '43'
+- '44'
+- '45'
+- '46'
+- '48'
+- '49'
+- '50'
+- '51'
+- '52'
+- '53'
+- '54'
+- '56'
+- '57'
+- '58'
+- '60'
+- '64'
+- '65'
+- '66'
+- '67'
+- '68'
+- '69'
+- '70'
+- '71'
+- '72'
+- '73'
+- '74'
+- '75'
+- '76'
+- '77'
+- '78'
+- '80'
+- '81'
+- '82'
+- '83'
+- '84'
+- '85'
+- '86'
+- '88'
+- '89'
+- '90'
+- '92'
+- '96'
+- '97'
+- '98'
+- '99'
+- '100'
+- '101'
+- '102'
+- '104'
+- '105'
+- '106'
+- '108'
+- '112'
+- '113'
+- '114'
+- '116'
+- '120'
+- '128'
+- '129'
+- '130'
+- '131'
+- '132'
+- '133'
+- '134'
+- '135'
+- '136'
+- '137'
+- '138'
+- '139'
+- '140'
+- '141'
+- '142'
+- '144'
+- '145'
+- '146'
+- '147'
+- '148'
+- '149'
+- '150'
+- '152'
+- '153'
+- '154'
+- '156'
+- '160'
+- '161'
+- '162'
+- '163'
+- '164'
+- '165'
+- '166'
+- '168'
+- '169'
+- '170'
+- '172'
+- '176'
+- '177'
+- '178'
+- '180'
+- '184'
+- '192'
+- '193'
+- '194'
+- '195'
+- '196'
+- '197'
+- '198'
+- '200'
+- '201'
+- '202'
+- '204'
+- '208'
+- '209'
+- '210'
+- '212'
+- '216'
+- '224'
+- '225'
+- '226'
+- '228'
+- '232'
+- '240'
+- '256'
+- '257'
+- '258'
+- '259'
+- '260'
+- '261'
+- '262'
+- '263'
+- '264'
+- '265'
+- '266'
+- '267'
+- '268'
+- '269'
+- '270'
+- '272'
+- '273'
+- '274'
+- '275'
+- '276'
+- '277'
+- '278'
+- '280'
+- '281'
+- '282'
+- '284'
+- '288'
+- '289'
+- '290'
+- '291'
+- '292'
+- '293'
+- '294'
+- '296'
+- '297'
+- '298'
+- '300'
+- '304'
+- '305'
+- '306'
+- '308'
+- '312'
+- '320'
+- '321'
+- '322'
+- '323'
+- '324'
+- '325'
+- '326'
+- '328'
+- '329'
+- '330'
+- '332'
+- '336'
+- '337'
+- '338'
+- '340'
+- '344'
+- '352'
+- '353'
+- '354'
+- '356'
+- '360'
+- '368'
+- '384'
+- '385'
+- '386'
+- '387'
+- '388'
+- '389'
+- '390'
+- '392'
+- '393'
+- '394'
+- '396'
+- '400'
+- '401'
+- '402'
+- '404'
+- '408'
+- '416'
+- '417'
+- '418'
+- '420'
+- '424'
+- '432'
+- '448'
+- '449'
+- '450'
+- '452'
+- '456'
+- '464'
+- '480'
+- '512'
+- '513'
+- '514'
+- '515'
+- '516'
+- '517'
+- '518'
+- '519'
+- '520'
+- '521'
+- '522'
+- '523'
+- '524'
+- '525'
+- '526'
+- '528'
+- '529'
+- '530'
+- '531'
+- '532'
+- '533'
+- '534'
+- '536'
+- '537'
+- '538'
+- '540'
+- '544'
+- '545'
+- '546'
+- '547'
+- '548'
+- '549'
+- '550'
+- '552'
+- '553'
+- '554'
+- '556'
+- '560'
+- '561'
+- '562'
+- '564'
+- '568'
+- '576'
+- '577'
+- '578'
+- '579'
+- '580'
+- '581'
+- '582'
+- '584'
+- '585'
+- '586'
+- '588'
+- '592'
+- '593'
+- '594'
+- '596'
+- '600'
+- '608'
+- '609'
+- '610'
+- '612'
+- '616'
+- '624'
+- '640'
+- '641'
+- '642'
+- '643'
+- '644'
+- '645'
+- '646'
+- '648'
+- '649'
+- '650'
+- '652'
+- '656'
+- '657'
+- '658'
+- '660'
+- '664'
+- '672'
+- '673'
+- '674'
+- '676'
+- '680'
+- '688'
+- '704'
+- '705'
+- '706'
+- '708'
+- '712'
+- '720'
+- '736'
+- '768'
+- '769'
+- '770'
+- '771'
+- '772'
+- '773'
+- '774'
+- '776'
+- '777'
+- '778'
+- '780'
+- '784'
+- '785'
+- '786'
+- '788'
+- '792'
+- '800'
+- '801'
+- '802'
+- '804'
+- '808'
+- '816'
+- '832'
+- '833'
+- '834'
+- '836'
+- '840'
+- '848'
+- '864'
+- '896'
+- '897'
+- '898'
+- '900'
+- '904'
+- '912'
+- '928'
+- '960'
+- '1024'
+- '1025'
+- '1026'
+- '1027'
+- '1028'
+- '1029'
+- '1030'
+- '1031'
+- '1032'
+- '1033'
+- '1034'
+- '1035'
+- '1036'
+- '1037'
+- '1038'
+- '1040'
+- '1041'
+- '1042'
+- '1043'
+- '1044'
+- '1045'
+- '1046'
+- '1048'
+- '1049'
+- '1050'
+- '1052'
+- '1056'
+- '1057'
+- '1058'
+- '1059'
+- '1060'
+- '1061'
+- '1062'
+- '1064'
+- '1065'
+- '1066'
+- '1068'
+- '1072'
+- '1073'
+- '1074'
+- '1076'
+- '1080'
+- '1088'
+- '1089'
+- '1090'
+- '1091'
+- '1092'
+- '1093'
+- '1094'
+- '1096'
+- '1097'
+- '1098'
+- '1100'
+- '1104'
+- '1105'
+- '1106'
+- '1108'
+- '1112'
+- '1120'
+- '1121'
+- '1122'
+- '1124'
+- '1128'
+- '1136'
+- '1152'
+- '1153'
+- '1154'
+- '1155'
+- '1156'
+- '1157'
+- '1158'
+- '1160'
+- '1161'
+- '1162'
+- '1164'
+- '1168'
+- '1169'
+- '1170'
+- '1172'
+- '1176'
+- '1184'
+- '1185'
+- '1186'
+- '1188'
+- '1192'
+- '1200'
+- '1216'
+- '1217'
+- '1218'
+- '1220'
+- '1224'
+- '1232'
+- '1248'
+- '1280'
+- '1281'
+- '1282'
+- '1283'
+- '1284'
+- '1285'
+- '1286'
+- '1288'
+- '1289'
+- '1290'
+- '1292'
+- '1296'
+- '1297'
+- '1298'
+- '1300'
+- '1304'
+- '1312'
+- '1313'
+- '1314'
+- '1316'
+- '1320'
+- '1328'
+- '1344'
+- '1345'
+- '1346'
+- '1348'
+- '1352'
+- '1360'
+- '1376'
+- '1408'
+- '1409'
+- '1410'
+- '1412'
+- '1416'
+- '1424'
+- '1440'
+- '1472'
+- '1536'
+- '1537'
+- '1538'
+- '1539'
+- '1540'
+- '1541'
+- '1542'
+- '1544'
+- '1545'
+- '1546'
+- '1548'
+- '1552'
+- '1553'
+- '1554'
+- '1556'
+- '1560'
+- '1568'
+- '1569'
+- '1570'
+- '1572'
+- '1576'
+- '1584'
+- '1600'
+- '1601'
+- '1602'
+- '1604'
+- '1608'
+- '1616'
+- '1632'
+- '1664'
+- '1665'
+- '1666'
+- '1668'
+- '1672'
+- '1680'
+- '1696'
+- '1728'
+- '1792'
+- '1793'
+- '1794'
+- '1796'
+- '1800'
+- '1808'
+- '1824'
+- '1856'
+- '1920'
+- '2048'
+- '2049'
+- '2050'
+- '2051'
+- '2052'
+- '2053'
+- '2054'
+- '2055'
+- '2056'
+- '2057'
+- '2058'
+- '2059'
+- '2060'
+- '2061'
+- '2062'
+- '2064'
+- '2065'
+- '2066'
+- '2067'
+- '2068'
+- '2069'
+- '2070'
+- '2072'
+- '2073'
+- '2074'
+- '2076'
+- '2080'
+- '2081'
+- '2082'
+- '2083'
+- '2084'
+- '2085'
+- '2086'
+- '2088'
+- '2089'
+- '2090'
+- '2092'
+- '2096'
+- '2097'
+- '2098'
+- '2100'
+- '2104'
+- '2112'
+- '2113'
+- '2114'
+- '2115'
+- '2116'
+- '2117'
+- '2118'
+- '2120'
+- '2121'
+- '2122'
+- '2124'
+- '2128'
+- '2129'
+- '2130'
+- '2132'
+- '2136'
+- '2144'
+- '2145'
+- '2146'
+- '2148'
+- '2152'
+- '2160'
+- '2176'
+- '2177'
+- '2178'
+- '2179'
+- '2180'
+- '2181'
+- '2182'
+- '2184'
+- '2185'
+- '2186'
+- '2188'
+- '2192'
+- '2193'
+- '2194'
+- '2196'
+- '2200'
+- '2208'
+- '2209'
+- '2210'
+- '2212'
+- '2216'
+- '2224'
+- '2240'
+- '2241'
+- '2242'
+- '2244'
+- '2248'
+- '2256'
+- '2272'
+- '2304'
+- '2305'
+- '2306'
+- '2307'
+- '2308'
+- '2309'
+- '2310'
+- '2312'
+- '2313'
+- '2314'
+- '2316'
+- '2320'
+- '2321'
+- '2322'
+- '2324'
+- '2328'
+- '2336'
+- '2337'
+- '2338'
+- '2340'
+- '2344'
+- '2352'
+- '2368'
+- '2369'
+- '2370'
+- '2372'
+- '2376'
+- '2384'
+- '2400'
+- '2432'
+- '2433'
+- '2434'
+- '2436'
+- '2440'
+- '2448'
+- '2464'
+- '2496'
+- '2560'
+- '2561'
+- '2562'
+- '2563'
+- '2564'
+- '2565'
+- '2566'
+- '2568'
+- '2569'
+- '2570'
+- '2572'
+- '2576'
+- '2577'
+- '2578'
+- '2580'
+- '2584'
+- '2592'
+- '2593'
+- '2594'
+- '2596'
+- '2600'
+- '2608'
+- '2624'
+- '2625'
+- '2626'
+- '2628'
+- '2632'
+- '2640'
+- '2656'
+- '2688'
+- '2689'
+- '2690'
+- '2692'
+- '2696'
+- '2704'
+- '2720'
+- '2752'
+- '2816'
+- '2817'
+- '2818'
+- '2820'
+- '2824'
+- '2832'
+- '2848'
+- '2880'
+- '2944'
+- '3072'
+- '3073'
+- '3074'
+- '3075'
+- '3076'
+- '3077'
+- '3078'
+- '3080'
+- '3081'
+- '3082'
+- '3084'
+- '3088'
+- '3089'
+- '3090'
+- '3092'
+- '3096'
+- '3104'
+- '3105'
+- '3106'
+- '3108'
+- '3112'
+- '3120'
+- '3136'
+- '3137'
+- '3138'
+- '3140'
+- '3144'
+- '3152'
+- '3168'
+- '3200'
+- '3201'
+- '3202'
+- '3204'
+- '3208'
+- '3216'
+- '3232'
+- '3264'
+- '3328'
+- '3329'
+- '3330'
+- '3332'
+- '3336'
+- '3344'
+- '3360'
+- '3392'
+- '3456'
+- '3584'
+- '3585'
+- '3586'
+- '3588'
+- '3592'
+- '3600'
+- '3616'
+- '3648'
+- '3712'
+- '3840'
+- '4096'
+- '4097'
+- '4098'
+- '4099'
+- '4100'
+- '4101'
+- '4102'
+- '4103'
+- '4104'
+- '4105'
+- '4106'
+- '4107'
+- '4108'
+- '4109'
+- '4110'
+- '4112'
+- '4113'
+- '4114'
+- '4115'
+- '4116'
+- '4117'
+- '4118'
+- '4120'
+- '4121'
+- '4122'
+- '4124'
+- '4128'
+- '4129'
+- '4130'
+- '4131'
+- '4132'
+- '4133'
+- '4134'
+- '4136'
+- '4137'
+- '4138'
+- '4140'
+- '4144'
+- '4145'
+- '4146'
+- '4148'
+- '4152'
+- '4160'
+- '4161'
+- '4162'
+- '4163'
+- '4164'
+- '4165'
+- '4166'
+- '4168'
+- '4169'
+- '4170'
+- '4172'
+- '4176'
+- '4177'
+- '4178'
+- '4180'
+- '4184'
+- '4192'
+- '4193'
+- '4194'
+- '4196'
+- '4200'
+- '4208'
+- '4224'
+- '4225'
+- '4226'
+- '4227'
+- '4228'
+- '4229'
+- '4230'
+- '4232'
+- '4233'
+- '4234'
+- '4236'
+- '4240'
+- '4241'
+- '4242'
+- '4244'
+- '4248'
+- '4256'
+- '4257'
+- '4258'
+- '4260'
+- '4264'
+- '4272'
+- '4288'
+- '4289'
+- '4290'
+- '4292'
+- '4296'
+- '4304'
+- '4320'
+- '4352'
+- '4353'
+- '4354'
+- '4355'
+- '4356'
+- '4357'
+- '4358'
+- '4360'
+- '4361'
+- '4362'
+- '4364'
+- '4368'
+- '4369'
+- '4370'
+- '4372'
+- '4376'
+- '4384'
+- '4385'
+- '4386'
+- '4388'
+- '4392'
+- '4400'
+- '4416'
+- '4417'
+- '4418'
+- '4420'
+- '4424'
+- '4432'
+- '4448'
+- '4480'
+- '4481'
+- '4482'
+- '4484'
+- '4488'
+- '4496'
+- '4512'
+- '4544'
+- '4608'
+- '4609'
+- '4610'
+- '4611'
+- '4612'
+- '4613'
+- '4614'
+- '4616'
+- '4617'
+- '4618'
+- '4620'
+- '4624'
+- '4625'
+- '4626'
+- '4628'
+- '4632'
+- '4640'
+- '4641'
+- '4642'
+- '4644'
+- '4648'
+- '4656'
+- '4672'
+- '4673'
+- '4674'
+- '4676'
+- '4680'
+- '4688'
+- '4704'
+- '4736'
+- '4737'
+- '4738'
+- '4740'
+- '4744'
+- '4752'
+- '4768'
+- '4800'
+- '4864'
+- '4865'
+- '4866'
+- '4868'
+- '4872'
+- '4880'
+- '4896'
+- '4928'
+- '4992'
+- '5120'
+- '5121'
+- '5122'
+- '5123'
+- '5124'
+- '5125'
+- '5126'
+- '5128'
+- '5129'
+- '5130'
+- '5132'
+- '5136'
+- '5137'
+- '5138'
+- '5140'
+- '5144'
+- '5152'
+- '5153'
+- '5154'
+- '5156'
+- '5160'
+- '5168'
+- '5184'
+- '5185'
+- '5186'
+- '5188'
+- '5192'
+- '5200'
+- '5216'
+- '5248'
+- '5249'
+- '5250'
+- '5252'
+- '5256'
+- '5264'
+- '5280'
+- '5312'
+- '5376'
+- '5377'
+- '5378'
+- '5380'
+- '5384'
+- '5392'
+- '5408'
+- '5440'
+- '5504'
+- '5632'
+- '5633'
+- '5634'
+- '5636'
+- '5640'
+- '5648'
+- '5664'
+- '5696'
+- '5760'
+- '5888'
+- '6144'
+- '6145'
+- '6146'
+- '6147'
+- '6148'
+- '6149'
+- '6150'
+- '6152'
+- '6153'
+- '6154'
+- '6156'
+- '6160'
+- '6161'
+- '6162'
+- '6164'
+- '6168'
+- '6176'
+- '6177'
+- '6178'
+- '6180'
+- '6184'
+- '6192'
+- '6208'
+- '6209'
+- '6210'
+- '6212'
+- '6216'
+- '6224'
+- '6240'
+- '6272'
+- '6273'
+- '6274'
+- '6276'
+- '6280'
+- '6288'
+- '6304'
+- '6336'
+- '6400'
+- '6401'
+- '6402'
+- '6404'
+- '6408'
+- '6416'
+- '6432'
+- '6464'
+- '6528'
+- '6656'
+- '6657'
+- '6658'
+- '6660'
+- '6664'
+- '6672'
+- '6688'
+- '6720'
+- '6784'
+- '6912'
+- '7168'
+- '7169'
+- '7170'
+- '7172'
+- '7176'
+- '7184'
+- '7200'
+- '7232'
+- '7296'
+- '7424'
+- '7680'
+- '8192'
+- '8193'
+- '8194'
+- '8195'
+- '8196'
+- '8197'
+- '8198'
+- '8199'
+- '8200'
+- '8201'
+- '8202'
+- '8203'
+- '8204'
+- '8205'
+- '8206'
+- '8208'
+- '8209'
+- '8210'
+- '8211'
+- '8212'
+- '8213'
+- '8214'
+- '8216'
+- '8217'
+- '8218'
+- '8220'
+- '8224'
+- '8225'
+- '8226'
+- '8227'
+- '8228'
+- '8229'
+- '8230'
+- '8232'
+- '8233'
+- '8234'
+- '8236'
+- '8240'
+- '8241'
+- '8242'
+- '8244'
+- '8248'
+- '8256'
+- '8257'
+- '8258'
+- '8259'
+- '8260'
+- '8261'
+- '8262'
+- '8264'
+- '8265'
+- '8266'
+- '8268'
+- '8272'
+- '8273'
+- '8274'
+- '8276'
+- '8280'
+- '8288'
+- '8289'
+- '8290'
+- '8292'
+- '8296'
+- '8304'
+- '8320'
+- '8321'
+- '8322'
+- '8323'
+- '8324'
+- '8325'
+- '8326'
+- '8328'
+- '8329'
+- '8330'
+- '8332'
+- '8336'
+- '8337'
+- '8338'
+- '8340'
+- '8344'
+- '8352'
+- '8353'
+- '8354'
+- '8356'
+- '8360'
+- '8368'
+- '8384'
+- '8385'
+- '8386'
+- '8388'
+- '8392'
+- '8400'
+- '8416'
+- '8448'
+- '8449'
+- '8450'
+- '8451'
+- '8452'
+- '8453'
+- '8454'
+- '8456'
+- '8457'
+- '8458'
+- '8460'
+- '8464'
+- '8465'
+- '8466'
+- '8468'
+- '8472'
+- '8480'
+- '8481'
+- '8482'
+- '8484'
+- '8488'
+- '8496'
+- '8512'
+- '8513'
+- '8514'
+- '8516'
+- '8520'
+- '8528'
+- '8544'
+- '8576'
+- '8577'
+- '8578'
+- '8580'
+- '8584'
+- '8592'
+- '8608'
+- '8640'
+- '8704'
+- '8705'
+- '8706'
+- '8707'
+- '8708'
+- '8709'
+- '8710'
+- '8712'
+- '8713'
+- '8714'
+- '8716'
+- '8720'
+- '8721'
+- '8722'
+- '8724'
+- '8728'
+- '8736'
+- '8737'
+- '8738'
+- '8740'
+- '8744'
+- '8752'
+- '8768'
+- '8769'
+- '8770'
+- '8772'
+- '8776'
+- '8784'
+- '8800'
+- '8832'
+- '8833'
+- '8834'
+- '8836'
+- '8840'
+- '8848'
+- '8864'
+- '8896'
+- '8960'
+- '8961'
+- '8962'
+- '8964'
+- '8968'
+- '8976'
+- '8992'
+- '9024'
+- '9088'
+- '9216'
+- '9217'
+- '9218'
+- '9219'
+- '9220'
+- '9221'
+- '9222'
+- '9224'
+- '9225'
+- '9226'
+- '9228'
+- '9232'
+- '9233'
+- '9234'
+- '9236'
+- '9240'
+- '9248'
+- '9249'
+- '9250'
+- '9252'
+- '9256'
+- '9264'
+- '9280'
+- '9281'
+- '9282'
+- '9284'
+- '9288'
+- '9296'
+- '9312'
+- '9344'
+- '9345'
+- '9346'
+- '9348'
+- '9352'
+- '9360'
+- '9376'
+- '9408'
+- '9472'
+- '9473'
+- '9474'
+- '9476'
+- '9480'
+- '9488'
+- '9504'
+- '9536'
+- '9600'
+- '9728'
+- '9729'
+- '9730'
+- '9732'
+- '9736'
+- '9744'
+- '9760'
+- '9792'
+- '9856'
+- '9984'
+- '10240'
+- '10241'
+- '10242'
+- '10243'
+- '10244'
+- '10245'
+- '10246'
+- '10248'
+- '10249'
+- '10250'
+- '10252'
+- '10256'
+- '10257'
+- '10258'
+- '10260'
+- '10264'
+- '10272'
+- '10273'
+- '10274'
+- '10276'
+- '10280'
+- '10288'
+- '10304'
+- '10305'
+- '10306'
+- '10308'
+- '10312'
+- '10320'
+- '10336'
+- '10368'
+- '10369'
+- '10370'
+- '10372'
+- '10376'
+- '10384'
+- '10400'
+- '10432'
+- '10496'
+- '10497'
+- '10498'
+- '10500'
+- '10504'
+- '10512'
+- '10528'
+- '10560'
+- '10624'
+- '10752'
+- '10753'
+- '10754'
+- '10756'
+- '10760'
+- '10768'
+- '10784'
+- '10816'
+- '10880'
+- '11008'
+- '11264'
+- '11265'
+- '11266'
+- '11268'
+- '11272'
+- '11280'
+- '11296'
+- '11328'
+- '11392'
+- '11520'
+- '11776'
+- '12288'
+- '12289'
+- '12290'
+- '12291'
+- '12292'
+- '12293'
+- '12294'
+- '12296'
+- '12297'
+- '12298'
+- '12300'
+- '12304'
+- '12305'
+- '12306'
+- '12308'
+- '12312'
+- '12320'
+- '12321'
+- '12322'
+- '12324'
+- '12328'
+- '12336'
+- '12352'
+- '12353'
+- '12354'
+- '12356'
+- '12360'
+- '12368'
+- '12384'
+- '12416'
+- '12417'
+- '12418'
+- '12420'
+- '12424'
+- '12432'
+- '12448'
+- '12480'
+- '12544'
+- '12545'
+- '12546'
+- '12548'
+- '12552'
+- '12560'
+- '12576'
+- '12608'
+- '12672'
+- '12800'
+- '12801'
+- '12802'
+- '12804'
+- '12808'
+- '12816'
+- '12832'
+- '12864'
+- '12928'
+- '13056'
+- '13312'
+- '13313'
+- '13314'
+- '13316'
+- '13320'
+- '13328'
+- '13344'
+- '13376'
+- '13440'
+- '13568'
+- '13824'
+- '14336'
+- '14337'
+- '14338'
+- '14340'
+- '14344'
+- '14352'
+- '14368'
+- '14400'
+- '14464'
+- '14592'
+- '14848'
+- '15360'
+- '16384'
+- '16385'
+- '16386'
+- '16387'
+- '16388'
+- '16389'
+- '16390'
+- '16391'
+- '16392'
+- '16393'
+- '16394'
+- '16395'
+- '16396'
+- '16397'
+- '16398'
+- '16400'
+- '16401'
+- '16402'
+- '16403'
+- '16404'
+- '16405'
+- '16406'
+- '16408'
+- '16409'
+- '16410'
+- '16412'
+- '16416'
+- '16417'
+- '16418'
+- '16419'
+- '16420'
+- '16421'
+- '16422'
+- '16424'
+- '16425'
+- '16426'
+- '16428'
+- '16432'
+- '16433'
+- '16434'
+- '16436'
+- '16440'
+- '16448'
+- '16449'
+- '16450'
+- '16451'
+- '16452'
+- '16453'
+- '16454'
+- '16456'
+- '16457'
+- '16458'
+- '16460'
+- '16464'
+- '16465'
+- '16466'
+- '16468'
+- '16472'
+- '16480'
+- '16481'
+- '16482'
+- '16484'
+- '16488'
+- '16496'
+- '16512'
+- '16513'
+- '16514'
+- '16515'
+- '16516'
+- '16517'
+- '16518'
+- '16520'
+- '16521'
+- '16522'
+- '16524'
+- '16528'
+- '16529'
+- '16530'
+- '16532'
+- '16536'
+- '16544'
+- '16545'
+- '16546'
+- '16548'
+- '16552'
+- '16560'
+- '16576'
+- '16577'
+- '16578'
+- '16580'
+- '16584'
+- '16592'
+- '16608'
+- '16640'
+- '16641'
+- '16642'
+- '16643'
+- '16644'
+- '16645'
+- '16646'
+- '16648'
+- '16649'
+- '16650'
+- '16652'
+- '16656'
+- '16657'
+- '16658'
+- '16660'
+- '16664'
+- '16672'
+- '16673'
+- '16674'
+- '16676'
+- '16680'
+- '16688'
+- '16704'
+- '16705'
+- '16706'
+- '16708'
+- '16712'
+- '16720'
+- '16736'
+- '16768'
+- '16769'
+- '16770'
+- '16772'
+- '16776'
+- '16784'
+- '16800'
+- '16832'
+- '16896'
+- '16897'
+- '16898'
+- '16899'
+- '16900'
+- '16901'
+- '16902'
+- '16904'
+- '16905'
+- '16906'
+- '16908'
+- '16912'
+- '16913'
+- '16914'
+- '16916'
+- '16920'
+- '16928'
+- '16929'
+- '16930'
+- '16932'
+- '16936'
+- '16944'
+- '16960'
+- '16961'
+- '16962'
+- '16964'
+- '16968'
+- '16976'
+- '16992'
+- '17024'
+- '17025'
+- '17026'
+- '17028'
+- '17032'
+- '17040'
+- '17056'
+- '17088'
+- '17152'
+- '17153'
+- '17154'
+- '17156'
+- '17160'
+- '17168'
+- '17184'
+- '17216'
+- '17280'
+- '17408'
+- '17409'
+- '17410'
+- '17411'
+- '17412'
+- '17413'
+- '17414'
+- '17416'
+- '17417'
+- '17418'
+- '17420'
+- '17424'
+- '17425'
+- '17426'
+- '17428'
+- '17432'
+- '17440'
+- '17441'
+- '17442'
+- '17444'
+- '17448'
+- '17456'
+- '17472'
+- '17473'
+- '17474'
+- '17476'
+- '17480'
+- '17488'
+- '17504'
+- '17536'
+- '17537'
+- '17538'
+- '17540'
+- '17544'
+- '17552'
+- '17568'
+- '17600'
+- '17664'
+- '17665'
+- '17666'
+- '17668'
+- '17672'
+- '17680'
+- '17696'
+- '17728'
+- '17792'
+- '17920'
+- '17921'
+- '17922'
+- '17924'
+- '17928'
+- '17936'
+- '17952'
+- '17984'
+- '18048'
+- '18176'
+- '18432'
+- '18433'
+- '18434'
+- '18435'
+- '18436'
+- '18437'
+- '18438'
+- '18440'
+- '18441'
+- '18442'
+- '18444'
+- '18448'
+- '18449'
+- '18450'
+- '18452'
+- '18456'
+- '18464'
+- '18465'
+- '18466'
+- '18468'
+- '18472'
+- '18480'
+- '18496'
+- '18497'
+- '18498'
+- '18500'
+- '18504'
+- '18512'
+- '18528'
+- '18560'
+- '18561'
+- '18562'
+- '18564'
+- '18568'
+- '18576'
+- '18592'
+- '18624'
+- '18688'
+- '18689'
+- '18690'
+- '18692'
+- '18696'
+- '18704'
+- '18720'
+- '18752'
+- '18816'
+- '18944'
+- '18945'
+- '18946'
+- '18948'
+- '18952'
+- '18960'
+- '18976'
+- '19008'
+- '19072'
+- '19200'
+- '19456'
+- '19457'
+- '19458'
+- '19460'
+- '19464'
+- '19472'
+- '19488'
+- '19520'
+- '19584'
+- '19712'
+- '19968'
+- '20480'
+- '20481'
+- '20482'
+- '20483'
+- '20484'
+- '20485'
+- '20486'
+- '20488'
+- '20489'
+- '20490'
+- '20492'
+- '20496'
+- '20497'
+- '20498'
+- '20500'
+- '20504'
+- '20512'
+- '20513'
+- '20514'
+- '20516'
+- '20520'
+- '20528'
+- '20544'
+- '20545'
+- '20546'
+- '20548'
+- '20552'
+- '20560'
+- '20576'
+- '20608'
+- '20609'
+- '20610'
+- '20612'
+- '20616'
+- '20624'
+- '20640'
+- '20672'
+- '20736'
+- '20737'
+- '20738'
+- '20740'
+- '20744'
+- '20752'
+- '20768'
+- '20800'
+- '20864'
+- '20992'
+- '20993'
+- '20994'
+- '20996'
+- '21000'
+- '21008'
+- '21024'
+- '21056'
+- '21120'
+- '21248'
+- '21504'
+- '21505'
+- '21506'
+- '21508'
+- '21512'
+- '21520'
+- '21536'
+- '21568'
+- '21632'
+- '21760'
+- '22016'
+- '22528'
+- '22529'
+- '22530'
+- '22532'
+- '22536'
+- '22544'
+- '22560'
+- '22592'
+- '22656'
+- '22784'
+- '23040'
+- '23552'
+- '24576'
+- '24577'
+- '24578'
+- '24579'
+- '24580'
+- '24581'
+- '24582'
+- '24584'
+- '24585'
+- '24586'
+- '24588'
+- '24592'
+- '24593'
+- '24594'
+- '24596'
+- '24600'
+- '24608'
+- '24609'
+- '24610'
+- '24612'
+- '24616'
+- '24624'
+- '24640'
+- '24641'
+- '24642'
+- '24644'
+- '24648'
+- '24656'
+- '24672'
+- '24704'
+- '24705'
+- '24706'
+- '24708'
+- '24712'
+- '24720'
+- '24736'
+- '24768'
+- '24832'
+- '24833'
+- '24834'
+- '24836'
+- '24840'
+- '24848'
+- '24864'
+- '24896'
+- '24960'
+- '25088'
+- '25089'
+- '25090'
+- '25092'
+- '25096'
+- '25104'
+- '25120'
+- '25152'
+- '25216'
+- '25344'
+- '25600'
+- '25601'
+- '25602'
+- '25604'
+- '25608'
+- '25616'
+- '25632'
+- '25664'
+- '25728'
+- '25856'
+- '26112'
+- '26624'
+- '26625'
+- '26626'
+- '26628'
+- '26632'
+- '26640'
+- '26656'
+- '26688'
+- '26752'
+- '26880'
+- '27136'
+- '27648'
+- '28672'
+- '28673'
+- '28674'
+- '28676'
+- '28680'
+- '28688'
+- '28704'
+- '28736'
+- '28800'
+- '28928'
+- '29184'
+- '29696'
+- '30720'
+- '32768'
+- '32769'
+- '32770'
+- '32771'
+- '32772'
+- '32773'
+- '32774'
+- '32775'
+- '32776'
+- '32777'
+- '32778'
+- '32779'
+- '32780'
+- '32781'
+- '32782'
+- '32784'
+- '32785'
+- '32786'
+- '32787'
+- '32788'
+- '32789'
+- '32790'
+- '32792'
+- '32793'
+- '32794'
+- '32796'
+- '32800'
+- '32801'
+- '32802'
+- '32803'
+- '32804'
+- '32805'
+- '32806'
+- '32808'
+- '32809'
+- '32810'
+- '32812'
+- '32816'
+- '32817'
+- '32818'
+- '32820'
+- '32824'
+- '32832'
+- '32833'
+- '32834'
+- '32835'
+- '32836'
+- '32837'
+- '32838'
+- '32840'
+- '32841'
+- '32842'
+- '32844'
+- '32848'
+- '32849'
+- '32850'
+- '32852'
+- '32856'
+- '32864'
+- '32865'
+- '32866'
+- '32868'
+- '32872'
+- '32880'
+- '32896'
+- '32897'
+- '32898'
+- '32899'
+- '32900'
+- '32901'
+- '32902'
+- '32904'
+- '32905'
+- '32906'
+- '32908'
+- '32912'
+- '32913'
+- '32914'
+- '32916'
+- '32920'
+- '32928'
+- '32929'
+- '32930'
+- '32932'
+- '32936'
+- '32944'
+- '32960'
+- '32961'
+- '32962'
+- '32964'
+- '32968'
+- '32976'
+- '32992'
+- '33024'
+- '33025'
+- '33026'
+- '33027'
+- '33028'
+- '33029'
+- '33030'
+- '33032'
+- '33033'
+- '33034'
+- '33036'
+- '33040'
+- '33041'
+- '33042'
+- '33044'
+- '33048'
+- '33056'
+- '33057'
+- '33058'
+- '33060'
+- '33064'
+- '33072'
+- '33088'
+- '33089'
+- '33090'
+- '33092'
+- '33096'
+- '33104'
+- '33120'
+- '33152'
+- '33153'
+- '33154'
+- '33156'
+- '33160'
+- '33168'
+- '33184'
+- '33216'
+- '33280'
+- '33281'
+- '33282'
+- '33283'
+- '33284'
+- '33285'
+- '33286'
+- '33288'
+- '33289'
+- '33290'
+- '33292'
+- '33296'
+- '33297'
+- '33298'
+- '33300'
+- '33304'
+- '33312'
+- '33313'
+- '33314'
+- '33316'
+- '33320'
+- '33328'
+- '33344'
+- '33345'
+- '33346'
+- '33348'
+- '33352'
+- '33360'
+- '33376'
+- '33408'
+- '33409'
+- '33410'
+- '33412'
+- '33416'
+- '33424'
+- '33440'
+- '33472'
+- '33536'
+- '33537'
+- '33538'
+- '33540'
+- '33544'
+- '33552'
+- '33568'
+- '33600'
+- '33664'
+- '33792'
+- '33793'
+- '33794'
+- '33795'
+- '33796'
+- '33797'
+- '33798'
+- '33800'
+- '33801'
+- '33802'
+- '33804'
+- '33808'
+- '33809'
+- '33810'
+- '33812'
+- '33816'
+- '33824'
+- '33825'
+- '33826'
+- '33828'
+- '33832'
+- '33840'
+- '33856'
+- '33857'
+- '33858'
+- '33860'
+- '33864'
+- '33872'
+- '33888'
+- '33920'
+- '33921'
+- '33922'
+- '33924'
+- '33928'
+- '33936'
+- '33952'
+- '33984'
+- '34048'
+- '34049'
+- '34050'
+- '34052'
+- '34056'
+- '34064'
+- '34080'
+- '34112'
+- '34176'
+- '34304'
+- '34305'
+- '34306'
+- '34308'
+- '34312'
+- '34320'
+- '34336'
+- '34368'
+- '34432'
+- '34560'
+- '34816'
+- '34817'
+- '34818'
+- '34819'
+- '34820'
+- '34821'
+- '34822'
+- '34824'
+- '34825'
+- '34826'
+- '34828'
+- '34832'
+- '34833'
+- '34834'
+- '34836'
+- '34840'
+- '34848'
+- '34849'
+- '34850'
+- '34852'
+- '34856'
+- '34864'
+- '34880'
+- '34881'
+- '34882'
+- '34884'
+- '34888'
+- '34896'
+- '34912'
+- '34944'
+- '34945'
+- '34946'
+- '34948'
+- '34952'
+- '34960'
+- '34976'
+- '35008'
+- '35072'
+- '35073'
+- '35074'
+- '35076'
+- '35080'
+- '35088'
+- '35104'
+- '35136'
+- '35200'
+- '35328'
+- '35329'
+- '35330'
+- '35332'
+- '35336'
+- '35344'
+- '35360'
+- '35392'
+- '35456'
+- '35584'
+- '35840'
+- '35841'
+- '35842'
+- '35844'
+- '35848'
+- '35856'
+- '35872'
+- '35904'
+- '35968'
+- '36096'
+- '36352'
+- '36864'
+- '36865'
+- '36866'
+- '36867'
+- '36868'
+- '36869'
+- '36870'
+- '36872'
+- '36873'
+- '36874'
+- '36876'
+- '36880'
+- '36881'
+- '36882'
+- '36884'
+- '36888'
+- '36896'
+- '36897'
+- '36898'
+- '36900'
+- '36904'
+- '36912'
+- '36928'
+- '36929'
+- '36930'
+- '36932'
+- '36936'
+- '36944'
+- '36960'
+- '36992'
+- '36993'
+- '36994'
+- '36996'
+- '37000'
+- '37008'
+- '37024'
+- '37056'
+- '37120'
+- '37121'
+- '37122'
+- '37124'
+- '37128'
+- '37136'
+- '37152'
+- '37184'
+- '37248'
+- '37376'
+- '37377'
+- '37378'
+- '37380'
+- '37384'
+- '37392'
+- '37408'
+- '37440'
+- '37504'
+- '37632'
+- '37888'
+- '37889'
+- '37890'
+- '37892'
+- '37896'
+- '37904'
+- '37920'
+- '37952'
+- '38016'
+- '38144'
+- '38400'
+- '38912'
+- '38913'
+- '38914'
+- '38916'
+- '38920'
+- '38928'
+- '38944'
+- '38976'
+- '39040'
+- '39168'
+- '39424'
+- '39936'
+- '40960'
+- '40961'
+- '40962'
+- '40963'
+- '40964'
+- '40965'
+- '40966'
+- '40968'
+- '40969'
+- '40970'
+- '40972'
+- '40976'
+- '40977'
+- '40978'
+- '40980'
+- '40984'
+- '40992'
+- '40993'
+- '40994'
+- '40996'
+- '41000'
+- '41008'
+- '41024'
+- '41025'
+- '41026'
+- '41028'
+- '41032'
+- '41040'
+- '41056'
+- '41088'
+- '41089'
+- '41090'
+- '41092'
+- '41096'
+- '41104'
+- '41120'
+- '41152'
+- '41216'
+- '41217'
+- '41218'
+- '41220'
+- '41224'
+- '41232'
+- '41248'
+- '41280'
+- '41344'
+- '41472'
+- '41473'
+- '41474'
+- '41476'
+- '41480'
+- '41488'
+- '41504'
+- '41536'
+- '41600'
+- '41728'
+- '41984'
+- '41985'
+- '41986'
+- '41988'
+- '41992'
+- '42000'
+- '42016'
+- '42048'
+- '42112'
+- '42240'
+- '42496'
+- '43008'
+- '43009'
+- '43010'
+- '43012'
+- '43016'
+- '43024'
+- '43040'
+- '43072'
+- '43136'
+- '43264'
+- '43520'
+- '44032'
+- '45056'
+- '45057'
+- '45058'
+- '45060'
+- '45064'
+- '45072'
+- '45088'
+- '45120'
+- '45184'
+- '45312'
+- '45568'
+- '46080'
+- '47104'
+- '49152'
+- '49153'
+- '49154'
+- '49155'
+- '49156'
+- '49157'
+- '49158'
+- '49160'
+- '49161'
+- '49162'
+- '49164'
+- '49168'
+- '49169'
+- '49170'
+- '49172'
+- '49176'
+- '49184'
+- '49185'
+- '49186'
+- '49188'
+- '49192'
+- '49200'
+- '49216'
+- '49217'
+- '49218'
+- '49220'
+- '49224'
+- '49232'
+- '49248'
+- '49280'
+- '49281'
+- '49282'
+- '49284'
+- '49288'
+- '49296'
+- '49312'
+- '49344'
+- '49408'
+- '49409'
+- '49410'
+- '49412'
+- '49416'
+- '49424'
+- '49440'
+- '49472'
+- '49536'
+- '49664'
+- '49665'
+- '49666'
+- '49668'
+- '49672'
+- '49680'
+- '49696'
+- '49728'
+- '49792'
+- '49920'
+- '50176'
+- '50177'
+- '50178'
+- '50180'
+- '50184'
+- '50192'
+- '50208'
+- '50240'
+- '50304'
+- '50432'
+- '50688'
+- '51200'
+- '51201'
+- '51202'
+- '51204'
+- '51208'
+- '51216'
+- '51232'
+- '51264'
+- '51328'
+- '51456'
+- '51712'
+- '52224'
+- '53248'
+- '53249'
+- '53250'
+- '53252'
+- '53256'
+- '53264'
+- '53280'
+- '53312'
+- '53376'
+- '53504'
+- '53760'
+- '54272'
+- '55296'
+- '57344'
+- '57345'
+- '57346'
+- '57348'
+- '57352'
+- '57360'
+- '57376'
+- '57408'
+- '57472'
+- '57600'
+- '57856'
+- '58368'
+- '59392'
+- '61440'
+init: null
+input_size: 80
+cmvn_file: null
+ctc_conf:
+ dropout_rate: 0.0
+ ctc_type: builtin
+ reduce: true
+ ignore_nan_grad: true
+joint_net_conf: null
+use_preprocessor: true
+token_type: char
+bpemodel: null
+non_linguistic_symbols: null
+cleaner: null
+g2p: null
+speech_volume_normalize: null
+rir_scp: null
+rir_apply_prob: 1.0
+noise_scp: null
+noise_apply_prob: 1.0
+noise_db_range: '13_15'
+specaug: null
+specaug_conf: {}
+normalize: null
+normalize_conf: {}
+label_aggregator: null
+label_aggregator_conf: {}
+model: sond
+model_conf:
+ lsm_weight: 0.1
+ length_normalized_loss: true
+ max_spk_num: 16
+ normalize_speech_speaker: true
+# speech encoder
+encoder: resnet34_sp_l2reg
+encoder_conf:
+ # pass by model, equal to feature dim
+ # input_size: 80
+ batchnorm_momentum: 0.01
+ pooling_type: "window_shift"
+ pool_size: 20
+ stride: 1
+ tf2torch_tensor_name_prefix_torch: encoder
+ tf2torch_tensor_name_prefix_tf: EAND/speech_encoder
+speaker_encoder: null
+speaker_encoder_conf: {}
+ci_scorer: conv
+ci_scorer_conf:
+ input_units: 512
+ num_layers: 3
+ num_units: 512
+ kernel_size: 1
+ dropout_rate: 0.0
+ position_encoder: null
+ out_units: 1
+ out_norm: false
+ auxiliary_states: false
+ tf2torch_tensor_name_prefix_torch: ci_scorer
+ tf2torch_tensor_name_prefix_tf: EAND/compute_distance_layer/ci_scorer
+cd_scorer: san
+cd_scorer_conf:
+ input_size: 512
+ output_size: 512
+ out_units: 1
+ attention_heads: 4
+ linear_units: 1024
+ num_blocks: 4
+ dropout_rate: 0.0
+ positional_dropout_rate: 0.0
+ attention_dropout_rate: 0.0
+ # use string "null" to remove input layer
+ input_layer: "null"
+ pos_enc_class: null
+ normalize_before: true
+ tf2torch_tensor_name_prefix_torch: cd_scorer
+ tf2torch_tensor_name_prefix_tf: EAND/compute_distance_layer/cd_scorer
+# post net
+decoder: fsmn
+decoder_conf:
+ in_units: 32
+ out_units: 2517
+ filter_size: 31
+ fsmn_num_layers: 6
+ dnn_num_layers: 1
+ num_memory_units: 16
+ ffn_inner_dim: 512
+ dropout_rate: 0.0
+ tf2torch_tensor_name_prefix_torch: decoder
+ tf2torch_tensor_name_prefix_tf: EAND/post_net
+frontend: null
+frontend_conf:
+ fs: 8000
+ window: povey
+ n_mels: 80
+ frame_length: 25
+ frame_shift: 10
+ filter_length_min: -1
+ filter_length_max: -1
+ lfr_m: 1
+ lfr_n: 1
+ dither: 0.0
+ snip_edges: false
+ upsacle_samples: false
+num_worker_count: 0
+required:
+- output_dir
+- token_list
+oss_bucket: 'null'
+version: 0.1.4
diff --git a/egs/callhome/diarization/sond/unit_test.py b/egs/callhome/diarization/sond/unit_test.py
new file mode 100644
index 000000000..a48eda148
--- /dev/null
+++ b/egs/callhome/diarization/sond/unit_test.py
@@ -0,0 +1,97 @@
+from funasr.bin.diar_inference_launch import inference_launch
+import os
+
+
+def test_fbank_cpu_infer():
+ diar_config_path = "sond_fbank.yaml"
+ diar_model_path = "sond.pb"
+ output_dir = "./outputs"
+ data_path_and_name_and_type = [
+ ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
+ ("data/unit_test/test_profile.scp", "profile", "kaldi_ark"),
+ ]
+ pipeline = inference_launch(
+ mode="sond",
+ diar_train_config=diar_config_path,
+ diar_model_file=diar_model_path,
+ output_dir=output_dir,
+ num_workers=0,
+ log_level="INFO",
+ )
+ results = pipeline(data_path_and_name_and_type)
+ print(results)
+
+
+def test_fbank_gpu_infer():
+ diar_config_path = "sond_fbank.yaml"
+ diar_model_path = "sond.pb"
+ output_dir = "./outputs"
+ data_path_and_name_and_type = [
+ ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
+ ("data/unit_test/test_profile.scp", "profile", "kaldi_ark"),
+ ]
+ pipeline = inference_launch(
+ mode="sond",
+ diar_train_config=diar_config_path,
+ diar_model_file=diar_model_path,
+ output_dir=output_dir,
+ ngpu=1,
+ num_workers=1,
+ log_level="INFO",
+ )
+ results = pipeline(data_path_and_name_and_type)
+ print(results)
+
+
+def test_wav_gpu_infer():
+ diar_config_path = "config.yaml"
+ diar_model_path = "sond.pb"
+ output_dir = "./outputs"
+ data_path_and_name_and_type = [
+ ("data/unit_test/test_wav.scp", "speech", "sound"),
+ ("data/unit_test/test_profile.scp", "profile", "kaldi_ark"),
+ ]
+ pipeline = inference_launch(
+ mode="sond",
+ diar_train_config=diar_config_path,
+ diar_model_file=diar_model_path,
+ output_dir=output_dir,
+ ngpu=1,
+ num_workers=1,
+ log_level="WARNING",
+ )
+ results = pipeline(data_path_and_name_and_type)
+ print(results)
+
+
+def test_without_profile_gpu_infer():
+ diar_config_path = "config.yaml"
+ diar_model_path = "sond.pb"
+ output_dir = "./outputs"
+ raw_inputs = [[
+ "data/unit_test/raw_inputs/record.wav",
+ "data/unit_test/raw_inputs/spk1.wav",
+ "data/unit_test/raw_inputs/spk2.wav",
+ "data/unit_test/raw_inputs/spk3.wav",
+ "data/unit_test/raw_inputs/spk4.wav"
+ ]]
+ pipeline = inference_launch(
+ mode="sond_demo",
+ diar_train_config=diar_config_path,
+ diar_model_file=diar_model_path,
+ output_dir=output_dir,
+ ngpu=1,
+ num_workers=1,
+ log_level="WARNING",
+ param_dict={},
+ )
+ results = pipeline(raw_inputs=raw_inputs)
+ print(results)
+
+
+if __name__ == '__main__':
+ os.environ["CUDA_VISIBLE_DEVICES"] = "7"
+ test_fbank_cpu_infer()
+ # test_fbank_gpu_infer()
+ # test_wav_gpu_infer()
+ # test_without_profile_gpu_infer()
diff --git a/egs/mars/sd/local_run.sh b/egs/mars/sd/local_run.sh
index 3b319f46e..4516e9f96 100755
--- a/egs/mars/sd/local_run.sh
+++ b/egs/mars/sd/local_run.sh
@@ -49,7 +49,7 @@ asr_config=conf/train_asr_conformer.yaml
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md
index c2e4354c1..053986d3d 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md
@@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i
- Modify inference related parameters in `infer_after_finetune.py`
-
output_dir: # result dir
-
data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed~~~~
- -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
index 56c282ce2..b3260672c 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
@@ -48,5 +48,5 @@ if __name__ == '__main__':
params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "valid.cer_ctc.ave.pth"
+ params["decoding_model_name"] = "valid.cer_ctc.ave.pb"
modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md
index c2e4354c1..053986d3d 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md
@@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i
- Modify inference related parameters in `infer_after_finetune.py`
-
output_dir: # result dir
-
data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed~~~~
- -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
index e163999b7..2f038a85a 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
@@ -48,5 +48,5 @@ if __name__ == '__main__':
params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "valid.cer_ctc.ave.pth"
+ params["decoding_model_name"] = "valid.cer_ctc.ave.pb"
modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md
index 9097e7ab9..16aeada4b 100644
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md
@@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.sp.cer` and `
- Modify inference related parameters in `infer_after_finetune.py`
-
output_dir: # result dir
-
data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/finetune.py b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/finetune.py
index bf8176ed1..7db085af1 100755
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/finetune.py
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/finetune.py
@@ -31,5 +31,5 @@ if __name__ == '__main__':
params.batch_bins = 1000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒,
params.max_epoch = 10 # 最大训练轮数
params.lr = 0.0001 # 设置学习率
- params.model_revision = 'v1.0.0'
+ params.model_revision = 'v3.0.0'
modelscope_finetune(params)
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py
index fa22aad05..b3bfe8e24 100755
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py
@@ -19,7 +19,7 @@ def modelscope_infer_core(output_dir, split_dir, njob, idx):
inference_pipline = pipeline(
task=Tasks.auto_speech_recognition,
model='NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950',
- model_revision='v1.0.0',
+ model_revision='v3.0.0',
output_dir=output_dir_job,
batch_size=1,
)
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
index e714a3d03..333b66a72 100755
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
@@ -63,5 +63,5 @@ if __name__ == '__main__':
params["required_files"] = ["feats_stats.npz", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./example_data/validation"
- params["decoding_model_name"] = "valid.acc.ave.pth"
+ params["decoding_model_name"] = "valid.acc.ave.pb"
modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer.py
index dd3fb4886..2fceb48f8 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer.py
@@ -8,9 +8,14 @@ from modelscope.utils.constant import Tasks
from funasr.utils.compute_wer import compute_wer
-def modelscope_infer_core(output_dir, split_dir, njob, idx):
+def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
- gpu_id = (int(idx) - 1) // njob
+ if ngpu > 0:
+ use_gpu = 1
+ gpu_id = int(idx) - 1
+ else:
+ use_gpu = 0
+ gpu_id = -1
if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
@@ -18,9 +23,10 @@ def modelscope_infer_core(output_dir, split_dir, njob, idx):
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
inference_pipline = pipeline(
task=Tasks.auto_speech_recognition,
- model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch",
+ model=model,
output_dir=output_dir_job,
- batch_size=64
+ batch_size=batch_size,
+ ngpu=use_gpu,
)
audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
inference_pipline(audio_in=audio_in)
@@ -30,13 +36,18 @@ def modelscope_infer(params):
# prepare for multi-GPU decoding
ngpu = params["ngpu"]
njob = params["njob"]
+ batch_size = params["batch_size"]
output_dir = params["output_dir"]
+ model = params["model"]
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
os.mkdir(output_dir)
split_dir = os.path.join(output_dir, "split")
os.mkdir(split_dir)
- nj = ngpu * njob
+ if ngpu > 0:
+ nj = ngpu
+ elif ngpu == 0:
+ nj = njob
wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
with open(wav_scp_file) as f:
lines = f.readlines()
@@ -56,7 +67,7 @@ def modelscope_infer(params):
p = Pool(nj)
for i in range(nj):
p.apply_async(modelscope_infer_core,
- args=(output_dir, split_dir, njob, str(i + 1)))
+ args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
p.close()
p.join()
@@ -81,8 +92,10 @@ def modelscope_infer(params):
if __name__ == "__main__":
params = {}
+ params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch"
params["data_dir"] = "./data/test"
params["output_dir"] = "./results"
- params["ngpu"] = 1
- params["njob"] = 1
- modelscope_infer(params)
+ params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
+ params["njob"] = 1 # if ngpu = 0, will use cpu decoding
+ params["batch_size"] = 64
+ modelscope_infer(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py
index 6c34ed099..fafe565f1 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py
@@ -4,23 +4,18 @@ import shutil
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
+from modelscope.hub.snapshot_download import snapshot_download
from funasr.utils.compute_wer import compute_wer
-
def modelscope_infer_after_finetune(params):
# prepare for decoding
- pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
- for file_name in params["required_files"]:
- if file_name == "configuration.json":
- with open(os.path.join(pretrained_model_path, file_name)) as f:
- config_dict = json.load(f)
- config_dict["model"]["am_model_name"] = params["decoding_model_name"]
- with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
- json.dump(config_dict, f, indent=4, separators=(',', ': '))
- else:
- shutil.copy(os.path.join(pretrained_model_path, file_name),
- os.path.join(params["output_dir"], file_name))
+
+ try:
+ pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
+ except BaseException:
+ raise BaseException(f"Please download pretrain model from ModelScope firstly.")
+ shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
decoding_path = os.path.join(params["output_dir"], "decode_results")
if os.path.exists(decoding_path):
shutil.rmtree(decoding_path)
@@ -29,9 +24,9 @@ def modelscope_infer_after_finetune(params):
# decoding
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
- model=params["output_dir"],
+ model=pretrained_model_path,
output_dir=decoding_path,
- batch_size=64
+ batch_size=params["batch_size"]
)
audio_in = os.path.join(params["data_dir"], "wav.scp")
inference_pipeline(audio_in=audio_in)
@@ -46,8 +41,8 @@ def modelscope_infer_after_finetune(params):
if __name__ == '__main__':
params = {}
params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch"
- params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "valid.acc.ave_10best.pth"
- modelscope_infer_after_finetune(params)
+ params["decoding_model_name"] = "valid.acc.ave_10best.pb"
+ params["batch_size"] = 64
+ modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer.py
index d616d3e22..d70af7245 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer.py
@@ -8,9 +8,14 @@ from modelscope.utils.constant import Tasks
from funasr.utils.compute_wer import compute_wer
-def modelscope_infer_core(output_dir, split_dir, njob, idx):
+def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
- gpu_id = (int(idx) - 1) // njob
+ if ngpu > 0:
+ use_gpu = 1
+ gpu_id = int(idx) - 1
+ else:
+ use_gpu = 0
+ gpu_id = -1
if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
@@ -18,9 +23,10 @@ def modelscope_infer_core(output_dir, split_dir, njob, idx):
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
inference_pipline = pipeline(
task=Tasks.auto_speech_recognition,
- model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch",
+ model=model,
output_dir=output_dir_job,
- batch_size=64
+ batch_size=batch_size,
+ ngpu=use_gpu,
)
audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
inference_pipline(audio_in=audio_in)
@@ -30,13 +36,18 @@ def modelscope_infer(params):
# prepare for multi-GPU decoding
ngpu = params["ngpu"]
njob = params["njob"]
+ batch_size = params["batch_size"]
output_dir = params["output_dir"]
+ model = params["model"]
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
os.mkdir(output_dir)
split_dir = os.path.join(output_dir, "split")
os.mkdir(split_dir)
- nj = ngpu * njob
+ if ngpu > 0:
+ nj = ngpu
+ elif ngpu == 0:
+ nj = njob
wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
with open(wav_scp_file) as f:
lines = f.readlines()
@@ -56,7 +67,7 @@ def modelscope_infer(params):
p = Pool(nj)
for i in range(nj):
p.apply_async(modelscope_infer_core,
- args=(output_dir, split_dir, njob, str(i + 1)))
+ args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
p.close()
p.join()
@@ -81,8 +92,10 @@ def modelscope_infer(params):
if __name__ == "__main__":
params = {}
+ params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch"
params["data_dir"] = "./data/test"
params["output_dir"] = "./results"
- params["ngpu"] = 1
- params["njob"] = 1
- modelscope_infer(params)
+ params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
+ params["njob"] = 1 # if ngpu = 0, will use cpu decoding
+ params["batch_size"] = 64
+ modelscope_infer(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py
index 6140bb71f..731cafe15 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py
@@ -4,23 +4,18 @@ import shutil
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
+from modelscope.hub.snapshot_download import snapshot_download
from funasr.utils.compute_wer import compute_wer
-
def modelscope_infer_after_finetune(params):
# prepare for decoding
- pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
- for file_name in params["required_files"]:
- if file_name == "configuration.json":
- with open(os.path.join(pretrained_model_path, file_name)) as f:
- config_dict = json.load(f)
- config_dict["model"]["am_model_name"] = params["decoding_model_name"]
- with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
- json.dump(config_dict, f, indent=4, separators=(',', ': '))
- else:
- shutil.copy(os.path.join(pretrained_model_path, file_name),
- os.path.join(params["output_dir"], file_name))
+
+ try:
+ pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
+ except BaseException:
+ raise BaseException(f"Please download pretrain model from ModelScope firstly.")
+ shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
decoding_path = os.path.join(params["output_dir"], "decode_results")
if os.path.exists(decoding_path):
shutil.rmtree(decoding_path)
@@ -29,9 +24,9 @@ def modelscope_infer_after_finetune(params):
# decoding
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
- model=params["output_dir"],
+ model=pretrained_model_path,
output_dir=decoding_path,
- batch_size=64
+ batch_size=params["batch_size"]
)
audio_in = os.path.join(params["data_dir"], "wav.scp")
inference_pipeline(audio_in=audio_in)
@@ -46,8 +41,8 @@ def modelscope_infer_after_finetune(params):
if __name__ == '__main__':
params = {}
params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch"
- params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "valid.acc.ave_10best.pth"
- modelscope_infer_after_finetune(params)
+ params["decoding_model_name"] = "valid.acc.ave_10best.pb"
+ params["batch_size"] = 64
+ modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
index dfd509dd4..a0443614f 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@@ -22,10 +22,12 @@
Or you can use the finetuned model for inference directly.
- Setting parameters in `infer.py`
+ -
model: # model name on ModelScope
-
data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-
output_dir: # result dir
- -
ngpu: # the number of GPUs for decoding
- -
njob: # the number of jobs for each GPU
+ -
ngpu: # the number of GPUs for decoding, if `ngpu` > 0, use GPU decoding
+ -
njob: # the number of jobs for CPU decoding, if `ngpu` = 0, use CPU decoding, please set `njob`
+ -
batch_size: # batchsize of inference
- Then you can run the pipeline to infer with:
```python
@@ -39,9 +41,11 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i
### Inference using local finetuned model
- Modify inference related parameters in `infer_after_finetune.py`
+ -
modelscope_model_name: # model name on ModelScope
-
output_dir: # result dir
-
data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
+ -
batch_size: # batchsize of inference
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
index f9f61147e..795a1e7c5 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
@@ -8,9 +8,14 @@ from modelscope.utils.constant import Tasks
from funasr.utils.compute_wer import compute_wer
-def modelscope_infer_core(output_dir, split_dir, njob, idx):
+def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
- gpu_id = (int(idx) - 1) // njob
+ if ngpu > 0:
+ use_gpu = 1
+ gpu_id = int(idx) - 1
+ else:
+ use_gpu = 0
+ gpu_id = -1
if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
@@ -18,9 +23,10 @@ def modelscope_infer_core(output_dir, split_dir, njob, idx):
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
inference_pipline = pipeline(
task=Tasks.auto_speech_recognition,
- model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+ model=model,
output_dir=output_dir_job,
- batch_size=64
+ batch_size=batch_size,
+ ngpu=use_gpu,
)
audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
inference_pipline(audio_in=audio_in)
@@ -30,13 +36,18 @@ def modelscope_infer(params):
# prepare for multi-GPU decoding
ngpu = params["ngpu"]
njob = params["njob"]
+ batch_size = params["batch_size"]
output_dir = params["output_dir"]
+ model = params["model"]
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
os.mkdir(output_dir)
split_dir = os.path.join(output_dir, "split")
os.mkdir(split_dir)
- nj = ngpu * njob
+ if ngpu > 0:
+ nj = ngpu
+ elif ngpu == 0:
+ nj = njob
wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
with open(wav_scp_file) as f:
lines = f.readlines()
@@ -56,7 +67,7 @@ def modelscope_infer(params):
p = Pool(nj)
for i in range(nj):
p.apply_async(modelscope_infer_core,
- args=(output_dir, split_dir, njob, str(i + 1)))
+ args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
p.close()
p.join()
@@ -81,8 +92,10 @@ def modelscope_infer(params):
if __name__ == "__main__":
params = {}
+ params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
params["data_dir"] = "./data/test"
params["output_dir"] = "./results"
- params["ngpu"] = 1
- params["njob"] = 1
- modelscope_infer(params)
+ params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
+ params["njob"] = 1 # if ngpu = 0, will use cpu decoding
+ params["batch_size"] = 64
+ modelscope_infer(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
index 94393ec5e..295c95d7f 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
@@ -4,23 +4,18 @@ import shutil
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
+from modelscope.hub.snapshot_download import snapshot_download
from funasr.utils.compute_wer import compute_wer
-
def modelscope_infer_after_finetune(params):
# prepare for decoding
- pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
- for file_name in params["required_files"]:
- if file_name == "configuration.json":
- with open(os.path.join(pretrained_model_path, file_name)) as f:
- config_dict = json.load(f)
- config_dict["model"]["am_model_name"] = params["decoding_model_name"]
- with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
- json.dump(config_dict, f, indent=4, separators=(',', ': '))
- else:
- shutil.copy(os.path.join(pretrained_model_path, file_name),
- os.path.join(params["output_dir"], file_name))
+
+ try:
+ pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
+ except BaseException:
+ raise BaseException(f"Please download pretrain model from ModelScope firstly.")
+ shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
decoding_path = os.path.join(params["output_dir"], "decode_results")
if os.path.exists(decoding_path):
shutil.rmtree(decoding_path)
@@ -29,9 +24,9 @@ def modelscope_infer_after_finetune(params):
# decoding
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
- model=params["output_dir"],
+ model=pretrained_model_path,
output_dir=decoding_path,
- batch_size=64
+ batch_size=params["batch_size"]
)
audio_in = os.path.join(params["data_dir"], "wav.scp")
inference_pipeline(audio_in=audio_in)
@@ -46,8 +41,8 @@ def modelscope_infer_after_finetune(params):
if __name__ == '__main__':
params = {}
params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
- params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "valid.acc.ave_10best.pth"
- modelscope_infer_after_finetune(params)
+ params["decoding_model_name"] = "valid.acc.ave_10best.pb"
+ params["batch_size"] = 64
+ modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
index f08b31f7d..0b508fbc2 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
@@ -8,9 +8,14 @@ from modelscope.utils.constant import Tasks
from funasr.utils.compute_wer import compute_wer
-def modelscope_infer_core(output_dir, split_dir, njob, idx):
+def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
- gpu_id = (int(idx) - 1) // njob
+ if ngpu > 0:
+ use_gpu = 1
+ gpu_id = int(idx) - 1
+ else:
+ use_gpu = 0
+ gpu_id = -1
if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
@@ -18,9 +23,10 @@ def modelscope_infer_core(output_dir, split_dir, njob, idx):
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
inference_pipline = pipeline(
task=Tasks.auto_speech_recognition,
- model="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1",
+ model=model,
output_dir=output_dir_job,
- batch_size=64
+ batch_size=batch_size,
+ ngpu=use_gpu,
)
audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
inference_pipline(audio_in=audio_in)
@@ -30,13 +36,18 @@ def modelscope_infer(params):
# prepare for multi-GPU decoding
ngpu = params["ngpu"]
njob = params["njob"]
+ batch_size = params["batch_size"]
output_dir = params["output_dir"]
+ model = params["model"]
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
os.mkdir(output_dir)
split_dir = os.path.join(output_dir, "split")
os.mkdir(split_dir)
- nj = ngpu * njob
+ if ngpu > 0:
+ nj = ngpu
+ elif ngpu == 0:
+ nj = njob
wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
with open(wav_scp_file) as f:
lines = f.readlines()
@@ -56,7 +67,7 @@ def modelscope_infer(params):
p = Pool(nj)
for i in range(nj):
p.apply_async(modelscope_infer_core,
- args=(output_dir, split_dir, njob, str(i + 1)))
+ args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
p.close()
p.join()
@@ -81,8 +92,10 @@ def modelscope_infer(params):
if __name__ == "__main__":
params = {}
+ params["model"] = "damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1"
params["data_dir"] = "./data/test"
params["output_dir"] = "./results"
- params["ngpu"] = 1
- params["njob"] = 1
- modelscope_infer(params)
+ params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
+ params["njob"] = 1 # if ngpu = 0, will use cpu decoding
+ params["batch_size"] = 64
+ modelscope_infer(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
index 96102ccfa..e8fee02a0 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
@@ -4,23 +4,18 @@ import shutil
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
+from modelscope.hub.snapshot_download import snapshot_download
from funasr.utils.compute_wer import compute_wer
-
def modelscope_infer_after_finetune(params):
# prepare for decoding
- pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
- for file_name in params["required_files"]:
- if file_name == "configuration.json":
- with open(os.path.join(pretrained_model_path, file_name)) as f:
- config_dict = json.load(f)
- config_dict["model"]["am_model_name"] = params["decoding_model_name"]
- with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
- json.dump(config_dict, f, indent=4, separators=(',', ': '))
- else:
- shutil.copy(os.path.join(pretrained_model_path, file_name),
- os.path.join(params["output_dir"], file_name))
+
+ try:
+ pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
+ except BaseException:
+ raise BaseException(f"Please download pretrain model from ModelScope firstly.")
+ shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
decoding_path = os.path.join(params["output_dir"], "decode_results")
if os.path.exists(decoding_path):
shutil.rmtree(decoding_path)
@@ -29,9 +24,9 @@ def modelscope_infer_after_finetune(params):
# decoding
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
- model=params["output_dir"],
+ model=pretrained_model_path,
output_dir=decoding_path,
- batch_size=64
+ batch_size=params["batch_size"]
)
audio_in = os.path.join(params["data_dir"], "wav.scp")
inference_pipeline(audio_in=audio_in)
@@ -46,8 +41,8 @@ def modelscope_infer_after_finetune(params):
if __name__ == '__main__':
params = {}
params["modelscope_model_name"] = "damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1"
- params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "valid.acc.ave_10best.pth"
- modelscope_infer_after_finetune(params)
+ params["decoding_model_name"] = "valid.acc.ave_10best.pb"
+ params["batch_size"] = 64
+ modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md
index dfd509dd4..b68f1e921 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md
@@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i
- Modify inference related parameters in `infer_after_finetune.py`
-
output_dir: # result dir
-
data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py
index d91a40a6c..6593f4e3f 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py
@@ -50,5 +50,5 @@ if __name__ == '__main__':
params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "20epoch.pth"
+ params["decoding_model_name"] = "20epoch.pb"
modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md
index dfd509dd4..b68f1e921 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md
@@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i
- Modify inference related parameters in `infer_after_finetune.py`
-
output_dir: # result dir
-
data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py
index f9fb0db8a..f067c8193 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py
@@ -50,5 +50,5 @@ if __name__ == '__main__':
params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "20epoch.pth"
+ params["decoding_model_name"] = "20epoch.pb"
modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py
new file mode 100644
index 000000000..56fb58302
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py
@@ -0,0 +1,35 @@
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from funasr.datasets.ms_dataset import MsDataset
+
+
+def modelscope_finetune(params):
+ if not os.path.exists(params["output_dir"]):
+ os.makedirs(params["output_dir"], exist_ok=True)
+ # dataset split ["train", "validation"]
+ ds_dict = MsDataset.load(params["data_dir"])
+ kwargs = dict(
+ model=params["model"],
+ model_revision=params["model_revision"],
+ data_dir=ds_dict,
+ dataset_type=params["dataset_type"],
+ work_dir=params["output_dir"],
+ batch_bins=params["batch_bins"],
+ max_epoch=params["max_epoch"],
+ lr=params["lr"])
+ trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+ trainer.train()
+
+
+if __name__ == '__main__':
+ params = {}
+ params["output_dir"] = "./checkpoint"
+ params["data_dir"] = "./data"
+ params["batch_bins"] = 2000
+ params["dataset_type"] = "small"
+ params["max_epoch"] = 50
+ params["lr"] = 0.00005
+ params["model"] = "damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch"
+ params["model_revision"] = None
+ modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py
new file mode 100644
index 000000000..c54ab8c83
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py
@@ -0,0 +1,13 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == "__main__":
+ audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_he.wav"
+ output_dir = "./results"
+ inference_pipline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model="damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch",
+ output_dir=output_dir,
+ )
+ rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+ print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/finetune.py
index 4a5efdbe9..5485ff56e 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/finetune.py
@@ -30,6 +30,6 @@ if __name__ == '__main__':
params["dataset_type"] = "small"
params["max_epoch"] = 50
params["lr"] = 0.00005
- params["model"] = "damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online"
+ params["model"] = "damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline"
params["model_revision"] = None
modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py
index a053957d3..1a174bbca 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py
@@ -6,7 +6,7 @@ if __name__ == "__main__":
output_dir = "./results"
inference_pipline = pipeline(
task=Tasks.auto_speech_recognition,
- model="damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online",
+ model="damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline",
output_dir=output_dir,
)
rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md
index dd947d329..9a84f9b57 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md
@@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i
- Modify inference related parameters in `infer_after_finetune.py`
-
output_dir: # result dir
-
data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py
index 030c2e278..d4df29e01 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py
@@ -50,5 +50,5 @@ if __name__ == '__main__':
params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "20epoch.pth"
+ params["decoding_model_name"] = "20epoch.pb"
modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py
new file mode 100644
index 000000000..8bbce606c
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py
@@ -0,0 +1,35 @@
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from funasr.datasets.ms_dataset import MsDataset
+
+
+def modelscope_finetune(params):
+ if not os.path.exists(params["output_dir"]):
+ os.makedirs(params["output_dir"], exist_ok=True)
+ # dataset split ["train", "validation"]
+ ds_dict = MsDataset.load(params["data_dir"])
+ kwargs = dict(
+ model=params["model"],
+ model_revision=params["model_revision"],
+ data_dir=ds_dict,
+ dataset_type=params["dataset_type"],
+ work_dir=params["output_dir"],
+ batch_bins=params["batch_bins"],
+ max_epoch=params["max_epoch"],
+ lr=params["lr"])
+ trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+ trainer.train()
+
+
+if __name__ == '__main__':
+ params = {}
+ params["output_dir"] = "./checkpoint"
+ params["data_dir"] = "./data"
+ params["batch_bins"] = 2000
+ params["dataset_type"] = "small"
+ params["max_epoch"] = 50
+ params["lr"] = 0.00005
+ params["model"] = "damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch"
+ params["model_revision"] = None
+ modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py
new file mode 100644
index 000000000..cfd869f04
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py
@@ -0,0 +1,13 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == "__main__":
+ audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_my.wav"
+ output_dir = "./results"
+ inference_pipline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model="damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch",
+ output_dir=output_dir,
+ )
+ rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+ print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/finetune.py
index 60f3c8208..512b844c6 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/finetune.py
@@ -30,6 +30,6 @@ if __name__ == '__main__':
params["dataset_type"] = "small"
params["max_epoch"] = 50
params["lr"] = 0.00005
- params["model"] = "damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online"
+ params["model"] = "damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline"
params["model_revision"] = None
modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py
index 30a11ffd3..2dcb6638a 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py
@@ -6,7 +6,7 @@ if __name__ == "__main__":
output_dir = "./results"
inference_pipline = pipeline(
task=Tasks.auto_speech_recognition,
- model="damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online",
+ model="damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline",
output_dir=output_dir,
)
rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py
new file mode 100644
index 000000000..5e313e533
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py
@@ -0,0 +1,35 @@
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from funasr.datasets.ms_dataset import MsDataset
+
+
+def modelscope_finetune(params):
+ if not os.path.exists(params["output_dir"]):
+ os.makedirs(params["output_dir"], exist_ok=True)
+ # dataset split ["train", "validation"]
+ ds_dict = MsDataset.load(params["data_dir"])
+ kwargs = dict(
+ model=params["model"],
+ model_revision=params["model_revision"],
+ data_dir=ds_dict,
+ dataset_type=params["dataset_type"],
+ work_dir=params["output_dir"],
+ batch_bins=params["batch_bins"],
+ max_epoch=params["max_epoch"],
+ lr=params["lr"])
+ trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+ trainer.train()
+
+
+if __name__ == '__main__':
+ params = {}
+ params["output_dir"] = "./checkpoint"
+ params["data_dir"] = "./data"
+ params["batch_bins"] = 2000
+ params["dataset_type"] = "small"
+ params["max_epoch"] = 50
+ params["lr"] = 0.00005
+ params["model"] = "damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch"
+ params["model_revision"] = None
+ modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py
new file mode 100644
index 000000000..e8c5524f0
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py
@@ -0,0 +1,13 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == "__main__":
+ audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ur.wav"
+ output_dir = "./results"
+ inference_pipline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model="damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch",
+ output_dir=output_dir,
+ )
+ rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+ print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md
index dd947d329..9a84f9b57 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md
@@ -41,7 +41,7 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i
- Modify inference related parameters in `infer_after_finetune.py`
-
output_dir: # result dir
-
data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
index 3b39a1665..861fefb7f 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
@@ -49,5 +49,5 @@ if __name__ == '__main__':
params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "20epoch.pth"
+ params["decoding_model_name"] = "20epoch.pb"
modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md
index dd947d329..eff933e8d 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md
@@ -41,7 +41,8 @@ The decoding results can be found in `$output_dir/1best_recog/text.cer`, which i
- Modify inference related parameters in `infer_after_finetune.py`
-
output_dir: # result dir
-
data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave
+ .pb`
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
index 4860cf743..d73cae267 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
@@ -49,5 +49,5 @@ if __name__ == '__main__':
params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "20epoch.pth"
+ params["decoding_model_name"] = "20epoch.pb"
modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
index 1094bb5ff..94144efa7 100644
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@@ -34,7 +34,7 @@ Or you can use the finetuned model for inference directly.
- Modify inference related parameters in `infer_after_finetune.py`
-
output_dir: # result dir
-
data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ -
decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
index 5f171b419..473019c70 100644
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
@@ -4,27 +4,17 @@ import shutil
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
+from modelscope.hub.snapshot_download import snapshot_download
from funasr.utils.compute_wer import compute_wer
-
def modelscope_infer_after_finetune(params):
# prepare for decoding
- if not os.path.exists(os.path.join(params["output_dir"], "punc")):
- os.makedirs(os.path.join(params["output_dir"], "punc"))
- if not os.path.exists(os.path.join(params["output_dir"], "vad")):
- os.makedirs(os.path.join(params["output_dir"], "vad"))
- pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
- for file_name in params["required_files"]:
- if file_name == "configuration.json":
- with open(os.path.join(pretrained_model_path, file_name)) as f:
- config_dict = json.load(f)
- config_dict["model"]["am_model_name"] = params["decoding_model_name"]
- with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
- json.dump(config_dict, f, indent=4, separators=(',', ': '))
- else:
- shutil.copy(os.path.join(pretrained_model_path, file_name),
- os.path.join(params["output_dir"], file_name))
+
+ try:
+ pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
+ except BaseException:
+ raise BaseException(f"Please download pretrain model from ModelScope firstly.")shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
decoding_path = os.path.join(params["output_dir"], "decode_results")
if os.path.exists(decoding_path):
shutil.rmtree(decoding_path)
@@ -33,16 +23,16 @@ def modelscope_infer_after_finetune(params):
# decoding
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
- model=params["output_dir"],
+ model=pretrained_model_path,
output_dir=decoding_path,
- batch_size=64
+ batch_size=params["batch_size"]
)
audio_in = os.path.join(params["data_dir"], "wav.scp")
inference_pipeline(audio_in=audio_in)
# computer CER if GT text is set
text_in = os.path.join(params["data_dir"], "text")
- if text_in is not None:
+ if os.path.exists(text_in):
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
@@ -50,8 +40,8 @@ def modelscope_infer_after_finetune(params):
if __name__ == '__main__':
params = {}
params["modelscope_model_name"] = "damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
- params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json", "punc/punc.pb", "punc/punc.yaml", "vad/vad.mvn", "vad/vad.pb", "vad/vad.yaml"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "valid.acc.ave_10best.pth"
- modelscope_infer_after_finetune(params)
+ params["decoding_model_name"] = "valid.acc.ave_10best.pb"
+ params["batch_size"] = 64
+ modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py
new file mode 100644
index 000000000..540e3cf64
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py
@@ -0,0 +1,26 @@
+
+##################text二进制数据#####################
+inputs = "跨境河流是养育沿岸|人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员|在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险|向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切|愿意进一步完善双方联合工作机制|凡是|中方能做的我们|都会去做而且会做得更好我请印度朋友们放心中国在上游的|任何开发利用都会经过科学|规划和论证兼顾上下游的利益"
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+ task=Tasks.punctuation,
+ model='damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727',
+ model_revision="v1.0.0",
+ output_dir="./tmp/"
+)
+
+vads = inputs.split("|")
+
+cache_out = []
+rec_result_all="outputs:"
+for vad in vads:
+ rec_result = inference_pipeline(text_in=vad, cache=cache_out)
+ #print(rec_result)
+ cache_out = rec_result['cache']
+ rec_result_all += rec_result['text']
+
+print(rec_result_all)
+
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py
index 8dac29299..0da8d25a1 100644
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py
@@ -15,7 +15,7 @@ from modelscope.utils.constant import Tasks
inference_pipline = pipeline(
task=Tasks.punctuation,
model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
- model_revision="v1.1.6",
+ model_revision="v1.1.7",
output_dir="./tmp/"
)
diff --git a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
new file mode 100644
index 000000000..81cb2c629
--- /dev/null
+++ b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
@@ -0,0 +1,10 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_diar_pipline = pipeline(
+ task=Tasks.speaker_diarization,
+ model='damo/speech_diarization_eend-ola-en-us-callhome-8k',
+ model_revision="v1.0.0",
+)
+results = inference_diar_pipline(audio_in=["https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record2.wav"])
+print(results)
\ No newline at end of file
diff --git a/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py b/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py
new file mode 100644
index 000000000..5f4563dbc
--- /dev/null
+++ b/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py
@@ -0,0 +1,25 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+# 初始化推理 pipeline
+# 当以原始音频作为输入时使用配置文件 sond.yaml,并设置 mode 为sond_demo
+inference_diar_pipline = pipeline(
+ mode="sond_demo",
+ num_workers=0,
+ task=Tasks.speaker_diarization,
+ diar_model_config="sond.yaml",
+ model='damo/speech_diarization_sond-en-us-callhome-8k-n16k4-pytorch',
+ sv_model="damo/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch",
+ sv_model_revision="master",
+)
+
+# 以 audio_list 作为输入,其中第一个音频为待检测语音,后面的音频为不同说话人的声纹注册语音
+audio_list = [
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav",
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_A.wav",
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B.wav",
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B1.wav"
+]
+
+results = inference_diar_pipline(audio_in=audio_list)
+print(results)
diff --git a/egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/infer.py b/egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/infer.py
new file mode 100644
index 000000000..1fd9dc614
--- /dev/null
+++ b/egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/infer.py
@@ -0,0 +1,39 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+import numpy as np
+
+if __name__ == '__main__':
+ inference_sv_pipline = pipeline(
+ task=Tasks.speaker_verification,
+ model='damo/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch'
+ )
+
+ # extract speaker embedding
+ # for url use "spk_embedding" as key
+ rec_result = inference_sv_pipline(
+ audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/sv_example_enroll.wav')
+ enroll = rec_result["spk_embedding"]
+
+ # for local file use "spk_embedding" as key
+ rec_result = inference_sv_pipline(audio_in='example/sv_example_same.wav')
+ same = rec_result["spk_embedding"]
+
+ import soundfile
+ wav = soundfile.read('example/sv_example_enroll.wav')[0]
+ # for raw inputs use "spk_embedding" as key
+ spk_embedding = inference_sv_pipline(audio_in=wav)["spk_embedding"]
+
+ rec_result = inference_sv_pipline(
+ audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/sv_example_different.wav')
+ different = rec_result["spk_embedding"]
+
+ # calculate cosine similarity for same speaker
+ sv_threshold = 0.80
+ same_cos = np.sum(enroll * same) / (np.linalg.norm(enroll) * np.linalg.norm(same))
+ same_cos = max(same_cos - sv_threshold, 0.0) / (1.0 - sv_threshold) * 100.0
+ print("Similarity:", same_cos)
+
+ # calculate cosine similarity for different speaker
+ diff_cos = np.sum(enroll * different) / (np.linalg.norm(enroll) * np.linalg.norm(different))
+ diff_cos = max(diff_cos - sv_threshold, 0.0) / (1.0 - sv_threshold) * 100.0
+ print("Similarity:", diff_cos)
diff --git a/egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/infer_sv.py b/egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/infer_sv.py
new file mode 100644
index 000000000..880b2d3b1
--- /dev/null
+++ b/egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/infer_sv.py
@@ -0,0 +1,21 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == '__main__':
+ inference_sv_pipline = pipeline(
+ task=Tasks.speaker_verification,
+ model='damo/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch'
+ )
+
+ # the same speaker
+ rec_result = inference_sv_pipline(audio_in=(
+ 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/sv_example_enroll.wav',
+ 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/sv_example_same.wav'))
+ print("Similarity", rec_result["scores"])
+
+ # different speakers
+ rec_result = inference_sv_pipline(audio_in=(
+ 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/sv_example_enroll.wav',
+ 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/sv_example_different.wav'))
+
+ print("Similarity", rec_result["scores"])
diff --git a/egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/infer.py b/egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/infer.py
index e81297a3f..87f38013b 100644
--- a/egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/infer.py
+++ b/egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/infer.py
@@ -9,14 +9,20 @@ if __name__ == '__main__':
)
# 提取不同句子的说话人嵌入码
+ # for url use "utt_id" as key
rec_result = inference_sv_pipline(
audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav')
enroll = rec_result["spk_embedding"]
- rec_result = inference_sv_pipline(
- audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_same.wav')
+ # for local file use "utt_id" as key
+ rec_result = inference_sv_pipline(audio_in='sv_example_same.wav')["test1"]
same = rec_result["spk_embedding"]
+ import soundfile
+ wav = soundfile.read('sv_example_enroll.wav')[0]
+ # for raw inputs use "utt_id" as key
+ spk_embedding = inference_sv_pipline(audio_in=wav)["spk_embedding"]
+
rec_result = inference_sv_pipline(
audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_different.wav')
different = rec_result["spk_embedding"]
diff --git a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md
new file mode 100644
index 000000000..5488aaa3c
--- /dev/null
+++ b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md
@@ -0,0 +1,25 @@
+# ModelScope Model
+
+## How to finetune and infer using a pretrained ModelScope Model
+
+### Inference
+
+Or you can use the finetuned model for inference directly.
+
+- Setting parameters in `infer.py`
+ -
audio_in: # support wav, url, bytes, and parsed audio format.
+ -
text_in: # support text, text url.
+ -
output_dir: # If the input format is wav.scp, it needs to be set.
+
+- Then you can run the pipeline to infer with:
+```python
+ python infer.py
+```
+
+
+Modify inference related parameters in vad.yaml.
+
+- max_end_silence_time: The end-point silence duration to judge the end of sentence, the parameter range is 500ms~6000ms, and the default value is 800ms
+- speech_noise_thres: The balance of speech and silence scores, the parameter range is (-1,1)
+ - The value tends to -1, the greater probability of noise being judged as speech
+ - The value tends to 1, the greater probability of speech being judged as noise
diff --git a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py
new file mode 100644
index 000000000..ff42e6857
--- /dev/null
+++ b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py
@@ -0,0 +1,12 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipline = pipeline(
+ task=Tasks.speech_timestamp,
+ model='damo/speech_timestamp_prediction-v1-16k-offline',
+ output_dir='./tmp')
+
+rec_result = inference_pipline(
+ audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_timestamps.wav',
+ text_in='一 个 东 太 平 洋 国 家 为 什 么 跑 到 西 太 平 洋 来 了 呢')
+print(rec_result)
\ No newline at end of file
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py
index c255474b8..2bf3251e3 100644
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py
@@ -7,7 +7,7 @@ if __name__ == '__main__':
inference_pipline = pipeline(
task=Tasks.voice_activity_detection,
model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
- model_revision=None,
+ model_revision='v1.2.0',
output_dir=output_dir,
batch_size=1,
)
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py
new file mode 100644
index 000000000..66b816149
--- /dev/null
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py
@@ -0,0 +1,33 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+import soundfile
+
+
+if __name__ == '__main__':
+ output_dir = None
+ inference_pipline = pipeline(
+ task=Tasks.voice_activity_detection,
+ model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+ model_revision='v1.2.0',
+ output_dir=output_dir,
+ batch_size=1,
+ mode='online',
+ )
+ speech, sample_rate = soundfile.read("./vad_example_16k.wav")
+ speech_length = speech.shape[0]
+
+ sample_offset = 0
+
+ step = 160 * 10
+ param_dict = {'in_cache': dict()}
+ for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
+ if sample_offset + step >= speech_length - 1:
+ step = speech_length - sample_offset
+ is_final = True
+ else:
+ is_final = False
+ param_dict['is_final'] = is_final
+ segments_result = inference_pipline(audio_in=speech[sample_offset: sample_offset + step],
+ param_dict=param_dict)
+ print(segments_result)
+
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py
index 71af48656..2e5027500 100644
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py
@@ -7,8 +7,8 @@ if __name__ == '__main__':
inference_pipline = pipeline(
task=Tasks.voice_activity_detection,
model="damo/speech_fsmn_vad_zh-cn-8k-common",
- model_revision=None,
- output_dir='./output_dir',
+ model_revision='v1.2.0',
+ output_dir=output_dir,
batch_size=1,
)
segments_result = inference_pipline(audio_in=audio_in)
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py
new file mode 100644
index 000000000..abf4ef555
--- /dev/null
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py
@@ -0,0 +1,33 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+import soundfile
+
+
+if __name__ == '__main__':
+ output_dir = None
+ inference_pipline = pipeline(
+ task=Tasks.voice_activity_detection,
+ model="damo/speech_fsmn_vad_zh-cn-8k-common",
+ model_revision='v1.2.0',
+ output_dir=output_dir,
+ batch_size=1,
+ mode='online',
+ )
+ speech, sample_rate = soundfile.read("./vad_example_8k.wav")
+ speech_length = speech.shape[0]
+
+ sample_offset = 0
+
+ step = 80 * 10
+ param_dict = {'in_cache': dict()}
+ for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
+ if sample_offset + step >= speech_length - 1:
+ step = speech_length - sample_offset
+ is_final = True
+ else:
+ is_final = False
+ param_dict['is_final'] = is_final
+ segments_result = inference_pipline(audio_in=speech[sample_offset: sample_offset + step],
+ param_dict=param_dict)
+ print(segments_result)
+
diff --git a/funasr/bin/asr_inference.py b/funasr/bin/asr_inference.py
index 318d3d7a2..f3b4d560a 100644
--- a/funasr/bin/asr_inference.py
+++ b/funasr/bin/asr_inference.py
@@ -52,7 +52,7 @@ class Speech2Text:
Examples:
>>> import soundfile
- >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+ >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2text(audio)
[(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index 1fae766ea..da1241a66 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -216,6 +216,9 @@ def inference_launch(**kwargs):
elif mode == "paraformer":
from funasr.bin.asr_inference_paraformer import inference_modelscope
return inference_modelscope(**kwargs)
+ elif mode == "paraformer_streaming":
+ from funasr.bin.asr_inference_paraformer_streaming import inference_modelscope
+ return inference_modelscope(**kwargs)
elif mode == "paraformer_vad":
from funasr.bin.asr_inference_paraformer_vad import inference_modelscope
return inference_modelscope(**kwargs)
diff --git a/funasr/bin/asr_inference_mfcca.py b/funasr/bin/asr_inference_mfcca.py
index e25b2a90b..888d4d2f8 100644
--- a/funasr/bin/asr_inference_mfcca.py
+++ b/funasr/bin/asr_inference_mfcca.py
@@ -55,7 +55,7 @@ class Speech2Text:
Examples:
>>> import soundfile
- >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+ >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2text(audio)
[(text, token, token_int, hypothesis object), ...]
@@ -194,8 +194,8 @@ class Speech2Text:
# Input as audio signal
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
-
-
+ if(speech.dim()==3):
+ speech = torch.squeeze(speech, 2)
#speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
speech = speech.to(getattr(torch, self.dtype))
# lenghts: (1,)
@@ -534,6 +534,8 @@ def inference_modelscope(
data_path_and_name_and_type,
dtype=dtype,
batch_size=batch_size,
+ fs=fs,
+ mc=True,
key_file=key_file,
num_workers=num_workers,
preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
index b807a3452..e45e575ed 100644
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -42,6 +42,7 @@ from funasr.utils import asr_utils, wav_utils, postprocess_utils
from funasr.models.frontend.wav_frontend import WavFrontend
from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
+from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
class Speech2Text:
@@ -49,7 +50,7 @@ class Speech2Text:
Examples:
>>> import soundfile
- >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+ >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2text(audio)
[(text, token, token_int, hypothesis object), ...]
@@ -190,7 +191,8 @@ class Speech2Text:
@torch.no_grad()
def __call__(
- self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None
+ self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
+ begin_time: int = 0, end_time: int = None,
):
"""Inference
@@ -242,6 +244,10 @@ class Speech2Text:
decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length, hw_list=self.hotword_list)
decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
+ if isinstance(self.asr_model, BiCifParaformer):
+ _, _, us_alphas, us_peaks = self.asr_model.calc_predictor_timestamp(enc, enc_len,
+ pre_token_length) # test no bias cif2
+
results = []
b, n, d = decoder_out.size()
for i in range(b):
@@ -284,7 +290,14 @@ class Speech2Text:
else:
text = None
- results.append((text, token, token_int, hyp, enc_len_batch_total, lfr_factor))
+ if isinstance(self.asr_model, BiCifParaformer):
+ _, timestamp = ts_prediction_lfr6_standard(us_alphas[i],
+ us_peaks[i],
+ copy.copy(token),
+ vad_offset=begin_time)
+ results.append((text, token, token_int, hyp, timestamp, enc_len_batch_total, lfr_factor))
+ else:
+ results.append((text, token, token_int, hyp, enc_len_batch_total, lfr_factor))
# assert check_return_type(results)
return results
@@ -683,6 +696,11 @@ def inference_modelscope(
inference=True,
)
+ if param_dict is not None:
+ use_timestamp = param_dict.get('use_timestamp', True)
+ else:
+ use_timestamp = True
+
forward_time_total = 0.0
length_total = 0.0
finish_count = 0
@@ -724,7 +742,9 @@ def inference_modelscope(
result = [results[batch_id][:-2]]
key = keys[batch_id]
- for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), result):
+ for n, result in zip(range(1, nbest + 1), result):
+ text, token, token_int, hyp = result[0], result[1], result[2], result[3]
+ time_stamp = None if len(result) < 5 else result[4]
# Create a directory: outdir/{n}best_recog
if writer is not None:
ibest_writer = writer[f"{n}best_recog"]
@@ -736,8 +756,20 @@ def inference_modelscope(
ibest_writer["rtf"][key] = rtf_cur
if text is not None:
- text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
+ if use_timestamp and time_stamp is not None:
+ postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
+ else:
+ postprocessed_result = postprocess_utils.sentence_postprocess(token)
+ time_stamp_postprocessed = ""
+ if len(postprocessed_result) == 3:
+ text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
+ postprocessed_result[1], \
+ postprocessed_result[2]
+ else:
+ text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]
item = {'key': key, 'value': text_postprocessed}
+ if time_stamp_postprocessed != "":
+ item['time_stamp'] = time_stamp_postprocessed
asr_result_list.append(item)
finish_count += 1
# asr_utils.print_progress(finish_count / file_count)
diff --git a/funasr/bin/asr_inference_paraformer_streaming.py b/funasr/bin/asr_inference_paraformer_streaming.py
new file mode 100644
index 000000000..9b572a0af
--- /dev/null
+++ b/funasr/bin/asr_inference_paraformer_streaming.py
@@ -0,0 +1,907 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+import sys
+import time
+import copy
+import os
+import codecs
+import tempfile
+import requests
+from pathlib import Path
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+from typing import Dict
+from typing import Any
+from typing import List
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+
+from funasr.fileio.datadir_writer import DatadirWriter
+from funasr.modules.beam_search.beam_search import BeamSearchPara as BeamSearch
+from funasr.modules.beam_search.beam_search import Hypothesis
+from funasr.modules.scorers.ctc import CTCPrefixScorer
+from funasr.modules.scorers.length_bonus import LengthBonus
+from funasr.modules.subsampling import TooShortUttError
+from funasr.tasks.asr import ASRTaskParaformer as ASRTask
+from funasr.tasks.lm import LMTask
+from funasr.text.build_tokenizer import build_tokenizer
+from funasr.text.token_id_converter import TokenIDConverter
+from funasr.torch_utils.device_funcs import to_device
+from funasr.torch_utils.set_all_random_seed import set_all_random_seed
+from funasr.utils import config_argparse
+from funasr.utils.cli_utils import get_commandline_args
+from funasr.utils.types import str2bool
+from funasr.utils.types import str2triple_str
+from funasr.utils.types import str_or_none
+from funasr.utils import asr_utils, wav_utils, postprocess_utils
+from funasr.models.frontend.wav_frontend import WavFrontend
+from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
+from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
+
+class Speech2Text:
+ """Speech2Text class
+
+ Examples:
+ >>> import soundfile
+ >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+ >>> audio, rate = soundfile.read("speech.wav")
+ >>> speech2text(audio)
+ [(text, token, token_int, hypothesis object), ...]
+
+ """
+
+ def __init__(
+ self,
+ asr_train_config: Union[Path, str] = None,
+ asr_model_file: Union[Path, str] = None,
+ cmvn_file: Union[Path, str] = None,
+ lm_train_config: Union[Path, str] = None,
+ lm_file: Union[Path, str] = None,
+ token_type: str = None,
+ bpemodel: str = None,
+ device: str = "cpu",
+ maxlenratio: float = 0.0,
+ minlenratio: float = 0.0,
+ dtype: str = "float32",
+ beam_size: int = 20,
+ ctc_weight: float = 0.5,
+ lm_weight: float = 1.0,
+ ngram_weight: float = 0.9,
+ penalty: float = 0.0,
+ nbest: int = 1,
+ frontend_conf: dict = None,
+ hotword_list_or_file: str = None,
+ **kwargs,
+ ):
+ assert check_argument_types()
+
+ # 1. Build ASR model
+ scorers = {}
+ asr_model, asr_train_args = ASRTask.build_model_from_file(
+ asr_train_config, asr_model_file, cmvn_file, device
+ )
+ frontend = None
+ if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
+ frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
+
+ logging.info("asr_model: {}".format(asr_model))
+ logging.info("asr_train_args: {}".format(asr_train_args))
+ asr_model.to(dtype=getattr(torch, dtype)).eval()
+
+ if asr_model.ctc != None:
+ ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+ scorers.update(
+ ctc=ctc
+ )
+ token_list = asr_model.token_list
+ scorers.update(
+ length_bonus=LengthBonus(len(token_list)),
+ )
+
+ # 2. Build Language model
+ if lm_train_config is not None:
+ lm, lm_train_args = LMTask.build_model_from_file(
+ lm_train_config, lm_file, device
+ )
+ scorers["lm"] = lm.lm
+
+ # 3. Build ngram model
+ # ngram is not supported now
+ ngram = None
+ scorers["ngram"] = ngram
+
+ # 4. Build BeamSearch object
+ # transducer is not supported now
+ beam_search_transducer = None
+
+ weights = dict(
+ decoder=1.0 - ctc_weight,
+ ctc=ctc_weight,
+ lm=lm_weight,
+ ngram=ngram_weight,
+ length_bonus=penalty,
+ )
+ beam_search = BeamSearch(
+ beam_size=beam_size,
+ weights=weights,
+ scorers=scorers,
+ sos=asr_model.sos,
+ eos=asr_model.eos,
+ vocab_size=len(token_list),
+ token_list=token_list,
+ pre_beam_score_key=None if ctc_weight == 1.0 else "full",
+ )
+
+ beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
+ for scorer in scorers.values():
+ if isinstance(scorer, torch.nn.Module):
+ scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
+
+ logging.info(f"Decoding device={device}, dtype={dtype}")
+
+ # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
+ if token_type is None:
+ token_type = asr_train_args.token_type
+ if bpemodel is None:
+ bpemodel = asr_train_args.bpemodel
+
+ if token_type is None:
+ tokenizer = None
+ elif token_type == "bpe":
+ if bpemodel is not None:
+ tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+ else:
+ tokenizer = None
+ else:
+ tokenizer = build_tokenizer(token_type=token_type)
+ converter = TokenIDConverter(token_list=token_list)
+ logging.info(f"Text tokenizer: {tokenizer}")
+
+ self.asr_model = asr_model
+ self.asr_train_args = asr_train_args
+ self.converter = converter
+ self.tokenizer = tokenizer
+
+ # 6. [Optional] Build hotword list from str, local file or url
+
+ is_use_lm = lm_weight != 0.0 and lm_file is not None
+ if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
+ beam_search = None
+ self.beam_search = beam_search
+ logging.info(f"Beam_search: {self.beam_search}")
+ self.beam_search_transducer = beam_search_transducer
+ self.maxlenratio = maxlenratio
+ self.minlenratio = minlenratio
+ self.device = device
+ self.dtype = dtype
+ self.nbest = nbest
+ self.frontend = frontend
+ self.encoder_downsampling_factor = 1
+ if asr_train_args.encoder == "data2vec_encoder" or asr_train_args.encoder_conf["input_layer"] == "conv2d":
+ self.encoder_downsampling_factor = 4
+
+ @torch.no_grad()
+ def __call__(
+ self, cache: dict, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
+ begin_time: int = 0, end_time: int = None,
+ ):
+ """Inference
+
+ Args:
+ speech: Input speech data
+ Returns:
+ text, token, token_int, hyp
+
+ """
+ assert check_argument_types()
+
+ # Input as audio signal
+ if isinstance(speech, np.ndarray):
+ speech = torch.tensor(speech)
+
+ if self.frontend is not None:
+ feats, feats_len = self.frontend.forward(speech, speech_lengths)
+ feats = to_device(feats, device=self.device)
+ feats_len = feats_len.int()
+ self.asr_model.frontend = None
+ else:
+ feats = speech
+ feats_len = speech_lengths
+ lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
+ batch = {"speech": feats, "speech_lengths": feats_len, "cache": cache}
+
+ # a. To device
+ batch = to_device(batch, device=self.device)
+
+ # b. Forward Encoder
+ enc, enc_len = self.asr_model.encode_chunk(**batch)
+ if isinstance(enc, tuple):
+ enc = enc[0]
+ # assert len(enc) == 1, len(enc)
+ enc_len_batch_total = torch.sum(enc_len).item() * self.encoder_downsampling_factor
+
+ predictor_outs = self.asr_model.calc_predictor_chunk(enc, cache)
+ pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
+ predictor_outs[2], predictor_outs[3]
+ pre_token_length = pre_token_length.floor().long()
+ if torch.max(pre_token_length) < 1:
+ return []
+ decoder_outs = self.asr_model.cal_decoder_with_predictor_chunk(enc, pre_acoustic_embeds, cache)
+ decoder_out = decoder_outs
+
+ results = []
+ b, n, d = decoder_out.size()
+ for i in range(b):
+ x = enc[i, :enc_len[i], :]
+ am_scores = decoder_out[i, :pre_token_length[i], :]
+ if self.beam_search is not None:
+ nbest_hyps = self.beam_search(
+ x=x, am_scores=am_scores, maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
+ )
+
+ nbest_hyps = nbest_hyps[: self.nbest]
+ else:
+ yseq = am_scores.argmax(dim=-1)
+ score = am_scores.max(dim=-1)[0]
+ score = torch.sum(score, dim=-1)
+ # pad with mask tokens to ensure compatibility with sos/eos tokens
+ yseq = torch.tensor(
+ [self.asr_model.sos] + yseq.tolist() + [self.asr_model.eos], device=yseq.device
+ )
+ nbest_hyps = [Hypothesis(yseq=yseq, score=score)]
+
+ for hyp in nbest_hyps:
+ assert isinstance(hyp, (Hypothesis)), type(hyp)
+
+ # remove sos/eos and get results
+ last_pos = -1
+ if isinstance(hyp.yseq, list):
+ token_int = hyp.yseq[1:last_pos]
+ else:
+ token_int = hyp.yseq[1:last_pos].tolist()
+
+ # remove blank symbol id, which is assumed to be 0
+ token_int = list(filter(lambda x: x != 0 and x != 2, token_int))
+
+ # Change integer-ids to tokens
+ token = self.converter.ids2tokens(token_int)
+
+ if self.tokenizer is not None:
+ text = self.tokenizer.tokens2text(token)
+ else:
+ text = None
+
+ results.append((text, token, token_int, hyp, enc_len_batch_total, lfr_factor))
+
+ # assert check_return_type(results)
+ return results
+
+
+class Speech2TextExport:
+ """Speech2TextExport class
+
+ """
+
+ def __init__(
+ self,
+ asr_train_config: Union[Path, str] = None,
+ asr_model_file: Union[Path, str] = None,
+ cmvn_file: Union[Path, str] = None,
+ lm_train_config: Union[Path, str] = None,
+ lm_file: Union[Path, str] = None,
+ token_type: str = None,
+ bpemodel: str = None,
+ device: str = "cpu",
+ maxlenratio: float = 0.0,
+ minlenratio: float = 0.0,
+ dtype: str = "float32",
+ beam_size: int = 20,
+ ctc_weight: float = 0.5,
+ lm_weight: float = 1.0,
+ ngram_weight: float = 0.9,
+ penalty: float = 0.0,
+ nbest: int = 1,
+ frontend_conf: dict = None,
+ hotword_list_or_file: str = None,
+ **kwargs,
+ ):
+
+ # 1. Build ASR model
+ asr_model, asr_train_args = ASRTask.build_model_from_file(
+ asr_train_config, asr_model_file, cmvn_file, device
+ )
+ frontend = None
+ if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
+ frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
+
+ logging.info("asr_model: {}".format(asr_model))
+ logging.info("asr_train_args: {}".format(asr_train_args))
+ asr_model.to(dtype=getattr(torch, dtype)).eval()
+
+ token_list = asr_model.token_list
+
+ logging.info(f"Decoding device={device}, dtype={dtype}")
+
+ # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
+ if token_type is None:
+ token_type = asr_train_args.token_type
+ if bpemodel is None:
+ bpemodel = asr_train_args.bpemodel
+
+ if token_type is None:
+ tokenizer = None
+ elif token_type == "bpe":
+ if bpemodel is not None:
+ tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+ else:
+ tokenizer = None
+ else:
+ tokenizer = build_tokenizer(token_type=token_type)
+ converter = TokenIDConverter(token_list=token_list)
+ logging.info(f"Text tokenizer: {tokenizer}")
+
+ # self.asr_model = asr_model
+ self.asr_train_args = asr_train_args
+ self.converter = converter
+ self.tokenizer = tokenizer
+
+ self.device = device
+ self.dtype = dtype
+ self.nbest = nbest
+ self.frontend = frontend
+
+ model = Paraformer_export(asr_model, onnx=False)
+ self.asr_model = model
+
+ @torch.no_grad()
+ def __call__(
+ self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None
+ ):
+ """Inference
+
+ Args:
+ speech: Input speech data
+ Returns:
+ text, token, token_int, hyp
+
+ """
+ assert check_argument_types()
+
+ # Input as audio signal
+ if isinstance(speech, np.ndarray):
+ speech = torch.tensor(speech)
+
+ if self.frontend is not None:
+ feats, feats_len = self.frontend.forward(speech, speech_lengths)
+ feats = to_device(feats, device=self.device)
+ feats_len = feats_len.int()
+ self.asr_model.frontend = None
+ else:
+ feats = speech
+ feats_len = speech_lengths
+
+ enc_len_batch_total = feats_len.sum()
+ lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
+ batch = {"speech": feats, "speech_lengths": feats_len}
+
+ # a. To device
+ batch = to_device(batch, device=self.device)
+
+ decoder_outs = self.asr_model(**batch)
+ decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
+
+ results = []
+ b, n, d = decoder_out.size()
+ for i in range(b):
+ am_scores = decoder_out[i, :ys_pad_lens[i], :]
+
+ yseq = am_scores.argmax(dim=-1)
+ score = am_scores.max(dim=-1)[0]
+ score = torch.sum(score, dim=-1)
+ # pad with mask tokens to ensure compatibility with sos/eos tokens
+ yseq = torch.tensor(
+ yseq.tolist(), device=yseq.device
+ )
+ nbest_hyps = [Hypothesis(yseq=yseq, score=score)]
+
+ for hyp in nbest_hyps:
+ assert isinstance(hyp, (Hypothesis)), type(hyp)
+
+ # remove sos/eos and get results
+ last_pos = -1
+ if isinstance(hyp.yseq, list):
+ token_int = hyp.yseq[1:last_pos]
+ else:
+ token_int = hyp.yseq[1:last_pos].tolist()
+
+ # remove blank symbol id, which is assumed to be 0
+ token_int = list(filter(lambda x: x != 0 and x != 2, token_int))
+
+ # Change integer-ids to tokens
+ token = self.converter.ids2tokens(token_int)
+
+ if self.tokenizer is not None:
+ text = self.tokenizer.tokens2text(token)
+ else:
+ text = None
+
+ results.append((text, token, token_int, hyp, enc_len_batch_total, lfr_factor))
+
+ return results
+
+
+def inference(
+ maxlenratio: float,
+ minlenratio: float,
+ batch_size: int,
+ beam_size: int,
+ ngpu: int,
+ ctc_weight: float,
+ lm_weight: float,
+ penalty: float,
+ log_level: Union[int, str],
+ data_path_and_name_and_type,
+ asr_train_config: Optional[str],
+ asr_model_file: Optional[str],
+ cmvn_file: Optional[str] = None,
+ raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ lm_train_config: Optional[str] = None,
+ lm_file: Optional[str] = None,
+ token_type: Optional[str] = None,
+ key_file: Optional[str] = None,
+ word_lm_train_config: Optional[str] = None,
+ bpemodel: Optional[str] = None,
+ allow_variable_data_keys: bool = False,
+ streaming: bool = False,
+ output_dir: Optional[str] = None,
+ dtype: str = "float32",
+ seed: int = 0,
+ ngram_weight: float = 0.9,
+ nbest: int = 1,
+ num_workers: int = 1,
+
+ **kwargs,
+):
+ inference_pipeline = inference_modelscope(
+ maxlenratio=maxlenratio,
+ minlenratio=minlenratio,
+ batch_size=batch_size,
+ beam_size=beam_size,
+ ngpu=ngpu,
+ ctc_weight=ctc_weight,
+ lm_weight=lm_weight,
+ penalty=penalty,
+ log_level=log_level,
+ asr_train_config=asr_train_config,
+ asr_model_file=asr_model_file,
+ cmvn_file=cmvn_file,
+ raw_inputs=raw_inputs,
+ lm_train_config=lm_train_config,
+ lm_file=lm_file,
+ token_type=token_type,
+ key_file=key_file,
+ word_lm_train_config=word_lm_train_config,
+ bpemodel=bpemodel,
+ allow_variable_data_keys=allow_variable_data_keys,
+ streaming=streaming,
+ output_dir=output_dir,
+ dtype=dtype,
+ seed=seed,
+ ngram_weight=ngram_weight,
+ nbest=nbest,
+ num_workers=num_workers,
+
+ **kwargs,
+ )
+ return inference_pipeline(data_path_and_name_and_type, raw_inputs)
+
+
+def inference_modelscope(
+ maxlenratio: float,
+ minlenratio: float,
+ batch_size: int,
+ beam_size: int,
+ ngpu: int,
+ ctc_weight: float,
+ lm_weight: float,
+ penalty: float,
+ log_level: Union[int, str],
+ # data_path_and_name_and_type,
+ asr_train_config: Optional[str],
+ asr_model_file: Optional[str],
+ cmvn_file: Optional[str] = None,
+ lm_train_config: Optional[str] = None,
+ lm_file: Optional[str] = None,
+ token_type: Optional[str] = None,
+ key_file: Optional[str] = None,
+ word_lm_train_config: Optional[str] = None,
+ bpemodel: Optional[str] = None,
+ allow_variable_data_keys: bool = False,
+ dtype: str = "float32",
+ seed: int = 0,
+ ngram_weight: float = 0.9,
+ nbest: int = 1,
+ num_workers: int = 1,
+ output_dir: Optional[str] = None,
+ param_dict: dict = None,
+ **kwargs,
+):
+ assert check_argument_types()
+
+ if word_lm_train_config is not None:
+ raise NotImplementedError("Word LM is not implemented")
+ if ngpu > 1:
+ raise NotImplementedError("only single GPU decoding is supported")
+
+ logging.basicConfig(
+ level=log_level,
+ format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+ )
+
+ export_mode = False
+ if param_dict is not None:
+ hotword_list_or_file = param_dict.get('hotword')
+ export_mode = param_dict.get("export_mode", False)
+ else:
+ hotword_list_or_file = None
+
+ if ngpu >= 1 and torch.cuda.is_available():
+ device = "cuda"
+ else:
+ device = "cpu"
+ batch_size = 1
+
+ # 1. Set random-seed
+ set_all_random_seed(seed)
+
+ # 2. Build speech2text
+ speech2text_kwargs = dict(
+ asr_train_config=asr_train_config,
+ asr_model_file=asr_model_file,
+ cmvn_file=cmvn_file,
+ lm_train_config=lm_train_config,
+ lm_file=lm_file,
+ token_type=token_type,
+ bpemodel=bpemodel,
+ device=device,
+ maxlenratio=maxlenratio,
+ minlenratio=minlenratio,
+ dtype=dtype,
+ beam_size=beam_size,
+ ctc_weight=ctc_weight,
+ lm_weight=lm_weight,
+ ngram_weight=ngram_weight,
+ penalty=penalty,
+ nbest=nbest,
+ hotword_list_or_file=hotword_list_or_file,
+ )
+ if export_mode:
+ speech2text = Speech2TextExport(**speech2text_kwargs)
+ else:
+ speech2text = Speech2Text(**speech2text_kwargs)
+
+ def _forward(
+ data_path_and_name_and_type,
+ raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ output_dir_v2: Optional[str] = None,
+ fs: dict = None,
+ param_dict: dict = None,
+ **kwargs,
+ ):
+
+ hotword_list_or_file = None
+ if param_dict is not None:
+ hotword_list_or_file = param_dict.get('hotword')
+ if 'hotword' in kwargs:
+ hotword_list_or_file = kwargs['hotword']
+ if hotword_list_or_file is not None or 'hotword' in kwargs:
+ speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file)
+
+ # 3. Build data-iterator
+ if data_path_and_name_and_type is None and raw_inputs is not None:
+ if isinstance(raw_inputs, torch.Tensor):
+ raw_inputs = raw_inputs.numpy()
+ data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
+ loader = ASRTask.build_streaming_iterator(
+ data_path_and_name_and_type,
+ dtype=dtype,
+ fs=fs,
+ batch_size=batch_size,
+ key_file=key_file,
+ num_workers=num_workers,
+ preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
+ collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
+ allow_variable_data_keys=allow_variable_data_keys,
+ inference=True,
+ )
+
+ if param_dict is not None:
+ use_timestamp = param_dict.get('use_timestamp', True)
+ else:
+ use_timestamp = True
+
+ forward_time_total = 0.0
+ length_total = 0.0
+ finish_count = 0
+ file_count = 1
+ cache = None
+ # 7 .Start for-loop
+ # FIXME(kamo): The output format should be discussed about
+ asr_result_list = []
+ output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
+ if output_path is not None:
+ writer = DatadirWriter(output_path)
+ else:
+ writer = None
+ if param_dict is not None and "cache" in param_dict:
+ cache = param_dict["cache"]
+ for keys, batch in loader:
+ assert isinstance(batch, dict), type(batch)
+ assert all(isinstance(s, str) for s in keys), keys
+ _bs = len(next(iter(batch.values())))
+ assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+ # batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")}
+ logging.info("decoding, utt_id: {}".format(keys))
+ # N-best list of (text, token, token_int, hyp_object)
+
+ time_beg = time.time()
+ results = speech2text(cache=cache, **batch)
+ if len(results) < 1:
+ hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
+ results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
+ time_end = time.time()
+ forward_time = time_end - time_beg
+ lfr_factor = results[0][-1]
+ length = results[0][-2]
+ forward_time_total += forward_time
+ length_total += length
+ rtf_cur = "decoding, feature length: {}, forward_time: {:.4f}, rtf: {:.4f}".format(length, forward_time,
+ 100 * forward_time / (
+ length * lfr_factor))
+ logging.info(rtf_cur)
+
+ for batch_id in range(_bs):
+ result = [results[batch_id][:-2]]
+
+ key = keys[batch_id]
+ for n, result in zip(range(1, nbest + 1), result):
+ text, token, token_int, hyp = result[0], result[1], result[2], result[3]
+ time_stamp = None if len(result) < 5 else result[4]
+ # Create a directory: outdir/{n}best_recog
+ if writer is not None:
+ ibest_writer = writer[f"{n}best_recog"]
+
+ # Write the result to each file
+ ibest_writer["token"][key] = " ".join(token)
+ # ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+ ibest_writer["score"][key] = str(hyp.score)
+ ibest_writer["rtf"][key] = rtf_cur
+
+ if text is not None:
+ if use_timestamp and time_stamp is not None:
+ postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
+ else:
+ postprocessed_result = postprocess_utils.sentence_postprocess(token)
+ time_stamp_postprocessed = ""
+ if len(postprocessed_result) == 3:
+ text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
+ postprocessed_result[1], \
+ postprocessed_result[2]
+ else:
+ text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]
+ item = {'key': key, 'value': text_postprocessed}
+ if time_stamp_postprocessed != "":
+ item['time_stamp'] = time_stamp_postprocessed
+ asr_result_list.append(item)
+ finish_count += 1
+ # asr_utils.print_progress(finish_count / file_count)
+ if writer is not None:
+ ibest_writer["text"][key] = text_postprocessed
+
+ logging.info("decoding, utt: {}, predictions: {}".format(key, text))
+ rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total,
+ forward_time_total,
+ 100 * forward_time_total / (
+ length_total * lfr_factor))
+ logging.info(rtf_avg)
+ if writer is not None:
+ ibest_writer["rtf"]["rtf_avf"] = rtf_avg
+ return asr_result_list
+
+ return _forward
+
+
+def get_parser():
+ parser = config_argparse.ArgumentParser(
+ description="ASR Decoding",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+
+ # Note(kamo): Use '_' instead of '-' as separator.
+ # '-' is confusing if written in yaml.
+ parser.add_argument(
+ "--log_level",
+ type=lambda x: x.upper(),
+ default="INFO",
+ choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+ help="The verbose level of logging",
+ )
+
+ parser.add_argument("--output_dir", type=str, required=True)
+ parser.add_argument(
+ "--ngpu",
+ type=int,
+ default=0,
+ help="The number of gpus. 0 indicates CPU mode",
+ )
+ parser.add_argument("--seed", type=int, default=0, help="Random seed")
+ parser.add_argument(
+ "--dtype",
+ default="float32",
+ choices=["float16", "float32", "float64"],
+ help="Data type",
+ )
+ parser.add_argument(
+ "--num_workers",
+ type=int,
+ default=1,
+ help="The number of workers used for DataLoader",
+ )
+ parser.add_argument(
+ "--hotword",
+ type=str_or_none,
+ default=None,
+ help="hotword file path or hotwords seperated by space"
+ )
+ group = parser.add_argument_group("Input data related")
+ group.add_argument(
+ "--data_path_and_name_and_type",
+ type=str2triple_str,
+ required=False,
+ action="append",
+ )
+ group.add_argument("--key_file", type=str_or_none)
+ group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+
+ group = parser.add_argument_group("The model configuration related")
+ group.add_argument(
+ "--asr_train_config",
+ type=str,
+ help="ASR training configuration",
+ )
+ group.add_argument(
+ "--asr_model_file",
+ type=str,
+ help="ASR model parameter file",
+ )
+ group.add_argument(
+ "--cmvn_file",
+ type=str,
+ help="Global cmvn file",
+ )
+ group.add_argument(
+ "--lm_train_config",
+ type=str,
+ help="LM training configuration",
+ )
+ group.add_argument(
+ "--lm_file",
+ type=str,
+ help="LM parameter file",
+ )
+ group.add_argument(
+ "--word_lm_train_config",
+ type=str,
+ help="Word LM training configuration",
+ )
+ group.add_argument(
+ "--word_lm_file",
+ type=str,
+ help="Word LM parameter file",
+ )
+ group.add_argument(
+ "--ngram_file",
+ type=str,
+ help="N-gram parameter file",
+ )
+ group.add_argument(
+ "--model_tag",
+ type=str,
+ help="Pretrained model tag. If specify this option, *_train_config and "
+ "*_file will be overwritten",
+ )
+
+ group = parser.add_argument_group("Beam-search related")
+ group.add_argument(
+ "--batch_size",
+ type=int,
+ default=1,
+ help="The batch size for inference",
+ )
+ group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+ group.add_argument("--beam_size", type=int, default=20, help="Beam size")
+ group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
+ group.add_argument(
+ "--maxlenratio",
+ type=float,
+ default=0.0,
+ help="Input length ratio to obtain max output length. "
+ "If maxlenratio=0.0 (default), it uses a end-detect "
+ "function "
+ "to automatically find maximum hypothesis lengths."
+ "If maxlenratio<0.0, its absolute value is interpreted"
+ "as a constant max output length",
+ )
+ group.add_argument(
+ "--minlenratio",
+ type=float,
+ default=0.0,
+ help="Input length ratio to obtain min output length",
+ )
+ group.add_argument(
+ "--ctc_weight",
+ type=float,
+ default=0.5,
+ help="CTC weight in joint decoding",
+ )
+ group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
+ group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
+ group.add_argument("--streaming", type=str2bool, default=False)
+
+ group.add_argument(
+ "--frontend_conf",
+ default=None,
+ help="",
+ )
+ group.add_argument("--raw_inputs", type=list, default=None)
+ # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
+
+ group = parser.add_argument_group("Text converter related")
+ group.add_argument(
+ "--token_type",
+ type=str_or_none,
+ default=None,
+ choices=["char", "bpe", None],
+ help="The token type for ASR model. "
+ "If not given, refers from the training args",
+ )
+ group.add_argument(
+ "--bpemodel",
+ type=str_or_none,
+ default=None,
+ help="The model path of sentencepiece. "
+ "If not given, refers from the training args",
+ )
+
+ return parser
+
+
+def main(cmd=None):
+ print(get_commandline_args(), file=sys.stderr)
+ parser = get_parser()
+ args = parser.parse_args(cmd)
+ param_dict = {'hotword': args.hotword}
+ kwargs = vars(args)
+ kwargs.pop("config", None)
+ kwargs['param_dict'] = param_dict
+ inference(**kwargs)
+
+
+if __name__ == "__main__":
+ main()
+
+ # from modelscope.pipelines import pipeline
+ # from modelscope.utils.constant import Tasks
+ #
+ # inference_16k_pipline = pipeline(
+ # task=Tasks.auto_speech_recognition,
+ # model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
+ #
+ # rec_result = inference_16k_pipline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+ # print(rec_result)
+
diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py
index 13208778f..3f5775195 100644
--- a/funasr/bin/asr_inference_paraformer_vad_punc.py
+++ b/funasr/bin/asr_inference_paraformer_vad_punc.py
@@ -44,11 +44,10 @@ from funasr.utils import asr_utils, wav_utils, postprocess_utils
from funasr.models.frontend.wav_frontend import WavFrontend
from funasr.tasks.vad import VADTask
from funasr.bin.vad_inference import Speech2VadSegment
-from funasr.utils.timestamp_tools import time_stamp_lfr6_pl
+from funasr.utils.timestamp_tools import time_stamp_sentence, ts_prediction_lfr6_standard
from funasr.bin.punctuation_infer import Text2Punc
from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
-from funasr.utils.timestamp_tools import time_stamp_sentence
header_colors = '\033[95m'
end_colors = '\033[0m'
@@ -59,7 +58,7 @@ class Speech2Text:
Examples:
>>> import soundfile
- >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+ >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2text(audio)
[(text, token, token_int, hypothesis object), ...]
@@ -257,7 +256,7 @@ class Speech2Text:
decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
if isinstance(self.asr_model, BiCifParaformer):
- _, _, us_alphas, us_cif_peak = self.asr_model.calc_predictor_timestamp(enc, enc_len,
+ _, _, us_alphas, us_peaks = self.asr_model.calc_predictor_timestamp(enc, enc_len,
pre_token_length) # test no bias cif2
results = []
@@ -303,7 +302,10 @@ class Speech2Text:
text = None
if isinstance(self.asr_model, BiCifParaformer):
- timestamp = time_stamp_lfr6_pl(us_alphas[i], us_cif_peak[i], copy.copy(token), begin_time, end_time)
+ _, timestamp = ts_prediction_lfr6_standard(us_alphas[i],
+ us_peaks[i],
+ copy.copy(token),
+ vad_offset=begin_time)
results.append((text, token, token_int, timestamp, enc_len_batch_total, lfr_factor))
else:
results.append((text, token, token_int, enc_len_batch_total, lfr_factor))
diff --git a/funasr/bin/asr_inference_rnnt.py b/funasr/bin/asr_inference_rnnt.py
index 6cd70613b..4a9ff0bda 100644
--- a/funasr/bin/asr_inference_rnnt.py
+++ b/funasr/bin/asr_inference_rnnt.py
@@ -49,7 +49,7 @@ class Speech2Text:
Examples:
>>> import soundfile
- >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+ >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2text(audio)
[(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_uniasr.py b/funasr/bin/asr_inference_uniasr.py
index 8b31fad13..ac71538a6 100644
--- a/funasr/bin/asr_inference_uniasr.py
+++ b/funasr/bin/asr_inference_uniasr.py
@@ -46,7 +46,7 @@ class Speech2Text:
Examples:
>>> import soundfile
- >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+ >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2text(audio)
[(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_uniasr_vad.py b/funasr/bin/asr_inference_uniasr_vad.py
index e5815df11..7cb889b7d 100644
--- a/funasr/bin/asr_inference_uniasr_vad.py
+++ b/funasr/bin/asr_inference_uniasr_vad.py
@@ -46,7 +46,7 @@ class Speech2Text:
Examples:
>>> import soundfile
- >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+ >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2text(audio)
[(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/build_trainer.py b/funasr/bin/build_trainer.py
index 8dee75827..94f72627e 100644
--- a/funasr/bin/build_trainer.py
+++ b/funasr/bin/build_trainer.py
@@ -28,7 +28,9 @@ def parse_args(mode):
elif mode == "uniasr":
from funasr.tasks.asr import ASRTaskUniASR as ASRTask
elif mode == "mfcca":
- from funasr.tasks.asr import ASRTaskMFCCA as ASRTask
+ from funasr.tasks.asr import ASRTaskMFCCA as ASRTask
+ elif mode == "tp":
+ from funasr.tasks.asr import ASRTaskAligner as ASRTask
else:
raise ValueError("Unknown mode: {}".format(mode))
parser = ASRTask.get_parser()
diff --git a/funasr/bin/diar_inference_launch.py b/funasr/bin/diar_inference_launch.py
index 7738f4f4f..85e451836 100755
--- a/funasr/bin/diar_inference_launch.py
+++ b/funasr/bin/diar_inference_launch.py
@@ -133,7 +133,7 @@ def inference_launch(mode, **kwargs):
param_dict = {
"extract_profile": True,
"sv_train_config": "sv.yaml",
- "sv_model_file": "sv.pth",
+ "sv_model_file": "sv.pb",
}
if "param_dict" in kwargs and kwargs["param_dict"] is not None:
for key in param_dict:
@@ -142,6 +142,9 @@ def inference_launch(mode, **kwargs):
else:
kwargs["param_dict"] = param_dict
return inference_modelscope(mode=mode, **kwargs)
+ elif mode == "eend-ola":
+ from funasr.bin.eend_ola_inference import inference_modelscope
+ return inference_modelscope(mode=mode, **kwargs)
else:
logging.info("Unknown decoding mode: {}".format(mode))
return None
diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py
new file mode 100755
index 000000000..01d3f296a
--- /dev/null
+++ b/funasr/bin/eend_ola_inference.py
@@ -0,0 +1,427 @@
+#!/usr/bin/env python3
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+# MIT License (https://opensource.org/licenses/MIT)
+
+import argparse
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Any
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import torch
+from scipy.signal import medfilt
+from typeguard import check_argument_types
+
+from funasr.models.frontend.wav_frontend import WavFrontendMel23
+from funasr.tasks.diar import EENDOLADiarTask
+from funasr.torch_utils.device_funcs import to_device
+from funasr.utils import config_argparse
+from funasr.utils.cli_utils import get_commandline_args
+from funasr.utils.types import str2bool
+from funasr.utils.types import str2triple_str
+from funasr.utils.types import str_or_none
+
+
+class Speech2Diarization:
+ """Speech2Diarlization class
+
+ Examples:
+ >>> import soundfile
+ >>> import numpy as np
+ >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
+ >>> profile = np.load("profiles.npy")
+ >>> audio, rate = soundfile.read("speech.wav")
+ >>> speech2diar(audio, profile)
+ {"spk1": [(int, int), ...], ...}
+
+ """
+
+ def __init__(
+ self,
+ diar_train_config: Union[Path, str] = None,
+ diar_model_file: Union[Path, str] = None,
+ device: str = "cpu",
+ dtype: str = "float32",
+ ):
+ assert check_argument_types()
+
+ # 1. Build Diarization model
+ diar_model, diar_train_args = EENDOLADiarTask.build_model_from_file(
+ config_file=diar_train_config,
+ model_file=diar_model_file,
+ device=device
+ )
+ frontend = None
+ if diar_train_args.frontend is not None and diar_train_args.frontend_conf is not None:
+ frontend = WavFrontendMel23(**diar_train_args.frontend_conf)
+
+ # set up seed for eda
+ np.random.seed(diar_train_args.seed)
+ torch.manual_seed(diar_train_args.seed)
+ torch.cuda.manual_seed(diar_train_args.seed)
+ os.environ['PYTORCH_SEED'] = str(diar_train_args.seed)
+ logging.info("diar_model: {}".format(diar_model))
+ logging.info("diar_train_args: {}".format(diar_train_args))
+ diar_model.to(dtype=getattr(torch, dtype)).eval()
+
+ self.diar_model = diar_model
+ self.diar_train_args = diar_train_args
+ self.device = device
+ self.dtype = dtype
+ self.frontend = frontend
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ speech: Union[torch.Tensor, np.ndarray],
+ speech_lengths: Union[torch.Tensor, np.ndarray] = None
+ ):
+ """Inference
+
+ Args:
+ speech: Input speech data
+ Returns:
+ diarization results
+
+ """
+ assert check_argument_types()
+ # Input as audio signal
+ if isinstance(speech, np.ndarray):
+ speech = torch.tensor(speech)
+
+ if self.frontend is not None:
+ feats, feats_len = self.frontend.forward(speech, speech_lengths)
+ feats = to_device(feats, device=self.device)
+ feats_len = feats_len.int()
+ self.diar_model.frontend = None
+ else:
+ feats = speech
+ feats_len = speech_lengths
+ batch = {"speech": feats, "speech_lengths": feats_len}
+ batch = to_device(batch, device=self.device)
+ results = self.diar_model.estimate_sequential(**batch)
+
+ return results
+
+ @staticmethod
+ def from_pretrained(
+ model_tag: Optional[str] = None,
+ **kwargs: Optional[Any],
+ ):
+ """Build Speech2Diarization instance from the pretrained model.
+
+ Args:
+ model_tag (Optional[str]): Model tag of the pretrained models.
+ Currently, the tags of espnet_model_zoo are supported.
+
+ Returns:
+ Speech2Diarization: Speech2Diarization instance.
+
+ """
+ if model_tag is not None:
+ try:
+ from espnet_model_zoo.downloader import ModelDownloader
+
+ except ImportError:
+ logging.error(
+ "`espnet_model_zoo` is not installed. "
+ "Please install via `pip install -U espnet_model_zoo`."
+ )
+ raise
+ d = ModelDownloader()
+ kwargs.update(**d.download_and_unpack(model_tag))
+
+ return Speech2Diarization(**kwargs)
+
+
+def inference_modelscope(
+ diar_train_config: str,
+ diar_model_file: str,
+ output_dir: Optional[str] = None,
+ batch_size: int = 1,
+ dtype: str = "float32",
+ ngpu: int = 1,
+ num_workers: int = 0,
+ log_level: Union[int, str] = "INFO",
+ key_file: Optional[str] = None,
+ model_tag: Optional[str] = None,
+ allow_variable_data_keys: bool = True,
+ streaming: bool = False,
+ param_dict: Optional[dict] = None,
+ **kwargs,
+):
+ assert check_argument_types()
+ if batch_size > 1:
+ raise NotImplementedError("batch decoding is not implemented")
+ if ngpu > 1:
+ raise NotImplementedError("only single GPU decoding is supported")
+
+ logging.basicConfig(
+ level=log_level,
+ format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+ )
+ logging.info("param_dict: {}".format(param_dict))
+
+ if ngpu >= 1 and torch.cuda.is_available():
+ device = "cuda"
+ else:
+ device = "cpu"
+
+ # 1. Build speech2diar
+ speech2diar_kwargs = dict(
+ diar_train_config=diar_train_config,
+ diar_model_file=diar_model_file,
+ device=device,
+ dtype=dtype,
+ )
+ logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs))
+ speech2diar = Speech2Diarization.from_pretrained(
+ model_tag=model_tag,
+ **speech2diar_kwargs,
+ )
+ speech2diar.diar_model.eval()
+
+ def output_results_str(results: dict, uttid: str):
+ rst = []
+ mid = uttid.rsplit("-", 1)[0]
+ for key in results:
+ results[key] = [(x[0] / 100, x[1] / 100) for x in results[key]]
+ template = "SPEAKER {} 0 {:.2f} {:.2f}
{} "
+ for spk, segs in results.items():
+ rst.extend([template.format(mid, st, ed, spk) for st, ed in segs])
+
+ return "\n".join(rst)
+
+ def _forward(
+ data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None,
+ raw_inputs: List[List[Union[np.ndarray, torch.Tensor, str, bytes]]] = None,
+ output_dir_v2: Optional[str] = None,
+ param_dict: Optional[dict] = None,
+ ):
+ # 2. Build data-iterator
+ if data_path_and_name_and_type is None and raw_inputs is not None:
+ if isinstance(raw_inputs, torch.Tensor):
+ raw_inputs = raw_inputs.numpy()
+ data_path_and_name_and_type = [raw_inputs[0], "speech", "sound"]
+ loader = EENDOLADiarTask.build_streaming_iterator(
+ data_path_and_name_and_type,
+ dtype=dtype,
+ batch_size=batch_size,
+ key_file=key_file,
+ num_workers=num_workers,
+ preprocess_fn=EENDOLADiarTask.build_preprocess_fn(speech2diar.diar_train_args, False),
+ collate_fn=EENDOLADiarTask.build_collate_fn(speech2diar.diar_train_args, False),
+ allow_variable_data_keys=allow_variable_data_keys,
+ inference=True,
+ )
+
+ # 3. Start for-loop
+ output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
+ if output_path is not None:
+ os.makedirs(output_path, exist_ok=True)
+ output_writer = open("{}/result.txt".format(output_path), "w")
+ result_list = []
+ for keys, batch in loader:
+ assert isinstance(batch, dict), type(batch)
+ assert all(isinstance(s, str) for s in keys), keys
+ _bs = len(next(iter(batch.values())))
+ assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+ # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+
+ results = speech2diar(**batch)
+
+ # post process
+ a = results[0][0].cpu().numpy()
+ a = medfilt(a, (11, 1))
+ rst = []
+ for spkid, frames in enumerate(a.T):
+ frames = np.pad(frames, (1, 1), 'constant')
+ changes, = np.where(np.diff(frames, axis=0) != 0)
+ fmt = "SPEAKER {:s} 1 {:7.2f} {:7.2f} {:s} "
+ for s, e in zip(changes[::2], changes[1::2]):
+ st = s / 10.
+ dur = (e - s) / 10.
+ rst.append(fmt.format(keys[0], st, dur, "{}_{}".format(keys[0], str(spkid))))
+
+ # Only supporting batch_size==1
+ value = "\n".join(rst)
+ item = {"key": keys[0], "value": value}
+ result_list.append(item)
+ if output_path is not None:
+ output_writer.write(value)
+ output_writer.flush()
+
+ if output_path is not None:
+ output_writer.close()
+
+ return result_list
+
+ return _forward
+
+
+def inference(
+ data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
+ diar_train_config: Optional[str],
+ diar_model_file: Optional[str],
+ output_dir: Optional[str] = None,
+ batch_size: int = 1,
+ dtype: str = "float32",
+ ngpu: int = 0,
+ seed: int = 0,
+ num_workers: int = 1,
+ log_level: Union[int, str] = "INFO",
+ key_file: Optional[str] = None,
+ model_tag: Optional[str] = None,
+ allow_variable_data_keys: bool = True,
+ streaming: bool = False,
+ smooth_size: int = 83,
+ dur_threshold: int = 10,
+ out_format: str = "vad",
+ **kwargs,
+):
+ inference_pipeline = inference_modelscope(
+ diar_train_config=diar_train_config,
+ diar_model_file=diar_model_file,
+ output_dir=output_dir,
+ batch_size=batch_size,
+ dtype=dtype,
+ ngpu=ngpu,
+ seed=seed,
+ num_workers=num_workers,
+ log_level=log_level,
+ key_file=key_file,
+ model_tag=model_tag,
+ allow_variable_data_keys=allow_variable_data_keys,
+ streaming=streaming,
+ smooth_size=smooth_size,
+ dur_threshold=dur_threshold,
+ out_format=out_format,
+ **kwargs,
+ )
+
+ return inference_pipeline(data_path_and_name_and_type, raw_inputs=None)
+
+
+def get_parser():
+ parser = config_argparse.ArgumentParser(
+ description="Speaker verification/x-vector extraction",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+
+ # Note(kamo): Use '_' instead of '-' as separator.
+ # '-' is confusing if written in yaml.
+ parser.add_argument(
+ "--log_level",
+ type=lambda x: x.upper(),
+ default="INFO",
+ choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+ help="The verbose level of logging",
+ )
+
+ parser.add_argument("--output_dir", type=str, required=False)
+ parser.add_argument(
+ "--ngpu",
+ type=int,
+ default=0,
+ help="The number of gpus. 0 indicates CPU mode",
+ )
+ parser.add_argument(
+ "--gpuid_list",
+ type=str,
+ default="",
+ help="The visible gpus",
+ )
+ parser.add_argument("--seed", type=int, default=0, help="Random seed")
+ parser.add_argument(
+ "--dtype",
+ default="float32",
+ choices=["float16", "float32", "float64"],
+ help="Data type",
+ )
+ parser.add_argument(
+ "--num_workers",
+ type=int,
+ default=1,
+ help="The number of workers used for DataLoader",
+ )
+
+ group = parser.add_argument_group("Input data related")
+ group.add_argument(
+ "--data_path_and_name_and_type",
+ type=str2triple_str,
+ required=False,
+ action="append",
+ )
+ group.add_argument("--key_file", type=str_or_none)
+ group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+
+ group = parser.add_argument_group("The model configuration related")
+ group.add_argument(
+ "--diar_train_config",
+ type=str,
+ help="diarization training configuration",
+ )
+ group.add_argument(
+ "--diar_model_file",
+ type=str,
+ help="diarization model parameter file",
+ )
+ group.add_argument(
+ "--dur_threshold",
+ type=int,
+ default=10,
+ help="The threshold for short segments in number frames"
+ )
+ parser.add_argument(
+ "--smooth_size",
+ type=int,
+ default=83,
+ help="The smoothing window length in number frames"
+ )
+ group.add_argument(
+ "--model_tag",
+ type=str,
+ help="Pretrained model tag. If specify this option, *_train_config and "
+ "*_file will be overwritten",
+ )
+ parser.add_argument(
+ "--batch_size",
+ type=int,
+ default=1,
+ help="The batch size for inference",
+ )
+ parser.add_argument("--streaming", type=str2bool, default=False)
+
+ return parser
+
+
+def main(cmd=None):
+ print(get_commandline_args(), file=sys.stderr)
+ parser = get_parser()
+ args = parser.parse_args(cmd)
+ kwargs = vars(args)
+ kwargs.pop("config", None)
+ logging.info("args: {}".format(kwargs))
+ if args.output_dir is None:
+ jobid, n_gpu = 1, 1
+ gpuid = args.gpuid_list.split(",")[jobid - 1]
+ else:
+ jobid = int(args.output_dir.split(".")[-1])
+ n_gpu = len(args.gpuid_list.split(","))
+ gpuid = args.gpuid_list.split(",")[(jobid - 1) % n_gpu]
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+ os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
+ results_list = inference(**kwargs)
+ for results in results_list:
+ print("{} {}".format(results["key"], results["value"]))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/funasr/bin/punc_inference_launch.py b/funasr/bin/punc_inference_launch.py
index 53db1dfca..e7e3f1558 100755
--- a/funasr/bin/punc_inference_launch.py
+++ b/funasr/bin/punc_inference_launch.py
@@ -75,6 +75,9 @@ def inference_launch(mode, **kwargs):
if mode == "punc":
from funasr.bin.punctuation_infer import inference_modelscope
return inference_modelscope(**kwargs)
+ if mode == "punc_VadRealtime":
+ from funasr.bin.punctuation_infer_vadrealtime import inference_modelscope
+ return inference_modelscope(**kwargs)
else:
logging.info("Unknown decoding mode: {}".format(mode))
return None
diff --git a/funasr/bin/punctuation_infer_vadrealtime.py b/funasr/bin/punctuation_infer_vadrealtime.py
new file mode 100644
index 000000000..d6cc15332
--- /dev/null
+++ b/funasr/bin/punctuation_infer_vadrealtime.py
@@ -0,0 +1,335 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+from pathlib import Path
+import sys
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+from typing import Any
+from typing import List
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+
+from funasr.datasets.preprocessor import CodeMixTokenizerCommonPreprocessor
+from funasr.utils.cli_utils import get_commandline_args
+from funasr.tasks.punctuation import PunctuationTask
+from funasr.torch_utils.device_funcs import to_device
+from funasr.torch_utils.forward_adaptor import ForwardAdaptor
+from funasr.torch_utils.set_all_random_seed import set_all_random_seed
+from funasr.utils import config_argparse
+from funasr.utils.types import str2triple_str
+from funasr.utils.types import str_or_none
+from funasr.punctuation.text_preprocessor import split_to_mini_sentence
+
+
+class Text2Punc:
+
+ def __init__(
+ self,
+ train_config: Optional[str],
+ model_file: Optional[str],
+ device: str = "cpu",
+ dtype: str = "float32",
+ ):
+ # Build Model
+ model, train_args = PunctuationTask.build_model_from_file(train_config, model_file, device)
+ self.device = device
+ # Wrape model to make model.nll() data-parallel
+ self.wrapped_model = ForwardAdaptor(model, "inference")
+ self.wrapped_model.to(dtype=getattr(torch, dtype)).to(device=device).eval()
+ # logging.info(f"Model:\n{model}")
+ self.punc_list = train_args.punc_list
+ self.period = 0
+ for i in range(len(self.punc_list)):
+ if self.punc_list[i] == ",":
+ self.punc_list[i] = ","
+ elif self.punc_list[i] == "?":
+ self.punc_list[i] = "?"
+ elif self.punc_list[i] == "。":
+ self.period = i
+ self.preprocessor = CodeMixTokenizerCommonPreprocessor(
+ train=False,
+ token_type=train_args.token_type,
+ token_list=train_args.token_list,
+ bpemodel=train_args.bpemodel,
+ text_cleaner=train_args.cleaner,
+ g2p_type=train_args.g2p,
+ text_name="text",
+ non_linguistic_symbols=train_args.non_linguistic_symbols,
+ )
+ print("start decoding!!!")
+
+ @torch.no_grad()
+ def __call__(self, text: Union[list, str], cache: list, split_size=20):
+ if cache is not None and len(cache) > 0:
+ precache = "".join(cache)
+ else:
+ precache = ""
+ data = {"text": precache + text}
+ result = self.preprocessor(data=data, uid="12938712838719")
+ split_text = self.preprocessor.pop_split_text_data(result)
+ mini_sentences = split_to_mini_sentence(split_text, split_size)
+ mini_sentences_id = split_to_mini_sentence(data["text"], split_size)
+ assert len(mini_sentences) == len(mini_sentences_id)
+ cache_sent = []
+ cache_sent_id = torch.from_numpy(np.array([], dtype='int32'))
+ sentence_punc_list = []
+ sentence_words_list= []
+ cache_pop_trigger_limit = 200
+ skip_num = 0
+ for mini_sentence_i in range(len(mini_sentences)):
+ mini_sentence = mini_sentences[mini_sentence_i]
+ mini_sentence_id = mini_sentences_id[mini_sentence_i]
+ mini_sentence = cache_sent + mini_sentence
+ mini_sentence_id = np.concatenate((cache_sent_id, mini_sentence_id), axis=0)
+ data = {
+ "text": torch.unsqueeze(torch.from_numpy(mini_sentence_id), 0),
+ "text_lengths": torch.from_numpy(np.array([len(mini_sentence_id)], dtype='int32')),
+ "vad_indexes": torch.from_numpy(np.array([len(cache)-1], dtype='int32')),
+ }
+ data = to_device(data, self.device)
+ y, _ = self.wrapped_model(**data)
+ _, indices = y.view(-1, y.shape[-1]).topk(1, dim=1)
+ punctuations = indices
+ if indices.size()[0] != 1:
+ punctuations = torch.squeeze(indices)
+ assert punctuations.size()[0] == len(mini_sentence)
+
+ # Search for the last Period/QuestionMark as cache
+ if mini_sentence_i < len(mini_sentences) - 1:
+ sentenceEnd = -1
+ last_comma_index = -1
+ for i in range(len(punctuations) - 2, 1, -1):
+ if self.punc_list[punctuations[i]] == "。" or self.punc_list[punctuations[i]] == "?":
+ sentenceEnd = i
+ break
+ if last_comma_index < 0 and self.punc_list[punctuations[i]] == ",":
+ last_comma_index = i
+
+ if sentenceEnd < 0 and len(mini_sentence) > cache_pop_trigger_limit and last_comma_index >= 0:
+ # The sentence it too long, cut off at a comma.
+ sentenceEnd = last_comma_index
+ punctuations[sentenceEnd] = self.period
+ cache_sent = mini_sentence[sentenceEnd + 1:]
+ cache_sent_id = mini_sentence_id[sentenceEnd + 1:]
+ mini_sentence = mini_sentence[0:sentenceEnd + 1]
+ punctuations = punctuations[0:sentenceEnd + 1]
+
+ punctuations_np = punctuations.cpu().numpy()
+ sentence_punc_list += [self.punc_list[int(x)] for x in punctuations_np]
+ sentence_words_list += mini_sentence
+
+ assert len(sentence_punc_list) == len(sentence_words_list)
+ words_with_punc = []
+ sentence_punc_list_out = []
+ for i in range(0, len(sentence_words_list)):
+ if i > 0:
+ if len(sentence_words_list[i][0].encode()) == 1 and len(sentence_words_list[i - 1][-1].encode()) == 1:
+ sentence_words_list[i] = " " + sentence_words_list[i]
+ if skip_num < len(cache):
+ skip_num += 1
+ else:
+ words_with_punc.append(sentence_words_list[i])
+ if skip_num >= len(cache):
+ sentence_punc_list_out.append(sentence_punc_list[i])
+ if sentence_punc_list[i] != "_":
+ words_with_punc.append(sentence_punc_list[i])
+ sentence_out = "".join(words_with_punc)
+
+ sentenceEnd = -1
+ for i in range(len(sentence_punc_list) - 2, 1, -1):
+ if sentence_punc_list[i] == "。" or sentence_punc_list[i] == "?":
+ sentenceEnd = i
+ break
+ cache_out = sentence_words_list[sentenceEnd + 1 :]
+ if sentence_out[-1] in self.punc_list:
+ sentence_out = sentence_out[:-1]
+ sentence_punc_list_out[-1] = "_"
+ return sentence_out, sentence_punc_list_out, cache_out
+
+
+def inference(
+ batch_size: int,
+ dtype: str,
+ ngpu: int,
+ seed: int,
+ num_workers: int,
+ output_dir: str,
+ log_level: Union[int, str],
+ train_config: Optional[str],
+ model_file: Optional[str],
+ key_file: Optional[str] = None,
+ data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None,
+ raw_inputs: Union[List[Any], bytes, str] = None,
+ cache: List[Any] = None,
+ param_dict: dict = None,
+ **kwargs,
+):
+ inference_pipeline = inference_modelscope(
+ output_dir=output_dir,
+ batch_size=batch_size,
+ dtype=dtype,
+ ngpu=ngpu,
+ seed=seed,
+ num_workers=num_workers,
+ log_level=log_level,
+ key_file=key_file,
+ train_config=train_config,
+ model_file=model_file,
+ param_dict=param_dict,
+ **kwargs,
+ )
+ return inference_pipeline(data_path_and_name_and_type, raw_inputs, cache)
+
+
+def inference_modelscope(
+ batch_size: int,
+ dtype: str,
+ ngpu: int,
+ seed: int,
+ num_workers: int,
+ log_level: Union[int, str],
+ #cache: list,
+ key_file: Optional[str],
+ train_config: Optional[str],
+ model_file: Optional[str],
+ output_dir: Optional[str] = None,
+ param_dict: dict = None,
+ **kwargs,
+):
+ assert check_argument_types()
+ logging.basicConfig(
+ level=log_level,
+ format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+ )
+
+ if ngpu >= 1 and torch.cuda.is_available():
+ device = "cuda"
+ else:
+ device = "cpu"
+
+ # 1. Set random-seed
+ set_all_random_seed(seed)
+ text2punc = Text2Punc(train_config, model_file, device)
+
+ def _forward(
+ data_path_and_name_and_type,
+ raw_inputs: Union[List[Any], bytes, str] = None,
+ output_dir_v2: Optional[str] = None,
+ cache: List[Any] = None,
+ param_dict: dict = None,
+ ):
+ results = []
+ split_size = 10
+
+ if raw_inputs != None:
+ line = raw_inputs.strip()
+ key = "demo"
+ if line == "":
+ item = {'key': key, 'value': ""}
+ results.append(item)
+ return results
+ #import pdb;pdb.set_trace()
+ result, _, cache = text2punc(line, cache)
+ item = {'key': key, 'value': result, 'cache': cache}
+ results.append(item)
+ return results
+
+ for inference_text, _, _ in data_path_and_name_and_type:
+ with open(inference_text, "r", encoding="utf-8") as fin:
+ for line in fin:
+ line = line.strip()
+ segs = line.split("\t")
+ if len(segs) != 2:
+ continue
+ key = segs[0]
+ if len(segs[1]) == 0:
+ continue
+ result, _ = text2punc(segs[1])
+ item = {'key': key, 'value': result}
+ results.append(item)
+ output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
+ if output_path != None:
+ output_file_name = "infer.out"
+ Path(output_path).mkdir(parents=True, exist_ok=True)
+ output_file_path = (Path(output_path) / output_file_name).absolute()
+ with open(output_file_path, "w", encoding="utf-8") as fout:
+ for item_i in results:
+ key_out = item_i["key"]
+ value_out = item_i["value"]
+ fout.write(f"{key_out}\t{value_out}\n")
+ return results
+
+ return _forward
+
+
+def get_parser():
+ parser = config_argparse.ArgumentParser(
+ description="Punctuation inference",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+
+ parser.add_argument(
+ "--log_level",
+ type=lambda x: x.upper(),
+ default="INFO",
+ choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+ help="The verbose level of logging",
+ )
+
+ parser.add_argument("--output_dir", type=str, required=False)
+ parser.add_argument(
+ "--ngpu",
+ type=int,
+ default=0,
+ help="The number of gpus. 0 indicates CPU mode",
+ )
+ parser.add_argument("--seed", type=int, default=0, help="Random seed")
+ parser.add_argument(
+ "--dtype",
+ default="float32",
+ choices=["float16", "float32", "float64"],
+ help="Data type",
+ )
+ parser.add_argument(
+ "--num_workers",
+ type=int,
+ default=1,
+ help="The number of workers used for DataLoader",
+ )
+ parser.add_argument(
+ "--batch_size",
+ type=int,
+ default=1,
+ help="The batch size for inference",
+ )
+
+ group = parser.add_argument_group("Input data related")
+ group.add_argument("--data_path_and_name_and_type", type=str2triple_str, action="append", required=False)
+ group.add_argument("--raw_inputs", type=str, required=False)
+ group.add_argument("--cache", type=list, required=False)
+ group.add_argument("--param_dict", type=dict, required=False)
+ group.add_argument("--key_file", type=str_or_none)
+
+ group = parser.add_argument_group("The model configuration related")
+ group.add_argument("--train_config", type=str)
+ group.add_argument("--model_file", type=str)
+
+ return parser
+
+
+def main(cmd=None):
+ print(get_commandline_args(), file=sys.stderr)
+ parser = get_parser()
+ args = parser.parse_args(cmd)
+ kwargs = vars(args)
+ # kwargs.pop("config", None)
+ inference(**kwargs)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/funasr/bin/sond_inference.py b/funasr/bin/sond_inference.py
index ab6d26f45..5a0a8e28f 100755
--- a/funasr/bin/sond_inference.py
+++ b/funasr/bin/sond_inference.py
@@ -42,7 +42,7 @@ class Speech2Diarization:
Examples:
>>> import soundfile
>>> import numpy as np
- >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pth")
+ >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
>>> profile = np.load("profiles.npy")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2diar(audio, profile)
@@ -54,7 +54,7 @@ class Speech2Diarization:
self,
diar_train_config: Union[Path, str] = None,
diar_model_file: Union[Path, str] = None,
- device: str = "cpu",
+ device: Union[str, torch.device] = "cpu",
batch_size: int = 1,
dtype: str = "float32",
streaming: bool = False,
@@ -114,9 +114,19 @@ class Speech2Diarization:
# little-endian order: lower bit first
return (np.array(list(b)[::-1]) == '1').astype(dtype)
- return np.row_stack([int2vec(int(x), vec_dim) for x in seq])
+ # process oov
+ seq = np.array([int(x) for x in seq])
+ new_seq = []
+ for i, x in enumerate(seq):
+ if x < 2 ** vec_dim:
+ new_seq.append(x)
+ else:
+ idx_list = np.where(seq < 2 ** vec_dim)[0]
+ idx = np.abs(idx_list - i).argmin()
+ new_seq.append(seq[idx_list[idx]])
+ return np.row_stack([int2vec(x, vec_dim) for x in new_seq])
- def post_processing(self, raw_logits: torch.Tensor, spk_num: int):
+ def post_processing(self, raw_logits: torch.Tensor, spk_num: int, output_format: str = "speaker_turn"):
logits_idx = raw_logits.argmax(-1) # B, T, vocab_size -> B, T
# upsampling outputs to match inputs
ut = logits_idx.shape[1] * self.diar_model.encoder.time_ds_ratio
@@ -127,8 +137,14 @@ class Speech2Diarization:
).squeeze(1).long()
logits_idx = logits_idx[0].tolist()
pse_labels = [self.token_list[x] for x in logits_idx]
+ if output_format == "pse_labels":
+ return pse_labels, None
+
multi_labels = self.seq2arr(pse_labels, spk_num)[:, :spk_num] # remove padding speakers
multi_labels = self.smooth_multi_labels(multi_labels)
+ if output_format == "binary_labels":
+ return multi_labels, None
+
spk_list = ["spk{}".format(i + 1) for i in range(spk_num)]
spk_turns = self.calc_spk_turns(multi_labels, spk_list)
results = OrderedDict()
@@ -149,6 +165,7 @@ class Speech2Diarization:
self,
speech: Union[torch.Tensor, np.ndarray],
profile: Union[torch.Tensor, np.ndarray],
+ output_format: str = "speaker_turn"
):
"""Inference
@@ -178,7 +195,7 @@ class Speech2Diarization:
batch = to_device(batch, device=self.device)
logits = self.diar_model.prediction_forward(**batch)
- results, pse_labels = self.post_processing(logits, profile.shape[1])
+ results, pse_labels = self.post_processing(logits, profile.shape[1], output_format)
return results, pse_labels
@@ -367,7 +384,7 @@ def inference_modelscope(
pse_label_writer = open("{}/labels.txt".format(output_path), "w")
logging.info("Start to diarize...")
result_list = []
- for keys, batch in loader:
+ for idx, (keys, batch) in enumerate(loader):
assert isinstance(batch, dict), type(batch)
assert all(isinstance(s, str) for s in keys), keys
_bs = len(next(iter(batch.values())))
@@ -385,6 +402,9 @@ def inference_modelscope(
pse_label_writer.write("{} {}\n".format(key, " ".join(pse_labels)))
pse_label_writer.flush()
+ if idx % 100 == 0:
+ logging.info("Processing {:5d}: {}".format(idx, key))
+
if output_path is not None:
output_writer.close()
pse_label_writer.close()
diff --git a/funasr/bin/sv_inference.py b/funasr/bin/sv_inference.py
index a78bccded..7e63bbd2d 100755
--- a/funasr/bin/sv_inference.py
+++ b/funasr/bin/sv_inference.py
@@ -36,7 +36,7 @@ class Speech2Xvector:
Examples:
>>> import soundfile
- >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pth")
+ >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2xvector(audio)
[(text, token, token_int, hypothesis object), ...]
@@ -169,7 +169,7 @@ def inference_modelscope(
log_level: Union[int, str] = "INFO",
key_file: Optional[str] = None,
sv_train_config: Optional[str] = "sv.yaml",
- sv_model_file: Optional[str] = "sv.pth",
+ sv_model_file: Optional[str] = "sv.pb",
model_tag: Optional[str] = None,
allow_variable_data_keys: bool = True,
streaming: bool = False,
diff --git a/funasr/bin/tp_inference.py b/funasr/bin/tp_inference.py
new file mode 100644
index 000000000..e374a227a
--- /dev/null
+++ b/funasr/bin/tp_inference.py
@@ -0,0 +1,379 @@
+import argparse
+import logging
+from optparse import Option
+import sys
+import json
+from pathlib import Path
+from typing import Any
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+from typing import Dict
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+
+from funasr.fileio.datadir_writer import DatadirWriter
+from funasr.datasets.preprocessor import LMPreprocessor
+from funasr.tasks.asr import ASRTaskAligner as ASRTask
+from funasr.torch_utils.device_funcs import to_device
+from funasr.torch_utils.set_all_random_seed import set_all_random_seed
+from funasr.utils import config_argparse
+from funasr.utils.cli_utils import get_commandline_args
+from funasr.utils.types import str2bool
+from funasr.utils.types import str2triple_str
+from funasr.utils.types import str_or_none
+from funasr.models.frontend.wav_frontend import WavFrontend
+from funasr.text.token_id_converter import TokenIDConverter
+from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
+
+
+header_colors = '\033[95m'
+end_colors = '\033[0m'
+
+global_asr_language: str = 'zh-cn'
+global_sample_rate: Union[int, Dict[Any, int]] = {
+ 'audio_fs': 16000,
+ 'model_fs': 16000
+}
+
+
+class SpeechText2Timestamp:
+ def __init__(
+ self,
+ timestamp_infer_config: Union[Path, str] = None,
+ timestamp_model_file: Union[Path, str] = None,
+ timestamp_cmvn_file: Union[Path, str] = None,
+ device: str = "cpu",
+ dtype: str = "float32",
+ **kwargs,
+ ):
+ assert check_argument_types()
+ # 1. Build ASR model
+ tp_model, tp_train_args = ASRTask.build_model_from_file(
+ timestamp_infer_config, timestamp_model_file, device
+ )
+ if 'cuda' in device:
+ tp_model = tp_model.cuda() # force model to cuda
+
+ frontend = None
+ if tp_train_args.frontend is not None:
+ frontend = WavFrontend(cmvn_file=timestamp_cmvn_file, **tp_train_args.frontend_conf)
+
+ logging.info("tp_model: {}".format(tp_model))
+ logging.info("tp_train_args: {}".format(tp_train_args))
+ tp_model.to(dtype=getattr(torch, dtype)).eval()
+
+ logging.info(f"Decoding device={device}, dtype={dtype}")
+
+
+ self.tp_model = tp_model
+ self.tp_train_args = tp_train_args
+
+ token_list = self.tp_model.token_list
+ self.converter = TokenIDConverter(token_list=token_list)
+
+ self.device = device
+ self.dtype = dtype
+ self.frontend = frontend
+ self.encoder_downsampling_factor = 1
+ if tp_train_args.encoder_conf["input_layer"] == "conv2d":
+ self.encoder_downsampling_factor = 4
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ speech: Union[torch.Tensor, np.ndarray],
+ speech_lengths: Union[torch.Tensor, np.ndarray] = None,
+ text_lengths: Union[torch.Tensor, np.ndarray] = None
+ ):
+ assert check_argument_types()
+
+ # Input as audio signal
+ if isinstance(speech, np.ndarray):
+ speech = torch.tensor(speech)
+ if self.frontend is not None:
+ feats, feats_len = self.frontend.forward(speech, speech_lengths)
+ feats = to_device(feats, device=self.device)
+ feats_len = feats_len.int()
+ self.tp_model.frontend = None
+ else:
+ feats = speech
+ feats_len = speech_lengths
+
+ # lfr_factor = max(1, (feats.size()[-1]//80)-1)
+ batch = {"speech": feats, "speech_lengths": feats_len}
+
+ # a. To device
+ batch = to_device(batch, device=self.device)
+
+ # b. Forward Encoder
+ enc, enc_len = self.tp_model.encode(**batch)
+ if isinstance(enc, tuple):
+ enc = enc[0]
+
+ # c. Forward Predictor
+ _, _, us_alphas, us_cif_peak = self.tp_model.calc_predictor_timestamp(enc, enc_len, text_lengths.to(self.device)+1)
+ return us_alphas, us_cif_peak
+
+
+def inference(
+ batch_size: int,
+ ngpu: int,
+ log_level: Union[int, str],
+ data_path_and_name_and_type,
+ timestamp_infer_config: Optional[str],
+ timestamp_model_file: Optional[str],
+ timestamp_cmvn_file: Optional[str] = None,
+ raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ key_file: Optional[str] = None,
+ allow_variable_data_keys: bool = False,
+ output_dir: Optional[str] = None,
+ dtype: str = "float32",
+ seed: int = 0,
+ num_workers: int = 1,
+ split_with_space: bool = True,
+ seg_dict_file: Optional[str] = None,
+ **kwargs,
+):
+ inference_pipeline = inference_modelscope(
+ batch_size=batch_size,
+ ngpu=ngpu,
+ log_level=log_level,
+ timestamp_infer_config=timestamp_infer_config,
+ timestamp_model_file=timestamp_model_file,
+ timestamp_cmvn_file=timestamp_cmvn_file,
+ key_file=key_file,
+ allow_variable_data_keys=allow_variable_data_keys,
+ output_dir=output_dir,
+ dtype=dtype,
+ seed=seed,
+ num_workers=num_workers,
+ split_with_space=split_with_space,
+ seg_dict_file=seg_dict_file,
+ **kwargs,
+ )
+ return inference_pipeline(data_path_and_name_and_type, raw_inputs)
+
+
+def inference_modelscope(
+ batch_size: int,
+ ngpu: int,
+ log_level: Union[int, str],
+ # data_path_and_name_and_type,
+ timestamp_infer_config: Optional[str],
+ timestamp_model_file: Optional[str],
+ timestamp_cmvn_file: Optional[str] = None,
+ # raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ key_file: Optional[str] = None,
+ allow_variable_data_keys: bool = False,
+ output_dir: Optional[str] = None,
+ dtype: str = "float32",
+ seed: int = 0,
+ num_workers: int = 1,
+ split_with_space: bool = True,
+ seg_dict_file: Optional[str] = None,
+ **kwargs,
+):
+ assert check_argument_types()
+ if batch_size > 1:
+ raise NotImplementedError("batch decoding is not implemented")
+ if ngpu > 1:
+ raise NotImplementedError("only single GPU decoding is supported")
+
+ logging.basicConfig(
+ level=log_level,
+ format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+ )
+
+ if ngpu >= 1 and torch.cuda.is_available():
+ device = "cuda"
+ else:
+ device = "cpu"
+ # 1. Set random-seed
+ set_all_random_seed(seed)
+
+ # 2. Build speech2vadsegment
+ speechtext2timestamp_kwargs = dict(
+ timestamp_infer_config=timestamp_infer_config,
+ timestamp_model_file=timestamp_model_file,
+ timestamp_cmvn_file=timestamp_cmvn_file,
+ device=device,
+ dtype=dtype,
+ )
+ logging.info("speechtext2timestamp_kwargs: {}".format(speechtext2timestamp_kwargs))
+ speechtext2timestamp = SpeechText2Timestamp(**speechtext2timestamp_kwargs)
+
+ preprocessor = LMPreprocessor(
+ train=False,
+ token_type=speechtext2timestamp.tp_train_args.token_type,
+ token_list=speechtext2timestamp.tp_train_args.token_list,
+ bpemodel=None,
+ text_cleaner=None,
+ g2p_type=None,
+ text_name="text",
+ non_linguistic_symbols=speechtext2timestamp.tp_train_args.non_linguistic_symbols,
+ split_with_space=split_with_space,
+ seg_dict_file=seg_dict_file,
+ )
+
+ def _forward(
+ data_path_and_name_and_type,
+ raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ output_dir_v2: Optional[str] = None,
+ fs: dict = None,
+ param_dict: dict = None,
+ **kwargs
+ ):
+ # 3. Build data-iterator
+ if data_path_and_name_and_type is None and raw_inputs is not None:
+ if isinstance(raw_inputs, torch.Tensor):
+ raw_inputs = raw_inputs.numpy()
+ data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
+
+ loader = ASRTask.build_streaming_iterator(
+ data_path_and_name_and_type,
+ dtype=dtype,
+ batch_size=batch_size,
+ key_file=key_file,
+ num_workers=num_workers,
+ preprocess_fn=preprocessor,
+ collate_fn=ASRTask.build_collate_fn(speechtext2timestamp.tp_train_args, False),
+ allow_variable_data_keys=allow_variable_data_keys,
+ inference=True,
+ )
+
+ tp_result_list = []
+ for keys, batch in loader:
+ assert isinstance(batch, dict), type(batch)
+ assert all(isinstance(s, str) for s in keys), keys
+ _bs = len(next(iter(batch.values())))
+ assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+
+ logging.info("timestamp predicting, utt_id: {}".format(keys))
+ _batch = {'speech':batch['speech'],
+ 'speech_lengths':batch['speech_lengths'],
+ 'text_lengths':batch['text_lengths']}
+ us_alphas, us_cif_peak = speechtext2timestamp(**_batch)
+
+ for batch_id in range(_bs):
+ key = keys[batch_id]
+ token = speechtext2timestamp.converter.ids2tokens(batch['text'][batch_id])
+ ts_str, ts_list = ts_prediction_lfr6_standard(us_alphas[batch_id], us_cif_peak[batch_id], token, force_time_shift=-3.0)
+ logging.warning(ts_str)
+ item = {'key': key, 'value': ts_str, 'timestamp':ts_list}
+ tp_result_list.append(item)
+ return tp_result_list
+
+ return _forward
+
+
+def get_parser():
+ parser = config_argparse.ArgumentParser(
+ description="Timestamp Prediction Inference",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+
+ # Note(kamo): Use '_' instead of '-' as separator.
+ # '-' is confusing if written in yaml.
+ parser.add_argument(
+ "--log_level",
+ type=lambda x: x.upper(),
+ default="INFO",
+ choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+ help="The verbose level of logging",
+ )
+
+ parser.add_argument("--output_dir", type=str, required=False)
+ parser.add_argument(
+ "--ngpu",
+ type=int,
+ default=0,
+ help="The number of gpus. 0 indicates CPU mode",
+ )
+ parser.add_argument(
+ "--gpuid_list",
+ type=str,
+ default="",
+ help="The visible gpus",
+ )
+ parser.add_argument("--seed", type=int, default=0, help="Random seed")
+ parser.add_argument(
+ "--dtype",
+ default="float32",
+ choices=["float16", "float32", "float64"],
+ help="Data type",
+ )
+ parser.add_argument(
+ "--num_workers",
+ type=int,
+ default=0,
+ help="The number of workers used for DataLoader",
+ )
+
+ group = parser.add_argument_group("Input data related")
+ group.add_argument(
+ "--data_path_and_name_and_type",
+ type=str2triple_str,
+ required=False,
+ action="append",
+ )
+ group.add_argument("--raw_inputs", type=list, default=None)
+ # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
+ group.add_argument("--key_file", type=str_or_none)
+ group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+
+ group = parser.add_argument_group("The model configuration related")
+ group.add_argument(
+ "--timestamp_infer_config",
+ type=str,
+ help="VAD infer configuration",
+ )
+ group.add_argument(
+ "--timestamp_model_file",
+ type=str,
+ help="VAD model parameter file",
+ )
+ group.add_argument(
+ "--timestamp_cmvn_file",
+ type=str,
+ help="Global cmvn file",
+ )
+
+ group = parser.add_argument_group("infer related")
+ group.add_argument(
+ "--batch_size",
+ type=int,
+ default=1,
+ help="The batch size for inference",
+ )
+ group.add_argument(
+ "--seg_dict_file",
+ type=str,
+ default=None,
+ help="The batch size for inference",
+ )
+ group.add_argument(
+ "--split_with_space",
+ type=bool,
+ default=False,
+ help="The batch size for inference",
+ )
+
+ return parser
+
+
+def main(cmd=None):
+ print(get_commandline_args(), file=sys.stderr)
+ parser = get_parser()
+ args = parser.parse_args(cmd)
+ kwargs = vars(args)
+ kwargs.pop("config", None)
+ inference(**kwargs)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/funasr/bin/tp_inference_launch.py b/funasr/bin/tp_inference_launch.py
new file mode 100644
index 000000000..dd76df61b
--- /dev/null
+++ b/funasr/bin/tp_inference_launch.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
+# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+
+import argparse
+import logging
+import os
+import sys
+from typing import Union, Dict, Any
+
+from funasr.utils import config_argparse
+from funasr.utils.cli_utils import get_commandline_args
+from funasr.utils.types import str2bool
+from funasr.utils.types import str2triple_str
+from funasr.utils.types import str_or_none
+
+
+def get_parser():
+ parser = config_argparse.ArgumentParser(
+ description="Timestamp Prediction Inference",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+
+ # Note(kamo): Use '_' instead of '-' as separator.
+ # '-' is confusing if written in yaml.
+ parser.add_argument(
+ "--log_level",
+ type=lambda x: x.upper(),
+ default="INFO",
+ choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+ help="The verbose level of logging",
+ )
+
+ parser.add_argument("--output_dir", type=str, required=False)
+ parser.add_argument(
+ "--ngpu",
+ type=int,
+ default=0,
+ help="The number of gpus. 0 indicates CPU mode",
+ )
+ parser.add_argument(
+ "--njob",
+ type=int,
+ default=1,
+ help="The number of jobs for each gpu",
+ )
+ parser.add_argument(
+ "--gpuid_list",
+ type=str,
+ default="",
+ help="The visible gpus",
+ )
+ parser.add_argument("--seed", type=int, default=0, help="Random seed")
+ parser.add_argument(
+ "--dtype",
+ default="float32",
+ choices=["float16", "float32", "float64"],
+ help="Data type",
+ )
+ parser.add_argument(
+ "--num_workers",
+ type=int,
+ default=1,
+ help="The number of workers used for DataLoader",
+ )
+
+ group = parser.add_argument_group("Input data related")
+ group.add_argument(
+ "--data_path_and_name_and_type",
+ type=str2triple_str,
+ required=True,
+ action="append",
+ )
+ group.add_argument("--key_file", type=str_or_none)
+ group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+
+ group = parser.add_argument_group("The model configuration related")
+ group.add_argument(
+ "--timestamp_infer_config",
+ type=str,
+ help="VAD infer configuration",
+ )
+ group.add_argument(
+ "--timestamp_model_file",
+ type=str,
+ help="VAD model parameter file",
+ )
+ group.add_argument(
+ "--timestamp_cmvn_file",
+ type=str,
+ help="Global CMVN file",
+ )
+
+ group = parser.add_argument_group("The inference configuration related")
+ group.add_argument(
+ "--batch_size",
+ type=int,
+ default=1,
+ help="The batch size for inference",
+ )
+ return parser
+
+
+def inference_launch(mode, **kwargs):
+ if mode == "tp_norm":
+ from funasr.bin.tp_inference import inference_modelscope
+ return inference_modelscope(**kwargs)
+ else:
+ logging.info("Unknown decoding mode: {}".format(mode))
+ return None
+
+def main(cmd=None):
+ print(get_commandline_args(), file=sys.stderr)
+ parser = get_parser()
+ parser.add_argument(
+ "--mode",
+ type=str,
+ default="tp_norm",
+ help="The decoding mode",
+ )
+ args = parser.parse_args(cmd)
+ kwargs = vars(args)
+ kwargs.pop("config", None)
+
+ # set logging messages
+ logging.basicConfig(
+ level=args.log_level,
+ format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+ )
+ logging.info("Decoding args: {}".format(kwargs))
+
+ # gpu setting
+ if args.ngpu > 0:
+ jobid = int(args.output_dir.split(".")[-1])
+ gpuid = args.gpuid_list.split(",")[(jobid - 1) // args.njob]
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+ os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
+
+ inference_launch(**kwargs)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/funasr/bin/vad_inference_launch.py b/funasr/bin/vad_inference_launch.py
index 18eba33fb..42c5c1e12 100644
--- a/funasr/bin/vad_inference_launch.py
+++ b/funasr/bin/vad_inference_launch.py
@@ -110,8 +110,7 @@ def inference_launch(mode, **kwargs):
if mode == "offline":
from funasr.bin.vad_inference import inference_modelscope
return inference_modelscope(**kwargs)
- # elif mode == "online":
- if "param_dict" in kwargs and kwargs["param_dict"]["online"]:
+ elif mode == "online":
from funasr.bin.vad_inference_online import inference_modelscope
return inference_modelscope(**kwargs)
else:
diff --git a/funasr/bin/vad_inference_online.py b/funasr/bin/vad_inference_online.py
new file mode 100644
index 000000000..d18488ed0
--- /dev/null
+++ b/funasr/bin/vad_inference_online.py
@@ -0,0 +1,345 @@
+import argparse
+import logging
+import sys
+import json
+from pathlib import Path
+from typing import Any
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+from typing import Dict
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+
+from funasr.fileio.datadir_writer import DatadirWriter
+from funasr.tasks.vad import VADTask
+from funasr.torch_utils.device_funcs import to_device
+from funasr.torch_utils.set_all_random_seed import set_all_random_seed
+from funasr.utils import config_argparse
+from funasr.utils.cli_utils import get_commandline_args
+from funasr.utils.types import str2bool
+from funasr.utils.types import str2triple_str
+from funasr.utils.types import str_or_none
+from funasr.models.frontend.wav_frontend import WavFrontendOnline
+from funasr.models.frontend.wav_frontend import WavFrontend
+from funasr.bin.vad_inference import Speech2VadSegment
+
+header_colors = '\033[95m'
+end_colors = '\033[0m'
+
+global_asr_language: str = 'zh-cn'
+global_sample_rate: Union[int, Dict[Any, int]] = {
+ 'audio_fs': 16000,
+ 'model_fs': 16000
+}
+
+
+class Speech2VadSegmentOnline(Speech2VadSegment):
+ """Speech2VadSegmentOnline class
+
+ Examples:
+ >>> import soundfile
+ >>> speech2segment = Speech2VadSegmentOnline("vad_config.yml", "vad.pt")
+ >>> audio, rate = soundfile.read("speech.wav")
+ >>> speech2segment(audio)
+ [[10, 230], [245, 450], ...]
+
+ """
+ def __init__(self, **kwargs):
+ super(Speech2VadSegmentOnline, self).__init__(**kwargs)
+ vad_cmvn_file = kwargs.get('vad_cmvn_file', None)
+ self.frontend = None
+ if self.vad_infer_args.frontend is not None:
+ self.frontend = WavFrontendOnline(cmvn_file=vad_cmvn_file, **self.vad_infer_args.frontend_conf)
+
+
+ @torch.no_grad()
+ def __call__(
+ self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
+ in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False
+ ) -> Tuple[torch.Tensor, List[List[int]], torch.Tensor]:
+ """Inference
+
+ Args:
+ speech: Input speech data
+ Returns:
+ text, token, token_int, hyp
+
+ """
+ assert check_argument_types()
+
+ # Input as audio signal
+ if isinstance(speech, np.ndarray):
+ speech = torch.tensor(speech)
+ batch_size = speech.shape[0]
+ segments = [[]] * batch_size
+ if self.frontend is not None:
+ feats, feats_len = self.frontend.forward(speech, speech_lengths, is_final)
+ fbanks, _ = self.frontend.get_fbank()
+ else:
+ raise Exception("Need to extract feats first, please configure frontend configuration")
+ if feats.shape[0]:
+ feats = to_device(feats, device=self.device)
+ feats_len = feats_len.int()
+ waveforms = self.frontend.get_waveforms()
+
+ batch = {
+ "feats": feats,
+ "waveform": waveforms,
+ "in_cache": in_cache,
+ "is_final": is_final
+ }
+ # a. To device
+ batch = to_device(batch, device=self.device)
+ segments, in_cache = self.vad_model.forward_online(**batch)
+ # in_cache.update(batch['in_cache'])
+ # in_cache = {key: value for key, value in batch['in_cache'].items()}
+ return fbanks, segments, in_cache
+
+
+def inference(
+ batch_size: int,
+ ngpu: int,
+ log_level: Union[int, str],
+ data_path_and_name_and_type,
+ vad_infer_config: Optional[str],
+ vad_model_file: Optional[str],
+ vad_cmvn_file: Optional[str] = None,
+ raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ key_file: Optional[str] = None,
+ allow_variable_data_keys: bool = False,
+ output_dir: Optional[str] = None,
+ dtype: str = "float32",
+ seed: int = 0,
+ num_workers: int = 1,
+ **kwargs,
+):
+ inference_pipeline = inference_modelscope(
+ batch_size=batch_size,
+ ngpu=ngpu,
+ log_level=log_level,
+ vad_infer_config=vad_infer_config,
+ vad_model_file=vad_model_file,
+ vad_cmvn_file=vad_cmvn_file,
+ key_file=key_file,
+ allow_variable_data_keys=allow_variable_data_keys,
+ output_dir=output_dir,
+ dtype=dtype,
+ seed=seed,
+ num_workers=num_workers,
+ **kwargs,
+ )
+ return inference_pipeline(data_path_and_name_and_type, raw_inputs)
+
+
+def inference_modelscope(
+ batch_size: int,
+ ngpu: int,
+ log_level: Union[int, str],
+ # data_path_and_name_and_type,
+ vad_infer_config: Optional[str],
+ vad_model_file: Optional[str],
+ vad_cmvn_file: Optional[str] = None,
+ # raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ key_file: Optional[str] = None,
+ allow_variable_data_keys: bool = False,
+ output_dir: Optional[str] = None,
+ dtype: str = "float32",
+ seed: int = 0,
+ num_workers: int = 1,
+ **kwargs,
+):
+ assert check_argument_types()
+ if batch_size > 1:
+ raise NotImplementedError("batch decoding is not implemented")
+ if ngpu > 1:
+ raise NotImplementedError("only single GPU decoding is supported")
+
+ logging.basicConfig(
+ level=log_level,
+ format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+ )
+
+ if ngpu >= 1 and torch.cuda.is_available():
+ device = "cuda"
+ else:
+ device = "cpu"
+
+ # 1. Set random-seed
+ set_all_random_seed(seed)
+
+ # 2. Build speech2vadsegment
+ speech2vadsegment_kwargs = dict(
+ vad_infer_config=vad_infer_config,
+ vad_model_file=vad_model_file,
+ vad_cmvn_file=vad_cmvn_file,
+ device=device,
+ dtype=dtype,
+ )
+ logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs))
+ speech2vadsegment = Speech2VadSegmentOnline(**speech2vadsegment_kwargs)
+
+ def _forward(
+ data_path_and_name_and_type,
+ raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ output_dir_v2: Optional[str] = None,
+ fs: dict = None,
+ param_dict: dict = None,
+ ):
+ # 3. Build data-iterator
+ if data_path_and_name_and_type is None and raw_inputs is not None:
+ if isinstance(raw_inputs, torch.Tensor):
+ raw_inputs = raw_inputs.numpy()
+ data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
+ loader = VADTask.build_streaming_iterator(
+ data_path_and_name_and_type,
+ dtype=dtype,
+ batch_size=batch_size,
+ key_file=key_file,
+ num_workers=num_workers,
+ preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False),
+ collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False),
+ allow_variable_data_keys=allow_variable_data_keys,
+ inference=True,
+ )
+
+ finish_count = 0
+ file_count = 1
+ # 7 .Start for-loop
+ # FIXME(kamo): The output format should be discussed about
+ output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
+ if output_path is not None:
+ writer = DatadirWriter(output_path)
+ ibest_writer = writer[f"1best_recog"]
+ else:
+ writer = None
+ ibest_writer = None
+
+ vad_results = []
+ batch_in_cache = param_dict['in_cache'] if param_dict is not None else dict()
+ is_final = param_dict['is_final'] if param_dict is not None else False
+ for keys, batch in loader:
+ assert isinstance(batch, dict), type(batch)
+ assert all(isinstance(s, str) for s in keys), keys
+ _bs = len(next(iter(batch.values())))
+ assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+ batch['in_cache'] = batch_in_cache
+ batch['is_final'] = is_final
+
+ # do vad segment
+ _, results, param_dict['in_cache'] = speech2vadsegment(**batch)
+ # param_dict['in_cache'] = batch['in_cache']
+ if results:
+ for i, _ in enumerate(keys):
+ if results[i]:
+ results[i] = json.dumps(results[i])
+ item = {'key': keys[i], 'value': results[i]}
+ vad_results.append(item)
+ if writer is not None:
+ results[i] = json.loads(results[i])
+ ibest_writer["text"][keys[i]] = "{}".format(results[i])
+
+ return vad_results
+
+ return _forward
+
+
+def get_parser():
+ parser = config_argparse.ArgumentParser(
+ description="VAD Decoding",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+
+ # Note(kamo): Use '_' instead of '-' as separator.
+ # '-' is confusing if written in yaml.
+ parser.add_argument(
+ "--log_level",
+ type=lambda x: x.upper(),
+ default="INFO",
+ choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+ help="The verbose level of logging",
+ )
+
+ parser.add_argument("--output_dir", type=str, required=False)
+ parser.add_argument(
+ "--ngpu",
+ type=int,
+ default=0,
+ help="The number of gpus. 0 indicates CPU mode",
+ )
+ parser.add_argument(
+ "--gpuid_list",
+ type=str,
+ default="",
+ help="The visible gpus",
+ )
+ parser.add_argument("--seed", type=int, default=0, help="Random seed")
+ parser.add_argument(
+ "--dtype",
+ default="float32",
+ choices=["float16", "float32", "float64"],
+ help="Data type",
+ )
+ parser.add_argument(
+ "--num_workers",
+ type=int,
+ default=1,
+ help="The number of workers used for DataLoader",
+ )
+
+ group = parser.add_argument_group("Input data related")
+ group.add_argument(
+ "--data_path_and_name_and_type",
+ type=str2triple_str,
+ required=False,
+ action="append",
+ )
+ group.add_argument("--raw_inputs", type=list, default=None)
+ # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
+ group.add_argument("--key_file", type=str_or_none)
+ group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+
+ group = parser.add_argument_group("The model configuration related")
+ group.add_argument(
+ "--vad_infer_config",
+ type=str,
+ help="VAD infer configuration",
+ )
+ group.add_argument(
+ "--vad_model_file",
+ type=str,
+ help="VAD model parameter file",
+ )
+ group.add_argument(
+ "--vad_cmvn_file",
+ type=str,
+ help="Global cmvn file",
+ )
+
+ group = parser.add_argument_group("infer related")
+ group.add_argument(
+ "--batch_size",
+ type=int,
+ default=1,
+ help="The batch size for inference",
+ )
+
+ return parser
+
+
+def main(cmd=None):
+ print(get_commandline_args(), file=sys.stderr)
+ parser = get_parser()
+ args = parser.parse_args(cmd)
+ kwargs = vars(args)
+ kwargs.pop("config", None)
+ inference(**kwargs)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/funasr/datasets/dataset.py b/funasr/datasets/dataset.py
index 2af93d0bc..1595224fd 100644
--- a/funasr/datasets/dataset.py
+++ b/funasr/datasets/dataset.py
@@ -107,7 +107,7 @@ class H5FileWrapper:
return value[()]
-def sound_loader(path, float_dtype=None):
+def sound_loader(path, dest_sample_rate=16000, float_dtype=None):
# The file is as follows:
# utterance_id_A /some/where/a.wav
# utterance_id_B /some/where/a.flac
@@ -115,7 +115,7 @@ def sound_loader(path, float_dtype=None):
# NOTE(kamo): SoundScpReader doesn't support pipe-fashion
# like Kaldi e.g. "cat a.wav |".
# NOTE(kamo): The audio signal is normalized to [-1,1] range.
- loader = SoundScpReader(path, normalize=True, always_2d=False)
+ loader = SoundScpReader(path, dest_sample_rate, normalize=True, always_2d=False)
# SoundScpReader.__getitem__() returns Tuple[int, ndarray],
# but ndarray is desired, so Adapter class is inserted here
@@ -139,7 +139,7 @@ def rand_int_loader(filepath, loader_type):
DATA_TYPES = {
"sound": dict(
func=sound_loader,
- kwargs=["float_dtype"],
+ kwargs=["dest_sample_rate","float_dtype"],
help="Audio format types which supported by sndfile wav, flac, etc."
"\n\n"
" utterance_id_a a.wav\n"
@@ -282,6 +282,7 @@ class ESPnetDataset(AbsDataset):
int_dtype: str = "long",
max_cache_size: Union[float, int, str] = 0.0,
max_cache_fd: int = 0,
+ dest_sample_rate: int = 16000,
):
assert check_argument_types()
if len(path_name_type_list) == 0:
@@ -295,6 +296,7 @@ class ESPnetDataset(AbsDataset):
self.float_dtype = float_dtype
self.int_dtype = int_dtype
self.max_cache_fd = max_cache_fd
+ self.dest_sample_rate = dest_sample_rate
self.loader_dict = {}
self.debug_info = {}
@@ -335,6 +337,8 @@ class ESPnetDataset(AbsDataset):
for key2 in dic["kwargs"]:
if key2 == "loader_type":
kwargs["loader_type"] = loader_type
+ elif key2 == "dest_sample_rate" and loader_type=="sound":
+ kwargs["dest_sample_rate"] = self.dest_sample_rate
elif key2 == "float_dtype":
kwargs["float_dtype"] = self.float_dtype
elif key2 == "int_dtype":
diff --git a/funasr/datasets/iterable_dataset.py b/funasr/datasets/iterable_dataset.py
index 2f97e78b9..c8c51d458 100644
--- a/funasr/datasets/iterable_dataset.py
+++ b/funasr/datasets/iterable_dataset.py
@@ -8,6 +8,7 @@ from typing import Dict
from typing import Iterator
from typing import Tuple
from typing import Union
+from typing import List
import kaldiio
import numpy as np
@@ -66,7 +67,7 @@ def load_pcm(input):
return load_bytes(bytes)
DATA_TYPES = {
- "sound": lambda x: torchaudio.load(x)[0][0].numpy(),
+ "sound": lambda x: torchaudio.load(x)[0].numpy(),
"pcm": load_pcm,
"kaldi_ark": load_kaldi,
"bytes": load_bytes,
@@ -106,6 +107,7 @@ class IterableESPnetDataset(IterableDataset):
] = None,
float_dtype: str = "float32",
fs: dict = None,
+ mc: bool = False,
int_dtype: str = "long",
key_file: str = None,
):
@@ -122,12 +124,13 @@ class IterableESPnetDataset(IterableDataset):
self.int_dtype = int_dtype
self.key_file = key_file
self.fs = fs
+ self.mc = mc
self.debug_info = {}
non_iterable_list = []
self.path_name_type_list = []
- if not isinstance(path_name_type_list[0], Tuple):
+ if not isinstance(path_name_type_list[0], (Tuple, List)):
path = path_name_type_list[0]
name = path_name_type_list[1]
_type = path_name_type_list[2]
@@ -192,6 +195,7 @@ class IterableESPnetDataset(IterableDataset):
array = torchaudio.transforms.Resample(orig_freq=audio_fs,
new_freq=model_fs)(array)
array = array.squeeze(0).numpy()
+
data[name] = array
if self.preprocess is not None:
@@ -238,11 +242,17 @@ class IterableESPnetDataset(IterableDataset):
model_fs = self.fs["model_fs"]
if audio_fs is not None and model_fs is not None:
array = torch.from_numpy(array)
- array = array.unsqueeze(0)
array = torchaudio.transforms.Resample(orig_freq=audio_fs,
new_freq=model_fs)(array)
- array = array.squeeze(0).numpy()
- data[name] = array
+ array = array.numpy()
+
+ if _type == "sound":
+ if self.mc:
+ data[name] = array.transpose((1, 0))
+ else:
+ data[name] = array[0]
+ else:
+ data[name] = array
if self.preprocess is not None:
data = self.preprocess(uid, data)
@@ -340,11 +350,16 @@ class IterableESPnetDataset(IterableDataset):
model_fs = self.fs["model_fs"]
if audio_fs is not None and model_fs is not None:
array = torch.from_numpy(array)
- array = array.unsqueeze(0)
array = torchaudio.transforms.Resample(orig_freq=audio_fs,
new_freq=model_fs)(array)
- array = array.squeeze(0).numpy()
- data[name] = array
+ array = array.numpy()
+ if _type == "sound":
+ if self.mc:
+ data[name] = array.transpose((1, 0))
+ else:
+ data[name] = array[0]
+ else:
+ data[name] = array
if self.non_iterable_dataset is not None:
# 2.b. Load data from non-iterable dataset
_, from_non_iterable = self.non_iterable_dataset[uid]
diff --git a/funasr/export/README.md b/funasr/export/README.md
index bde1e94a5..b3068d050 100644
--- a/funasr/export/README.md
+++ b/funasr/export/README.md
@@ -55,3 +55,4 @@ python -m funasr.export.export_model --model-name /mnt/workspace/damo/speech_par
## Acknowledge
Torch model quantization is supported by [BladeDISC](https://github.com/alibaba/BladeDISC), an end-to-end DynamIc Shape Compiler project for machine learning workloads. BladeDISC provides general, transparent, and ease of use performance optimization for TensorFlow/PyTorch workloads on GPGPU and CPU backends. If you are interested, please contact us.
+
diff --git a/funasr/fileio/sound_scp.py b/funasr/fileio/sound_scp.py
index 459369efb..dc872b047 100644
--- a/funasr/fileio/sound_scp.py
+++ b/funasr/fileio/sound_scp.py
@@ -4,6 +4,7 @@ from typing import Union
import numpy as np
import soundfile
+import librosa
from typeguard import check_argument_types
from funasr.fileio.read_text import read_2column_text
@@ -30,6 +31,7 @@ class SoundScpReader(collections.abc.Mapping):
dtype=np.int16,
always_2d: bool = False,
normalize: bool = False,
+ dest_sample_rate: int = 16000,
):
assert check_argument_types()
self.fname = fname
@@ -37,15 +39,18 @@ class SoundScpReader(collections.abc.Mapping):
self.always_2d = always_2d
self.normalize = normalize
self.data = read_2column_text(fname)
+ self.dest_sample_rate = dest_sample_rate
def __getitem__(self, key):
wav = self.data[key]
if self.normalize:
# soundfile.read normalizes data to [-1,1] if dtype is not given
- array, rate = soundfile.read(wav, always_2d=self.always_2d)
+ array, rate = librosa.load(
+ wav, sr=self.dest_sample_rate, mono=not self.always_2d
+ )
else:
- array, rate = soundfile.read(
- wav, dtype=self.dtype, always_2d=self.always_2d
+ array, rate = librosa.load(
+ wav, sr=self.dest_sample_rate, mono=not self.always_2d, dtype=self.dtype
)
return rate, array
diff --git a/funasr/main_funcs/average_nbest_models.py b/funasr/main_funcs/average_nbest_models.py
index 53f956800..d8df94985 100644
--- a/funasr/main_funcs/average_nbest_models.py
+++ b/funasr/main_funcs/average_nbest_models.py
@@ -66,13 +66,13 @@ def average_nbest_models(
elif n == 1:
# The averaged model is same as the best model
e, _ = epoch_and_values[0]
- op = output_dir / f"{e}epoch.pth"
- sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pth"
+ op = output_dir / f"{e}epoch.pb"
+ sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pb"
if sym_op.is_symlink() or sym_op.exists():
sym_op.unlink()
sym_op.symlink_to(op.name)
else:
- op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pth"
+ op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pb"
logging.info(
f"Averaging {n}best models: " f'criterion="{ph}.{cr}": {op}'
)
@@ -83,12 +83,12 @@ def average_nbest_models(
if e not in _loaded:
if oss_bucket is None:
_loaded[e] = torch.load(
- output_dir / f"{e}epoch.pth",
+ output_dir / f"{e}epoch.pb",
map_location="cpu",
)
else:
buffer = BytesIO(
- oss_bucket.get_object(os.path.join(pai_output_dir, f"{e}epoch.pth")).read())
+ oss_bucket.get_object(os.path.join(pai_output_dir, f"{e}epoch.pb")).read())
_loaded[e] = torch.load(buffer)
states = _loaded[e]
@@ -115,13 +115,13 @@ def average_nbest_models(
else:
buffer = BytesIO()
torch.save(avg, buffer)
- oss_bucket.put_object(os.path.join(pai_output_dir, f"{ph}.{cr}.ave_{n}best.{suffix}pth"),
+ oss_bucket.put_object(os.path.join(pai_output_dir, f"{ph}.{cr}.ave_{n}best.{suffix}pb"),
buffer.getvalue())
- # 3. *.*.ave.pth is a symlink to the max ave model
+ # 3. *.*.ave.pb is a symlink to the max ave model
if oss_bucket is None:
- op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pth"
- sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pth"
+ op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pb"
+ sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pb"
if sym_op.is_symlink() or sym_op.exists():
sym_op.unlink()
sym_op.symlink_to(op.name)
diff --git a/funasr/main_funcs/pack_funcs.py b/funasr/main_funcs/pack_funcs.py
index ffa807e23..fe365d8e7 100644
--- a/funasr/main_funcs/pack_funcs.py
+++ b/funasr/main_funcs/pack_funcs.py
@@ -191,12 +191,12 @@ def unpack(
Examples:
tarfile:
- model.pth
+ model.pb
some1.file
some2.file
>>> unpack("tarfile", "out")
- {'asr_model_file': 'out/model.pth'}
+ {'asr_model_file': 'out/model.pb'}
"""
input_archive = Path(input_archive)
outpath = Path(outpath)
diff --git a/funasr/models/decoder/sanm_decoder.py b/funasr/models/decoder/sanm_decoder.py
index ab03f0b61..3bfcffc3f 100644
--- a/funasr/models/decoder/sanm_decoder.py
+++ b/funasr/models/decoder/sanm_decoder.py
@@ -90,6 +90,47 @@ class DecoderLayerSANM(nn.Module):
tgt = self.norm1(tgt)
tgt = self.feed_forward(tgt)
+ x = tgt
+ if self.self_attn:
+ if self.normalize_before:
+ tgt = self.norm2(tgt)
+ x, _ = self.self_attn(tgt, tgt_mask)
+ x = residual + self.dropout(x)
+
+ if self.src_attn is not None:
+ residual = x
+ if self.normalize_before:
+ x = self.norm3(x)
+
+ x = residual + self.dropout(self.src_attn(x, memory, memory_mask))
+
+
+ return x, tgt_mask, memory, memory_mask, cache
+
+ def forward_chunk(self, tgt, tgt_mask, memory, memory_mask=None, cache=None):
+ """Compute decoded features.
+
+ Args:
+ tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
+ tgt_mask (torch.Tensor): Mask for input tensor (#batch, maxlen_out).
+ memory (torch.Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
+ memory_mask (torch.Tensor): Encoded memory mask (#batch, maxlen_in).
+ cache (List[torch.Tensor]): List of cached tensors.
+ Each tensor shape should be (#batch, maxlen_out - 1, size).
+
+ Returns:
+ torch.Tensor: Output tensor(#batch, maxlen_out, size).
+ torch.Tensor: Mask for output tensor (#batch, maxlen_out).
+ torch.Tensor: Encoded memory (#batch, maxlen_in, size).
+ torch.Tensor: Encoded memory mask (#batch, maxlen_in).
+
+ """
+ # tgt = self.dropout(tgt)
+ residual = tgt
+ if self.normalize_before:
+ tgt = self.norm1(tgt)
+ tgt = self.feed_forward(tgt)
+
x = tgt
if self.self_attn:
if self.normalize_before:
@@ -109,7 +150,6 @@ class DecoderLayerSANM(nn.Module):
return x, tgt_mask, memory, memory_mask, cache
-
class FsmnDecoderSCAMAOpt(BaseTransformerDecoder):
"""
author: Speech Lab, Alibaba Group, China
@@ -947,6 +987,65 @@ class ParaformerSANMDecoder(BaseTransformerDecoder):
)
return logp.squeeze(0), state
+ def forward_chunk(
+ self,
+ memory: torch.Tensor,
+ tgt: torch.Tensor,
+ cache: dict = None,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ """Forward decoder.
+
+ Args:
+ hs_pad: encoded memory, float32 (batch, maxlen_in, feat)
+ hlens: (batch)
+ ys_in_pad:
+ input token ids, int64 (batch, maxlen_out)
+ if input_layer == "embed"
+ input tensor (batch, maxlen_out, #mels) in the other cases
+ ys_in_lens: (batch)
+ Returns:
+ (tuple): tuple containing:
+
+ x: decoded token score before softmax (batch, maxlen_out, token)
+ if use_output_layer is True,
+ olens: (batch, )
+ """
+ x = tgt
+ if cache["decode_fsmn"] is None:
+ cache_layer_num = len(self.decoders)
+ if self.decoders2 is not None:
+ cache_layer_num += len(self.decoders2)
+ new_cache = [None] * cache_layer_num
+ else:
+ new_cache = cache["decode_fsmn"]
+ for i in range(self.att_layer_num):
+ decoder = self.decoders[i]
+ x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
+ x, None, memory, None, cache=new_cache[i]
+ )
+ new_cache[i] = c_ret
+
+ if self.num_blocks - self.att_layer_num > 1:
+ for i in range(self.num_blocks - self.att_layer_num):
+ j = i + self.att_layer_num
+ decoder = self.decoders2[i]
+ x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
+ x, None, memory, None, cache=new_cache[j]
+ )
+ new_cache[j] = c_ret
+
+ for decoder in self.decoders3:
+
+ x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
+ x, None, memory, None, cache=None
+ )
+ if self.normalize_before:
+ x = self.after_norm(x)
+ if self.output_layer is not None:
+ x = self.output_layer(x)
+ cache["decode_fsmn"] = new_cache
+ return x
+
def forward_one_step(
self,
tgt: torch.Tensor,
diff --git a/funasr/models/e2e_asr_paraformer.py b/funasr/models/e2e_asr_paraformer.py
index 5786bc46e..02f60af22 100644
--- a/funasr/models/e2e_asr_paraformer.py
+++ b/funasr/models/e2e_asr_paraformer.py
@@ -325,6 +325,65 @@ class Paraformer(AbsESPnetModel):
return encoder_out, encoder_out_lens
+ def encode_chunk(
+ self, speech: torch.Tensor, speech_lengths: torch.Tensor, cache: dict = None
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ """Frontend + Encoder. Note that this method is used by asr_inference.py
+
+ Args:
+ speech: (Batch, Length, ...)
+ speech_lengths: (Batch, )
+ """
+ with autocast(False):
+ # 1. Extract feats
+ feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+
+ # 2. Data augmentation
+ if self.specaug is not None and self.training:
+ feats, feats_lengths = self.specaug(feats, feats_lengths)
+
+ # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
+ if self.normalize is not None:
+ feats, feats_lengths = self.normalize(feats, feats_lengths)
+
+ # Pre-encoder, e.g. used for raw input data
+ if self.preencoder is not None:
+ feats, feats_lengths = self.preencoder(feats, feats_lengths)
+
+ # 4. Forward encoder
+ # feats: (Batch, Length, Dim)
+ # -> encoder_out: (Batch, Length2, Dim2)
+ if self.encoder.interctc_use_conditioning:
+ encoder_out, encoder_out_lens, _ = self.encoder.forward_chunk(
+ feats, feats_lengths, cache=cache["encoder"], ctc=self.ctc
+ )
+ else:
+ encoder_out, encoder_out_lens, _ = self.encoder.forward_chunk(feats, feats_lengths, cache=cache["encoder"])
+ intermediate_outs = None
+ if isinstance(encoder_out, tuple):
+ intermediate_outs = encoder_out[1]
+ encoder_out = encoder_out[0]
+
+ # Post-encoder, e.g. NLU
+ if self.postencoder is not None:
+ encoder_out, encoder_out_lens = self.postencoder(
+ encoder_out, encoder_out_lens
+ )
+
+ assert encoder_out.size(0) == speech.size(0), (
+ encoder_out.size(),
+ speech.size(0),
+ )
+ assert encoder_out.size(1) <= encoder_out_lens.max(), (
+ encoder_out.size(),
+ encoder_out_lens.max(),
+ )
+
+ if intermediate_outs is not None:
+ return (encoder_out, intermediate_outs), encoder_out_lens
+
+ return encoder_out, encoder_out_lens
+
def calc_predictor(self, encoder_out, encoder_out_lens):
encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
@@ -333,6 +392,11 @@ class Paraformer(AbsESPnetModel):
ignore_id=self.ignore_id)
return pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index
+ def calc_predictor_chunk(self, encoder_out, cache=None):
+
+ pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor.forward_chunk(encoder_out, cache["encoder"])
+ return pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index
+
def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens):
decoder_outs = self.decoder(
@@ -342,6 +406,14 @@ class Paraformer(AbsESPnetModel):
decoder_out = torch.log_softmax(decoder_out, dim=-1)
return decoder_out, ys_pad_lens
+ def cal_decoder_with_predictor_chunk(self, encoder_out, sematic_embeds, cache=None):
+ decoder_outs = self.decoder.forward_chunk(
+ encoder_out, sematic_embeds, cache["decoder"]
+ )
+ decoder_out = decoder_outs
+ decoder_out = torch.log_softmax(decoder_out, dim=-1)
+ return decoder_out
+
def _extract_feats(
self, speech: torch.Tensor, speech_lengths: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -926,10 +998,10 @@ class BiCifParaformer(Paraformer):
def calc_predictor_timestamp(self, encoder_out, encoder_out_lens, token_num):
encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
encoder_out.device)
- ds_alphas, ds_cif_peak, us_alphas, us_cif_peak = self.predictor.get_upsample_timestamp(encoder_out,
+ ds_alphas, ds_cif_peak, us_alphas, us_peaks = self.predictor.get_upsample_timestamp(encoder_out,
encoder_out_mask,
token_num)
- return ds_alphas, ds_cif_peak, us_alphas, us_cif_peak
+ return ds_alphas, ds_cif_peak, us_alphas, us_peaks
def forward(
self,
@@ -978,6 +1050,7 @@ class BiCifParaformer(Paraformer):
loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
return loss, stats, weight
+
class ContextualParaformer(Paraformer):
"""
Paraformer model with contextual hotword
@@ -1458,4 +1531,4 @@ class ContextualParaformer(Paraformer):
"torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_tf,
var_dict_tf[name_tf].shape))
- return var_dict_torch_update
\ No newline at end of file
+ return var_dict_torch_update
diff --git a/funasr/models/e2e_diar_eend_ola.py b/funasr/models/e2e_diar_eend_ola.py
new file mode 100644
index 000000000..097b23a57
--- /dev/null
+++ b/funasr/models/e2e_diar_eend_ola.py
@@ -0,0 +1,253 @@
+# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
+# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+
+from contextlib import contextmanager
+from distutils.version import LooseVersion
+from typing import Dict
+from typing import Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+from typeguard import check_argument_types
+
+from funasr.models.frontend.wav_frontend import WavFrontendMel23
+from funasr.modules.eend_ola.encoder import EENDOLATransformerEncoder
+from funasr.modules.eend_ola.encoder_decoder_attractor import EncoderDecoderAttractor
+from funasr.modules.eend_ola.utils.power import generate_mapping_dict
+from funasr.torch_utils.device_funcs import force_gatherable
+from funasr.train.abs_espnet_model import AbsESPnetModel
+
+if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+ pass
+else:
+ # Nothing to do if torch<1.6.0
+ @contextmanager
+ def autocast(enabled=True):
+ yield
+
+
+def pad_attractor(att, max_n_speakers):
+ C, D = att.shape
+ if C < max_n_speakers:
+ att = torch.cat([att, torch.zeros(max_n_speakers - C, D).to(torch.float32).to(att.device)], dim=0)
+ return att
+
+
+class DiarEENDOLAModel(AbsESPnetModel):
+ """EEND-OLA diarization model"""
+
+ def __init__(
+ self,
+ frontend: WavFrontendMel23,
+ encoder: EENDOLATransformerEncoder,
+ encoder_decoder_attractor: EncoderDecoderAttractor,
+ n_units: int = 256,
+ max_n_speaker: int = 8,
+ attractor_loss_weight: float = 1.0,
+ mapping_dict=None,
+ **kwargs,
+ ):
+ assert check_argument_types()
+
+ super().__init__()
+ self.frontend = frontend
+ self.enc = encoder
+ self.eda = encoder_decoder_attractor
+ self.attractor_loss_weight = attractor_loss_weight
+ self.max_n_speaker = max_n_speaker
+ if mapping_dict is None:
+ mapping_dict = generate_mapping_dict(max_speaker_num=self.max_n_speaker)
+ self.mapping_dict = mapping_dict
+ # PostNet
+ self.postnet = nn.LSTM(self.max_n_speaker, n_units, 1, batch_first=True)
+ self.output_layer = nn.Linear(n_units, mapping_dict['oov'] + 1)
+
+ def forward_encoder(self, xs, ilens):
+ xs = nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=-1)
+ pad_shape = xs.shape
+ xs_mask = [torch.ones(ilen).to(xs.device) for ilen in ilens]
+ xs_mask = torch.nn.utils.rnn.pad_sequence(xs_mask, batch_first=True, padding_value=0).unsqueeze(-2)
+ emb = self.enc(xs, xs_mask)
+ emb = torch.split(emb.view(pad_shape[0], pad_shape[1], -1), 1, dim=0)
+ emb = [e[0][:ilen] for e, ilen in zip(emb, ilens)]
+ return emb
+
+ def forward_post_net(self, logits, ilens):
+ maxlen = torch.max(ilens).to(torch.int).item()
+ logits = nn.utils.rnn.pad_sequence(logits, batch_first=True, padding_value=-1)
+ logits = nn.utils.rnn.pack_padded_sequence(logits, ilens.cpu().to(torch.int64), batch_first=True, enforce_sorted=False)
+ outputs, (_, _) = self.postnet(logits)
+ outputs = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True, padding_value=-1, total_length=maxlen)[0]
+ outputs = [output[:ilens[i].to(torch.int).item()] for i, output in enumerate(outputs)]
+ outputs = [self.output_layer(output) for output in outputs]
+ return outputs
+
+ def forward(
+ self,
+ speech: torch.Tensor,
+ speech_lengths: torch.Tensor,
+ text: torch.Tensor,
+ text_lengths: torch.Tensor,
+ ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+ """Frontend + Encoder + Decoder + Calc loss
+
+ Args:
+ speech: (Batch, Length, ...)
+ speech_lengths: (Batch, )
+ text: (Batch, Length)
+ text_lengths: (Batch,)
+ """
+ assert text_lengths.dim() == 1, text_lengths.shape
+ # Check that batch_size is unified
+ assert (
+ speech.shape[0]
+ == speech_lengths.shape[0]
+ == text.shape[0]
+ == text_lengths.shape[0]
+ ), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape)
+ batch_size = speech.shape[0]
+
+ # for data-parallel
+ text = text[:, : text_lengths.max()]
+
+ # 1. Encoder
+ encoder_out, encoder_out_lens = self.enc(speech, speech_lengths)
+ intermediate_outs = None
+ if isinstance(encoder_out, tuple):
+ intermediate_outs = encoder_out[1]
+ encoder_out = encoder_out[0]
+
+ loss_att, acc_att, cer_att, wer_att = None, None, None, None
+ loss_ctc, cer_ctc = None, None
+ stats = dict()
+
+ # 1. CTC branch
+ if self.ctc_weight != 0.0:
+ loss_ctc, cer_ctc = self._calc_ctc_loss(
+ encoder_out, encoder_out_lens, text, text_lengths
+ )
+
+ # Collect CTC branch stats
+ stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
+ stats["cer_ctc"] = cer_ctc
+
+ # Intermediate CTC (optional)
+ loss_interctc = 0.0
+ if self.interctc_weight != 0.0 and intermediate_outs is not None:
+ for layer_idx, intermediate_out in intermediate_outs:
+ # we assume intermediate_out has the same length & padding
+ # as those of encoder_out
+ loss_ic, cer_ic = self._calc_ctc_loss(
+ intermediate_out, encoder_out_lens, text, text_lengths
+ )
+ loss_interctc = loss_interctc + loss_ic
+
+ # Collect Intermedaite CTC stats
+ stats["loss_interctc_layer{}".format(layer_idx)] = (
+ loss_ic.detach() if loss_ic is not None else None
+ )
+ stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic
+
+ loss_interctc = loss_interctc / len(intermediate_outs)
+
+ # calculate whole encoder loss
+ loss_ctc = (
+ 1 - self.interctc_weight
+ ) * loss_ctc + self.interctc_weight * loss_interctc
+
+ # 2b. Attention decoder branch
+ if self.ctc_weight != 1.0:
+ loss_att, acc_att, cer_att, wer_att = self._calc_att_loss(
+ encoder_out, encoder_out_lens, text, text_lengths
+ )
+
+ # 3. CTC-Att loss definition
+ if self.ctc_weight == 0.0:
+ loss = loss_att
+ elif self.ctc_weight == 1.0:
+ loss = loss_ctc
+ else:
+ loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att
+
+ # Collect Attn branch stats
+ stats["loss_att"] = loss_att.detach() if loss_att is not None else None
+ stats["acc"] = acc_att
+ stats["cer"] = cer_att
+ stats["wer"] = wer_att
+
+ # Collect total loss stats
+ stats["loss"] = torch.clone(loss.detach())
+
+ # force_gatherable: to-device and to-tensor if scalar for DataParallel
+ loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+ return loss, stats, weight
+
+ def estimate_sequential(self,
+ speech: torch.Tensor,
+ speech_lengths: torch.Tensor,
+ n_speakers: int = None,
+ shuffle: bool = True,
+ threshold: float = 0.5,
+ **kwargs):
+ speech = [s[:s_len] for s, s_len in zip(speech, speech_lengths)]
+ emb = self.forward_encoder(speech, speech_lengths)
+ if shuffle:
+ orders = [np.arange(e.shape[0]) for e in emb]
+ for order in orders:
+ np.random.shuffle(order)
+ attractors, probs = self.eda.estimate(
+ [e[torch.from_numpy(order).to(torch.long).to(speech[0].device)] for e, order in zip(emb, orders)])
+ else:
+ attractors, probs = self.eda.estimate(emb)
+ attractors_active = []
+ for p, att, e in zip(probs, attractors, emb):
+ if n_speakers and n_speakers >= 0:
+ att = att[:n_speakers, ]
+ attractors_active.append(att)
+ elif threshold is not None:
+ silence = torch.nonzero(p < threshold)[0]
+ n_spk = silence[0] if silence.size else None
+ att = att[:n_spk, ]
+ attractors_active.append(att)
+ else:
+ NotImplementedError('n_speakers or threshold has to be given.')
+ raw_n_speakers = [att.shape[0] for att in attractors_active]
+ attractors = [
+ pad_attractor(att, self.max_n_speaker) if att.shape[0] <= self.max_n_speaker else att[:self.max_n_speaker]
+ for att in attractors_active]
+ ys = [torch.matmul(e, att.permute(1, 0)) for e, att in zip(emb, attractors)]
+ logits = self.forward_post_net(ys, speech_lengths)
+ ys = [self.recover_y_from_powerlabel(logit, raw_n_speaker) for logit, raw_n_speaker in
+ zip(logits, raw_n_speakers)]
+
+ return ys, emb, attractors, raw_n_speakers
+
+ def recover_y_from_powerlabel(self, logit, n_speaker):
+ pred = torch.argmax(torch.softmax(logit, dim=-1), dim=-1)
+ oov_index = torch.where(pred == self.mapping_dict['oov'])[0]
+ for i in oov_index:
+ if i > 0:
+ pred[i] = pred[i - 1]
+ else:
+ pred[i] = 0
+ pred = [self.inv_mapping_func(i) for i in pred]
+ decisions = [bin(num)[2:].zfill(self.max_n_speaker)[::-1] for num in pred]
+ decisions = torch.from_numpy(
+ np.stack([np.array([int(i) for i in dec]) for dec in decisions], axis=0)).to(logit.device).to(
+ torch.float32)
+ decisions = decisions[:, :n_speaker]
+ return decisions
+
+ def inv_mapping_func(self, label):
+
+ if not isinstance(label, int):
+ label = int(label)
+ if label in self.mapping_dict['label2dec'].keys():
+ num = self.mapping_dict['label2dec'][label]
+ else:
+ num = -1
+ return num
+
+ def collect_feats(self, **batch: torch.Tensor) -> Dict[str, torch.Tensor]:
+ pass
\ No newline at end of file
diff --git a/funasr/models/e2e_diar_sond.py b/funasr/models/e2e_diar_sond.py
index 419c8133a..de669f2ee 100644
--- a/funasr/models/e2e_diar_sond.py
+++ b/funasr/models/e2e_diar_sond.py
@@ -59,7 +59,8 @@ class DiarSondModel(AbsESPnetModel):
normalize_speech_speaker: bool = False,
ignore_id: int = -1,
speaker_discrimination_loss_weight: float = 1.0,
- inter_score_loss_weight: float = 0.0
+ inter_score_loss_weight: float = 0.0,
+ inputs_type: str = "raw",
):
assert check_argument_types()
@@ -86,14 +87,12 @@ class DiarSondModel(AbsESPnetModel):
)
self.criterion_bce = SequenceBinaryCrossEntropy(normalize_length=length_normalized_loss)
self.pse_embedding = self.generate_pse_embedding()
- # self.register_buffer("pse_embedding", pse_embedding)
self.power_weight = torch.from_numpy(2 ** np.arange(max_spk_num)[np.newaxis, np.newaxis, :]).float()
- # self.register_buffer("power_weight", power_weight)
self.int_token_arr = torch.from_numpy(np.array(self.token_list).astype(int)[np.newaxis, np.newaxis, :]).int()
- # self.register_buffer("int_token_arr", int_token_arr)
self.speaker_discrimination_loss_weight = speaker_discrimination_loss_weight
self.inter_score_loss_weight = inter_score_loss_weight
self.forward_steps = 0
+ self.inputs_type = inputs_type
def generate_pse_embedding(self):
embedding = np.zeros((len(self.token_list), self.max_spk_num), dtype=np.float)
@@ -125,9 +124,14 @@ class DiarSondModel(AbsESPnetModel):
binary_labels: (Batch, frames, max_spk_num)
binary_labels_lengths: (Batch,)
"""
- assert speech.shape[0] == binary_labels.shape[0], (speech.shape, binary_labels.shape)
+ assert speech.shape[0] <= binary_labels.shape[0], (speech.shape, binary_labels.shape)
batch_size = speech.shape[0]
self.forward_steps = self.forward_steps + 1
+ if self.pse_embedding.device != speech.device:
+ self.pse_embedding = self.pse_embedding.to(speech.device)
+ self.power_weight = self.power_weight.to(speech.device)
+ self.int_token_arr = self.int_token_arr.to(speech.device)
+
# 1. Network forward
pred, inter_outputs = self.prediction_forward(
speech, speech_lengths,
@@ -149,9 +153,13 @@ class DiarSondModel(AbsESPnetModel):
# the sequence length of 'pred' might be slightly less than the
# length of 'spk_labels'. Here we force them to be equal.
length_diff_tolerance = 2
- length_diff = pse_labels.shape[1] - pred.shape[1]
- if 0 < length_diff <= length_diff_tolerance:
- pse_labels = pse_labels[:, 0: pred.shape[1]]
+ length_diff = abs(pse_labels.shape[1] - pred.shape[1])
+ if length_diff <= length_diff_tolerance:
+ min_len = min(pred.shape[1], pse_labels.shape[1])
+ pse_labels = pse_labels[:, :min_len]
+ pred = pred[:, :min_len]
+ cd_score = cd_score[:, :min_len]
+ ci_score = ci_score[:, :min_len]
loss_diar = self.classification_loss(pred, pse_labels, binary_labels_lengths)
loss_spk_dis = self.speaker_discrimination_loss(profile, profile_lengths)
@@ -299,7 +307,7 @@ class DiarSondModel(AbsESPnetModel):
speech: torch.Tensor,
speech_lengths: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
- if self.encoder is not None:
+ if self.encoder is not None and self.inputs_type == "raw":
speech, speech_lengths = self.encode(speech, speech_lengths)
speech_mask = ~make_pad_mask(speech_lengths, maxlen=speech.shape[1])
speech_mask = speech_mask.to(speech.device).unsqueeze(-1).float()
@@ -342,6 +350,7 @@ class DiarSondModel(AbsESPnetModel):
if isinstance(self.ci_scorer, AbsEncoder):
ci_simi = self.ci_scorer(ge_in, ge_len)[0]
+ ci_simi = torch.reshape(ci_simi, [bb, self.max_spk_num, tt]).permute([0, 2, 1])
else:
ci_simi = self.ci_scorer(speech_encoder_outputs, speaker_encoder_outputs)
diff --git a/funasr/models/e2e_tp.py b/funasr/models/e2e_tp.py
new file mode 100644
index 000000000..887439c5e
--- /dev/null
+++ b/funasr/models/e2e_tp.py
@@ -0,0 +1,175 @@
+import logging
+from contextlib import contextmanager
+from distutils.version import LooseVersion
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+import numpy as np
+from typeguard import check_argument_types
+
+from funasr.models.encoder.abs_encoder import AbsEncoder
+from funasr.models.frontend.abs_frontend import AbsFrontend
+from funasr.models.predictor.cif import mae_loss
+from funasr.modules.add_sos_eos import add_sos_eos
+from funasr.modules.nets_utils import make_pad_mask, pad_list
+from funasr.torch_utils.device_funcs import force_gatherable
+from funasr.train.abs_espnet_model import AbsESPnetModel
+from funasr.models.predictor.cif import CifPredictorV3
+
+
+if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+ from torch.cuda.amp import autocast
+else:
+ # Nothing to do if torch<1.6.0
+ @contextmanager
+ def autocast(enabled=True):
+ yield
+
+
+class TimestampPredictor(AbsESPnetModel):
+ """
+ Author: Speech Lab, Alibaba Group, China
+ """
+
+ def __init__(
+ self,
+ frontend: Optional[AbsFrontend],
+ encoder: AbsEncoder,
+ predictor: CifPredictorV3,
+ predictor_bias: int = 0,
+ token_list=None,
+ ):
+ assert check_argument_types()
+
+ super().__init__()
+ # note that eos is the same as sos (equivalent ID)
+
+ self.frontend = frontend
+ self.encoder = encoder
+ self.encoder.interctc_use_conditioning = False
+
+ self.predictor = predictor
+ self.predictor_bias = predictor_bias
+ self.criterion_pre = mae_loss()
+ self.token_list = token_list
+
+ def forward(
+ self,
+ speech: torch.Tensor,
+ speech_lengths: torch.Tensor,
+ text: torch.Tensor,
+ text_lengths: torch.Tensor,
+ ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+ """Frontend + Encoder + Decoder + Calc loss
+
+ Args:
+ speech: (Batch, Length, ...)
+ speech_lengths: (Batch, )
+ text: (Batch, Length)
+ text_lengths: (Batch,)
+ """
+ assert text_lengths.dim() == 1, text_lengths.shape
+ # Check that batch_size is unified
+ assert (
+ speech.shape[0]
+ == speech_lengths.shape[0]
+ == text.shape[0]
+ == text_lengths.shape[0]
+ ), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape)
+ batch_size = speech.shape[0]
+ # for data-parallel
+ text = text[:, : text_lengths.max()]
+ speech = speech[:, :speech_lengths.max()]
+
+ # 1. Encoder
+ encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+
+ encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
+ encoder_out.device)
+ if self.predictor_bias == 1:
+ _, text = add_sos_eos(text, 1, 2, -1)
+ text_lengths = text_lengths + self.predictor_bias
+ _, _, _, _, pre_token_length2 = self.predictor(encoder_out, text, encoder_out_mask, ignore_id=-1)
+
+ # loss_pre = self.criterion_pre(ys_pad_lens.type_as(pre_token_length), pre_token_length)
+ loss_pre = self.criterion_pre(text_lengths.type_as(pre_token_length2), pre_token_length2)
+
+ loss = loss_pre
+ stats = dict()
+
+ # Collect Attn branch stats
+ stats["loss_pre"] = loss_pre.detach().cpu() if loss_pre is not None else None
+ stats["loss"] = torch.clone(loss.detach())
+
+ # force_gatherable: to-device and to-tensor if scalar for DataParallel
+ loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+ return loss, stats, weight
+
+ def encode(
+ self, speech: torch.Tensor, speech_lengths: torch.Tensor
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ """Frontend + Encoder. Note that this method is used by asr_inference.py
+
+ Args:
+ speech: (Batch, Length, ...)
+ speech_lengths: (Batch, )
+ """
+ with autocast(False):
+ # 1. Extract feats
+ feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+
+ # 4. Forward encoder
+ # feats: (Batch, Length, Dim)
+ # -> encoder_out: (Batch, Length2, Dim2)
+ encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths)
+
+ return encoder_out, encoder_out_lens
+
+ def _extract_feats(
+ self, speech: torch.Tensor, speech_lengths: torch.Tensor
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ assert speech_lengths.dim() == 1, speech_lengths.shape
+
+ # for data-parallel
+ speech = speech[:, : speech_lengths.max()]
+ if self.frontend is not None:
+ # Frontend
+ # e.g. STFT and Feature extract
+ # data_loader may send time-domain signal in this case
+ # speech (Batch, NSamples) -> feats: (Batch, NFrames, Dim)
+ feats, feats_lengths = self.frontend(speech, speech_lengths)
+ else:
+ # No frontend and no feature extract
+ feats, feats_lengths = speech, speech_lengths
+ return feats, feats_lengths
+
+ def calc_predictor_timestamp(self, encoder_out, encoder_out_lens, token_num):
+ encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
+ encoder_out.device)
+ ds_alphas, ds_cif_peak, us_alphas, us_peaks = self.predictor.get_upsample_timestamp(encoder_out,
+ encoder_out_mask,
+ token_num)
+ return ds_alphas, ds_cif_peak, us_alphas, us_peaks
+
+ def collect_feats(
+ self,
+ speech: torch.Tensor,
+ speech_lengths: torch.Tensor,
+ text: torch.Tensor,
+ text_lengths: torch.Tensor,
+ ) -> Dict[str, torch.Tensor]:
+ if self.extract_feats_in_collect_stats:
+ feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+ else:
+ # Generate dummy stats if extract_feats_in_collect_stats is False
+ logging.warning(
+ "Generating dummy stats for feats and feats_lengths, "
+ "because encoder_conf.extract_feats_in_collect_stats is "
+ f"{self.extract_feats_in_collect_stats}"
+ )
+ feats, feats_lengths = speech, speech_lengths
+ return {"feats": feats, "feats_lengths": feats_lengths}
diff --git a/funasr/models/e2e_vad.py b/funasr/models/e2e_vad.py
index b9be89aaa..2c5673cb1 100755
--- a/funasr/models/e2e_vad.py
+++ b/funasr/models/e2e_vad.py
@@ -215,6 +215,7 @@ class E2EVadModel(nn.Module):
self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
self.noise_average_decibel = -100.0
self.pre_end_silence_detected = False
+ self.next_seg = True
self.output_data_buf = []
self.output_data_buf_offset = 0
@@ -244,6 +245,7 @@ class E2EVadModel(nn.Module):
self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
self.noise_average_decibel = -100.0
self.pre_end_silence_detected = False
+ self.next_seg = True
self.output_data_buf = []
self.output_data_buf_offset = 0
@@ -441,7 +443,7 @@ class E2EVadModel(nn.Module):
- 1)) / self.vad_opts.noise_frame_num_used_for_snr
return frame_state
-
+
def forward(self, feats: torch.Tensor, waveform: torch.tensor, in_cache: Dict[str, torch.Tensor] = dict(),
is_final: bool = False
) -> Tuple[List[List[List[int]]], Dict[str, torch.Tensor]]:
@@ -470,6 +472,42 @@ class E2EVadModel(nn.Module):
self.AllResetDetection()
return segments, in_cache
+ def forward_online(self, feats: torch.Tensor, waveform: torch.tensor, in_cache: Dict[str, torch.Tensor] = dict(),
+ is_final: bool = False
+ ) -> Tuple[List[List[List[int]]], Dict[str, torch.Tensor]]:
+ self.waveform = waveform # compute decibel for each frame
+ self.ComputeDecibel()
+ self.ComputeScores(feats, in_cache)
+ if not is_final:
+ self.DetectCommonFrames()
+ else:
+ self.DetectLastFrames()
+ segments = []
+ for batch_num in range(0, feats.shape[0]): # only support batch_size = 1 now
+ segment_batch = []
+ if len(self.output_data_buf) > 0:
+ for i in range(self.output_data_buf_offset, len(self.output_data_buf)):
+ if not self.output_data_buf[i].contain_seg_start_point:
+ continue
+ if not self.next_seg and not self.output_data_buf[i].contain_seg_end_point:
+ continue
+ start_ms = self.output_data_buf[i].start_ms if self.next_seg else -1
+ if self.output_data_buf[i].contain_seg_end_point:
+ end_ms = self.output_data_buf[i].end_ms
+ self.next_seg = True
+ self.output_data_buf_offset += 1
+ else:
+ end_ms = -1
+ self.next_seg = False
+ segment = [start_ms, end_ms]
+ segment_batch.append(segment)
+ if segment_batch:
+ segments.append(segment_batch)
+ if is_final:
+ # reset class variables and clear the dict for the next query
+ self.AllResetDetection()
+ return segments, in_cache
+
def DetectCommonFrames(self) -> int:
if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
return 0
diff --git a/funasr/models/encoder/opennmt_encoders/conv_encoder.py b/funasr/models/encoder/opennmt_encoders/conv_encoder.py
index 40967437b..a33e0b718 100644
--- a/funasr/models/encoder/opennmt_encoders/conv_encoder.py
+++ b/funasr/models/encoder/opennmt_encoders/conv_encoder.py
@@ -137,12 +137,12 @@ class ConvEncoder(AbsEncoder):
self.out_padding = nn.ConstantPad1d((left_padding, right_padding), 0.0)
self.conv_out = nn.Conv1d(
num_units,
- num_units,
+ out_units,
kernel_size,
)
if self.out_norm:
- self.after_norm = LayerNorm(num_units)
+ self.after_norm = LayerNorm(out_units)
def output_size(self) -> int:
return self.num_units
diff --git a/funasr/models/encoder/opennmt_encoders/self_attention_encoder.py b/funasr/models/encoder/opennmt_encoders/self_attention_encoder.py
index 443b37ae3..cf77bce4b 100644
--- a/funasr/models/encoder/opennmt_encoders/self_attention_encoder.py
+++ b/funasr/models/encoder/opennmt_encoders/self_attention_encoder.py
@@ -272,7 +272,7 @@ class SelfAttentionEncoder(AbsEncoder):
position embedded tensor and mask
"""
masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
- xs_pad *= self.output_size()**0.5
+ xs_pad = xs_pad * self.output_size()**0.5
if self.embed is None:
xs_pad = xs_pad
elif (
diff --git a/funasr/models/encoder/resnet34_encoder.py b/funasr/models/encoder/resnet34_encoder.py
index 952ce1597..7d7179a00 100644
--- a/funasr/models/encoder/resnet34_encoder.py
+++ b/funasr/models/encoder/resnet34_encoder.py
@@ -387,7 +387,6 @@ class ResNet34_SP_L2Reg(AbsEncoder):
return var_dict_torch_update
-
class ResNet34Diar(ResNet34):
def __init__(
self,
@@ -613,3 +612,230 @@ class ResNet34Diar(ResNet34):
logging.warning("{} is missed from tf checkpoint".format(name))
return var_dict_torch_update
+
+
+class ResNet34SpL2RegDiar(ResNet34_SP_L2Reg):
+ def __init__(
+ self,
+ input_size,
+ embedding_node="resnet1_dense",
+ use_head_conv=True,
+ batchnorm_momentum=0.5,
+ use_head_maxpool=False,
+ num_nodes_pooling_layer=256,
+ layers_in_block=(3, 4, 6, 3),
+ filters_in_block=(32, 64, 128, 256),
+ num_nodes_resnet1=256,
+ num_nodes_last_layer=256,
+ pooling_type="window_shift",
+ pool_size=20,
+ stride=1,
+ tf2torch_tensor_name_prefix_torch="encoder",
+ tf2torch_tensor_name_prefix_tf="seq2seq/speech_encoder"
+ ):
+ super(ResNet34SpL2RegDiar, self).__init__(
+ input_size,
+ use_head_conv=use_head_conv,
+ batchnorm_momentum=batchnorm_momentum,
+ use_head_maxpool=use_head_maxpool,
+ num_nodes_pooling_layer=num_nodes_pooling_layer,
+ layers_in_block=layers_in_block,
+ filters_in_block=filters_in_block,
+ )
+
+ self.embedding_node = embedding_node
+ self.num_nodes_resnet1 = num_nodes_resnet1
+ self.num_nodes_last_layer = num_nodes_last_layer
+ self.pooling_type = pooling_type
+ self.pool_size = pool_size
+ self.stride = stride
+ self.tf2torch_tensor_name_prefix_torch = tf2torch_tensor_name_prefix_torch
+ self.tf2torch_tensor_name_prefix_tf = tf2torch_tensor_name_prefix_tf
+
+ self.resnet1_dense = torch.nn.Linear(num_nodes_pooling_layer * 2, num_nodes_resnet1)
+ self.resnet1_bn = torch.nn.BatchNorm1d(num_nodes_resnet1, eps=1e-3, momentum=batchnorm_momentum)
+
+ self.resnet2_dense = torch.nn.Linear(num_nodes_resnet1, num_nodes_last_layer)
+ self.resnet2_bn = torch.nn.BatchNorm1d(num_nodes_last_layer, eps=1e-3, momentum=batchnorm_momentum)
+
+ def output_size(self) -> int:
+ if self.embedding_node.startswith("resnet1"):
+ return self.num_nodes_resnet1
+ elif self.embedding_node.startswith("resnet2"):
+ return self.num_nodes_last_layer
+
+ return self.num_nodes_pooling_layer
+
+ def forward(
+ self,
+ xs_pad: torch.Tensor,
+ ilens: torch.Tensor,
+ prev_states: torch.Tensor = None,
+ ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+
+ endpoints = OrderedDict()
+ res_out, ilens = super().forward(xs_pad, ilens)
+ endpoints["resnet0_bn"] = res_out
+ if self.pooling_type == "frame_gsp":
+ features = statistic_pooling(res_out, ilens, (2, ))
+ else:
+ features, ilens = windowed_statistic_pooling(res_out, ilens, (2, ), self.pool_size, self.stride)
+ features = features.transpose(1, 2)
+ endpoints["pooling"] = features
+
+ features = self.resnet1_dense(features)
+ endpoints["resnet1_dense"] = features
+ features = F.relu(features)
+ endpoints["resnet1_relu"] = features
+ features = self.resnet1_bn(features.transpose(1, 2)).transpose(1, 2)
+ endpoints["resnet1_bn"] = features
+
+ features = self.resnet2_dense(features)
+ endpoints["resnet2_dense"] = features
+ features = F.relu(features)
+ endpoints["resnet2_relu"] = features
+ features = self.resnet2_bn(features.transpose(1, 2)).transpose(1, 2)
+ endpoints["resnet2_bn"] = features
+
+ return endpoints[self.embedding_node], ilens, None
+
+ def gen_tf2torch_map_dict(self):
+ tensor_name_prefix_torch = self.tf2torch_tensor_name_prefix_torch
+ tensor_name_prefix_tf = self.tf2torch_tensor_name_prefix_tf
+ train_steps = 720000
+ map_dict_local = {
+ # torch: conv1d.weight in "out_channel in_channel kernel_size"
+ # tf : conv1d.weight in "kernel_size in_channel out_channel"
+ # torch: linear.weight in "out_channel in_channel"
+ # tf : dense.weight in "in_channel out_channel"
+ "{}.pre_conv.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/pre_conv/kernel".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": (3, 2, 0, 1),
+ },
+ "{}.pre_conv_bn.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/pre_conv_bn/beta".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ },
+ "{}.pre_conv_bn.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/pre_conv_bn/gamma".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ },
+ "{}.pre_conv_bn.running_mean".format(tensor_name_prefix_torch):
+ {"name": "{}/pre_conv_bn/moving_mean".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ },
+ "{}.pre_conv_bn.running_var".format(tensor_name_prefix_torch):
+ {"name": "{}/pre_conv_bn/moving_variance".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ },
+ "{}.pre_conv_bn.num_batches_tracked".format(tensor_name_prefix_torch): train_steps
+ }
+ for layer_idx in range(3):
+ map_dict_local.update({
+ "{}.resnet{}_dense.weight".format(tensor_name_prefix_torch, layer_idx):
+ {"name": "{}/resnet{}_dense/kernel".format(tensor_name_prefix_tf, layer_idx),
+ "squeeze": None,
+ "transpose": (2, 1, 0) if layer_idx == 0 else (1, 0),
+ },
+ "{}.resnet{}_dense.bias".format(tensor_name_prefix_torch, layer_idx):
+ {"name": "{}/resnet{}_dense/bias".format(tensor_name_prefix_tf, layer_idx),
+ "squeeze": None,
+ "transpose": None,
+ },
+ "{}.resnet{}_bn.weight".format(tensor_name_prefix_torch, layer_idx):
+ {"name": "{}/resnet{}_bn/gamma".format(tensor_name_prefix_tf, layer_idx),
+ "squeeze": None,
+ "transpose": None,
+ },
+ "{}.resnet{}_bn.bias".format(tensor_name_prefix_torch, layer_idx):
+ {"name": "{}/resnet{}_bn/beta".format(tensor_name_prefix_tf, layer_idx),
+ "squeeze": None,
+ "transpose": None,
+ },
+ "{}.resnet{}_bn.running_mean".format(tensor_name_prefix_torch, layer_idx):
+ {"name": "{}/resnet{}_bn/moving_mean".format(tensor_name_prefix_tf, layer_idx),
+ "squeeze": None,
+ "transpose": None,
+ },
+ "{}.resnet{}_bn.running_var".format(tensor_name_prefix_torch, layer_idx):
+ {"name": "{}/resnet{}_bn/moving_variance".format(tensor_name_prefix_tf, layer_idx),
+ "squeeze": None,
+ "transpose": None,
+ },
+ "{}.resnet{}_bn.num_batches_tracked".format(tensor_name_prefix_torch, layer_idx): train_steps
+ })
+
+ for block_idx in range(len(self.layers_in_block)):
+ for layer_idx in range(self.layers_in_block[block_idx]):
+ for i in ["1", "2", "_sc"]:
+ map_dict_local.update({
+ "{}.block_{}.layer_{}.conv{}.weight".format(tensor_name_prefix_torch, block_idx, layer_idx, i):
+ {"name": "{}/block_{}/layer_{}/conv{}/kernel".format(tensor_name_prefix_tf, block_idx, layer_idx, i),
+ "squeeze": None,
+ "transpose": (3, 2, 0, 1),
+ },
+ "{}.block_{}.layer_{}.bn{}.weight".format(tensor_name_prefix_torch, block_idx, layer_idx, i):
+ {"name": "{}/block_{}/layer_{}/bn{}/gamma".format(tensor_name_prefix_tf, block_idx, layer_idx, i),
+ "squeeze": None,
+ "transpose": None,
+ },
+ "{}.block_{}.layer_{}.bn{}.bias".format(tensor_name_prefix_torch, block_idx, layer_idx, i):
+ {"name": "{}/block_{}/layer_{}/bn{}/beta".format(tensor_name_prefix_tf, block_idx, layer_idx, i),
+ "squeeze": None,
+ "transpose": None,
+ },
+ "{}.block_{}.layer_{}.bn{}.running_mean".format(tensor_name_prefix_torch, block_idx, layer_idx, i):
+ {"name": "{}/block_{}/layer_{}/bn{}/moving_mean".format(tensor_name_prefix_tf, block_idx, layer_idx, i),
+ "squeeze": None,
+ "transpose": None,
+ },
+ "{}.block_{}.layer_{}.bn{}.running_var".format(tensor_name_prefix_torch, block_idx, layer_idx, i):
+ {"name": "{}/block_{}/layer_{}/bn{}/moving_variance".format(tensor_name_prefix_tf, block_idx, layer_idx, i),
+ "squeeze": None,
+ "transpose": None,
+ },
+ "{}.block_{}.layer_{}.bn{}.num_batches_tracked".format(tensor_name_prefix_torch, block_idx, layer_idx, i): train_steps,
+ })
+
+ return map_dict_local
+
+ def convert_tf2torch(self,
+ var_dict_tf,
+ var_dict_torch,
+ ):
+
+ map_dict = self.gen_tf2torch_map_dict()
+
+ var_dict_torch_update = dict()
+ for name in sorted(var_dict_torch.keys(), reverse=False):
+ if name.startswith(self.tf2torch_tensor_name_prefix_torch):
+ if name in map_dict:
+ if "num_batches_tracked" not in name:
+ name_tf = map_dict[name]["name"]
+ data_tf = var_dict_tf[name_tf]
+ if map_dict[name]["squeeze"] is not None:
+ data_tf = np.squeeze(data_tf, axis=map_dict[name]["squeeze"])
+ if map_dict[name]["transpose"] is not None:
+ data_tf = np.transpose(data_tf, map_dict[name]["transpose"])
+ data_tf = torch.from_numpy(data_tf).type(torch.float32).to("cpu")
+ assert var_dict_torch[name].size() == data_tf.size(), \
+ "{}, {}, {} != {}".format(name, name_tf,
+ var_dict_torch[name].size(), data_tf.size())
+ var_dict_torch_update[name] = data_tf
+ logging.info("torch tensor: {}, {}, loading from tf tensor: {}, {}".format(
+ name, data_tf.size(), name_tf, var_dict_tf[name_tf].shape
+ ))
+ else:
+ var_dict_torch_update[name] = torch.from_numpy(np.array(map_dict[name])).type(torch.int64).to("cpu")
+ logging.info("torch tensor: {}, manually assigning to: {}".format(
+ name, map_dict[name]
+ ))
+ else:
+ logging.warning("{} is missed from tf checkpoint".format(name))
+
+ return var_dict_torch_update
diff --git a/funasr/models/encoder/sanm_encoder.py b/funasr/models/encoder/sanm_encoder.py
index 0751a1020..57890efe6 100644
--- a/funasr/models/encoder/sanm_encoder.py
+++ b/funasr/models/encoder/sanm_encoder.py
@@ -347,6 +347,48 @@ class SANMEncoder(AbsEncoder):
return (xs_pad, intermediate_outs), olens, None
return xs_pad, olens, None
+ def forward_chunk(self,
+ xs_pad: torch.Tensor,
+ ilens: torch.Tensor,
+ cache: dict = None,
+ ctc: CTC = None,
+ ):
+ xs_pad *= self.output_size() ** 0.5
+ if self.embed is None:
+ xs_pad = xs_pad
+ else:
+ xs_pad = self.embed.forward_chunk(xs_pad, cache)
+
+ encoder_outs = self.encoders0(xs_pad, None, None, None, None)
+ xs_pad, masks = encoder_outs[0], encoder_outs[1]
+ intermediate_outs = []
+ if len(self.interctc_layer_idx) == 0:
+ encoder_outs = self.encoders(xs_pad, None, None, None, None)
+ xs_pad, masks = encoder_outs[0], encoder_outs[1]
+ else:
+ for layer_idx, encoder_layer in enumerate(self.encoders):
+ encoder_outs = encoder_layer(xs_pad, None, None, None, None)
+ xs_pad, masks = encoder_outs[0], encoder_outs[1]
+ if layer_idx + 1 in self.interctc_layer_idx:
+ encoder_out = xs_pad
+
+ # intermediate outputs are also normalized
+ if self.normalize_before:
+ encoder_out = self.after_norm(encoder_out)
+
+ intermediate_outs.append((layer_idx + 1, encoder_out))
+
+ if self.interctc_use_conditioning:
+ ctc_out = ctc.softmax(encoder_out)
+ xs_pad = xs_pad + self.conditioning_layer(ctc_out)
+
+ if self.normalize_before:
+ xs_pad = self.after_norm(xs_pad)
+
+ if len(intermediate_outs) > 0:
+ return (xs_pad, intermediate_outs), None, None
+ return xs_pad, ilens, None
+
def gen_tf2torch_map_dict(self):
tensor_name_prefix_torch = self.tf2torch_tensor_name_prefix_torch
tensor_name_prefix_tf = self.tf2torch_tensor_name_prefix_tf
diff --git a/funasr/models/frontend/eend_ola_feature.py b/funasr/models/frontend/eend_ola_feature.py
new file mode 100644
index 000000000..e15b71c25
--- /dev/null
+++ b/funasr/models/frontend/eend_ola_feature.py
@@ -0,0 +1,51 @@
+# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita)
+# Licensed under the MIT license.
+#
+# This module is for computing audio features
+
+import librosa
+import numpy as np
+
+
+def transform(Y, dtype=np.float32):
+ Y = np.abs(Y)
+ n_fft = 2 * (Y.shape[1] - 1)
+ sr = 8000
+ n_mels = 23
+ mel_basis = librosa.filters.mel(sr, n_fft, n_mels)
+ Y = np.dot(Y ** 2, mel_basis.T)
+ Y = np.log10(np.maximum(Y, 1e-10))
+ mean = np.mean(Y, axis=0)
+ Y = Y - mean
+ return Y.astype(dtype)
+
+
+def subsample(Y, T, subsampling=1):
+ Y_ss = Y[::subsampling]
+ T_ss = T[::subsampling]
+ return Y_ss, T_ss
+
+
+def splice(Y, context_size=0):
+ Y_pad = np.pad(
+ Y,
+ [(context_size, context_size), (0, 0)],
+ 'constant')
+ Y_spliced = np.lib.stride_tricks.as_strided(
+ np.ascontiguousarray(Y_pad),
+ (Y.shape[0], Y.shape[1] * (2 * context_size + 1)),
+ (Y.itemsize * Y.shape[1], Y.itemsize), writeable=False)
+ return Y_spliced
+
+
+def stft(
+ data,
+ frame_size=1024,
+ frame_shift=256):
+ fft_size = 1 << (frame_size - 1).bit_length()
+ if len(data) % frame_shift == 0:
+ return librosa.stft(data, n_fft=fft_size, win_length=frame_size,
+ hop_length=frame_shift).T[:-1]
+ else:
+ return librosa.stft(data, n_fft=fft_size, win_length=frame_size,
+ hop_length=frame_shift).T
\ No newline at end of file
diff --git a/funasr/models/frontend/wav_frontend.py b/funasr/models/frontend/wav_frontend.py
index ed8cb3646..475a9398a 100644
--- a/funasr/models/frontend/wav_frontend.py
+++ b/funasr/models/frontend/wav_frontend.py
@@ -1,14 +1,15 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# Part of the implementation is borrowed from espnet/espnet.
-
from typing import Tuple
import numpy as np
import torch
import torchaudio.compliance.kaldi as kaldi
-from funasr.models.frontend.abs_frontend import AbsFrontend
-from typeguard import check_argument_types
from torch.nn.utils.rnn import pad_sequence
+from typeguard import check_argument_types
+
+import funasr.models.frontend.eend_ola_feature as eend_ola_feature
+from funasr.models.frontend.abs_frontend import AbsFrontend
def load_cmvn(cmvn_file):
@@ -33,9 +34,9 @@ def load_cmvn(cmvn_file):
means = np.array(means_list).astype(np.float)
vars = np.array(vars_list).astype(np.float)
cmvn = np.array([means, vars])
- cmvn = torch.as_tensor(cmvn)
- return cmvn
-
+ cmvn = torch.as_tensor(cmvn)
+ return cmvn
+
def apply_cmvn(inputs, cmvn_file): # noqa
"""
@@ -78,21 +79,22 @@ def apply_lfr(inputs, lfr_m, lfr_n):
class WavFrontend(AbsFrontend):
"""Conventional frontend structure for ASR.
"""
+
def __init__(
- self,
- cmvn_file: str = None,
- fs: int = 16000,
- window: str = 'hamming',
- n_mels: int = 80,
- frame_length: int = 25,
- frame_shift: int = 10,
- filter_length_min: int = -1,
- filter_length_max: int = -1,
- lfr_m: int = 1,
- lfr_n: int = 1,
- dither: float = 1.0,
- snip_edges: bool = True,
- upsacle_samples: bool = True,
+ self,
+ cmvn_file: str = None,
+ fs: int = 16000,
+ window: str = 'hamming',
+ n_mels: int = 80,
+ frame_length: int = 25,
+ frame_shift: int = 10,
+ filter_length_min: int = -1,
+ filter_length_max: int = -1,
+ lfr_m: int = 1,
+ lfr_n: int = 1,
+ dither: float = 1.0,
+ snip_edges: bool = True,
+ upsacle_samples: bool = True,
):
assert check_argument_types()
super().__init__()
@@ -135,11 +137,11 @@ class WavFrontend(AbsFrontend):
window_type=self.window,
sample_frequency=self.fs,
snip_edges=self.snip_edges)
-
+
if self.lfr_m != 1 or self.lfr_n != 1:
mat = apply_lfr(mat, self.lfr_m, self.lfr_n)
if self.cmvn_file is not None:
- mat = apply_cmvn(mat, self.cmvn_file)
+ mat = apply_cmvn(mat, self.cmvn_file)
feat_length = mat.size(0)
feats.append(mat)
feats_lens.append(feat_length)
@@ -171,7 +173,6 @@ class WavFrontend(AbsFrontend):
window_type=self.window,
sample_frequency=self.fs)
-
feat_length = mat.size(0)
feats.append(mat)
feats_lens.append(feat_length)
@@ -204,3 +205,299 @@ class WavFrontend(AbsFrontend):
batch_first=True,
padding_value=0.0)
return feats_pad, feats_lens
+
+
+class WavFrontendOnline(AbsFrontend):
+ """Conventional frontend structure for streaming ASR/VAD.
+ """
+
+ def __init__(
+ self,
+ cmvn_file: str = None,
+ fs: int = 16000,
+ window: str = 'hamming',
+ n_mels: int = 80,
+ frame_length: int = 25,
+ frame_shift: int = 10,
+ filter_length_min: int = -1,
+ filter_length_max: int = -1,
+ lfr_m: int = 1,
+ lfr_n: int = 1,
+ dither: float = 1.0,
+ snip_edges: bool = True,
+ upsacle_samples: bool = True,
+ ):
+ assert check_argument_types()
+ super().__init__()
+ self.fs = fs
+ self.window = window
+ self.n_mels = n_mels
+ self.frame_length = frame_length
+ self.frame_shift = frame_shift
+ self.frame_sample_length = int(self.frame_length * self.fs / 1000)
+ self.frame_shift_sample_length = int(self.frame_shift * self.fs / 1000)
+ self.filter_length_min = filter_length_min
+ self.filter_length_max = filter_length_max
+ self.lfr_m = lfr_m
+ self.lfr_n = lfr_n
+ self.cmvn_file = cmvn_file
+ self.dither = dither
+ self.snip_edges = snip_edges
+ self.upsacle_samples = upsacle_samples
+ self.waveforms = None
+ self.reserve_waveforms = None
+ self.fbanks = None
+ self.fbanks_lens = None
+ self.cmvn = None if self.cmvn_file is None else load_cmvn(self.cmvn_file)
+ self.input_cache = None
+ self.lfr_splice_cache = []
+
+ def output_size(self) -> int:
+ return self.n_mels * self.lfr_m
+
+ @staticmethod
+ def apply_cmvn(inputs: torch.Tensor, cmvn: torch.Tensor) -> torch.Tensor:
+ """
+ Apply CMVN with mvn data
+ """
+
+ device = inputs.device
+ dtype = inputs.dtype
+ frame, dim = inputs.shape
+
+ means = np.tile(cmvn[0:1, :dim], (frame, 1))
+ vars = np.tile(cmvn[1:2, :dim], (frame, 1))
+ inputs += torch.from_numpy(means).type(dtype).to(device)
+ inputs *= torch.from_numpy(vars).type(dtype).to(device)
+
+ return inputs.type(torch.float32)
+
+ @staticmethod
+ # inputs tensor has catted the cache tensor
+ # def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, inputs_lfr_cache: torch.Tensor = None,
+ # is_final: bool = False) -> Tuple[torch.Tensor, torch.Tensor, int]:
+ def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, is_final: bool = False) -> Tuple[
+ torch.Tensor, torch.Tensor, int]:
+ """
+ Apply lfr with data
+ """
+
+ LFR_inputs = []
+ # inputs = torch.vstack((inputs_lfr_cache, inputs))
+ T = inputs.shape[0] # include the right context
+ T_lfr = int(np.ceil((T - (lfr_m - 1) // 2) / lfr_n)) # minus the right context: (lfr_m - 1) // 2
+ splice_idx = T_lfr
+ for i in range(T_lfr):
+ if lfr_m <= T - i * lfr_n:
+ LFR_inputs.append((inputs[i * lfr_n:i * lfr_n + lfr_m]).view(1, -1))
+ else: # process last LFR frame
+ if is_final:
+ num_padding = lfr_m - (T - i * lfr_n)
+ frame = (inputs[i * lfr_n:]).view(-1)
+ for _ in range(num_padding):
+ frame = torch.hstack((frame, inputs[-1]))
+ LFR_inputs.append(frame)
+ else:
+ # update splice_idx and break the circle
+ splice_idx = i
+ break
+ splice_idx = min(T - 1, splice_idx * lfr_n)
+ lfr_splice_cache = inputs[splice_idx:, :]
+ LFR_outputs = torch.vstack(LFR_inputs)
+ return LFR_outputs.type(torch.float32), lfr_splice_cache, splice_idx
+
+ @staticmethod
+ def compute_frame_num(sample_length: int, frame_sample_length: int, frame_shift_sample_length: int) -> int:
+ frame_num = int((sample_length - frame_sample_length) / frame_shift_sample_length + 1)
+ return frame_num if frame_num >= 1 and sample_length >= frame_sample_length else 0
+
+ def forward_fbank(
+ self,
+ input: torch.Tensor,
+ input_lengths: torch.Tensor
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+ batch_size = input.size(0)
+ if self.input_cache is None:
+ self.input_cache = torch.empty(0)
+ input = torch.cat((self.input_cache, input), dim=1)
+ frame_num = self.compute_frame_num(input.shape[-1], self.frame_sample_length, self.frame_shift_sample_length)
+ # update self.in_cache
+ self.input_cache = input[:, -(input.shape[-1] - frame_num * self.frame_shift_sample_length):]
+ waveforms = torch.empty(0)
+ feats_pad = torch.empty(0)
+ feats_lens = torch.empty(0)
+ if frame_num:
+ waveforms = []
+ feats = []
+ feats_lens = []
+ for i in range(batch_size):
+ waveform = input[i]
+ # we need accurate wave samples that used for fbank extracting
+ waveforms.append(
+ waveform[:((frame_num - 1) * self.frame_shift_sample_length + self.frame_sample_length)])
+ waveform = waveform * (1 << 15)
+ waveform = waveform.unsqueeze(0)
+ mat = kaldi.fbank(waveform,
+ num_mel_bins=self.n_mels,
+ frame_length=self.frame_length,
+ frame_shift=self.frame_shift,
+ dither=self.dither,
+ energy_floor=0.0,
+ window_type=self.window,
+ sample_frequency=self.fs)
+
+ feat_length = mat.size(0)
+ feats.append(mat)
+ feats_lens.append(feat_length)
+
+ waveforms = torch.stack(waveforms)
+ feats_lens = torch.as_tensor(feats_lens)
+ feats_pad = pad_sequence(feats,
+ batch_first=True,
+ padding_value=0.0)
+ self.fbanks = feats_pad
+ import copy
+ self.fbanks_lens = copy.deepcopy(feats_lens)
+ return waveforms, feats_pad, feats_lens
+
+ def get_fbank(self) -> Tuple[torch.Tensor, torch.Tensor]:
+ return self.fbanks, self.fbanks_lens
+
+ def forward_lfr_cmvn(
+ self,
+ input: torch.Tensor,
+ input_lengths: torch.Tensor,
+ is_final: bool = False
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+ batch_size = input.size(0)
+ feats = []
+ feats_lens = []
+ lfr_splice_frame_idxs = []
+ for i in range(batch_size):
+ mat = input[i, :input_lengths[i], :]
+ if self.lfr_m != 1 or self.lfr_n != 1:
+ # update self.lfr_splice_cache in self.apply_lfr
+ # mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n, self.lfr_splice_cache[i],
+ mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n,
+ is_final)
+ if self.cmvn_file is not None:
+ mat = self.apply_cmvn(mat, self.cmvn)
+ feat_length = mat.size(0)
+ feats.append(mat)
+ feats_lens.append(feat_length)
+ lfr_splice_frame_idxs.append(lfr_splice_frame_idx)
+
+ feats_lens = torch.as_tensor(feats_lens)
+ feats_pad = pad_sequence(feats,
+ batch_first=True,
+ padding_value=0.0)
+ lfr_splice_frame_idxs = torch.as_tensor(lfr_splice_frame_idxs)
+ return feats_pad, feats_lens, lfr_splice_frame_idxs
+
+ def forward(
+ self, input: torch.Tensor, input_lengths: torch.Tensor, is_final: bool = False
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ batch_size = input.shape[0]
+ assert batch_size == 1, 'we support to extract feature online only when the batch size is equal to 1 now'
+ waveforms, feats, feats_lengths = self.forward_fbank(input, input_lengths) # input shape: B T D
+ if feats.shape[0]:
+ # if self.reserve_waveforms is None and self.lfr_m > 1:
+ # self.reserve_waveforms = waveforms[:, :(self.lfr_m - 1) // 2 * self.frame_shift_sample_length]
+ self.waveforms = waveforms if self.reserve_waveforms is None else torch.cat(
+ (self.reserve_waveforms, waveforms), dim=1)
+ if not self.lfr_splice_cache: # 初始化splice_cache
+ for i in range(batch_size):
+ self.lfr_splice_cache.append(feats[i][0, :].unsqueeze(dim=0).repeat((self.lfr_m - 1) // 2, 1))
+ # need the number of the input frames + self.lfr_splice_cache[0].shape[0] is greater than self.lfr_m
+ if feats_lengths[0] + self.lfr_splice_cache[0].shape[0] >= self.lfr_m:
+ lfr_splice_cache_tensor = torch.stack(self.lfr_splice_cache) # B T D
+ feats = torch.cat((lfr_splice_cache_tensor, feats), dim=1)
+ feats_lengths += lfr_splice_cache_tensor[0].shape[0]
+ frame_from_waveforms = int(
+ (self.waveforms.shape[1] - self.frame_sample_length) / self.frame_shift_sample_length + 1)
+ minus_frame = (self.lfr_m - 1) // 2 if self.reserve_waveforms is None else 0
+ feats, feats_lengths, lfr_splice_frame_idxs = self.forward_lfr_cmvn(feats, feats_lengths, is_final)
+ if self.lfr_m == 1:
+ self.reserve_waveforms = None
+ else:
+ reserve_frame_idx = lfr_splice_frame_idxs[0] - minus_frame
+ # print('reserve_frame_idx: ' + str(reserve_frame_idx))
+ # print('frame_frame: ' + str(frame_from_waveforms))
+ self.reserve_waveforms = self.waveforms[:, reserve_frame_idx * self.frame_shift_sample_length:frame_from_waveforms * self.frame_shift_sample_length]
+ sample_length = (frame_from_waveforms - 1) * self.frame_shift_sample_length + self.frame_sample_length
+ self.waveforms = self.waveforms[:, :sample_length]
+ else:
+ # update self.reserve_waveforms and self.lfr_splice_cache
+ self.reserve_waveforms = self.waveforms[:,
+ :-(self.frame_sample_length - self.frame_shift_sample_length)]
+ for i in range(batch_size):
+ self.lfr_splice_cache[i] = torch.cat((self.lfr_splice_cache[i], feats[i]), dim=0)
+ return torch.empty(0), feats_lengths
+ else:
+ if is_final:
+ self.waveforms = waveforms if self.reserve_waveforms is None else self.reserve_waveforms
+ feats = torch.stack(self.lfr_splice_cache)
+ feats_lengths = torch.zeros(batch_size, dtype=torch.int) + feats.shape[1]
+ feats, feats_lengths, _ = self.forward_lfr_cmvn(feats, feats_lengths, is_final)
+ if is_final:
+ self.cache_reset()
+ return feats, feats_lengths
+
+ def get_waveforms(self):
+ return self.waveforms
+
+ def cache_reset(self):
+ self.reserve_waveforms = None
+ self.input_cache = None
+ self.lfr_splice_cache = []
+
+
+class WavFrontendMel23(AbsFrontend):
+ """Conventional frontend structure for ASR.
+ """
+
+ def __init__(
+ self,
+ fs: int = 16000,
+ frame_length: int = 25,
+ frame_shift: int = 10,
+ lfr_m: int = 1,
+ lfr_n: int = 1,
+ ):
+ assert check_argument_types()
+ super().__init__()
+ self.fs = fs
+ self.frame_length = frame_length
+ self.frame_shift = frame_shift
+ self.lfr_m = lfr_m
+ self.lfr_n = lfr_n
+ self.n_mels = 23
+
+ def output_size(self) -> int:
+ return self.n_mels * (2 * self.lfr_m + 1)
+
+ def forward(
+ self,
+ input: torch.Tensor,
+ input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+ batch_size = input.size(0)
+ feats = []
+ feats_lens = []
+ for i in range(batch_size):
+ waveform_length = input_lengths[i]
+ waveform = input[i][:waveform_length]
+ waveform = waveform.numpy()
+ mat = eend_ola_feature.stft(waveform, self.frame_length, self.frame_shift)
+ mat = eend_ola_feature.transform(mat)
+ mat = eend_ola_feature.splice(mat, context_size=self.lfr_m)
+ mat = mat[::self.lfr_n]
+ mat = torch.from_numpy(mat)
+ feat_length = mat.size(0)
+ feats.append(mat)
+ feats_lens.append(feat_length)
+
+ feats_lens = torch.as_tensor(feats_lens)
+ feats_pad = pad_sequence(feats,
+ batch_first=True,
+ padding_value=0.0)
+ return feats_pad, feats_lens
diff --git a/funasr/models/pooling/statistic_pooling.py b/funasr/models/pooling/statistic_pooling.py
index dc8c98f0d..8f85de99d 100644
--- a/funasr/models/pooling/statistic_pooling.py
+++ b/funasr/models/pooling/statistic_pooling.py
@@ -82,13 +82,16 @@ def windowed_statistic_pooling(
tt = xs_pad.shape[2]
num_chunk = int(math.ceil(tt / pooling_stride))
pad = pooling_size // 2
- features = F.pad(xs_pad, (0, 0, pad, pad), "reflect")
+ if len(xs_pad.shape) == 4:
+ features = F.pad(xs_pad, (0, 0, pad, pad), "reflect")
+ else:
+ features = F.pad(xs_pad, (pad, pad), "reflect")
stat_list = []
for i in range(num_chunk):
# B x C
st, ed = i*pooling_stride, i*pooling_stride+pooling_size
- stat = statistic_pooling(features[:, :, st: ed, :], pooling_dim=pooling_dim)
+ stat = statistic_pooling(features[:, :, st: ed], pooling_dim=pooling_dim)
stat_list.append(stat.unsqueeze(2))
# B x C x T
diff --git a/funasr/models/predictor/cif.py b/funasr/models/predictor/cif.py
index 561537323..74f3e68a9 100644
--- a/funasr/models/predictor/cif.py
+++ b/funasr/models/predictor/cif.py
@@ -199,6 +199,63 @@ class CifPredictorV2(nn.Module):
return acoustic_embeds, token_num, alphas, cif_peak
+ def forward_chunk(self, hidden, cache=None):
+ h = hidden
+ context = h.transpose(1, 2)
+ queries = self.pad(context)
+ output = torch.relu(self.cif_conv1d(queries))
+ output = output.transpose(1, 2)
+ output = self.cif_output(output)
+ alphas = torch.sigmoid(output)
+ alphas = torch.nn.functional.relu(alphas * self.smooth_factor - self.noise_threshold)
+
+ alphas = alphas.squeeze(-1)
+ mask_chunk_predictor = None
+ if cache is not None:
+ mask_chunk_predictor = None
+ mask_chunk_predictor = torch.zeros_like(alphas)
+ mask_chunk_predictor[:, cache["pad_left"]:cache["stride"] + cache["pad_left"]] = 1.0
+
+ if mask_chunk_predictor is not None:
+ alphas = alphas * mask_chunk_predictor
+
+ if cache is not None:
+ if cache["cif_hidden"] is not None:
+ hidden = torch.cat((cache["cif_hidden"], hidden), 1)
+ if cache["cif_alphas"] is not None:
+ alphas = torch.cat((cache["cif_alphas"], alphas), -1)
+
+ token_num = alphas.sum(-1)
+ acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold)
+ len_time = alphas.size(-1)
+ last_fire_place = len_time - 1
+ last_fire_remainds = 0.0
+ pre_alphas_length = 0
+
+ mask_chunk_peak_predictor = None
+ if cache is not None:
+ mask_chunk_peak_predictor = None
+ mask_chunk_peak_predictor = torch.zeros_like(cif_peak)
+ if cache["cif_alphas"] is not None:
+ pre_alphas_length = cache["cif_alphas"].size(-1)
+ mask_chunk_peak_predictor[:, :pre_alphas_length] = 1.0
+ mask_chunk_peak_predictor[:, pre_alphas_length + cache["pad_left"]:pre_alphas_length + cache["stride"] + cache["pad_left"]] = 1.0
+
+
+ if mask_chunk_peak_predictor is not None:
+ cif_peak = cif_peak * mask_chunk_peak_predictor.squeeze(-1)
+
+ for i in range(len_time):
+ if cif_peak[0][len_time - 1 - i] > self.threshold or cif_peak[0][len_time - 1 - i] == self.threshold:
+ last_fire_place = len_time - 1 - i
+ last_fire_remainds = cif_peak[0][len_time - 1 - i] - self.threshold
+ break
+ last_fire_remainds = torch.tensor([last_fire_remainds], dtype=alphas.dtype).to(alphas.device)
+ cache["cif_hidden"] = hidden[:, last_fire_place:, :]
+ cache["cif_alphas"] = torch.cat((last_fire_remainds.unsqueeze(0), alphas[:, last_fire_place+1:]), -1)
+ token_num_int = token_num.floor().type(torch.int32).item()
+ return acoustic_embeds[:, 0:token_num_int, :], token_num, alphas, cif_peak
+
def tail_process_fn(self, hidden, alphas, token_num=None, mask=None):
b, t, d = hidden.size()
tail_threshold = self.tail_threshold
diff --git a/funasr/modules/attention.py b/funasr/modules/attention.py
index 627700524..31d5a8775 100644
--- a/funasr/modules/attention.py
+++ b/funasr/modules/attention.py
@@ -347,15 +347,17 @@ class MultiHeadedAttentionSANM(nn.Module):
mask = torch.reshape(mask, (b, -1, 1))
if mask_shfit_chunk is not None:
mask = mask * mask_shfit_chunk
+ inputs = inputs * mask
- inputs = inputs * mask
x = inputs.transpose(1, 2)
x = self.pad_fn(x)
x = self.fsmn_block(x)
x = x.transpose(1, 2)
x += inputs
x = self.dropout(x)
- return x * mask
+ if mask is not None:
+ x = x * mask
+ return x
def forward_qkv(self, x):
"""Transform query, key and value.
@@ -505,7 +507,7 @@ class MultiHeadedAttentionSANMDecoder(nn.Module):
# print("in fsmn, cache is None, x", x.size())
x = self.pad_fn(x)
- if not self.training and t <= 1:
+ if not self.training:
cache = x
else:
# print("in fsmn, cache is not None, x", x.size())
@@ -513,7 +515,7 @@ class MultiHeadedAttentionSANMDecoder(nn.Module):
# if t < self.kernel_size:
# x = self.pad_fn(x)
x = torch.cat((cache[:, :, 1:], x), dim=2)
- x = x[:, :, -self.kernel_size:]
+ x = x[:, :, -(self.kernel_size+t-1):]
# print("in fsmn, cache is not None, x_cat", x.size())
cache = x
x = self.fsmn_block(x)
diff --git a/funasr/modules/eend_ola/__init__.py b/funasr/modules/eend_ola/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/funasr/modules/eend_ola/encoder.py b/funasr/modules/eend_ola/encoder.py
new file mode 100644
index 000000000..90a63f369
--- /dev/null
+++ b/funasr/modules/eend_ola/encoder.py
@@ -0,0 +1,133 @@
+import math
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class MultiHeadSelfAttention(nn.Module):
+ def __init__(self, n_units, h=8, dropout_rate=0.1):
+ super(MultiHeadSelfAttention, self).__init__()
+ self.linearQ = nn.Linear(n_units, n_units)
+ self.linearK = nn.Linear(n_units, n_units)
+ self.linearV = nn.Linear(n_units, n_units)
+ self.linearO = nn.Linear(n_units, n_units)
+ self.d_k = n_units // h
+ self.h = h
+ self.dropout = nn.Dropout(dropout_rate)
+
+ def __call__(self, x, batch_size, x_mask):
+ q = self.linearQ(x).view(batch_size, -1, self.h, self.d_k)
+ k = self.linearK(x).view(batch_size, -1, self.h, self.d_k)
+ v = self.linearV(x).view(batch_size, -1, self.h, self.d_k)
+ scores = torch.matmul(
+ q.permute(0, 2, 1, 3), k.permute(0, 2, 3, 1)) / math.sqrt(self.d_k)
+ if x_mask is not None:
+ x_mask = x_mask.unsqueeze(1)
+ scores = scores.masked_fill(x_mask == 0, -1e9)
+ self.att = F.softmax(scores, dim=3)
+ p_att = self.dropout(self.att)
+ x = torch.matmul(p_att, v.permute(0, 2, 1, 3))
+ x = x.permute(0, 2, 1, 3).contiguous().view(-1, self.h * self.d_k)
+ return self.linearO(x)
+
+
+class PositionwiseFeedForward(nn.Module):
+ def __init__(self, n_units, d_units, dropout_rate):
+ super(PositionwiseFeedForward, self).__init__()
+ self.linear1 = nn.Linear(n_units, d_units)
+ self.linear2 = nn.Linear(d_units, n_units)
+ self.dropout = nn.Dropout(dropout_rate)
+
+ def __call__(self, x):
+ return self.linear2(self.dropout(F.relu(self.linear1(x))))
+
+
+class PositionalEncoding(torch.nn.Module):
+ def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
+ super(PositionalEncoding, self).__init__()
+ self.d_model = d_model
+ self.reverse = reverse
+ self.xscale = math.sqrt(self.d_model)
+ self.dropout = torch.nn.Dropout(p=dropout_rate)
+ self.pe = None
+ self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+
+ def extend_pe(self, x):
+ if self.pe is not None:
+ if self.pe.size(1) >= x.size(1):
+ if self.pe.dtype != x.dtype or self.pe.device != x.device:
+ self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+ return
+ pe = torch.zeros(x.size(1), self.d_model)
+ if self.reverse:
+ position = torch.arange(
+ x.size(1) - 1, -1, -1.0, dtype=torch.float32
+ ).unsqueeze(1)
+ else:
+ position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+ div_term = torch.exp(
+ torch.arange(0, self.d_model, 2, dtype=torch.float32)
+ * -(math.log(10000.0) / self.d_model)
+ )
+ pe[:, 0::2] = torch.sin(position * div_term)
+ pe[:, 1::2] = torch.cos(position * div_term)
+ pe = pe.unsqueeze(0)
+ self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+ def forward(self, x: torch.Tensor):
+ self.extend_pe(x)
+ x = x * self.xscale + self.pe[:, : x.size(1)]
+ return self.dropout(x)
+
+
+class EENDOLATransformerEncoder(nn.Module):
+ def __init__(self,
+ idim: int,
+ n_layers: int,
+ n_units: int,
+ e_units: int = 2048,
+ h: int = 4,
+ dropout_rate: float = 0.1,
+ use_pos_emb: bool = False):
+ super(EENDOLATransformerEncoder, self).__init__()
+ self.lnorm_in = nn.LayerNorm(n_units)
+ self.n_layers = n_layers
+ self.dropout = nn.Dropout(dropout_rate)
+ for i in range(n_layers):
+ setattr(self, '{}{:d}'.format("lnorm1_", i),
+ nn.LayerNorm(n_units))
+ setattr(self, '{}{:d}'.format("self_att_", i),
+ MultiHeadSelfAttention(n_units, h))
+ setattr(self, '{}{:d}'.format("lnorm2_", i),
+ nn.LayerNorm(n_units))
+ setattr(self, '{}{:d}'.format("ff_", i),
+ PositionwiseFeedForward(n_units, e_units, dropout_rate))
+ self.lnorm_out = nn.LayerNorm(n_units)
+ if use_pos_emb:
+ self.pos_enc = torch.nn.Sequential(
+ torch.nn.Linear(idim, n_units),
+ torch.nn.LayerNorm(n_units),
+ torch.nn.Dropout(dropout_rate),
+ torch.nn.ReLU(),
+ PositionalEncoding(n_units, dropout_rate),
+ )
+ else:
+ self.linear_in = nn.Linear(idim, n_units)
+ self.pos_enc = None
+
+ def __call__(self, x, x_mask=None):
+ BT_size = x.shape[0] * x.shape[1]
+ if self.pos_enc is not None:
+ e = self.pos_enc(x)
+ e = e.view(BT_size, -1)
+ else:
+ e = self.linear_in(x.reshape(BT_size, -1))
+ for i in range(self.n_layers):
+ e = getattr(self, '{}{:d}'.format("lnorm1_", i))(e)
+ s = getattr(self, '{}{:d}'.format("self_att_", i))(e, x.shape[0], x_mask)
+ e = e + self.dropout(s)
+ e = getattr(self, '{}{:d}'.format("lnorm2_", i))(e)
+ s = getattr(self, '{}{:d}'.format("ff_", i))(e)
+ e = e + self.dropout(s)
+ return self.lnorm_out(e)
diff --git a/funasr/modules/eend_ola/encoder_decoder_attractor.py b/funasr/modules/eend_ola/encoder_decoder_attractor.py
new file mode 100644
index 000000000..45ac98219
--- /dev/null
+++ b/funasr/modules/eend_ola/encoder_decoder_attractor.py
@@ -0,0 +1,50 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class EncoderDecoderAttractor(nn.Module):
+
+ def __init__(self, n_units, encoder_dropout=0.1, decoder_dropout=0.1):
+ super(EncoderDecoderAttractor, self).__init__()
+ self.enc0_dropout = nn.Dropout(encoder_dropout)
+ self.encoder = nn.LSTM(n_units, n_units, 1, batch_first=True, dropout=encoder_dropout)
+ self.dec0_dropout = nn.Dropout(decoder_dropout)
+ self.decoder = nn.LSTM(n_units, n_units, 1, batch_first=True, dropout=decoder_dropout)
+ self.counter = nn.Linear(n_units, 1)
+ self.n_units = n_units
+
+ def forward_core(self, xs, zeros):
+ ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).to(torch.int64)
+ xs = [self.enc0_dropout(x) for x in xs]
+ xs = nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=-1)
+ xs = nn.utils.rnn.pack_padded_sequence(xs, ilens, batch_first=True, enforce_sorted=False)
+ _, (hx, cx) = self.encoder(xs)
+ zlens = torch.from_numpy(np.array([z.shape[0] for z in zeros])).to(torch.int64)
+ max_zlen = torch.max(zlens).to(torch.int).item()
+ zeros = [self.enc0_dropout(z) for z in zeros]
+ zeros = nn.utils.rnn.pad_sequence(zeros, batch_first=True, padding_value=-1)
+ zeros = nn.utils.rnn.pack_padded_sequence(zeros, zlens, batch_first=True, enforce_sorted=False)
+ attractors, (_, _) = self.decoder(zeros, (hx, cx))
+ attractors = nn.utils.rnn.pad_packed_sequence(attractors, batch_first=True, padding_value=-1,
+ total_length=max_zlen)[0]
+ attractors = [att[:zlens[i].to(torch.int).item()] for i, att in enumerate(attractors)]
+ return attractors
+
+ def forward(self, xs, n_speakers):
+ zeros = [torch.zeros(n_spk + 1, self.n_units).to(torch.float32).to(xs[0].device) for n_spk in n_speakers]
+ attractors = self.forward_core(xs, zeros)
+ labels = torch.cat([torch.from_numpy(np.array([[1] * n_spk + [0]], np.float32)) for n_spk in n_speakers], dim=1)
+ labels = labels.to(xs[0].device)
+ logit = torch.cat([self.counter(att).view(-1, n_spk + 1) for att, n_spk in zip(attractors, n_speakers)], dim=1)
+ loss = F.binary_cross_entropy(torch.sigmoid(logit), labels)
+
+ attractors = [att[slice(0, att.shape[0] - 1)] for att in attractors]
+ return loss, attractors
+
+ def estimate(self, xs, max_n_speakers=15):
+ zeros = [torch.zeros(max_n_speakers, self.n_units).to(torch.float32).to(xs[0].device) for _ in xs]
+ attractors = self.forward_core(xs, zeros)
+ probs = [torch.sigmoid(torch.flatten(self.counter(att))) for att in attractors]
+ return attractors, probs
diff --git a/funasr/modules/eend_ola/utils/losses.py b/funasr/modules/eend_ola/utils/losses.py
new file mode 100644
index 000000000..af0181dda
--- /dev/null
+++ b/funasr/modules/eend_ola/utils/losses.py
@@ -0,0 +1,67 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+from itertools import permutations
+from torch import nn
+
+
+def standard_loss(ys, ts, label_delay=0):
+ losses = [F.binary_cross_entropy(torch.sigmoid(y), t) * len(y) for y, t in zip(ys, ts)]
+ loss = torch.sum(torch.stack(losses))
+ n_frames = torch.from_numpy(np.array(np.sum([t.shape[0] for t in ts]))).to(torch.float32).to(ys[0].device)
+ loss = loss / n_frames
+ return loss
+
+
+def batch_pit_n_speaker_loss(ys, ts, n_speakers_list):
+ max_n_speakers = ts[0].shape[1]
+ olens = [y.shape[0] for y in ys]
+ ys = nn.utils.rnn.pad_sequence(ys, batch_first=True, padding_value=-1)
+ ys_mask = [torch.ones(olen).to(ys.device) for olen in olens]
+ ys_mask = torch.nn.utils.rnn.pad_sequence(ys_mask, batch_first=True, padding_value=0).unsqueeze(-1)
+
+ losses = []
+ for shift in range(max_n_speakers):
+ ts_roll = [torch.roll(t, -shift, dims=1) for t in ts]
+ ts_roll = nn.utils.rnn.pad_sequence(ts_roll, batch_first=True, padding_value=-1)
+ loss = F.binary_cross_entropy(torch.sigmoid(ys), ts_roll, reduction='none')
+ if ys_mask is not None:
+ loss = loss * ys_mask
+ loss = torch.sum(loss, dim=1)
+ losses.append(loss)
+ losses = torch.stack(losses, dim=2)
+
+ perms = np.array(list(permutations(range(max_n_speakers)))).astype(np.float32)
+ perms = torch.from_numpy(perms).to(losses.device)
+ y_ind = torch.arange(max_n_speakers, dtype=torch.float32, device=losses.device)
+ t_inds = torch.fmod(perms - y_ind, max_n_speakers).to(torch.long)
+
+ losses_perm = []
+ for t_ind in t_inds:
+ losses_perm.append(
+ torch.mean(losses[:, y_ind.to(torch.long), t_ind], dim=1))
+ losses_perm = torch.stack(losses_perm, dim=1)
+
+ def select_perm_indices(num, max_num):
+ perms = list(permutations(range(max_num)))
+ sub_perms = list(permutations(range(num)))
+ return [
+ [x[:num] for x in perms].index(perm)
+ for perm in sub_perms]
+
+ masks = torch.full_like(losses_perm, device=losses.device, fill_value=float('inf'))
+ for i, t in enumerate(ts):
+ n_speakers = n_speakers_list[i]
+ indices = select_perm_indices(n_speakers, max_n_speakers)
+ masks[i, indices] = 0
+ losses_perm += masks
+
+ min_loss = torch.sum(torch.min(losses_perm, dim=1)[0])
+ n_frames = torch.from_numpy(np.array(np.sum([t.shape[0] for t in ts]))).to(losses.device)
+ min_loss = min_loss / n_frames
+
+ min_indices = torch.argmin(losses_perm, dim=1)
+ labels_perm = [t[:, perms[idx].to(torch.long)] for t, idx in zip(ts, min_indices)]
+ labels_perm = [t[:, :n_speakers] for t, n_speakers in zip(labels_perm, n_speakers_list)]
+
+ return min_loss, labels_perm
diff --git a/funasr/modules/eend_ola/utils/power.py b/funasr/modules/eend_ola/utils/power.py
new file mode 100644
index 000000000..7144e24da
--- /dev/null
+++ b/funasr/modules/eend_ola/utils/power.py
@@ -0,0 +1,95 @@
+import numpy as np
+import torch
+import torch.multiprocessing
+import torch.nn.functional as F
+from itertools import combinations
+from itertools import permutations
+
+
+def generate_mapping_dict(max_speaker_num=6, max_olp_speaker_num=3):
+ all_kinds = []
+ all_kinds.append(0)
+ for i in range(max_olp_speaker_num):
+ selected_num = i + 1
+ coms = np.array(list(combinations(np.arange(max_speaker_num), selected_num)))
+ for com in coms:
+ tmp = np.zeros(max_speaker_num)
+ tmp[com] = 1
+ item = int(raw_dec_trans(tmp.reshape(1, -1), max_speaker_num)[0])
+ all_kinds.append(item)
+ all_kinds_order = sorted(all_kinds)
+
+ mapping_dict = {}
+ mapping_dict['dec2label'] = {}
+ mapping_dict['label2dec'] = {}
+ for i in range(len(all_kinds_order)):
+ dec = all_kinds_order[i]
+ mapping_dict['dec2label'][dec] = i
+ mapping_dict['label2dec'][i] = dec
+ oov_id = len(all_kinds_order)
+ mapping_dict['oov'] = oov_id
+ return mapping_dict
+
+
+def raw_dec_trans(x, max_speaker_num):
+ num_list = []
+ for i in range(max_speaker_num):
+ num_list.append(x[:, i])
+ base = 1
+ T = x.shape[0]
+ res = np.zeros((T))
+ for num in num_list:
+ res += num * base
+ base = base * 2
+ return res
+
+
+def mapping_func(num, mapping_dict):
+ if num in mapping_dict['dec2label'].keys():
+ label = mapping_dict['dec2label'][num]
+ else:
+ label = mapping_dict['oov']
+ return label
+
+
+def dec_trans(x, max_speaker_num, mapping_dict):
+ num_list = []
+ for i in range(max_speaker_num):
+ num_list.append(x[:, i])
+ base = 1
+ T = x.shape[0]
+ res = np.zeros((T))
+ for num in num_list:
+ res += num * base
+ base = base * 2
+ res = np.array([mapping_func(i, mapping_dict) for i in res])
+ return res
+
+
+def create_powerlabel(label, mapping_dict, max_speaker_num=6, max_olp_speaker_num=3):
+ T, C = label.shape
+ padding_label = np.zeros((T, max_speaker_num))
+ padding_label[:, :C] = label
+ out_label = dec_trans(padding_label, max_speaker_num, mapping_dict)
+ out_label = torch.from_numpy(out_label)
+ return out_label
+
+
+def generate_perm_pse(label, n_speaker, mapping_dict, max_speaker_num, max_olp_speaker_num=3):
+ perms = np.array(list(permutations(range(n_speaker)))).astype(np.float32)
+ perms = torch.from_numpy(perms).to(label.device).to(torch.int64)
+ perm_labels = [label[:, perm] for perm in perms]
+ perm_pse_labels = [create_powerlabel(perm_label.cpu().numpy(), mapping_dict, max_speaker_num).
+ to(perm_label.device, non_blocking=True) for perm_label in perm_labels]
+ return perm_labels, perm_pse_labels
+
+
+def generate_min_pse(label, n_speaker, mapping_dict, max_speaker_num, pse_logit, max_olp_speaker_num=3):
+ perm_labels, perm_pse_labels = generate_perm_pse(label, n_speaker, mapping_dict, max_speaker_num,
+ max_olp_speaker_num=max_olp_speaker_num)
+ losses = [F.cross_entropy(input=pse_logit, target=perm_pse_label.to(torch.long)) * len(pse_logit)
+ for perm_pse_label in perm_pse_labels]
+ loss = torch.stack(losses)
+ min_index = torch.argmin(loss)
+ selected_perm_label, selected_pse_label = perm_labels[min_index], perm_pse_labels[min_index]
+ return selected_perm_label, selected_pse_label
diff --git a/funasr/modules/eend_ola/utils/report.py b/funasr/modules/eend_ola/utils/report.py
new file mode 100644
index 000000000..bfccedfe0
--- /dev/null
+++ b/funasr/modules/eend_ola/utils/report.py
@@ -0,0 +1,159 @@
+import copy
+import numpy as np
+import time
+import torch
+from eend.utils.power import create_powerlabel
+from itertools import combinations
+
+metrics = [
+ ('diarization_error', 'speaker_scored', 'DER'),
+ ('speech_miss', 'speech_scored', 'SAD_MR'),
+ ('speech_falarm', 'speech_scored', 'SAD_FR'),
+ ('speaker_miss', 'speaker_scored', 'MI'),
+ ('speaker_falarm', 'speaker_scored', 'FA'),
+ ('speaker_error', 'speaker_scored', 'CF'),
+ ('correct', 'frames', 'accuracy')
+]
+
+
+def recover_prediction(y, n_speaker):
+ if n_speaker <= 1:
+ return y
+ elif n_speaker == 2:
+ com_index = torch.from_numpy(
+ np.array(list(combinations(np.arange(n_speaker), 2)))).to(
+ y.dtype)
+ num_coms = com_index.shape[0]
+ y_single = y[:, :-num_coms]
+ y_olp = y[:, -num_coms:]
+ olp_map_index = torch.where(y_olp > 0.5)
+ olp_map_index = torch.stack(olp_map_index, dim=1)
+ com_map_index = com_index[olp_map_index[:, -1]]
+ speaker_map_index = torch.from_numpy(np.array(com_map_index)).view(-1).to(torch.int64)
+ frame_map_index = olp_map_index[:, 0][:, None].repeat([1, 2]).view(-1).to(
+ torch.int64)
+ y_single[frame_map_index] = 0
+ y_single[frame_map_index, speaker_map_index] = 1
+ return y_single
+ else:
+ olp2_com_index = torch.from_numpy(np.array(list(combinations(np.arange(n_speaker), 2)))).to(y.dtype)
+ olp2_num_coms = olp2_com_index.shape[0]
+ olp3_com_index = torch.from_numpy(np.array(list(combinations(np.arange(n_speaker), 3)))).to(y.dtype)
+ olp3_num_coms = olp3_com_index.shape[0]
+ y_single = y[:, :n_speaker]
+ y_olp2 = y[:, n_speaker:n_speaker + olp2_num_coms]
+ y_olp3 = y[:, -olp3_num_coms:]
+
+ olp3_map_index = torch.where(y_olp3 > 0.5)
+ olp3_map_index = torch.stack(olp3_map_index, dim=1)
+ olp3_com_map_index = olp3_com_index[olp3_map_index[:, -1]]
+ olp3_speaker_map_index = torch.from_numpy(np.array(olp3_com_map_index)).view(-1).to(torch.int64)
+ olp3_frame_map_index = olp3_map_index[:, 0][:, None].repeat([1, 3]).view(-1).to(torch.int64)
+ y_single[olp3_frame_map_index] = 0
+ y_single[olp3_frame_map_index, olp3_speaker_map_index] = 1
+ y_olp2[olp3_frame_map_index] = 0
+
+ olp2_map_index = torch.where(y_olp2 > 0.5)
+ olp2_map_index = torch.stack(olp2_map_index, dim=1)
+ olp2_com_map_index = olp2_com_index[olp2_map_index[:, -1]]
+ olp2_speaker_map_index = torch.from_numpy(np.array(olp2_com_map_index)).view(-1).to(torch.int64)
+ olp2_frame_map_index = olp2_map_index[:, 0][:, None].repeat([1, 2]).view(-1).to(torch.int64)
+ y_single[olp2_frame_map_index] = 0
+ y_single[olp2_frame_map_index, olp2_speaker_map_index] = 1
+ return y_single
+
+
+class PowerReporter():
+ def __init__(self, valid_data_loader, mapping_dict, max_n_speaker):
+ valid_data_loader_cp = copy.deepcopy(valid_data_loader)
+ self.valid_data_loader = valid_data_loader_cp
+ del valid_data_loader
+ self.mapping_dict = mapping_dict
+ self.max_n_speaker = max_n_speaker
+
+ def report(self, model, eidx, device):
+ self.report_val(model, eidx, device)
+
+ def report_val(self, model, eidx, device):
+ model.eval()
+ ud_valid_start = time.time()
+ valid_res, valid_loss, stats_keys, vad_valid_accuracy = self.report_core(model, self.valid_data_loader, device)
+
+ # Epoch Display
+ valid_der = valid_res['diarization_error'] / valid_res['speaker_scored']
+ valid_accuracy = valid_res['correct'].to(torch.float32) / valid_res['frames'] * 100
+ vad_valid_accuracy = vad_valid_accuracy * 100
+ print('Epoch ', eidx + 1, 'Valid Loss ', valid_loss, 'Valid_DER %.5f' % valid_der,
+ 'Valid_Accuracy %.5f%% ' % valid_accuracy, 'VAD_Valid_Accuracy %.5f%% ' % vad_valid_accuracy)
+ ud_valid = (time.time() - ud_valid_start) / 60.
+ print('Valid cost time ... ', ud_valid)
+
+ def inv_mapping_func(self, label, mapping_dict):
+ if not isinstance(label, int):
+ label = int(label)
+ if label in mapping_dict['label2dec'].keys():
+ num = mapping_dict['label2dec'][label]
+ else:
+ num = -1
+ return num
+
+ def report_core(self, model, data_loader, device):
+ res = {}
+ for item in metrics:
+ res[item[0]] = 0.
+ res[item[1]] = 0.
+ with torch.no_grad():
+ loss_s = 0.
+ uidx = 0
+ for xs, ts, orders in data_loader:
+ xs = [x.to(device) for x in xs]
+ ts = [t.to(device) for t in ts]
+ orders = [o.to(device) for o in orders]
+ loss, pit_loss, mpit_loss, att_loss, ys, logits, labels, attractors = model(xs, ts, orders)
+ loss_s += loss.item()
+ uidx += 1
+
+ for logit, t, att in zip(logits, labels, attractors):
+ pred = torch.argmax(torch.softmax(logit, dim=-1), dim=-1) # (T, )
+ oov_index = torch.where(pred == self.mapping_dict['oov'])[0]
+ for i in oov_index:
+ if i > 0:
+ pred[i] = pred[i - 1]
+ else:
+ pred[i] = 0
+ pred = [self.inv_mapping_func(i, self.mapping_dict) for i in pred]
+ decisions = [bin(num)[2:].zfill(self.max_n_speaker)[::-1] for num in pred]
+ decisions = torch.from_numpy(
+ np.stack([np.array([int(i) for i in dec]) for dec in decisions], axis=0)).to(att.device).to(
+ torch.float32)
+ decisions = decisions[:, :att.shape[0]]
+
+ stats = self.calc_diarization_error(decisions, t)
+ res['speaker_scored'] += stats['speaker_scored']
+ res['speech_scored'] += stats['speech_scored']
+ res['frames'] += stats['frames']
+ for item in metrics:
+ res[item[0]] += stats[item[0]]
+ loss_s /= uidx
+ vad_acc = 0
+
+ return res, loss_s, stats.keys(), vad_acc
+
+ def calc_diarization_error(self, decisions, label, label_delay=0):
+ label = label[:len(label) - label_delay, ...]
+ n_ref = torch.sum(label, dim=-1)
+ n_sys = torch.sum(decisions, dim=-1)
+ res = {}
+ res['speech_scored'] = torch.sum(n_ref > 0)
+ res['speech_miss'] = torch.sum((n_ref > 0) & (n_sys == 0))
+ res['speech_falarm'] = torch.sum((n_ref == 0) & (n_sys > 0))
+ res['speaker_scored'] = torch.sum(n_ref)
+ res['speaker_miss'] = torch.sum(torch.max(n_ref - n_sys, torch.zeros_like(n_ref)))
+ res['speaker_falarm'] = torch.sum(torch.max(n_sys - n_ref, torch.zeros_like(n_ref)))
+ n_map = torch.sum(((label == 1) & (decisions == 1)), dim=-1).to(torch.float32)
+ res['speaker_error'] = torch.sum(torch.min(n_ref, n_sys) - n_map)
+ res['correct'] = torch.sum(label == decisions) / label.shape[1]
+ res['diarization_error'] = (
+ res['speaker_miss'] + res['speaker_falarm'] + res['speaker_error'])
+ res['frames'] = len(label)
+ return res
diff --git a/funasr/modules/embedding.py b/funasr/modules/embedding.py
index b61a61a88..e4f9bff03 100644
--- a/funasr/modules/embedding.py
+++ b/funasr/modules/embedding.py
@@ -405,4 +405,13 @@ class SinusoidalPositionEncoder(torch.nn.Module):
positions = torch.arange(1, timesteps+1)[None, :]
position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
- return x + position_encoding
\ No newline at end of file
+ return x + position_encoding
+
+ def forward_chunk(self, x, cache=None):
+ start_idx = 0
+ batch_size, timesteps, input_dim = x.size()
+ if cache is not None:
+ start_idx = cache["start_idx"]
+ positions = torch.arange(1, timesteps+start_idx+1)[None, :]
+ position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
+ return x + position_encoding[:, start_idx: start_idx + timesteps]
diff --git a/funasr/runtime/grpc/CMakeLists.txt b/funasr/runtime/grpc/CMakeLists.txt
new file mode 100644
index 000000000..56e307482
--- /dev/null
+++ b/funasr/runtime/grpc/CMakeLists.txt
@@ -0,0 +1,83 @@
+# Copyright 2018 gRPC authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cmake build file for C++ paraformer example.
+# Assumes protobuf and gRPC have been installed using cmake.
+# See cmake_externalproject/CMakeLists.txt for all-in-one cmake build
+# that automatically builds all the dependencies before building paraformer.
+
+cmake_minimum_required(VERSION 3.10)
+
+project(ASR C CXX)
+
+include(common.cmake)
+
+# Proto file
+get_filename_component(rg_proto "../python/grpc/proto/paraformer.proto" ABSOLUTE)
+get_filename_component(rg_proto_path "${rg_proto}" PATH)
+
+# Generated sources
+set(rg_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/paraformer.pb.cc")
+set(rg_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/paraformer.pb.h")
+set(rg_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/paraformer.grpc.pb.cc")
+set(rg_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/paraformer.grpc.pb.h")
+add_custom_command(
+ OUTPUT "${rg_proto_srcs}" "${rg_proto_hdrs}" "${rg_grpc_srcs}" "${rg_grpc_hdrs}"
+ COMMAND ${_PROTOBUF_PROTOC}
+ ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
+ --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
+ -I "${rg_proto_path}"
+ --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
+ "${rg_proto}"
+ DEPENDS "${rg_proto}")
+
+
+# Include generated *.pb.h files
+include_directories("${CMAKE_CURRENT_BINARY_DIR}")
+
+include_directories(../onnxruntime/include/)
+link_directories(../onnxruntime/build/src/)
+link_directories(../onnxruntime/build/third_party/webrtc/)
+
+link_directories(${ONNXRUNTIME_DIR}/lib)
+add_subdirectory("../onnxruntime/src" onnx_src)
+
+# rg_grpc_proto
+add_library(rg_grpc_proto
+ ${rg_grpc_srcs}
+ ${rg_grpc_hdrs}
+ ${rg_proto_srcs}
+ ${rg_proto_hdrs})
+
+
+
+target_link_libraries(rg_grpc_proto
+ ${_REFLECTION}
+ ${_GRPC_GRPCPP}
+ ${_PROTOBUF_LIBPROTOBUF})
+
+# Targets paraformer_(server)
+foreach(_target
+ paraformer_server)
+ add_executable(${_target}
+ "${_target}.cc")
+ target_link_libraries(${_target}
+ rg_grpc_proto
+ rapidasr
+ webrtcvad
+ ${EXTRA_LIBS}
+ ${_REFLECTION}
+ ${_GRPC_GRPCPP}
+ ${_PROTOBUF_LIBPROTOBUF})
+endforeach()
diff --git a/funasr/runtime/grpc/Readme.md b/funasr/runtime/grpc/Readme.md
new file mode 100644
index 000000000..80e55aab2
--- /dev/null
+++ b/funasr/runtime/grpc/Readme.md
@@ -0,0 +1,57 @@
+## paraformer grpc onnx server in c++
+
+
+#### Step 1. Build ../onnxruntime as it's document
+```
+#put onnx-lib & onnx-asr-model & vocab.txt into /path/to/asrmodel(eg: /data/asrmodel)
+ls /data/asrmodel/
+onnxruntime-linux-x64-1.14.0 speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
+
+file /data/asrmodel/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/vocab.txt
+UTF-8 Unicode text
+```
+
+#### Step 2. Compile and install grpc v1.52.0 in case of grpc bugs
+```
+export GRPC_INSTALL_DIR=/data/soft/grpc
+export PKG_CONFIG_PATH=$GRPC_INSTALL_DIR/lib/pkgconfig
+
+git clone -b v1.52.0 --depth=1 https://github.com/grpc/grpc.git
+cd grpc
+git submodule update --init --recursive
+
+mkdir -p cmake/build
+pushd cmake/build
+cmake -DgRPC_INSTALL=ON \
+ -DgRPC_BUILD_TESTS=OFF \
+ -DCMAKE_INSTALL_PREFIX=$GRPC_INSTALL_DIR \
+ ../..
+make
+make install
+popd
+
+echo "export GRPC_INSTALL_DIR=/data/soft/grpc" >> ~/.bashrc
+echo "export PKG_CONFIG_PATH=\$GRPC_INSTALL_DIR/lib/pkgconfig" >> ~/.bashrc
+echo "export PATH=\$GRPC_INSTALL_DIR/bin/:\$PKG_CONFIG_PATH:\$PATH" >> ~/.bashrc
+source ~/.bashrc
+```
+
+#### Step 3. Compile and start grpc onnx paraformer server
+```
+# set -DONNXRUNTIME_DIR=/path/to/asrmodel/onnxruntime-linux-x64-1.14.0
+./rebuild.sh
+```
+
+#### Step 4. Start grpc paraformer server
+```
+Usage: ./cmake/build/paraformer_server port thread_num /path/to/model_file
+./cmake/build/paraformer_server 10108 4 /data/asrmodel/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
+```
+
+
+
+#### Step 5. Start grpc python paraformer client on PC with MIC
+```
+cd ../python/grpc
+python grpc_main_client_mic.py --host $server_ip --port 10108
+```
diff --git a/funasr/runtime/grpc/common.cmake b/funasr/runtime/grpc/common.cmake
new file mode 100644
index 000000000..1326a5be3
--- /dev/null
+++ b/funasr/runtime/grpc/common.cmake
@@ -0,0 +1,125 @@
+# Copyright 2018 gRPC authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cmake build file for C++ route_guide example.
+# Assumes protobuf and gRPC have been installed using cmake.
+# See cmake_externalproject/CMakeLists.txt for all-in-one cmake build
+# that automatically builds all the dependencies before building route_guide.
+
+cmake_minimum_required(VERSION 3.5.1)
+
+if (NOT DEFINED CMAKE_CXX_STANDARD)
+ set (CMAKE_CXX_STANDARD 14)
+endif()
+
+if(MSVC)
+ add_definitions(-D_WIN32_WINNT=0x600)
+endif()
+
+find_package(Threads REQUIRED)
+
+if(GRPC_AS_SUBMODULE)
+ # One way to build a projects that uses gRPC is to just include the
+ # entire gRPC project tree via "add_subdirectory".
+ # This approach is very simple to use, but the are some potential
+ # disadvantages:
+ # * it includes gRPC's CMakeLists.txt directly into your build script
+ # without and that can make gRPC's internal setting interfere with your
+ # own build.
+ # * depending on what's installed on your system, the contents of submodules
+ # in gRPC's third_party/* might need to be available (and there might be
+ # additional prerequisites required to build them). Consider using
+ # the gRPC_*_PROVIDER options to fine-tune the expected behavior.
+ #
+ # A more robust approach to add dependency on gRPC is using
+ # cmake's ExternalProject_Add (see cmake_externalproject/CMakeLists.txt).
+
+ # Include the gRPC's cmake build (normally grpc source code would live
+ # in a git submodule called "third_party/grpc", but this example lives in
+ # the same repository as gRPC sources, so we just look a few directories up)
+ add_subdirectory(../../.. ${CMAKE_CURRENT_BINARY_DIR}/grpc EXCLUDE_FROM_ALL)
+ message(STATUS "Using gRPC via add_subdirectory.")
+
+ # After using add_subdirectory, we can now use the grpc targets directly from
+ # this build.
+ set(_PROTOBUF_LIBPROTOBUF libprotobuf)
+ set(_REFLECTION grpc++_reflection)
+ if(CMAKE_CROSSCOMPILING)
+ find_program(_PROTOBUF_PROTOC protoc)
+ else()
+ set(_PROTOBUF_PROTOC $)
+ endif()
+ set(_GRPC_GRPCPP grpc++)
+ if(CMAKE_CROSSCOMPILING)
+ find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+ else()
+ set(_GRPC_CPP_PLUGIN_EXECUTABLE $)
+ endif()
+elseif(GRPC_FETCHCONTENT)
+ # Another way is to use CMake's FetchContent module to clone gRPC at
+ # configure time. This makes gRPC's source code available to your project,
+ # similar to a git submodule.
+ message(STATUS "Using gRPC via add_subdirectory (FetchContent).")
+ include(FetchContent)
+ FetchContent_Declare(
+ grpc
+ GIT_REPOSITORY https://github.com/grpc/grpc.git
+ # when using gRPC, you will actually set this to an existing tag, such as
+ # v1.25.0, v1.26.0 etc..
+ # For the purpose of testing, we override the tag used to the commit
+ # that's currently under test.
+ GIT_TAG vGRPC_TAG_VERSION_OF_YOUR_CHOICE)
+ FetchContent_MakeAvailable(grpc)
+
+ # Since FetchContent uses add_subdirectory under the hood, we can use
+ # the grpc targets directly from this build.
+ set(_PROTOBUF_LIBPROTOBUF libprotobuf)
+ set(_REFLECTION grpc++_reflection)
+ set(_PROTOBUF_PROTOC $)
+ set(_GRPC_GRPCPP grpc++)
+ if(CMAKE_CROSSCOMPILING)
+ find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+ else()
+ set(_GRPC_CPP_PLUGIN_EXECUTABLE $)
+ endif()
+else()
+ # This branch assumes that gRPC and all its dependencies are already installed
+ # on this system, so they can be located by find_package().
+
+ # Find Protobuf installation
+ # Looks for protobuf-config.cmake file installed by Protobuf's cmake installation.
+ set(protobuf_MODULE_COMPATIBLE TRUE)
+ find_package(Protobuf CONFIG REQUIRED)
+ message(STATUS "Using protobuf ${Protobuf_VERSION}")
+
+ set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf)
+ set(_REFLECTION gRPC::grpc++_reflection)
+ if(CMAKE_CROSSCOMPILING)
+ find_program(_PROTOBUF_PROTOC protoc)
+ else()
+ set(_PROTOBUF_PROTOC $)
+ endif()
+
+ # Find gRPC installation
+ # Looks for gRPCConfig.cmake file installed by gRPC's cmake installation.
+ find_package(gRPC CONFIG REQUIRED)
+ message(STATUS "Using gRPC ${gRPC_VERSION}")
+
+ set(_GRPC_GRPCPP gRPC::grpc++)
+ if(CMAKE_CROSSCOMPILING)
+ find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+ else()
+ set(_GRPC_CPP_PLUGIN_EXECUTABLE $)
+ endif()
+endif()
diff --git a/funasr/runtime/grpc/paraformer_server.cc b/funasr/runtime/grpc/paraformer_server.cc
new file mode 100644
index 000000000..e5814a56c
--- /dev/null
+++ b/funasr/runtime/grpc/paraformer_server.cc
@@ -0,0 +1,195 @@
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+
+#include "paraformer.grpc.pb.h"
+#include "paraformer_server.h"
+
+
+using grpc::Server;
+using grpc::ServerBuilder;
+using grpc::ServerContext;
+using grpc::ServerReader;
+using grpc::ServerReaderWriter;
+using grpc::ServerWriter;
+using grpc::Status;
+
+
+using paraformer::Request;
+using paraformer::Response;
+using paraformer::ASR;
+
+ASRServicer::ASRServicer(const char* model_path, int thread_num) {
+ AsrHanlde=RapidAsrInit(model_path, thread_num);
+ std::cout << "ASRServicer init" << std::endl;
+ init_flag = 0;
+}
+
+void ASRServicer::clear_states(const std::string& user) {
+ clear_buffers(user);
+ clear_transcriptions(user);
+}
+
+void ASRServicer::clear_buffers(const std::string& user) {
+ if (client_buffers.count(user)) {
+ client_buffers.erase(user);
+ }
+}
+
+void ASRServicer::clear_transcriptions(const std::string& user) {
+ if (client_transcription.count(user)) {
+ client_transcription.erase(user);
+ }
+}
+
+void ASRServicer::disconnect(const std::string& user) {
+ clear_states(user);
+ std::cout << "Disconnecting user: " << user << std::endl;
+}
+
+grpc::Status ASRServicer::Recognize(
+ grpc::ServerContext* context,
+ grpc::ServerReaderWriter* stream) {
+
+ Request req;
+ while (stream->Read(&req)) {
+ if (req.isend()) {
+ std::cout << "asr end" << std::endl;
+ disconnect(req.user());
+ Response res;
+ res.set_sentence(
+ R"({"success": true, "detail": "asr end"})"
+ );
+ res.set_user(req.user());
+ res.set_action("terminate");
+ res.set_language(req.language());
+ stream->Write(res);
+ } else if (req.speaking()) {
+ if (req.audio_data().size() > 0) {
+ auto& buf = client_buffers[req.user()];
+ buf.insert(buf.end(), req.audio_data().begin(), req.audio_data().end());
+ }
+ Response res;
+ res.set_sentence(
+ R"({"success": true, "detail": "speaking"})"
+ );
+ res.set_user(req.user());
+ res.set_action("speaking");
+ res.set_language(req.language());
+ stream->Write(res);
+ } else if (!req.speaking()) {
+ if (client_buffers.count(req.user()) == 0) {
+ Response res;
+ res.set_sentence(
+ R"({"success": true, "detail": "waiting_for_voice"})"
+ );
+ res.set_user(req.user());
+ res.set_action("waiting");
+ res.set_language(req.language());
+ stream->Write(res);
+ }else {
+ auto begin_time = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count();
+ std::string tmp_data = this->client_buffers[req.user()];
+ this->clear_states(req.user());
+
+ Response res;
+ res.set_sentence(
+ R"({"success": true, "detail": "decoding data: " + std::to_string(tmp_data.length()) + " bytes"})"
+ );
+ int data_len_int = tmp_data.length();
+ std::string data_len = std::to_string(data_len_int);
+ std::stringstream ss;
+ ss << R"({"success": true, "detail": "decoding data: )" << data_len << R"( bytes")" << R"("})";
+ std::string result = ss.str();
+ res.set_sentence(result);
+ res.set_user(req.user());
+ res.set_action("decoding");
+ res.set_language(req.language());
+ stream->Write(res);
+ if (tmp_data.length() < 800) { //min input_len for asr model
+ auto end_time = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count();
+ std::string delay_str = std::to_string(end_time - begin_time);
+ std::cout << "user: " << req.user() << " , delay(ms): " << delay_str << ", error: data_is_not_long_enough" << std::endl;
+ Response res;
+ std::stringstream ss;
+ std::string asr_result = "";
+ ss << R"({"success": true, "detail": "finish_sentence","server_delay_ms":)" << delay_str << R"(,"text":")" << asr_result << R"("})";
+ std::string result = ss.str();
+ res.set_sentence(result);
+ res.set_user(req.user());
+ res.set_action("finish");
+ res.set_language(req.language());
+
+
+
+ stream->Write(res);
+ }
+ else {
+ RPASR_RESULT Result= RapidAsrRecogPCMBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, RASR_NONE, NULL);
+ std::string asr_result = ((RPASR_RECOG_RESULT*)Result)->msg;
+
+ auto end_time = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count();
+ std::string delay_str = std::to_string(end_time - begin_time);
+
+ std::cout << "user: " << req.user() << " , delay(ms): " << delay_str << ", text: " << asr_result << std::endl;
+ Response res;
+ std::stringstream ss;
+ ss << R"({"success": true, "detail": "finish_sentence","server_delay_ms":)" << delay_str << R"(,"text":")" << asr_result << R"("})";
+ std::string result = ss.str();
+ res.set_sentence(result);
+ res.set_user(req.user());
+ res.set_action("finish");
+ res.set_language(req.language());
+
+
+ stream->Write(res);
+ }
+ }
+ }else {
+ Response res;
+ res.set_sentence(
+ R"({"success": false, "detail": "error, no condition matched! Unknown reason."})"
+ );
+ res.set_user(req.user());
+ res.set_action("terminate");
+ res.set_language(req.language());
+ stream->Write(res);
+ }
+ }
+ return Status::OK;
+}
+
+
+void RunServer(const std::string& port, int thread_num, const char* model_path) {
+ std::string server_address;
+ server_address = "0.0.0.0:" + port;
+ ASRServicer service(model_path, thread_num);
+
+ ServerBuilder builder;
+ builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
+ builder.RegisterService(&service);
+ std::unique_ptr server(builder.BuildAndStart());
+ std::cout << "Server listening on " << server_address << std::endl;
+ server->Wait();
+}
+
+int main(int argc, char* argv[]) {
+ if (argc < 3)
+ {
+ printf("Usage: %s port thread_num /path/to/model_file\n", argv[0]);
+ exit(-1);
+ }
+
+ RunServer(argv[1], atoi(argv[2]), argv[3]);
+ return 0;
+}
diff --git a/funasr/runtime/grpc/paraformer_server.h b/funasr/runtime/grpc/paraformer_server.h
new file mode 100644
index 000000000..f356d9413
--- /dev/null
+++ b/funasr/runtime/grpc/paraformer_server.h
@@ -0,0 +1,56 @@
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+#include "paraformer.grpc.pb.h"
+#include "librapidasrapi.h"
+
+
+using grpc::Server;
+using grpc::ServerBuilder;
+using grpc::ServerContext;
+using grpc::ServerReader;
+using grpc::ServerReaderWriter;
+using grpc::ServerWriter;
+using grpc::Status;
+
+
+using paraformer::Request;
+using paraformer::Response;
+using paraformer::ASR;
+
+typedef struct
+{
+ std::string msg;
+ float snippet_time;
+}RPASR_RECOG_RESULT;
+
+
+class ASRServicer final : public ASR::Service {
+ private:
+ int init_flag;
+ std::unordered_map client_buffers;
+ std::unordered_map client_transcription;
+
+ public:
+ ASRServicer(const char* model_path, int thread_num);
+ void clear_states(const std::string& user);
+ void clear_buffers(const std::string& user);
+ void clear_transcriptions(const std::string& user);
+ void disconnect(const std::string& user);
+ grpc::Status Recognize(grpc::ServerContext* context, grpc::ServerReaderWriter* stream);
+ RPASR_HANDLE AsrHanlde;
+
+};
diff --git a/funasr/runtime/grpc/rebuild.sh b/funasr/runtime/grpc/rebuild.sh
new file mode 100644
index 000000000..9b41ed6d1
--- /dev/null
+++ b/funasr/runtime/grpc/rebuild.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+rm cmake -rf
+mkdir -p cmake/build
+
+cd cmake/build
+
+cmake -DCMAKE_BUILD_TYPE=release ../.. -DONNXRUNTIME_DIR=/data/asrmodel/onnxruntime-linux-x64-1.14.0
+make
+
+
+echo "Build cmake/build/paraformer_server successfully!"
diff --git a/funasr/runtime/onnxruntime/CMakeLists.txt b/funasr/runtime/onnxruntime/CMakeLists.txt
new file mode 100644
index 000000000..8d502c4be
--- /dev/null
+++ b/funasr/runtime/onnxruntime/CMakeLists.txt
@@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 3.10)
+
+#-DONNXRUNTIME_DIR=D:\thirdpart\onnxruntime
+project(FastASR)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+# for onnxruntime
+
+IF(WIN32)
+
+
+ if(CMAKE_CL_64)
+ link_directories(${ONNXRUNTIME_DIR}\\lib)
+ else()
+ add_definitions(-D_WIN_X86)
+ endif()
+ELSE()
+
+
+link_directories(${ONNXRUNTIME_DIR}/lib)
+
+endif()
+
+#option(FASTASR_BUILD_PYTHON_MODULE "build python module, using FastASR in Python" OFF)
+
+add_subdirectory("./third_party/webrtc")
+add_subdirectory(src)
+add_subdirectory(tester)
diff --git a/funasr/runtime/onnxruntime/CMakeSettings.json b/funasr/runtime/onnxruntime/CMakeSettings.json
new file mode 100644
index 000000000..2eb6c5a26
--- /dev/null
+++ b/funasr/runtime/onnxruntime/CMakeSettings.json
@@ -0,0 +1,44 @@
+{
+ "configurations": [
+ {
+ "name": "x64-Debug",
+ "generator": "Ninja",
+ "configurationType": "Debug",
+ "inheritEnvironments": [ "msvc_x64_x64" ],
+ "buildRoot": "${projectDir}\\out\\build\\${name}",
+ "installRoot": "${projectDir}\\out\\install\\${name}",
+ "buildCommandArgs": "",
+ "ctestCommandArgs": ""
+ },
+ {
+ "name": "x64-Release",
+ "generator": "Ninja",
+ "configurationType": "RelWithDebInfo",
+ "buildRoot": "${projectDir}\\out\\build\\${name}",
+ "installRoot": "${projectDir}\\out\\install\\${name}",
+ "cmakeCommandArgs": "",
+ "buildCommandArgs": "",
+ "ctestCommandArgs": "",
+ "inheritEnvironments": [ "msvc_x64_x64" ]
+ },
+ {
+ "name": "Linux-GCC-Debug",
+ "generator": "Unix Makefiles",
+ "configurationType": "Debug",
+ "cmakeExecutable": "cmake",
+ "remoteCopySourcesExclusionList": [ ".vs", ".git", "out" ],
+ "cmakeCommandArgs": "-DONNXRUNTIME_DIR=/data/linux/thirdpart/onnxruntime-linux-x64-1.14.1",
+ "buildCommandArgs": "",
+ "ctestCommandArgs": "",
+ "inheritEnvironments": [ "linux_x64" ],
+ "remoteMachineName": "${defaultRemoteMachineName}",
+ "remoteCMakeListsRoot": "$HOME/.vs/${projectDirName}/${workspaceHash}/src",
+ "remoteBuildRoot": "$HOME/.vs/${projectDirName}/${workspaceHash}/out/build/${name}",
+ "remoteInstallRoot": "$HOME/.vs/${projectDirName}/${workspaceHash}/out/install/${name}",
+ "remoteCopySources": true,
+ "rsyncCommandArgs": "-t --delete",
+ "remoteCopyBuildOutput": false,
+ "remoteCopySourcesMethod": "rsync"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/images/demo.png b/funasr/runtime/onnxruntime/images/demo.png
new file mode 100644
index 000000000..03171b26b
Binary files /dev/null and b/funasr/runtime/onnxruntime/images/demo.png differ
diff --git a/funasr/runtime/onnxruntime/images/threadnum.png b/funasr/runtime/onnxruntime/images/threadnum.png
new file mode 100644
index 000000000..dd70cc840
Binary files /dev/null and b/funasr/runtime/onnxruntime/images/threadnum.png differ
diff --git a/funasr/runtime/onnxruntime/include/Audio.h b/funasr/runtime/onnxruntime/include/Audio.h
new file mode 100644
index 000000000..da5e82cc7
--- /dev/null
+++ b/funasr/runtime/onnxruntime/include/Audio.h
@@ -0,0 +1,59 @@
+
+#ifndef AUDIO_H
+#define AUDIO_H
+
+#include
+#include
+#include
+
+using namespace std;
+
+class AudioFrame {
+ private:
+ int start;
+ int end;
+ int len;
+
+ public:
+ AudioFrame();
+ AudioFrame(int len);
+
+ ~AudioFrame();
+ int set_start(int val);
+ int set_end(int val, int max_len);
+ int get_start();
+ int get_len();
+ int disp();
+};
+
+class Audio {
+ private:
+ float *speech_data;
+ int16_t *speech_buff;
+ int speech_len;
+ int speech_align_len;
+ int16_t sample_rate;
+ int offset;
+ float align_size;
+ int data_type;
+ queue frame_queue;
+
+ public:
+ Audio(int data_type);
+ Audio(int data_type, int size);
+ ~Audio();
+ void disp();
+ bool loadwav(const char* filename);
+ bool loadwav(const char* buf, int nLen);
+ bool loadpcmwav(const char* buf, int nFileLen);
+ bool loadpcmwav(const char* filename);
+ int fetch_chunck(float *&dout, int len);
+ int fetch(float *&dout, int &len, int &flag);
+ void padding();
+ void split();
+ float get_time_len();
+
+ int get_queue_size() { return (int)frame_queue.size(); }
+};
+
+#endif
diff --git a/funasr/runtime/onnxruntime/include/ComDefine.h b/funasr/runtime/onnxruntime/include/ComDefine.h
new file mode 100644
index 000000000..f131e5ec3
--- /dev/null
+++ b/funasr/runtime/onnxruntime/include/ComDefine.h
@@ -0,0 +1,11 @@
+
+#ifndef COMDEFINE_H
+#define COMDEFINE_H
+
+#define S_BEGIN 0
+#define S_MIDDLE 1
+#define S_END 2
+#define S_ALL 3
+#define S_ERR 4
+
+#endif
diff --git a/funasr/runtime/onnxruntime/include/Model.h b/funasr/runtime/onnxruntime/include/Model.h
new file mode 100644
index 000000000..06267cb30
--- /dev/null
+++ b/funasr/runtime/onnxruntime/include/Model.h
@@ -0,0 +1,17 @@
+
+#ifndef MODEL_H
+#define MODEL_H
+
+#include
+
+class Model {
+ public:
+ virtual ~Model(){};
+ virtual void reset() = 0;
+ virtual std::string forward_chunk(float *din, int len, int flag) = 0;
+ virtual std::string forward(float *din, int len, int flag) = 0;
+ virtual std::string rescoring() = 0;
+};
+
+Model *create_model(const char *path,int nThread=0);
+#endif
diff --git a/funasr/runtime/onnxruntime/include/librapidasrapi.h b/funasr/runtime/onnxruntime/include/librapidasrapi.h
new file mode 100644
index 000000000..a83098f93
--- /dev/null
+++ b/funasr/runtime/onnxruntime/include/librapidasrapi.h
@@ -0,0 +1,96 @@
+#pragma once
+
+
+#ifdef WIN32
+
+
+#ifdef _RPASR_API_EXPORT
+
+#define _RAPIDASRAPI __declspec(dllexport)
+#else
+#define _RAPIDASRAPI __declspec(dllimport)
+#endif
+
+
+#else
+#define _RAPIDASRAPI
+#endif
+
+
+
+
+
+#ifndef _WIN32
+
+#define RPASR_CALLBCK_PREFIX __attribute__((__stdcall__))
+
+#else
+#define RPASR_CALLBCK_PREFIX __stdcall
+#endif
+
+
+#ifdef __cplusplus
+
+extern "C" {
+#endif
+
+typedef void* RPASR_HANDLE;
+
+typedef void* RPASR_RESULT;
+
+typedef unsigned char RPASR_BOOL;
+
+#define RPASR_TRUE 1
+#define RPASR_FALSE 0
+#define QM_DEFAULT_THREAD_NUM 4
+
+
+typedef enum
+{
+ RASR_NONE=-1,
+ RASRM_CTC_GREEDY_SEARCH=0,
+ RASRM_CTC_RPEFIX_BEAM_SEARCH = 1,
+ RASRM_ATTENSION_RESCORING = 2,
+
+}RPASR_MODE;
+
+typedef enum {
+
+ RPASR_MODEL_PADDLE = 0,
+ RPASR_MODEL_PADDLE_2 = 1,
+ RPASR_MODEL_K2 = 2,
+ RPASR_MODEL_PARAFORMER = 3,
+
+}RPASR_MODEL_TYPE;
+
+
+typedef void (* QM_CALLBACK)(int nCurStep, int nTotal); // nTotal: total steps; nCurStep: Current Step.
+
+ // APIs for qmasr
+
+_RAPIDASRAPI RPASR_HANDLE RapidAsrInit(const char* szModelDir, int nThread);
+
+
+
+// if not give a fnCallback ,it should be NULL
+_RAPIDASRAPI RPASR_RESULT RapidAsrRecogBuffer(RPASR_HANDLE handle, const char* szBuf, int nLen, RPASR_MODE Mode, QM_CALLBACK fnCallback);
+_RAPIDASRAPI RPASR_RESULT RapidAsrRecogPCMBuffer(RPASR_HANDLE handle, const char* szBuf, int nLen, RPASR_MODE Mode, QM_CALLBACK fnCallback);
+
+_RAPIDASRAPI RPASR_RESULT RapidAsrRecogPCMFile(RPASR_HANDLE handle, const char* szFileName, RPASR_MODE Mode, QM_CALLBACK fnCallback);
+
+_RAPIDASRAPI RPASR_RESULT RapidAsrRecogFile(RPASR_HANDLE handle, const char* szWavfile, RPASR_MODE Mode, QM_CALLBACK fnCallback);
+
+_RAPIDASRAPI const char* RapidAsrGetResult(RPASR_RESULT Result,int nIndex);
+
+_RAPIDASRAPI const int RapidAsrGetRetNumber(RPASR_RESULT Result);
+_RAPIDASRAPI void RapidAsrFreeResult(RPASR_RESULT Result);
+
+
+_RAPIDASRAPI void RapidAsrUninit(RPASR_HANDLE Handle);
+
+_RAPIDASRAPI const float RapidAsrGetRetSnippetTime(RPASR_RESULT Result);
+
+#ifdef __cplusplus
+
+}
+#endif
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/include/webrtc_vad.h b/funasr/runtime/onnxruntime/include/webrtc_vad.h
new file mode 100644
index 000000000..f5bbadf5b
--- /dev/null
+++ b/funasr/runtime/onnxruntime/include/webrtc_vad.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * This header file includes the VAD API calls. Specific function calls are
+ * given below.
+ */
+
+#ifndef COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_ // NOLINT
+#define COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_
+
+#include
+#include
+
+typedef struct WebRtcVadInst VadInst;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Creates an instance to the VAD structure.
+VadInst* WebRtcVad_Create(void);
+
+// Frees the dynamic memory of a specified VAD instance.
+//
+// - handle [i] : Pointer to VAD instance that should be freed.
+void WebRtcVad_Free(VadInst* handle);
+
+// Initializes a VAD instance.
+//
+// - handle [i/o] : Instance that should be initialized.
+//
+// returns : 0 - (OK),
+// -1 - (null pointer or Default mode could not be set).
+int WebRtcVad_Init(VadInst* handle);
+
+// Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
+// restrictive in reporting speech. Put in other words the probability of being
+// speech when the VAD returns 1 is increased with increasing mode. As a
+// consequence also the missed detection rate goes up.
+//
+// - handle [i/o] : VAD instance.
+// - mode [i] : Aggressiveness mode (0, 1, 2, or 3).
+//
+// returns : 0 - (OK),
+// -1 - (null pointer, mode could not be set or the VAD instance
+// has not been initialized).
+int WebRtcVad_set_mode(VadInst* handle, int mode);
+
+// Calculates a VAD decision for the |audio_frame|. For valid sampling rates
+// frame lengths, see the description of WebRtcVad_ValidRatesAndFrameLengths().
+//
+// - handle [i/o] : VAD Instance. Needs to be initialized by
+// WebRtcVad_Init() before call.
+// - fs [i] : Sampling frequency (Hz): 8000, 16000, or 32000
+// - audio_frame [i] : Audio frame buffer.
+// - frame_length [i] : Length of audio frame buffer in number of samples.
+//
+// returns : 1 - (Active Voice),
+// 0 - (Non-active Voice),
+// -1 - (Error)
+int WebRtcVad_Process(VadInst* handle,
+ int fs,
+ const int16_t* audio_frame,
+ size_t frame_length);
+
+// Checks for valid combinations of |rate| and |frame_length|. We support 10,
+// 20 and 30 ms frames and the rates 8000, 16000 and 32000 Hz.
+//
+// - rate [i] : Sampling frequency (Hz).
+// - frame_length [i] : Speech frame buffer length in number of samples.
+//
+// returns : 0 - (valid combination), -1 - (invalid combination)
+int WebRtcVad_ValidRateAndFrameLength(int rate, size_t frame_length);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_ // NOLINT
diff --git a/funasr/runtime/onnxruntime/include/win_func.h b/funasr/runtime/onnxruntime/include/win_func.h
new file mode 100644
index 000000000..1baaae53f
--- /dev/null
+++ b/funasr/runtime/onnxruntime/include/win_func.h
@@ -0,0 +1,28 @@
+#include
+#ifdef WIN32
+#include
+#else
+#include
+#endif
+#ifdef WIN32
+int gettimeofday(struct timeval* tp, void* tzp)
+{
+ time_t clock;
+ struct tm tm;
+ SYSTEMTIME wtm;
+
+ GetLocalTime(&wtm);
+ tm.tm_year = wtm.wYear - 1900;
+ tm.tm_mon = wtm.wMonth - 1;
+ tm.tm_mday = wtm.wDay;
+ tm.tm_hour = wtm.wHour;
+ tm.tm_min = wtm.wMinute;
+ tm.tm_sec = wtm.wSecond;
+ tm.tm_isdst = -1;
+
+ clock = mktime(&tm);
+ tp->tv_sec = clock;
+ tp->tv_usec = wtm.wMilliseconds * 1000;
+ return (0);
+}
+#endif
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/models/readme.md b/funasr/runtime/onnxruntime/models/readme.md
new file mode 100644
index 000000000..732ef0124
--- /dev/null
+++ b/funasr/runtime/onnxruntime/models/readme.md
@@ -0,0 +1 @@
+Place model.onnx here!
diff --git a/funasr/runtime/onnxruntime/models/vocab.txt b/funasr/runtime/onnxruntime/models/vocab.txt
new file mode 100644
index 000000000..61cb04ea8
--- /dev/null
+++ b/funasr/runtime/onnxruntime/models/vocab.txt
@@ -0,0 +1,8404 @@
+
+
+
+and@@
+筑
+陨
+眺
+塘
+檩
+衷
+氧
+孔
+阖
+邠
+坎
+喵
+曰
+鼠
+隐
+腊
+族
+矧
+敉
+俜
+似
+怫
+塔
+price
+春
+罍
+娅
+棉
+弃
+茱
+应
+汈
+擦
+贺
+鹇
+these
+迅
+诬
+do@@
+盍
+秕
+啃
+颟
+辑
+彘
+ps
+斜
+瞭
+铟
+漭
+蹇
+旆
+窳
+臊
+览
+嘿
+淖
+尴
+袆
+斧
+筹
+媵
+挞
+臧
+齐
+璨
+笥
+滂
+即
+愔
+思
+gr@@
+幅
+祛
+箬
+礁
+茅
+北
+澡
+俭
+蘅
+ing
+肺
+肢
+巢
+九
+蠓
+路
+藻
+沱
+ness
+璐
+积
+寞
+栳
+舆
+医
+眷
+岳
+勘
+璃
+黔
+犇
+哎
+罡
+k
+丝
+de
+跣
+梦
+需
+毅
+峡
+竞
+砦
+研
+眙
+滋
+鹳
+肝
+阼
+per
+忱
+乏
+废
+邦
+輶
+驯
+夫
+寳
+忪
+崑
+睾
+逗
+峰
+越
+狗
+蟒
+笆
+适
+洇
+缶
+ore
+辎
+粤
+蹴
+黄
+浞
+comp@@
+犀
+藏
+本
+嗖
+黻
+這
+绰
+鉏
+麼
+喆
+袪
+刚
+侦
+ic@@
+骧
+瓫
+柤
+桃
+鲴
+褙
+韵
+妓
+甍
+to@@
+轱
+塑
+坯
+貌
+n@@
+蕞
+疡
+伏
+酉
+暇
+霖
+了
+萸
+嘶
+nu@@
+挼
+by
+聩
+袓
+嶓
+桎
+抖
+攥
+鞬
+毳
+旗
+庸
+呋
+诲
+tting
+狭
+魄
+伎
+喋
+樱
+翎
+怯
+场
+睽
+盒
+times
+鲳
+爆
+绵
+皋
+尢
+嘎
+渝
+迂
+嘁
+袷
+始
+奚
+台
+禄
+挢
+座
+绀
+漱
+龃
+tu@@
+榫
+诛
+minu@@
+萼
+裛
+玠
+谵
+亳
+副
+any@@
+诊
+唑
+頫
+斿
+赁
+骇
+训
+母
+床
+微
+椰
+迢
+埃
+辏
+汩
+叟
+辔
+隙
+遵
+pp@@
+翕
+佬
+栖
+踉
+皕
+苏
+痂
+奋
+阄
+悌
+点
+碡
+茎
+睫
+闫
+few
+籁
+孰
+拥
+for
+曾
+疲
+辞
+赖
+姆
+与
+诰
+怨
+沙
+with@@
+睱
+谓
+晰
+嗌
+id
+aga@@
+実
+魏
+鲟
+寅
+滗
+珲
+腑
+冠
+夺
+娶
+宇
+侩
+筘
+an
+磺
+邛
+着
+踵
+ite
+狝
+ele@@
+蓼
+猿
+豆
+蔷
+沽
+去
+铥
+癀
+站
+甪
+璧
+范
+哓
+菏
+龠
+岷
+嫉
+拧
+札
+戒
+琏
+绪
+澌
+楠
+莱
+cer@@
+here
+别
+帻
+嗉
+假
+拽
+髭
+穰
+勋
+栓
+塬
+ou@@
+橫
+刻
+侣
+鎉
+bre@@
+趸
+稳
+岌
+拎
+落
+岙
+氨
+桴
+鬶
+clu@@
+蚣
+肌
+讹
+骼
+忧
+雹
+0@@
+算
+腔
+璇
+酣
+锭
+蟾
+逦
+椟
+频
+ts
+矫
+拨
+珅
+侨
+蚨
+皯
+翛
+儋
+恳
+瀍
+敌
+砬
+奁
+耵
+烂
+绿
+缆
+辣
+time
+蔗
+too
+圾
+骞
+慥
+啶
+帔
+楢
+bi@@
+蚤
+浛
+ine
+綦
+old
+肩
+擎
+ling
+瘆
+娲
+prob@@
+暑
+鲨
+焚
+剽
+玚
+乔
+纪
+bo@@
+熏
+毗
+鳙
+鞥
+絜
+糠
+菔
+廛
+谪
+冬
+遐
+衽
+ich
+柿
+峭
+渴
+亓
+荠
+蝠
+扆
+鄞
+诎
+尙
+摸
+牙
+薤
+da@@
+w
+呜
+陂
+磻
+匹
+while
+迁
+良
+郤
+ct@@
+蚕
+浥
+鲋
+腱
+pres@@
+氕
+颉
+夥
+ter@@
+左
+侉
+妍
+嚒
+殖
+私
+bas@@
+锱
+篪
+吻
+鄅
+鳗
+疳
+cor@@
+毛
+歼
+邵
+圪
+inn
+舛
+埗
+貉
+帐
+妮
+ged
+窒
+put
+诉
+堌
+气
+國
+摩
+沫
+谁
+转
+语
+琵
+羊
+檠
+慎
+踮
+啾
+瞻
+山
+播
+筌
+财
+飚
+苟
+扣
+mis@@
+桌
+侑
+jo@@
+ke@@
+冶
+滔
+蠹
+呓
+捷
+证
+崞
+阵
+掐
+劳
+皆
+巧
+肏
+肆
+葆
+檄
+画
+狨
+谧
+傑
+诱
+纭
+荥
+厥
+bri@@
+绁
+睇
+布
+江
+噎
+灞
+鄏
+煊
+蒹
+厮
+馓
+狴
+碍
+穷
+lar@@
+食
+迩
+meeting
+tter
+趣
+辜
+椁
+汽
+燠
+ate
+礴
+骈
+will
+not
+煽
+嗳
+秆
+勤
+陆
+键
+墎
+官
+蘧
+酤
+唐
+颓
+仝
+iting
+田
+楫
+瑁
+鄢
+our
+诘
+venue
+霜
+镏
+痫
+娟
+婢
+埋
+汕
+铋
+徭
+隰
+猊
+卞
+慑
+said
+裘
+bus@@
+召
+粱
+返
+缨
+纻
+磋
+炴
+凊
+曝
+兀
+洼
+杓
+榆
+hotels
+睹
+糯
+窘
+葩
+帮
+荷
+塌
+矍
+圯
+er@@
+蚪
+篼
+咳
+吸
+喃
+岩
+嚟
+谱
+崆
+蟭
+wal@@
+姗
+谛
+东
+菱
+ction
+肫
+祥
+述
+璋
+仙
+唇
+硕
+嘉
+醒
+兔
+恒
+银
+収
+cre@@
+弭
+曌
+螯
+苡
+疾
+鸱
+权
+搔
+途
+茂
+皑
+补
+肃
+ns
+market
+讠
+阈
+机
+苒
+髙
+is@@
+幢
+郐
+萎
+帛
+烖
+襟
+崟
+溶
+螺
+铲
+仉
+舌
+敖
+倍
+锏
+鸦
+夌
+埌
+腰
+雠
+ol@@
+涙
+翳
+夹
+疏
+el
+驭
+鹯
+瑜
+薛
+奇
+娴
+靳
+镥
+伍
+暨
+what
+萧
+皴
+烦
+敷
+样
+ri@@
+睆
+伦
+後
+祺
+贾
+汹
+箸
+掂
+械
+ob@@
+赜
+黎
+潵
+鲽
+牒
+罐
+inter@@
+冰
+成
+薳
+if
+堡
+搜
+漪
+赕
+症
+参
+鲫
+克
+浴
+薹
+梫
+调
+劢
+茹
+飐
+r
+her
+鲊
+妪
+惟
+榜
+谒
+梃
+twenty
+ni@@
+猥
+兴
+氓
+肘
+饕
+陵
+within
+轲
+cted
+単
+嚗
+唼
+inte@@
+钋
+抒
+郓
+牖
+蔻
+邂
+懦
+邻
+茸
+辈
+种
+黹
+hard
+悦
+综
+ved
+oo@@
+used
+溱
+藁
+士
+猝
+芹
+char@@
+we
+呒
+ss@@
+岐
+桖
+迟
+荞
+哏
+仵
+抟
+淇
+il
+电
+褀
+果
+bir@@
+ges
+暻
+椊
+吴
+sal@@
+獐
+煲
+箥
+爰
+呣
+邶
+尻
+水
+sh@@
+fore
+mat@@
+浅
+枘
+稷
+艨
+唧
+can@@
+eng
+know
+桠
+亵
+tou@@
+伥
+虏
+綮
+sha@@
+腌
+蔓
+莶
+戬
+鋆
+酥
+璜
+阐
+贶
+远
+验
+dre@@
+赎
+惭
+裴
+晌
+铡
+愆
+隧
+祐
+貔
+片
+al
+抾
+柰
+探
+喈
+蟮
+潎
+批
+胴
+郁
+蓿
+稿
+傲
+垒
+甙
+靼
+chic@@
+效
+栗
+齁
+仲
+担
+har@@
+仟
+砂
+i
+舰
+吝
+慆
+芸
+why
+tic
+哥
+佩
+铀
+妊
+x
+刍
+殄
+躬
+莙
+客
+拚
+葬
+哼
+婵
+ance
+棺
+没
+崎
+曛
+宛
+斌
+掠
+醇
+庾
+黥
+乗
+旉
+铰
+寨
+钨
+沥
+gh@@
+吇
+嬖
+厨
+箔
+ah
+譞
+鲖
+蒗
+銶
+角
+脏
+夬
+who
+毂
+赓
+嘢
+蹶
+駃
+avenue
+呃
+that's
+訾
+蒜
+就
+抿
+霫
+que@@
+牁
+叻
+绌
+捣
+埭
+蛩
+迤
+ir
+最
+不
+鹚
+钧
+晃
+钍
+岍
+烯
+授
+笨
+馔
+甸
+啰
+赌
+蠼
+荆
+濉
+摹
+剔
+浪
+瓦
+涤
+阬
+eng@@
+墨
+鲢
+老
+拮
+轿
+弈
+秸
+ken
+省
+穹
+跨
+芤
+剰
+湍
+吥
+喧
+借
+伯
+咋
+噢
+剩
+略
+图
+毕
+爻
+箭
+ans@@
+no
+缣
+fic@@
+必
+礻
+视
+侔
+乸
+缎
+比
+殉
+禅
+蹈
+茶
+沔
+腹
+更
+倢
+骑
+俦
+一
+巉
+糌
+there
+笾
+泺
+虫
+随
+室
+谙
+淞
+even
+嘌
+掉
+进
+栈
+隋
+钳
+饲
+裾
+搞
+朽
+嚏
+垱
+倘
+sy@@
+蒂
+訚
+火
+葱
+踹
+only
+den@@
+胰
+曦
+汨
+奴
+院
+晶
+臇
+赭
+蚵
+便
+藜
+鍪
+穆
+尿
+find
+偾
+项
+嬅
+济
+area
+皿
+蹽
+af@@
+曈
+ger
+袭
+温
+包
+惎
+枝
+槁
+跑
+汇
+嫦
+崒
+颇
+丐
+丛
+哠
+鲲
+佯
+疱
+來
+彝
+件
+鸫
+张
+缋
+檎
+港
+尸
+comm@@
+瘘
+囍
+锅
+惫
+衔
+蔚
+龚
+酱
+ina
+尚
+孪
+蔵
+帧
+弯
+迄
+訇
+恕
+紡
+吱
+觐
+印
+need
+叭
+茫
+汶
+邢
+磅
+焜
+蜣
+米
+俎
+ath
+蛔
+组
+壹
+诈
+ing@@
+希
+茨
+砧
+has
+蝶
+矛
+拖
+乍
+浇
+another
+输
+朗
+殡
+壶
+灿
+礌
+钡
+瓤
+序
+误
+毖
+静
+鸾
+墚
+璟
+咱
+惘
+化
+腾
+苍
+苼
+七
+芾
+囝
+淄
+馆
+榉
+荸
+摧
+醋
+缦
+帘
+蛋
+曙
+萩
+莉
+犸
+拜
+特
+蕊
+并
+冼
+埝
+茴
+佶
+噶
+ked
+port
+柠
+吶
+竿
+鞧
+糙
+栻
+褂
+杉
+陛
+shi@@
+朋
+升
+钛
+拭
+walk
+钱
+岸
+衲
+若
+燕
+墩
+戛
+ations
+诳
+冨
+强
+掌
+腺
+淤
+鼍
+妥
+亥
+俵
+鹩
+占
+佤
+棋
+does
+tes
+拒
+劼
+绩
+ren@@
+货
+g@@
+深
+钯
+棬
+墟
+疼
+骊
+摅
+祧
+兊
+坠
+int
+use
+泞
+赦
+甾
+葺
+辘
+炆
+旭
+鸯
+茆
+融
+艄
+晖
+钺
+勉
+嘘
+龛
+蕙
+渀
+钞
+写
+弋
+颦
+灌
+埚
+鲷
+亡
+矩
+轰
+a
+单
+觚
+呯
+祏
+rec@@
+逢
+憧
+蒽
+內
+乡
+鸠
+卜
+庄
+仰
+how
+铓
+踝
+隆
+避
+豌
+low@@
+ak
+劣
+哺
+头
+proble@@
+es
+说
+哇
+折
+祝
+偻
+揆
+的
+盎
+初
+骝
+荻
+饷
+耽
+莸
+just
+簰
+现
+pl@@
+籍
+珉
+蕲
+臌
+闪
+崮
+gra@@
+琯
+圆
+瓴
+赬
+镧
+被
+共
+芯
+蚧
+stu@@
+mee@@
+沧
+伲
+觌
+筏
+庑
+still
+题
+wat@@
+4
+绱
+入
+亚
+sho@@
+珫
+饴
+點
+than
+good
+l@@
+梁
+忿
+荐
+躺
+蹡
+呕
+圩
+唷
+陌
+ue
+鲭
+碗
+怪
+飘
+country
+粑
+怹
+飕
+烨
+吹
+嵇
+驺
+纰
+in@@
+间
+馈
+榑
+窜
+泗
+硪
+躏
+th
+耸
+贞
+wom@@
+排
+箩
+绽
+舵
+焉
+振
+镶
+thirty
+闲
+摁
+堰
+牵
+栋
+堤
+馀
+盟
+t
+旄
+凇
+洣
+録
+韭
+por@@
+孑
+茄
+闺
+淀
+坡
+烟
+洺
+gre@@
+敦
+哉
+到
+ding
+遑
+钒
+壳
+lo
+纾
+砲
+灶
+lee
+玘
+up
+梵
+旖
+佗
+竽
+绋
+砩
+酒
+苯
+焕
+祚
+苁
+嗓
+ail@@
+殽
+om@@
+棨
+翼
+墼
+萄
+垭
+碱
+cts
+渲
+矱
+掇
+best
+锃
+谶
+喜
+雌
+辊
+啀
+嗞
+谢
+疹
+玎
+唤
+兆
+彳
+溧
+丕
+棒
+桁
+樓
+跟
+蝼
+哭
+啭
+替
+乩
+箪
+城
+朾
+ear@@
+鲌
+ship
+吕
+粉
+舜
+伛
+觏
+燮
+铊
+硝
+撤
+瘝
+thanks
+锵
+圣
+contin@@
+侬
+浮
+棵
+歭
+ici@@
+珞
+褔
+券
+演
+箫
+缵
+篾
+鲮
+砒
+含
+郡
+快
+栏
+瘟
+饤
+tw@@
+拃
+盹
+壕
+桯
+嗪
+鞨
+甏
+锫
+涕
+冕
+鄣
+淌
+辰
+唿
+暲
+蚀
+跋
+郸
+镀
+ku@@
+赔
+姺
+课
+础
+耷
+涪
+day
+笳
+away
+稞
+鹈
+珍
+毯
+酮
+汀
+梆
+嫫
+准
+ces
+巷
+晋
+肉
+莆
+痢
+缗
+怜
+鄙
+搠
+fri@@
+仳
+该
+宓
+珂
+圉
+弨
+悬
+buil@@
+绸
+太
+外
+祢
+蓍
+圹
+侓
+跸
+谊
+获
+髈
+迮
+鹤
+卦
+嗻
+佐
+愠
+媲
+殍
+齉
+妹
+残
+嗄
+钾
+court
+踞
+脯
+菖
+琼
+傻
+三
+虿
+唪
+逶
+鲤
+镌
+肇
+弘
+李
+履
+恩
+蒌
+夙
+环
+坒
+gar@@
+ans
+嘣
+嵯
+命
+酢
+屏
+鈇
+麟
+旨
+旼
+疮
+with
+解
+屈
+趴
+蠛
+密
+瞩
+屎
+显
+魁
+衯
+钇
+酩
+鳌
+戆
+芈
+十
+gu@@
+陪
+黑
+缌
+ch@@
+摇
+梨
+胼
+撷
+疤
+砟
+el@@
+唛
+芪
+速
+ol
+细
+馥
+犰
+bal@@
+鲇
+韂
+焰
+胗
+粹
+枌
+嵬
+古
+she
+through
+筛
+翀
+协
+se
+魃
+格
+mes
+晥
+跱
+掺
+阕
+智
+松
+st@@
+靠
+斟
+粒
+舞
+瀣
+棅
+茭
+韫
+鐎
+灵
+龢
+卷
+lion
+曹
+哒
+皝
+哲
+pe@@
+患
+逸
+涠
+蛰
+佣
+猇
+狈
+nine
+囫
+风
+态
+慈
+慜
+俨
+汲
+肛
+隶
+坩
+赍
+海
+癫
+my
+委
+ill
+胤
+覩
+臬
+矶
+炷
+衬
+前
+馊
+伽
+艳
+妗
+肠
+檗
+soon
+氙
+琅
+谏
+light
+変
+seven@@
+旸
+芭
+en@@
+烧
+诃
+攘
+陧
+觅
+铑
+氐
+余
+night
+hou@@
+鹀
+膜
+炙
+抨
+珊
+ses
+漩
+both
+桉
+笺
+鎛
+led
+披
+膛
+蜻
+菽
+娼
+団
+揽
+测
+f@@
+芎
+吅
+sion
+遹
+瓘
+慕
+他
+鄩
+矽
+thou@@
+沒
+唁
+匿
+设
+嵖
+髹
+ine@@
+恸
+窣
+-@@
+街
+膝
+碑
+national
+it@@
+瘢
+ci@@
+侮
+l
+陉
+照
+原
+厐
+悚
+答
+犷
+罔
+绘
+敞
+ys
+捆
+殚
+填
+挟
+tal@@
+萌
+卑
+甃
+吉
+蜮
+帑
+笖
+new
+昺
+诤
+襜
+矗
+藠
+苓
+th@@
+哝
+its
+蚋
+ran@@
+澳
+eight
+贱
+傕
+亦
+续
+槐
+筚
+追
+醺
+錾
+蹒
+玟
+look
+圃
+颗
+旎
+圮
+绷
+op@@
+咙
+槃
+冫
+乳
+鸣
+柴
+蚴
+擞
+锴
+姣
+惯
+管
+奎
+ra@@
+瞠
+侍
+恵
+岬
+喎
+摭
+卺
+wee@@
+羧
+cep@@
+fron@@
+妁
+很
+禹
+巯
+夼
+鄫
+re@@
+动
+迓
+狐
+瑕
+棹
+屹
+皈
+陔
+殛
+仿
+蝥
+缘
+镫
+品
+ase
+row
+缜
+stance
+予
+custom@@
+抬
+鞶
+蛘
+埏
+漂
+凝
+虻
+姒
+痒
+邝
+ss
+战
+悴
+spe@@
+羝
+吮
+锗
+湘
+端
+淸
+孢
+3
+郴
+卬
+fif@@
+濯
+射
+簏
+锌
+啖
+懑
+霪
+棻
+簺
+怅
+g
+毎
+犳
+ffe@@
+镉
+閦
+吲
+驹
+are@@
+埇
+心
+漴
+娃
+侯
+蔽
+值
+鲧
+fr@@
+ful
+嘹
+滥
+騠
+ility
+喹
+悉
+嗽
+些
+硁
+mer@@
+磊
+霆
+麝
+曲
+蜃
+police
+镩
+笪
+苾
+靑
+凼
+多
+质
+缇
+嗫
+沏
+ened
+花
+诹
+尉
+珥
+崩
+ld
+x@@
+揎
+纷
+缂
+轹
+庙
+渚
+鸪
+乒
+惧
+peop@@
+歌
+唾
+樘
+膺
+fro@@
+哂
+腼
+霄
+坞
+霰
+掎
+娿
+镬
+巨
+碇
+藩
+活
+荤
+团
+缪
+钵
+飞
+儡
+苤
+貊
+柄
+蓠
+防
+贮
+碾
+狞
+艏
+喏
+稚
+映
+i'm
+谤
+蜿
+车
+乂
+寕
+啧
+虔
+ster@@
+垣
+嗛
+讪
+ves
+again
+隗
+帜
+嗾
+绂
+公
+卮
+抱
+仕
+以
+栘
+拊
+萤
+him
+荪
+淬
+7
+鋹
+敢
+颖
+ment
+嫩
+棕
+show
+跩
+out@@
+汤
+迕
+榨
+暗
+糍
+晡
+9
+稂
+曼
+蒺
+ture
+鬄
+逅
+岚
+芟
+昶
+埤
+幺
+猖
+伙
+pub@@
+南
+荨
+趁
+淑
+嘲
+悔
+藉
+争
+渔
+pool
+簟
+谀
+噘
+窀
+祟
+阜
+涸
+掖
+癃
+疑
+搢
+漏
+锉
+钹
+耱
+踢
+骎
+稣
+锲
+繇
+缊
+劈
+啻
+蕴
+仔
+昝
+且
+滚
+柢
+镊
+响
+凰
+噗
+瑴
+嗔
+简
+蜇
+有
+豢
+ap@@
+啓
+翅
+愤
+peri@@
+蚶
+弄
+禨
+蚡
+坝
+换
+纨
+蹑
+for@@
+草
+荛
+懈
+奉
+鳊
+疗
+搂
+串
+幸
+岽
+牍
+蝰
+絶
+秣
+缴
+at
+网
+嗑
+岗
+绊
+圳
+恁
+反
+方
+癞
+煞
+雪
+尤
+鐧
+麒
+黡
+殷
+都
+则
+剃
+揄
+毐
+噱
+fi
+氹
+泠
+樾
+迳
+嫚
+齿
+殳
+墒
+役
+晟
+咔
+芃
+睁
+柽
+戍
+屺
+虱
+韦
+涅
+姚
+鋈
+sure
+既
+涯
+甯
+嘤
+硞
+som@@
+惴
+狻
+堑
+屉
+愿
+li@@
+行
+谲
+嶂
+峣
+碜
+暂
+h@@
+鏖
+瘊
+蜈
+浈
+萦
+职
+蚊
+汴
+people
+妱
+鸰
+易
+芜
+挪
+影
+竹
+洸
+烀
+鹘
+胜
+兵
+咧
+楷
+币
+妖
+ant
+臣
+桩
+创
+囹
+na@@
+鞑
+楂
+逡
+惆
+卿
+闱
+耀
+那
+童
+钰
+玮
+郄
+昏
+乘
+钩
+晳
+笼
+核
+芙
+小
+忋
+区
+as@@
+颢
+our@@
+that
+稻
+销
+韶
+刑
+延
+k@@
+teen
+幄
+pic@@
+叱
+骷
+棰
+羁
+垝
+犴
+媱
+兄
+尓
+乞
+鲦
+划
+壬
+芡
+hotel
+佃
+氯
+您
+颌
+汝
+缫
+幂
+竣
+喾
+疥
+long
+广
+镂
+酫
+ings
+ood
+柊
+唣
+辽
+稀
+襞
+讼
+篱
+坻
+袂
+华
+自
+歧
+昂
+摺
+gh
+聿
+犟
+敛
+牺
+旳
+锥
+玛
+低
+鄮
+漳
+叠
+川
+呼
+where
+戳
+嗮
+琦
+厓
+窠
+cas@@
+舷
+甦
+凛
+谖
+旷
+沌
+狒
+溉
+绍
+劲
+滟
+in
+褊
+fam@@
+楽
+金
+磕
+see
+斩
+佛
+壅
+境
+诂
+around
+羑
+浆
+矜
+铈
+provi@@
+藐
+伉
+阶
+哀
+潼
+精
+像
+凶
+琇
+秧
+涂
+豫
+镒
+蒟
+叹
+颜
+莫
+阀
+痕
+爬
+嬲
+滓
+牮
+沐
+璈
+窸
+湮
+喊
+徘
+而
+仞
+蛆
+吵
+栟
+郯
+谄
+膑
+垯
+恰
+筠
+淝
+剌
+vie@@
+估
+first
+渊
+鶗
+缬
+踺
+呦
+宄
+颎
+蔼
+挒
+亹
+墉
+倧
+梪
+猱
+顼
+泫
+鸳
+赠
+聋
+鬲
+隽
+胚
+驱
+丶
+邪
+鲚
+韩
+婆
+sed
+it
+审
+屠
+众
+翩
+铺
+磨
+醲
+瘼
+佑
+霹
+臀
+坮
+俯
+舸
+辍
+谗
+甥
+祭
+tell
+商
+ace
+宾
+骡
+浍
+冉
+肾
+im@@
+win@@
+甬
+蹚
+粕
+脖
+遽
+next
+expe@@
+榕
+蹂
+邹
+stru@@
+沁
+宸
+旮
+锁
+侂
+拢
+辫
+仁
+be
+洱
+摘
+律
+预
+徕
+鬣
+挠
+戟
+嘴
+杖
+骍
+劵
+哮
+雁
+擀
+鴐
+衎
+芮
+据
+霭
+com@@
+俗
+伝
+ory
+轭
+博
+谐
+孺
+te@@
+锹
+瞥
+导
+糜
+堙
+乾
+搌
+鏐
+你
+con@@
+琍
+art
+徇
+塞
+讽
+瞄
+rence
+溢
+卉
+逞
+阮
+阊
+婊
+mil@@
+专
+姜
+浉
+府
+sing@@
+嗵
+哨
+砺
+吋
+闹
+败
+居
+娓
+ce
+囟
+楼
+元
+鲥
+嗙
+tely
+帷
+還
+懋
+欷
+ong
+郝
+丨
+breakfast
+崴
+橼
+停
+沾
+under@@
+tion@@
+非
+堵
+仆
+铗
+难
+蛑
+狙
+找
+熠
+over@@
+檀
+鸩
+檐
+彀
+蟋
+腚
+槟
+泄
+舅
+痼
+秤
+氆
+罄
+啼
+啡
+冽
+疎
+嵨
+吿
+航
+采
+mb@@
+裟
+檿
+辆
+眍
+溃
+can
+唻
+媖
+佺
+狰
+仪
+rent
+沓
+话
+霾
+婷
+雨
+eigh@@
+白
+瞧
+澎
+洞
+阔
+ta@@
+侧
+躇
+莘
+骏
+宰
+縠
+birth
+萃
+men@@
+秉
+轮
+刹
+fl@@
+鸮
+忾
+胖
+攫
+磁
+飧
+鲈
+邙
+阌
+皂
+危
+搹
+靺
+唔
+撴
+柝
+垮
+膈
+辋
+榷
+邘
+锂
+戚
+蔹
+粝
+翊
+攵
+悯
+涝
+媞
+俤
+镲
+梳
+蓥
+艾
+guest
+顿
+譬
+兹
+囚
+倌
+遣
+朔
+such
+篷
+囷
+宫
+戊
+嵎
+娉
+箅
+檫
+玷
+please
+彰
+蜓
+怃
+癯
+怛
+镇
+还
+诿
+庞
+开
+节
+卒
+逵
+颔
+杰
+蘸
+楚
+颍
+吐
+堃
+澜
+弧
+流
+堍
+严
+焱
+on
+纬
+巽
+确
+子
+紙
+沭
+戞
+屙
+胭
+劫
+珧
+信
+樵
+讴
+豺
+叽
+钎
+霁
+瀛
+糟
+噌
+豝
+湜
+洎
+菌
+悆
+ree
+凯
+徜
+郏
+today
+勾
+嬉
+螵
+戕
+璞
+忝
+俞
+言
+庵
+贼
+费
+kind
+扁
+骁
+咪
+凿
+讳
+掊
+ated
+苄
+鳝
+噍
+茧
+govern@@
+筼
+颋
+愛
+渭
+踟
+罪
+汔
+踩
+陽
+疽
+闵
+我
+蒡
+缠
+曺
+婪
+农
+露
+染
+sent
+氽
+et
+咷
+圧
+咀
+site
+sti@@
+梗
+water
+舔
+嚣
+蜉
+逖
+湄
+栅
+刳
+薢
+ally
+诸
+藕
+钔
+伋
+莜
+硬
+窟
+sa@@
+愚
+蟪
+秩
+雯
+褚
+鹎
+泃
+ner
+ast
+菜
+晦
+枨
+偲
+嚩
+遴
+su@@
+掸
+千
+馄
+功
+胺
+rep@@
+涡
+ther
+孩
+液
+狲
+业
+巡
+脍
+甚
+珜
+郜
+蔑
+疔
+庚
+硌
+裉
+骘
+sequ@@
+迎
+盖
+噪
+尺
+咒
+蜕
+店
+镐
+蝉
+宝
+卍
+弩
+学
+猁
+犊
+妄
+葭
+every@@
+螋
+馃
+ating
+壮
+熟
+rela@@
+嗬
+约
+锞
+呫
+护
+磒
+疙
+羞
+绦
+铳
+掕
+宗
+荀
+玢
+ser@@
+啦
+氪
+盯
+疸
+鬐
+绚
+锡
+鬻
+瓮
+麸
+旱
+娱
+敕
+跄
+烘
+蠕
+te
+诽
+重
+翠
+珑
+慰
+鲍
+勣
+袱
+瑙
+tly
+庆
+government
+荦
+阗
+烫
+倓
+俏
+鸹
+倦
+ound
+co@@
+竟
+腋
+昙
+濂
+啋
+揶
+泣
+郾
+垍
+轳
+某
+酎
+板
+晤
+廑
+奶
+醴
+镑
+讣
+缤
+龅
+畿
+脁
+ma@@
+醢
+嗟
+丗
+殿
+魅
+熨
+wr@@
+嚷
+彤
+栎
+americ@@
+谡
+泽
+柬
+髎
+盆
+诅
+瘁
+萘
+喁
+媒
+忸
+阍
+曡
+裰
+锦
+something
+犋
+爱
+煺
+揉
+苇
+嘈
+胪
+铁
+屁
+颂
+锩
+骅
+渌
+邱
+脬
+滏
+罹
+散
+鼹
+父
+摔
+边
+申
+苛
+敝
+冈
+蕈
+郕
+耨
+闳
+逄
+拔
+将
+fe
+鲂
+颙
+0
+room
+胙
+澍
+媚
+廪
+量
+贩
+镕
+nine@@
+恚
+鹾
+旃
+铵
+堺
+剑
+ket
+支
+墙
+洒
+俳
+ors
+诐
+黟
+珩
+跻
+浸
+孟
+mp@@
+狷
+踌
+渎
+逝
+颁
+务
+羹
+羖
+阽
+跪
+褒
+乜
+择
+盩
+鸭
+抗
+递
+褪
+怂
+cou@@
+蟠
+cen@@
+傜
+砍
+鲔
+蹙
+restaurant
+杆
+茵
+尊
+耻
+淮
+躜
+蜡
+嘱
+谩
+蝗
+堞
+姨
+happ@@
+铿
+楝
+park
+力
+殊
+畹
+say
+练
+纱
+溪
+虮
+篑
+蜱
+惜
+跌
+啁
+溜
+饼
+裥
+勇
+柱
+惨
+陟
+殪
+安
+徼
+纛
+痍
+谨
+取
+犒
+鄄
+粟
+眨
+坤
+妤
+剟
+雉
+傩
+嫄
+嘧
+咽
+愀
+簸
+赏
+箾
+龈
+放
+菅
+坚
+奠
+黢
+琴
+潞
+朝
+tually
+铞
+乇
+醪
+潍
+槊
+纯
+瀹
+诋
+慢
+奸
+嘭
+揠
+昃
+革
+司
+ce@@
+趵
+醮
+碘
+器
+澶
+知
+&
+厢
+啷
+晔
+炜
+ook
+斐
+盏
+妻
+娣
+燋
+窈
+法
+also
+菸
+炽
+选
+埕
+击
+滤
+铖
+觎
+莩
+le@@
+匏
+踅
+酸
+鹑
+enjoy
+闰
+毡
+祆
+身
+郊
+笄
+乖
+甩
+like
+否
+厖
+爹
+牦
+起
+僔
+钓
+浚
+忙
+get
+冯
+樊
+识
+蹉
+鑫
+畚
+滕
+掮
+尾
+war@@
+桶
+瓢
+毫
+膊
+髌
+钗
+桼
+碲
+辨
+唉
+竲
+痰
+膻
+锖
+嘡
+雩
+版
+昧
+敬
+蕨
+伴
+徍
+襀
+盘
+憷
+涑
+different
+after
+虽
+何
+煎
+宽
+ori@@
+攀
+冢
+零
+樯
+哌
+瓣
+馍
+唰
+炪
+旰
+厔
+葵
+痞
+己
+靓
+凄
+服
+烤
+仮
+恨
+喳
+贯
+郞
+饪
+铄
+滠
+蠢
+薅
+齑
+褓
+黼
+涿
+n't
+磬
+匾
+沂
+镳
+nice
+浯
+悝
+淫
+捅
+箨
+瞀
+勠
+屐
+蹁
+蹦
+槿
+类
+栉
+脘
+页
+桕
+脊
+欲
+蝽
+勃
+坷
+酶
+售
+縻
+欺
+膏
+词
+兢
+楸
+娡
+娩
+陡
+ple
+尧
+幷
+豉
+桫
+滁
+麋
+罘
+朕
+耗
+汉
+登
+较
+逾
+蔫
+赳
+秭
+咫
+斑
+跚
+舒
+莞
+闾
+氤
+骸
+槩
+瓶
+餐
+瞪
+沉
+朱
+had
+鲣
+嵪
+nothing
+嫡
+恍
+衢
+轴
+杈
+赂
+津
+red
+拤
+狠
+卣
+蚱
+疆
+捞
+婉
+固
+梶
+垸
+逋
+髡
+晓
+骶
+季
+炀
+喷
+垛
+蜂
+sts
+阝
+未
+熬
+绲
+坟
+苈
+are
+寮
+吧
+皖
+捌
+炤
+b
+the
+哈
+鞍
+ir@@
+掘
+tation
+噙
+酊
+忄
+硼
+耕
+偎
+雎
+磴
+锺
+over
+侃
+婚
+吗
+竺
+man
+也
+苣
+绛
+冤
+呈
+孱
+缭
+埼
+犬
+麦
+蓐
+技
+夜
+tri@@
+杭
+佝
+莨
+di@@
+毙
+贿
+猎
+桡
+so
+氖
+叼
+哚
+濠
+湾
+全
+读
+盐
+钚
+鬘
+万
+鲠
+貂
+鋐
+堼
+茺
+拟
+牡
+蝮
+镢
+嚯
+束
+喱
+彿
+col@@
+仃
+涌
+蚯
+妩
+箢
+隹
+亰
+疃
+嬗
+喇
+攒
+am
+听
+fe@@
+zero
+痛
+诗
+干
+疫
+嬴
+降
+签
+丈
+sm@@
+髑
+劬
+萏
+诀
+镵
+坭
+咤
+池
+榼
+岕
+崃
+ined
+chu@@
+糸
+祸
+猜
+婿
+搋
+咛
+箴
+辚
+悄
+荒
+挑
+托
+drive
+撇
+莒
+鄜
+冒
+稹
+户
+ould
+铤
+翃
+try
+圊
+艿
+桐
+兒
+揿
+堋
+档
+ink
+email
+犍
+铍
+招
+鳇
+敲
+雳
+奌
+裝
+戏
+哗
+栱
+哆
+ds
+浙
+岂
+挝
+莲
+腩
+杬
+促
+斥
+蛱
+诓
+炖
+璘
+怵
+礽
+咬
+珐
+韡
+邺
+祀
+皇
+渑
+困
+潜
+添
+ter
+che@@
+如
+萹
+熙
+扇
+亭
+亍
+瘗
+舍
+皌
+诌
+虢
+欣
+掰
+棘
+岞
+毽
+卵
+罴
+疋
+ali@@
+整
+i@@
+紫
+mu@@
+涘
+携
+奏
+cri@@
+馘
+翘
+抚
+筝
+玄
+霎
+铮
+澹
+嫰
+夏
+咿
+围
+従
+猾
+pre@@
+逊
+糖
+漫
+聍
+及
+瘵
+绢
+棱
+笈
+铯
+釭
+恼
+癎
+苢
+斁
+醣
+植
+鱼
+涓
+z
+目
+par@@
+撬
+戎
+偓
+已
+打
+busine@@
+桦
+庇
+坪
+problem
+舾
+centr@@
+fifty
+梧
+𫖯
+restaurants
+beau@@
+fac@@
+谈
+腙
+阉
+孛
+秘
+庶
+畴
+例
+ang
+幛
+下
+溅
+彗
+魟
+诏
+鸢
+邽
+瘪
+ay
+胱
+勒
+槲
+橹
+男
+ide@@
+治
+锒
+祯
+慌
+佥
+苔
+暴
+时
+撩
+俶
+屋
+抠
+嫘
+浔
+鲻
+暌
+窿
+炒
+溇
+髯
+簿
+嶖
+峒
+si@@
+觥
+午
+联
+ty
+猫
+腭
+晻
+袝
+戾
+波
+啐
+戈
+蒎
+麓
+汾
+茉
+埔
+蛳
+徳
+鸶
+艇
+顶
+髻
+徒
+豨
+碳
+欻
+薰
+hi
+呲
+舳
+劾
+形
+'
+渥
+羔
+枇
+模
+弁
+坏
+徊
+馗
+刨
+璀
+锤
+钤
+囔
+拦
+剐
+揸
+滈
+缚
+ach@@
+斯
+诒
+寂
+裒
+同
+should
+蒴
+窑
+示
+戗
+录
+恶
+培
+迦
+邮
+熜
+泊
+乌
+篌
+center
+癸
+昔
+牠
+濛
+噻
+苋
+ang@@
+give
+姥
+祖
+搬
+悠
+瓒
+嫱
+wit@@
+畏
+神
+湟
+扃
+桀
+醭
+谘
+虾
+玳
+斡
+寥
+攻
+忑
+man@@
+僻
+money
+飊
+国
+吠
+扦
+j@@
+幹
+泛
+摈
+匼
+廉
+通
+袍
+楹
+搦
+缮
+ft
+岱
+scho@@
+丹
+蜗
+蓓
+卟
+倥
+喽
+蛾
+殭
+绣
+chan@@
+震
+棍
+潋
+葫
+嚎
+ed
+漉
+阻
+俐
+德
+様
+酝
+倒
+橱
+send
+谯
+嗐
+署
+贴
+搭
+坦
+蹯
+緛
+队
+死
+閟
+岭
+倪
+诜
+偱
+醐
+痊
+社
+凡
+畯
+摒
+迭
+措
+挌
+媛
+same
+蚓
+擤
+澈
+眈
+剪
+嫣
+鹱
+娆
+凸
+厅
+臼
+枭
+炎
+烊
+掀
+洄
+僳
+velo@@
+觉
+逼
+have
+皎
+酯
+怦
+叶
+统
+thir@@
+从
+纡
+les
+术
+径
+征
+come
+躅
+此
+甜
+践
+br@@
+鎸
+讲
+玲
+衣
+麾
+枥
+拈
+腮
+reas@@
+芩
+鹖
+more
+儆
+愕
+淏
+躄
+玕
+届
+永
+哟
+虓
+sit@@
+宴
+夷
+梭
+紧
+瘰
+recei@@
+匐
+号
+from
+况
+畋
+谜
+莽
+锣
+pas@@
+驽
+昱
+喟
+v@@
+呀
+'re
+阒
+踬
+认
+订
+黝
+筻
+seven
+们
+舱
+揭
+妾
+礼
+高
+传
+香
+谬
+篆
+逮
+玑
+眄
+驿
+all@@
+嗅
+五
+淹
+裨
+咕
+焖
+氚
+ari@@
+崤
+接
+沈
+寰
+轻
+旒
+维
+test
+麴
+枱
+挚
+句
+驾
+筷
+坂
+须
+鼩
+炳
+推
+姿
+溴
+庠
+箻
+燹
+拂
+呱
+愫
+袤
+睑
+眩
+冷
+葸
+but
+睢
+掣
+唠
+陈
+喉
+晬
+溯
+艺
+苦
+腐
+蟹
+燃
+候
+伸
+萝
+汭
+6@@
+僭
+蘖
+瘫
+书
+控
+乪
+溘
+痦
+lo@@
+蝣
+啂
+缩
+part@@
+只
+施
+鸨
+鎏
+悕
+诵
+孬
+獾
+玩
+汪
+ary
+吁
+拄
+诟
+扰
+鼎
+珪
+橇
+隍
+close
+姑
+ty@@
+迹
+骆
+崐
+go@@
+勺
+倜
+ble
+盥
+剂
+qu@@
+圻
+荧
+荣
+疵
+酪
+稗
+幔
+井
+蛭
+泔
+定
+q
+楱
+刈
+使
+潟
+螅
+铒
+巇
+呖
+垂
+啉
+戮
+佉
+寓
+翚
+啪
+fa@@
+w@@
+绐
+抛
+謷
+忐
+趿
+位
+螨
+last
+迫
+爷
+碎
+廊
+啮
+柯
+酰
+烜
+煳
+粲
+求
+楞
+考
+挲
+触
+荽
+荫
+疴
+遨
+仫
+瑭
+扩
+恭
+藦
+羣
+味
+缀
+享
+枣
+宠
+浊
+back
+扪
+篥
+sou@@
+涉
+son
+翔
+讫
+理
+玙
+栩
+闩
+龄
+亢
+桄
+sil
+幡
+婀
+脚
+疠
+歔
+兰
+査
+镗
+涎
+霏
+待
+忻
+警
+科
+脐
+琚
+真
+own
+醚
+绡
+ices
+嶷
+胶
+u@@
+峦
+箜
+丰
+痨
+衡
+锋
+宦
+皦
+感
+矮
+爨
+亟
+装
+鹁
+房
+捡
+乐
+胫
+彧
+kil@@
+葛
+氏
+捋
+戢
+牢
+5
+dis@@
+园
+孓
+柃
+榄
+喺
+悛
+俢
+c
+缢
+富
+je@@
+竭
+way
+拙
+牧
+阋
+究
+钘
+濮
+皙
+问
+告
+鞅
+燚
+肴
+螭
+篁
+当
+洮
+渠
+码
+辗
+慨
+崇
+诫
+意
+姤
+飓
+噬
+铐
+寡
+咨
+伺
+殓
+容
+蚁
+柏
+枧
+瑗
+挛
+绔
+月
+绾
+隘
+僖
+纇
+握
+耜
+舐
+tom@@
+旅
+翮
+products
+缑
+肄
+闿
+硫
+砚
+蜚
+bu@@
+鍉
+晗
+阃
+衾
+蝴
+丽
+髅
+煜
+习
+窃
+芑
+櫈
+ru@@
+deta@@
+谌
+俄
+桨
+芦
+泵
+企
+粽
+揣
+领
+qui@@
+鄚
+肱
+尼
+滦
+椽
+癜
+甄
+local
+fol@@
+偿
+丸
+涞
+铜
+du@@
+噜
+倬
+珣
+圜
+墅
+car@@
+撂
+栀
+评
+螽
+蘑
+扬
+挨
+轵
+殇
+酺
+high
+ach
+鲛
+啥
+think
+don't
+表
+cho@@
+盈
+啊
+怔
+滉
+艮
+徽
+撸
+淯
+茈
+硚
+桢
+魍
+潸
+at@@
+仄
+鹭
+沅
+操
+炭
+砝
+之
+讷
+吔
+疰
+葚
+夭
+跶
+蘼
+鲩
+滴
+菘
+滨
+加
+倾
+体
+茕
+髀
+雅
+ile
+欠
+崧
+囤
+僦
+守
+辂
+棂
+se@@
+捶
+扛
+铠
+铩
+京
+熘
+洛
+娄
+the@@
+痿
+箱
+驮
+礅
+郃
+谣
+ack@@
+晴
+res
+lot
+纲
+诖
+ways
+谋
+煌
+绗
+炼
+卤
+屄
+界
+拶
+ar
+遠
+周
+驩
+肖
+跖
+莳
+级
+咣
+hu@@
+啕
+歀
+忤
+临
+many
+募
+ned
+their
+睿
+冥
+久
+巍
+堇
+洽
+郎
+董
+this
+毒
+醉
+ton
+诶
+旺
+蛉
+规
+琶
+騑
+捉
+imp@@
+尥
+拗
+e@@
+歆
+刀
+跆
+three
+锢
+侏
+拳
+扤
+饨
+剥
+潡
+俊
+嗤
+穑
+地
+情
+憔
+惹
+奢
+inclu@@
+mo@@
+铭
+至
+浐
+祎
+really
+帆
+dri@@
+楮
+邬
+弛
+篮
+兮
+up@@
+慧
+腆
+碴
+挂
+计
+豚
+滑
+indi@@
+into
+褭
+酞
+t@@
+符
+彼
+涔
+body
+ins
+漶
+峪
+down
+缓
+剀
+福
+叮
+wa@@
+失
+隅
+怄
+扳
+great
+泰
+娌
+孙
+彬
+毁
+蛲
+萱
+泻
+舀
+dge
+瞅
+vo@@
+咆
+祜
+枋
+憩
+文
+gro@@
+倮
+鹄
+all
+槎
+忆
+絯
+裢
+two
+兑
+明
+遥
+窍
+吃
+it's
+启
+妽
+郅
+ies
+centre
+吒
+亿
+雲
+ask
+叩
+媜
+蚰
+奄
+垡
+椿
+篡
+聘
+窖
+垩
+diffe@@
+购
+褥
+豸
+腠
+咦
+眦
+敏
+鼐
+昉
+顽
+琊
+砌
+僮
+乱
+瞒
+撑
+羲
+纳
+赞
+焐
+橛
+嵩
+陀
+楔
+牛
+鹊
+帽
+may
+偬
+丫
+兜
+砰
+濡
+獬
+好
+腿
+靶
+木
+唸
+纹
+裹
+cl@@
+hund@@
+优
+猲
+焯
+岘
+辱
+丞
+查
+秽
+胀
+鳖
+岈
+喒
+默
+见
+forty
+纥
+泚
+驳
+铃
+萋
+筱
+蛏
+琲
+鸵
+这
+链
+read@@
+垵
+踦
+奭
+別
+软
+盲
+黛
+纤
+chil@@
+扱
+狡
+贽
+贻
+鳏
+篓
+der@@
+滢
+嵌
+妺
+臭
+谥
+夯
+韬
+惺
+检
+峨
+ms
+胛
+恫
+构
+鹂
+刺
+讦
+唬
+梈
+捯
+由
+志
+达
+梢
+歉
+al@@
+淳
+般
+恐
+獗
+央
+喔
+蟀
+伞
+sh
+'s
+姮
+痈
+ck
+俾
+髫
+蜒
+愧
+呆
+mer
+桤
+鬏
+型
+四
+哖
+龋
+鳔
+骗
+砷
+泷
+猷
+茯
+茁
+cha@@
+small
+倨
+乎
+奕
+鹅
+痉
+奥
+辙
+sting
+冲
+赅
+泅
+羰
+悲
+焘
+瞵
+紊
+卖
+take
+鳐
+菊
+姸
+辐
+嘻
+fast
+囿
+凫
+纵
+咝
+箧
+well
+钬
+he@@
+辕
+鹜
+unk@@
+诔
+泐
+世
+镖
+骄
+ve
+寸
+pay
+鴂
+睐
+氲
+牟
+洏
+尔
+quo@@
+椭
+芨
+险
+耳
+柁
+牾
+蚂
+much
+most
+now
+singapore
+丙
+洋
+燊
+啵
+胬
+媾
+碟
+濆
+陇
+扥
+势
+慷
+笋
+泥
+鄘
+hote@@
+罕
+沤
+葡
+蠡
+产
+鲵
+凉
+萆
+籽
+胞
+哦
+侠
+晚
+咵
+杯
+邗
+甑
+胁
+历
+嘏
+喘
+is
+霉
+仇
+鳀
+鉴
+猛
+纂
+决
+阏
+饯
+宣
+象
+儿
+猗
+瀑
+荚
+满
+茳
+酷
+鼬
+旁
+屡
+榇
+s@@
+呷
+驻
+薮
+鹏
+馋
+鹪
+纠
+thank
+轶
+渺
+镁
+cost
+汐
+谅
+甫
+publi@@
+储
+抹
+杨
+裳
+歙
+鏊
+犼
+蹐
+愦
+断
+able
+ind
+狂
+雒
+妧
+巴
+嘟
+筵
+喤
+舫
+刃
+fi@@
+company
+郿
+瞢
+褰
+清
+巿
+苕
+内
+暾
+垠
+戌
+溟
+剋
+葶
+猡
+蔺
+岖
+邕
+any
+pri@@
+卸
+基
+馒
+出
+柞
+溍
+吨
+蜥
+炟
+徂
+inve@@
+桂
+frien@@
+铪
+庖
+廋
+誉
+嗡
+忘
+i'@@
+螃
+胯
+un@@
+铨
+酹
+鷃
+wi
+眚
+跤
+汁
+针
+钏
+雀
+限
+面
+蛐
+黃
+脩
+叛
+枷
+故
+sk@@
+编
+菥
+勐
+汗
+胃
+洗
+洹
+ers
+髦
+宏
+聊
+嵫
+囎
+淠
+垧
+欧
+oms
+揵
+before
+畈
+饬
+秦
+莓
+努
+炔
+匠
+际
+蕉
+蝇
+po@@
+匽
+year
+橥
+pping
+注
+承
+琨
+又
+玉
+彖
+缸
+怒
+遏
+坍
+sta@@
+痹
+埆
+叵
+氟
+脂
+昭
+ments
+宕
+盅
+惩
+馁
+绕
+遢
+扫
+蔡
+崾
+阎
+葳
+殂
+潢
+每
+ro@@
+靖
+主
+tional
+闸
+涖
+第
+n
+镅
+蜜
+能
+衄
+猹
+鹰
+俘
+豪
+人
+痱
+蓑
+骒
+所
+绨
+粦
+望
+减
+魔
+释
+ars
+逑
+clo@@
+帏
+遘
+许
+誓
+牤
+刷
+扒
+峻
+9@@
+昕
+轩
+嬷
+them
+窗
+蹄
+骰
+湓
+堨
+巂
+婧
+骖
+螬
+酋
+ag@@
+闷
+滞
+郧
+朴
+supp@@
+倡
+腴
+褴
+极
+漷
+佾
+煨
+沚
+肓
+experi@@
+亮
+堕
+对
+秋
+every
+铅
+蛴
+擅
+a@@
+鳃
+保
+ze
+弓
+趄
+翦
+涛
+盗
+道
+遮
+屿
+绥
+cont@@
+部
+den
+付
+诮
+粘
+镔
+溏
+啬
+贫
+轼
+猩
+薯
+胳
+抻
+than@@
+king
+燎
+笫
+鸂
+蓊
+峋
+ent
+搡
+磡
+昌
+妈
+玺
+叫
+済
+彩
+lim
+孀
+囧
+we@@
+绒
+驸
+架
+斛
+资
+碣
+濞
+冿
+吡
+作
+because
+晏
+餍
+瞾
+栊
+蹿
+荩
+尖
+依
+氍
+炝
+抡
+中
+潏
+潦
+mon@@
+域
+杩
+兼
+捎
+怩
+正
+f
+near
+刮
+搪
+鸿
+俺
+耪
+牯
+瞎
+鬼
+惠
+bro@@
+沛
+浃
+pe
+釜
+vi@@
+jal@@
+拴
+冇
+暎
+魉
+痘
+盔
+枚
+秾
+杲
+郛
+僚
+昞
+足
+sen@@
+family
+岿
+榖
+蛊
+绯
+村
+鹃
+钼
+拓
+dly
+苫
+锄
+跎
+碰
+钝
+淆
+鳟
+浓
+柇
+桷
+愬
+裔
+邃
+恂
+楙
+沨
+ban@@
+ssi@@
+鳞
+褛
+钌
+y
+搀
+骺
+坊
+娘
+鄯
+袈
+匡
+厦
+艘
+障
+舂
+蹼
+材
+胡
+痄
+凖
+憋
+ll
+睥
+窦
+砕
+雇
+個
+籀
+饥
+价
+笞
+鞴
+筊
+树
+揖
+派
+垄
+檬
+眠
+偌
+闯
+姘
+少
+破
+怿
+扼
+恽
+岔
+亲
+垕
+踱
+泳
+截
+轫
+烽
+够
+武
+差
+声
+可
+绫
+巅
+邯
+椎
+硭
+畀
+常
+沸
+榧
+忡
+妫
+皮
+讯
+铣
+轾
+衙
+蒙
+蕃
+things
+腓
+坐
+鞘
+榻
+霈
+垆
+ous
+粳
+枹
+薨
+胆
+龙
+绺
+ning
+med
+挹
+诨
+译
+但
+半
+辩
+芊
+变
+翻
+尹
+靸
+酬
+咶
+暧
+姝
+貘
+墓
+匙
+been
+墀
+芗
+sc@@
+妲
+狮
+岫
+ants
+谔
+鞭
+婶
+one
+踏
+讵
+land
+幼
+軎
+晕
+wi@@
+瑞
+供
+啯
+朵
+枓
+奘
+咩
+訢
+臆
+硗
+帖
+dent
+戽
+瘿
+段
+傅
+oun@@
+枞
+属
+野
+芫
+诣
+钐
+堀
+uring
+试
+衍
+馏
+龇
+讧
+stre@@
+妒
+驴
+著
+槭
+洧
+side
+氾
+孵
+狎
+歩
+囯
+愎
+过
+pu@@
+right
+赋
+阳
+mber
+谕
+town
+盾
+救
+饱
+loc@@
+spa
+橡
+剅
+蒲
+姐
+厕
+铚
+套
+弗
+栴
+鹨
+糨
+騕
+雷
+裁
+拌
+鼻
+some
+搐
+均
+蓣
+ice
+卢
+ad@@
+瘌
+湶
+氩
+谰
+缙
+ard
+栾
+迨
+浣
+秏
+萨
+always
+污
+踔
+杏
+ded
+髽
+秀
+湉
+粪
+展
+槻
+垓
+赫
+惮
+lar
+薷
+迈
+p
+塾
+ga@@
+ey
+ho@@
+恺
+晩
+泸
+茚
+嗰
+绳
+lit@@
+渍
+遇
+洑
+晞
+溽
+ying
+枸
+无
+逍
+珰
+纣
+罩
+ons
+嶃
+瑚
+裙
+搛
+6
+咾
+唱
+瑠
+睃
+於
+din@@
+伟
+骓
+函
+瘤
+僆
+鞣
+兕
+伄
+柚
+席
+峄
+鄱
+抔
+an@@
+物
+loca@@
+穸
+懊
+2
+amer@@
+稼
+叉
+bra@@
+拇
+str@@
+res@@
+沆
+饰
+pen@@
+厘
+傈
+ality
+廷
+式
+夔
+督
+雍
+晨
+剿
+赶
+疖
+砜
+归
+min@@
+徙
+阚
+tions
+女
+营
+邅
+做
+伊
+er
+筜
+蛎
+business
+顾
+瘸
+洪
+钕
+觯
+政
+劝
+撮
+break@@
+氛
+漾
+sor@@
+址
+帼
+增
+ation
+瞋
+稊
+慊
+肯
+躁
+糗
+美
+豕
+us
+炸
+饽
+挎
+挫
+枢
+dr@@
+铉
+奖
+槌
+绮
+擂
+虬
+钽
+胍
+凌
+辟
+虹
+铂
+杂
+砗
+赪
+潇
+合
+鰕
+吭
+迷
+滘
+弹
+ff
+痤
+蛄
+觱
+杳
+嘬
+偕
+赘
+烩
+孖
+钉
+龉
+钪
+睬
+撵
+big
+笠
+俅
+锯
+呛
+欹
+cle@@
+珈
+浑
+墕
+hel@@
+彪
+姊
+拆
+雄
+hi@@
+钴
+壑
+幇
+卧
+痧
+澴
+长
+船
+六
+继
+棷
+尞
+窨
+燥
+膦
+镝
+裸
+办
+resta@@
+廆
+宅
+蓉
+娑
+猪
+阁
+蟆
+恢
+讥
+蜾
+of@@
+怏
+笕
+瞽
+赤
+岁
+禺
+昀
+她
+钫
+慝
+滩
+ack
+跃
+贰
+巳
+寿
+铙
+pro@@
+朊
+mor@@
+箦
+丧
+ready
+止
+跺
+缉
+桅
+论
+摽
+状
+驷
+咂
+铢
+绎
+叨
+遁
+锨
+里
+笊
+缰
+嵊
+膘
+帕
+泱
+鳜
+嫒
+仅
+捕
+徵
+each
+vil@@
+媪
+袁
+裕
+佻
+赙
+末
+笔
+啲
+轺
+掞
+彭
+氢
+got
+缁
+蠊
+攉
+寻
+骋
+噤
+ar@@
+棣
+疣
+豳
+祠
+衩
+匀
+王
+觫
+庭
+亶
+篇
+铎
+咖
+禛
+艹
+哐
+hope
+系
+娇
+窄
+髓
+阱
+懿
+右
+离
+穴
+渣
+邨
+肸
+杞
+肚
+垗
+幕
+ct
+苊
+隼
+wh@@
+宋
+煤
+柸
+镋
+耘
+kes
+激
+痣
+work
+吽
+彟
+株
+筐
+贸
+尕
+ere
+潬
+矣
+縢
+瑄
+油
+満
+漆
+rooms
+霞
+咏
+e
+珮
+仡
+眶
+醛
+票
+鼓
+突
+ia
+hundred
+瓯
+尬
+no@@
+西
+潴
+楪
+傍
+蜘
+yes
+蛀
+sig@@
+gs
+筮
+因
+谳
+bur@@
+跹
+鲑
+惑
+肋
+dy
+螟
+霓
+眼
+郇
+宙
+畲
+刬
+翟
+诧
+跫
+or
+颅
+跷
+淅
+臑
+楦
+屯
+走
+羸
+you
+咘
+als
+斤
+畔
+晧
+忠
+肥
+实
+凋
+褶
+ba@@
+摆
+熊
+篝
+乃
+绑
+沟
+瞿
+哜
+舶
+甘
+事
+瑛
+踧
+黯
+鲞
+cal@@
+璩
+enty
+剧
+洿
+孳
+祓
+天
+岑
+拾
+nineteen
+cause
+愇
+翁
+瞟
+昆
+弢
+摛
+义
+咸
+勮
+伪
+瑱
+橄
+丘
+铧
+怖
+终
+膨
+5@@
+扮
+to
+猕
+琬
+群
+叁
+狄
+眵
+烹
+蚝
+儒
+援
+ber@@
+1
+掾
+裂
+藓
+r@@
+squ@@
+鲁
+潺
+晷
+消
+鸸
+觊
+诡
+却
+as
+弼
+脰
+铷
+册
+懵
+啜
+涣
+扈
+垚
+森
+赵
+垫
+镪
+怼
+峁
+惚
+缄
+ass
+诼
+tive
+never
+昇
+ence
+耍
+异
+朐
+枕
+襻
+契
+祈
+骜
+羯
+豁
+瘕
+贲
+禥
+哔
+祁
+螫
+锆
+孕
+st
+忏
+掏
+昵
+哪
+鹞
+疍
+遄
+蠲
+恹
+狩
+憾
+悫
+弦
+脾
+soci@@
+爸
+home
+奔
+懂
+爿
+帙
+襦
+除
+ters
+ted
+景
+昚
+eas@@
+ven@@
+蹩
+ac@@
+歇
+fas@@
+抄
+牌
+浦
+榭
+lan@@
+巺
+刁
+secon@@
+顒
+个
+鲒
+嗨
+out
+凤
+褫
+瑮
+珏
+惦
+斝
+监
+土
+did
+date
+陋
+眉
+湴
+谎
+撖
+礞
+抑
+旻
+倩
+蟑
+produ@@
+忍
+free
+瓜
+康
+晾
+偏
+麇
+缱
+唏
+杪
+戥
+坳
+皲
+拘
+簋
+怆
+负
+陬
+埽
+皞
+素
+锐
+苘
+菀
+嚜
+免
+嫌
+靡
+d
+尪
+颧
+苷
+de@@
+inc@@
+策
+浠
+窾
+么
+壁
+pi@@
+荟
+愣
+砫
+巻
+ke
+旌
+咉
+矢
+婺
+绝
+线
+奂
+袋
+喻
+贡
+鍒
+逛
+days
+疟
+筇
+髃
+锛
+俱
+缅
+锊
+骟
+belie@@
+跏
+岜
+睚
+蜊
+铝
+闭
+麈
+趺
+锜
+彻
+缟
+甭
+苞
+钥
+ph@@
+ef@@
+灼
+銮
+塍
+淙
+鸷
+撼
+浒
+though
+或
+av@@
+荇
+要
+road
+wn
+纺
+师
+裇
+颚
+end
+汜
+驰
+8
+椹
+牂
+ties
+侪
+樉
+扎
+吞
+蕤
+垦
+茜
+泌
+wor@@
+艋
+次
+珙
+建
+切
+盱
+簌
+bus
+ten@@
+check
+硅
+概
+袒
+厚
+馐
+橐
+涵
+抢
+兖
+颠
+扑
+渐
+底
+獒
+瘴
+喝
+期
+踪
+镭
+糕
+翡
+养
+苺
+gi@@
+顷
+蚜
+琛
+县
+槽
+滹
+钙
+ple@@
+叙
+囵
+煮
+杵
+铛
+丢
+任
+咹
+莎
+请
+喂
+旬
+枫
+颊
+圄
+霍
+耆
+阅
+嵴
+鱿
+倭
+me
+咎
+sing
+眸
+羽
+二
+橦
+菉
+巩
+柑
+赧
+暮
+d@@
+损
+world
+隻
+鑹
+捧
+轨
+按
+啄
+聒
+炅
+湣
+詹
+饧
+柒
+恃
+掬
+爽
+料
+窎
+歹
+刿
+完
+餮
+鵀
+是
+榈
+透
+粗
+俛
+音
+份
+置
+砊
+蓬
+蘘
+缛
+going
+遯
+your
+ever@@
+制
+淼
+冏
+妆
+钣
+怡
+vern@@
+z@@
+弇
+砥
+蜴
+饸
+冮
+衫
+韘
+寖
+虐
+玹
+3@@
+tho@@
+闼
+旋
+煿
+嶝
+涟
+龌
+ko@@
+ge@@
+遛
+郢
+褡
+薪
+邓
+嫁
+叔
+嫤
+漠
+gue@@
+藿
+臾
+屣
+ily
+早
+痴
+埴
+囊
+俩
+ours
+蓦
+渗
+锟
+朦
+附
+赴
+make
+灾
+汛
+埸
+吾
+栲
+翰
+磔
+汰
+椋
+锸
+搴
+蒐
+韧
+撒
+纽
+笙
+order
+坼
+伷
+val@@
+烬
+梡
+瑶
+陷
+go
+绤
+琉
+湖
+硷
+熹
+觇
+侥
+往
+牲
+租
+矾
+祉
+跂
+age
+刘
+缃
+复
+阂
+刊
+am@@
+苻
+步
+cess
+蓖
+he
+喙
+畜
+鄠
+让
+雱
+噫
+sel@@
+钜
+胥
+蹬
+help
+捻
+芽
+肷
+掳
+旧
+謦
+甲
+ort
+搽
+哧
+骢
+吼
+口
+倚
+肽
+憨
+竦
+product
+棚
+黍
+砾
+愈
+昊
+阑
+魂
+持
+献
+pa@@
+凳
+厩
+佚
+鲱
+粮
+鹋
+姬
+抉
+籇
+part
+found
+嗍
+提
+蛇
+籓
+盼
+裱
+暄
+妞
+喑
+惶
+wha@@
+窥
+ks
+橘
+绞
+菁
+饭
+ste@@
+猞
+芘
+搧
+tain
+street
+呔
+懆
+袼
+肼
+on@@
+亨
+谑
+椅
+莹
+校
+躔
+颈
+圬
+匕
+mail
+税
+年
+脆
+郦
+拿
+fo@@
+鑱
+改
+la
+轸
+掩
+芒
+can't
+彺
+觑
+邴
+埵
+莴
+伢
+腕
+阴
+tre@@
+眇
+责
+青
+钠
+point
+僵
+庹
+具
+迠
+穗
+逃
+源
+咻
+今
+绻
+发
+嗝
+逆
+侈
+抓
+羌
+款
+耧
+锰
+罢
+跗
+汞
+赚
+累
+掷
+啫
+膳
+捭
+禀
+崂
+栝
+掴
+讶
+y@@
+瑾
+账
+挈
+鹬
+儁
+鳎
+骛
+釉
+矬
+潮
+ort@@
+病
+日
+o
+憍
+镍
+蜛
+壤
+葑
+聚
+灭
+立
+怙
+m@@
+诙
+躲
+泡
+would
+缺
+横
+违
+蛛
+ls
+鹮
+鲐
+茌
+罚
+ques@@
+畦
+厍
+缳
+劭
+拐
+萍
+债
+偈
+镛
+踽
+趋
+扭
+椒
+痔
+史
+称
+thing
+q@@
+fre@@
+剁
+beach
+舢
+黠
+执
+饵
+欤
+骥
+蓄
+邈
+疚
+'@@
+吓
+撄
+苌
+蜢
+mar@@
+桓
+made
+煦
+襁
+垃
+姓
+送
+柜
+埪
+怠
+蚍
+little
+殴
+炕
+蛙
+伱
+禁
+睺
+捍
+颀
+猬
+蟊
+岢
+o@@
+纩
+妨
+阡
+率
+蒋
+寐
+孚
+想
+洁
+沮
+铦
+镟
+潲
+杮
+tal
+镯
+虞
+鶒
+椐
+簦
+锈
+顗
+仑
+额
+掼
+耿
+荑
+撙
+谍
+沃
+惊
+碉
+皓
+峯
+炮
+厶
+拼
+得
+帅
+蜎
+莠
+近
+诩
+喓
+sequence
+愉
+ure
+枯
+thous@@
+槅
+移
+菡
+菪
+ten
+癍
+珀
+湎
+瓿
+名
+妇
+再
+傺
+蚬
+荡
+孽
+匜
+堠
+荜
+绉
+刽
+des
+1@@
+留
+裎
+赐
+ving
+光
+ly
+睟
+遂
+琮
+俟
+岛
+蛹
+蒿
+nor@@
+钦
+麻
+纫
+coun@@
+杷
+宿
+吣
+蝎
+湃
+惬
+enjo@@
+卯
+牸
+诠
+谷
+摞
+螈
+然
+盂
+mple
+萁
+灰
+畊
+when
+烈
+俸
+衿
+龟
+谭
+唳
+楀
+痪
+曳
+羿
+罂
+脱
+廿
+sur@@
+拝
+洵
+湿
+鸽
+冂
+跛
+school
+尡
+挡
+疭
+议
+self
+拍
+溺
+垅
+瞳
+笑
+thousand
+乓
+噼
+lie@@
+沿
+俣
+灏
+赈
+砭
+造
+标
+loo@@
+罅
+鳣
+翥
+铬
+诞
+酌
+佟
+歘
+瑀
+眬
+璁
+拣
+u
+颤
+笏
+ur@@
+蝌
+蛟
+number
+馕
+place
+伐
+热
+ver@@
+淜
+嚼
+鼢
+謩
+趟
+恬
+搅
+伫
+杌
+琐
+崽
+苗
+其
+熄
+在
+泾
+毓
+very
+琥
+孤
+扞
+恪
+t's
+埙
+do
+攮
+昴
+靥
+feel
+酗
+蕫
+der
+焙
+gen@@
+辛
+觞
+平
+椤
+耦
+黜
+寇
+醍
+耶
+撰
+蔌
+曜
+谂
+烛
+旦
+kno@@
+爪
+絺
+欢
+圈
+臂
+ch
+谆
+荏
+巫
+擐
+嵋
+霸
+ames
+wel@@
+叆
+傧
+tra@@
+隳
+唆
+澧
+焦
+窭
+剜
+菇
+锇
+锑
+枳
+娈
+侗
+橙
+愊
+泯
+茼
+荙
+遭
+蟥
+捱
+挤
+蹊
+暹
+ity
+珠
+驼
+嗷
+贪
+扔
+硒
+沣
+鸡
+蕖
+仓
+looking
+冦
+邳
+赝
+嵛
+chi@@
+典
+桧
+about
+茏
+es@@
+缒
+蕻
+闽
+独
+骨
+尰
+鲼
+und@@
+详
+姩
+脉
+腥
+拯
+婕
+杜
+俑
+经
+they
+距
+僧
+辅
+骂
+睛
+夤
+door
+捊
+靴
+庅
+劓
+舣
+饿
+幻
+ry
+悱
+畎
+先
+畠
+ye@@
+those
+菲
+榔
+邑
+city
+泖
+夕
+better
+吏
+酡
+蕾
+犯
+磲
+锝
+鹌
+鹫
+you@@
+耐
+梯
+鸬
+榛
+笛
+睡
+歪
+遍
+飒
+笃
+蓝
+于
+欸
+为
+琎
+宁
+ki@@
+tle
+直
+漕
+years
+烙
+捏
+馅
+re
+诺
+tur@@
+deci@@
+励
+佘
+bl@@
+of
+噩
+阪
+梿
+孃
+杻
+覃
+瞆
+瑷
+观
+循
+泪
+six@@
+集
+cust@@
+瘠
+尽
+鱀
+虎
+来
+other
+衮
+芷
+卓
+擒
+锶
+玻
+lu@@
+缝
+瀬
+缯
+嫖
+偭
+运
+厣
+顺
+璝
+锬
+悼
+off
+苑
+恤
+囡
+锻
+舁
+粼
+丑
+字
+酽
+鲜
+豹
+厝
+琤
+谞
+簖
+癣
+蚌
+退
+戡
+云
+颞
+cur@@
+腻
+呗
+沼
+骤
+崭
+扯
+怍
+app@@
+存
+害
+色
+踣
+淘
+嗦
+獠
+怗
+砸
+镜
+毹
+table
+@
+鲀
+寤
+桥
+嘞
+髁
+哙
+织
+嗒
+麂
+let
+偁
+苎
+燧
+rou@@
+哃
+鄂
+翌
+酚
+咄
+嗥
+蒸
+虚
+囗
+条
+鹕
+挖
+per@@
+矿
+珦
+囨
+冻
+佰
+洲
+漓
+菹
+镱
+尝
+low
+苜
+剞
+could
+埒
+铼
+结
+侵
+星
+then
+埠
+峤
+晒
+睦
+搓
+時
+铫
+偃
+might
+氦
+呻
+淦
+ge
+缧
+脑
+膴
+英
+pla@@
+案
+生
+顔
+记
+艰
+tic@@
+砖
+药
+宪
+涩
+鞠
+潆
+郭
+宵
+砀
+蟛
+岀
+赃
+蛤
+粢
+驶
+膂
+垤
+nee@@
+逐
+芳
+螂
+超
+change
+j
+锔
+箓
+笤
+弱
+畸
+分
+攸
+蛸
+烷
+脒
+昨
+咴
+羚
+芍
+卲
+噔
+飖
+蜍
+some@@
+崦
+嬛
+仍
+柳
+鹗
+増
+乙
+嫂
+骚
+巾
+憬
+刭
+顸
+磙
+伃
+孜
+碌
+堐
+祗
+鳉
+汊
+ab@@
+袄
+诚
+剡
+锷
+锕
+益
+箕
+瘾
+restaur@@
+鹣
+瘩
+矸
+his
+灸
+勖
+括
+呢
+漯
+罟
+嚓
+焗
+tru@@
+赛
+伶
+垢
+擘
+镠
+惰
+贬
+惙
+捺
+sto@@
+犁
+were
+艉
+佞
+配
+耙
+猢
+遗
+豊
+急
+蓟
+泓
+屑
+焊
+筲
+缈
+柔
+蜀
+珺
+怕
+淡
+.
+靛
+载
+堎
+潥
+懒
+邸
+惇
+粥
+斫
+漼
+re's
+簇
+修
+梏
+tr@@
+肟
+葙
+鲸
+呐
+游
+廖
+收
+鳅
+覆
+琳
+熵
+翱
+民
+飨
+螳
+媳
+辖
+旯
+榴
+瑰
+嗯
+渤
+ic
+熔
+暝
+柘
+鬟
+斋
+夸
+晢
+氰
+炫
+dn't
+押
+撺
+烁
+偶
+貅
+冀
+胎
+糊
+润
+咭
+各
+摐
+錞
+垌
+插
+舯
+嶙
+筋
+住
+酆
+镰
+工
+嗲
+铱
+暖
+off@@
+茔
+髂
+挺
+瞑
+伤
+忌
+僬
+萜
+普
+that@@
+爵
+稍
+瓠
+崛
+um
+磐
+投
+畅
+缕
+楯
+悒
+判
+筢
+馇
+芰
+遒
+黩
+迪
+蝾
+腈
+才
+峙
+悟
+禩
+惋
+five
+局
+砘
+ver
+恋
+斓
+蹀
+犹
+绶
+ep
+喛
+亩
+嵘
+汆
+狸
+迸
+盉
+line
+妣
+躐
+琢
+赣
+le
+疬
+穿
+焓
+跳
+匪
+feat@@
+鹦
+璠
+蔸
+百
+愁
+芋
+吙
+谚
+熳
+哩
+ad
+轧
+祇
+泬
+瘳
+列
+簃
+谦
+lea@@
+垴
+啤
+滇
+饮
+瞰
+獭
+倏
+肿
+宜
+涮
+礓
+数
+屌
+濒
+蟜
+上
+詝
+褐
+眭
+菟
+瘅
+蝙
+眜
+羡
+硖
+背
+芣
+歃
+红
+血
+混
+橞
+樽
+纶
+圭
+stay
+冗
+竑
+玥
+它
+怎
+薜
+殆
+看
+剖
+棠
+瘙
+be@@
+裤
+篙
+带
+助
+骀
+锍
+廓
+库
+绅
+劂
+蝻
+锘
+濩
+等
+魑
+酿
+阆
+弑
+悖
+姹
+傀
+which
+股
+湲
+贝
+相
+茑
+荔
+钲
+致
+沦
+另
+桑
+佳
+班
+昼
+埂
+剕
+箐
+岵
+涨
+柩
+两
+勰
+氮
+稽
+匝
+ting
+瘀
+缔
+鹉
+雕
+蛮
+镞
+魈
+謇
+me@@
+set
+豇
+h
+秫
+荬
+赢
+沩
+向
+石
+踖
+迥
+受
+抅
+镘
+嚭
+双
+锚
+ex@@
+辉
+谠
+綉
+ws
+吟
+湔
+ll@@
+市
+娥
+堢
+跽
+处
+厂
+朓
+笸
+悭
+孥
+嘀
+恻
+皱
+塄
+who@@
+妙
+栽
+瓷
+稔
+碧
+哽
+pol@@
+徨
+s
+邋
+息
+樨
+鼾
+拱
+谟
+墡
+匆
+speci@@
+ha@@
+庐
+鹿
+ak@@
+镣
+m
+唯
+州
+筒
+性
+髋
+哄
+镡
+贇
+and
+菩
+姻
+聪
+tion
+催
+醌
+唢
+雾
+堂
+芝
+擢
+匈
+悸
+钢
+gy
+徉
+渡
+誊
+ti@@
+军
+铸
+和
+沄
+兽
+短
+育
+韪
+邀
+谴
+耋
+浩
+public
+胝
+猴
+six
+黧
+being
+弊
+捂
+ical
+癖
+堆
+伧
+割
+引
+la@@
+鼙
+逻
+堪
+cap@@
+首
+谝
+瘦
+硐
+郑
+凑
+fu@@
+b@@
+睨
+骠
+佼
+篦
+贵
+铌
+act
+呑
+竖
+莅
+陕
+飙
+黏
+镆
+屦
+仨
+敫
+马
+埜
+垇
+阇
+耒
+笩
+蓂
+妃
+讨
+see@@
+吊
+踊
+弥
+梅
+坑
+渫
+au@@
+赡
+坛
+苹
+don@@
+察
+俚
+fir@@
+濑
+块
+浏
+唵
+婴
+恙
+隈
+麽
+ca@@
+莼
+御
+鞋
+禧
+衰
+潩
+茬
+ven
+瞌
+鸥
+凭
+咯
+饺
+耄
+uni@@
+瑟
+煟
+舟
+尅
+帚
+鼋
+嫔
+婄
+赟
+棼
+沪
+秃
+粧
+roo@@
+蚳
+亏
+备
+压
+悍
+嚅
+fin@@
+交
+癔
+跞
+莪
+闻
+罾
+炁
+钻
+鹆
+友
+傣
+糁
+service
+邾
+窝
+扉
+鲡
+盛
+玖
+卫
+ne@@
+眛
+鸟
+党
+杼
+碓
+篚
+裆
+钟
+大
+介
+揩
+砻
+卅
+弟
+颛
+嫪
+代
+会
+悻
+者
+箍
+番
+疯
+sp@@
+聃
+瀚
+挥
+拷
+氡
+璎
+肪
+tan
+ul@@
+蠃
+龊
+家
+挽
+must
+蹭
+樟
+令
+饶
+buy
+忖
+娠
+手
+house
+贷
+嘚
+莺
+茛
+宥
+刼
+傥
+rest
+厄
+洙
+牝
+林
+铆
+觳
+嬢
+芥
+充
+絮
+陲
+zer@@
+藤
+air@@
+贤
+趾
+孝
+袖
+氇
+章
+柷
+恓
+麹
+胧
+came
+休
+舲
+籼
+惕
+泼
+馨
+町
+wer
+鹧
+咚
+崔
+嵂
+蝈
+ils
+湛
+报
+蜷
+削
+根
+恣
+呙
+郫
+把
+窕
+癌
+xt
+榘
+傫
+用
+撕
+蚩
+菰
+崖
+酐
+襄
+cu@@
+呤
+帯
+炊
+揲
+啸
+鲶
+ve@@
+艟
+嗜
+ili@@
+凹
+球
+鬓
+荼
+给
+亘
+醯
+燔
+鳕
+侄
+簪
+洳
+嘛
+稠
+'t
+氘
+暍
+指
+泆
+总
+offer
+糅
+膀
+毋
+蹲
+煅
+桔
+什
+浜
+八
+涧
+跬
+祷
+湫
+脸
+描
+pr@@
+狱
+赉
+砼
+袜
+馑
+枪
+碁
+娜
+辇
+颐
+捩
+峥
+聆
+善
+询
+疝
+坜
+碚
+缥
+鼯
+钮
+腘
+溥
+脔
+殁
+肮
+泮
+捐
+槛
+吆
+河
+溆
+度
+殒
+骕
+饹
+cour@@
+臜
+空
+2@@
+允
+枉
+蒯
+or@@
+趔
+菠
+栌
+view
+possi@@
+仗
+鳄
+摊
+寝
+躯
+湝
+后
+删
+揪
+徐
+寒
+坨
+挣
+举
+氅
+臻
+饔
+丁
+彦
+泉
+曷
+衅
+ei@@
+呸
+狍
+甓
+禽
+薄
+shipping
+ght
+慵
+幽
+匣
+炬
+访
+four
+秒
+樗
+陶
+脓
+枰
+磷
+哕
+君
+脲
+灯
+回
+索
+璺
+邡
+愍
+薏
+辄
+珽
+離
+鬃
+赊
+枲
+邰
+佷
+颏
+寺
+抵
+鲅
+刎
+給
+戴
+溲
+袢
+less
+逯
+封
+food
+匮
+关
+尘
+皤
+鳍
+搏
+幌
+连
+杠
+蹋
+彷
+虺
+怀
+雏
+钿
+澄
+扶
+错
+樋
+黉
+胄
+薇
+阿
+墁
+觜
+利
+汵
+羟
+杀
+淋
+蓁
+腧
+哞
+was
+钊
+胸
+艽
+撞
+虑
+煋
+飗
+throu@@
+门
+玦
+琪
+煸
+酵
+婌
+serv@@
+硿
+净
+夐
+撅
+頠
+want
+犄
+厉
+p@@
+en
+斗
+层
+玫
+life
+潘
+骐
+臃
+谮
+殃
+厌
+摄
+磾
+v
+dress
+络
+囱
+眯
+忽
+壸
+咐
+搁
+肤
+魇
+芬
+窡
+拉
+纮
+楣
+蔬
+寄
+妯
+教
+俪
+颡
+碛
+互
+奈
+憎
+炉
+蹰
+聂
+员
+呶
+瞬
+il@@
+恿
+阙
+卡
+mi@@
+禾
+椴
+yo@@
+帀
+醵
+帝
+隔
+忒
+哑
+効
+楗
+鼱
+塽
+苴
+蜞
+健
+醅
+ju@@
+新
+程
+茗
+琰
+几
+揍
+匍
+砣
+禳
+罗
+勿
+擗
+畛
+框
+泒
+析
+沢
+偷
+繁
+嗣
+呵
+念
+so@@
+溷
+曩
+spon@@
+狼
+倔
+威
+潭
+踯
+晁
+吩
+袅
+喀
+洌
+炯
+纸
+抽
+簧
+c@@
+买
+吖
+俬
+梓
+叡
+祼
+烃
+荃
+眀
+
diff --git a/funasr/runtime/onnxruntime/readme.md b/funasr/runtime/onnxruntime/readme.md
new file mode 100644
index 000000000..fa2f276cb
--- /dev/null
+++ b/funasr/runtime/onnxruntime/readme.md
@@ -0,0 +1,114 @@
+
+
+
+## 快速使用
+
+### Windows
+
+ 安装Vs2022 打开cpp_onnx目录下的cmake工程,直接 build即可。 本仓库已经准备好所有相关依赖库。
+
+ Windows下已经预置fftw3及onnxruntime库
+
+
+### Linux
+See the bottom of this page: Building Guidance
+
+
+### 运行程序
+
+tester /path/to/models/dir /path/to/wave/file
+
+ 例如: tester /data/models /data/test.wav
+
+/data/models 需要包括如下两个文件: model.onnx 和vocab.txt
+
+
+## 支持平台
+- Windows
+- Linux/Unix
+
+## 依赖
+- fftw3
+- openblas
+- onnxruntime
+
+## 导出onnx格式模型文件
+安装 modelscope与FunASR,依赖:torch,torchaudio,安装过程[详细参考文档](https://github.com/alibaba-damo-academy/FunASR/wiki)
+```shell
+pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+git clone https://github.com/alibaba/FunASR.git && cd FunASR
+pip install --editable ./
+```
+导出onnx模型,[详见](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export),参考示例,从modelscope中模型导出:
+
+```
+python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
+```
+
+## Building Guidance for Linux/Unix
+
+```
+git clone https://github.com/alibaba-damo-academy/FunASR.git && cd funasr/runtime/onnxruntime
+mkdir build
+cd build
+# download an appropriate onnxruntime from https://github.com/microsoft/onnxruntime/releases/tag/v1.14.0
+# here we get a copy of onnxruntime for linux 64
+wget https://github.com/microsoft/onnxruntime/releases/download/v1.14.0/onnxruntime-linux-x64-1.14.0.tgz
+tar -zxvf onnxruntime-linux-x64-1.14.0.tgz
+# ls
+# onnxruntime-linux-x64-1.14.0 onnxruntime-linux-x64-1.14.0.tgz
+
+#install fftw3-dev
+ubuntu: apt install libfftw3-dev
+centos: yum install fftw fftw-devel
+
+#install openblas
+bash ./third_party/install_openblas.sh
+
+# build
+ cmake -DCMAKE_BUILD_TYPE=release .. -DONNXRUNTIME_DIR=/mnt/c/Users/ma139/RapidASR/cpp_onnx/build/onnxruntime-linux-x64-1.14.0
+ make
+
+ # then in the subfolder tester of current direcotry, you will see a program, tester
+
+````
+
+### The structure of a qualified onnxruntime package.
+```
+onnxruntime_xxx
+├───include
+└───lib
+```
+
+## 线程数与性能关系
+
+测试环境Rocky Linux 8,仅测试cpp版本结果(未测python版本),@acely
+
+简述:
+在3台配置不同的机器上分别编译并测试,在fftw和onnxruntime版本都相同的前提下,识别同一个30分钟的音频文件,分别测试不同onnx线程数量的表现。
+
+
+
+目前可以总结出大致规律:
+
+并非onnx线程数越多越好
+2线程比1线程提升显著,线程再多则提升较小
+线程数等于CPU物理核心数时效率最好
+实操建议:
+
+大部分场景用3-4线程性价比最高
+低配机器用2线程合适
+
+
+
+## 演示
+
+
+
+## 注意
+本程序只支持 采样率16000hz, 位深16bit的 **单声道** 音频。
+
+
+## Acknowledge
+1. We acknowledge [mayong](https://github.com/RapidAI/RapidASR/tree/main/cpp_onnx) for contributing the onnxruntime(cpp api).
+2. We borrowed a lot of code from [FastASR](https://github.com/chenkui164/FastASR) for audio frontend and text-postprocess.
diff --git a/funasr/runtime/onnxruntime/src/Audio.cpp b/funasr/runtime/onnxruntime/src/Audio.cpp
new file mode 100644
index 000000000..53bf9d02a
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/Audio.cpp
@@ -0,0 +1,474 @@
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "Audio.h"
+
+using namespace std;
+
+class AudioWindow {
+ private:
+ int *window;
+ int in_idx;
+ int out_idx;
+ int sum;
+ int window_size = 0;
+
+ public:
+ AudioWindow(int window_size) : window_size(window_size)
+ {
+ window = (int *)calloc(sizeof(int), window_size + 1);
+ in_idx = 0;
+ out_idx = 1;
+ sum = 0;
+ };
+ ~AudioWindow(){
+ free(window);
+ };
+ int put(int val)
+ {
+ sum = sum + val - window[out_idx];
+ window[in_idx] = val;
+ in_idx = in_idx == window_size ? 0 : in_idx + 1;
+ out_idx = out_idx == window_size ? 0 : out_idx + 1;
+ return sum;
+ };
+};
+
+AudioFrame::AudioFrame(){};
+AudioFrame::AudioFrame(int len) : len(len)
+{
+ start = 0;
+};
+AudioFrame::~AudioFrame(){};
+int AudioFrame::set_start(int val)
+{
+ start = val < 0 ? 0 : val;
+ return start;
+};
+
+int AudioFrame::set_end(int val, int max_len)
+{
+
+ float num_samples = val - start;
+ float frame_length = 400;
+ float frame_shift = 160;
+ float num_new_samples =
+ ceil((num_samples - 400) / frame_shift) * frame_shift + frame_length;
+
+ end = start + num_new_samples;
+ len = (int)num_new_samples;
+ if (end > max_len)
+ printf("frame end > max_len!!!!!!!\n");
+ return end;
+};
+
+int AudioFrame::get_start()
+{
+ return start;
+};
+
+int AudioFrame::get_len()
+{
+ return len;
+};
+
+int AudioFrame::disp()
+{
+ printf("not imp!!!!\n");
+
+ return 0;
+};
+
+Audio::Audio(int data_type) : data_type(data_type)
+{
+ speech_buff = NULL;
+ speech_data = NULL;
+ align_size = 1360;
+}
+
+Audio::Audio(int data_type, int size) : data_type(data_type)
+{
+ speech_buff = NULL;
+ speech_data = NULL;
+ align_size = (float)size;
+}
+
+Audio::~Audio()
+{
+ if (speech_buff != NULL) {
+ free(speech_buff);
+
+ }
+
+ if (speech_data != NULL) {
+
+ free(speech_data);
+ }
+}
+
+void Audio::disp()
+{
+ printf("Audio time is %f s. len is %d\n", (float)speech_len / 16000,
+ speech_len);
+}
+
+float Audio::get_time_len()
+{
+ return (float)speech_len / 16000;
+ //speech_len);
+}
+
+bool Audio::loadwav(const char *filename)
+{
+
+ if (speech_data != NULL) {
+ free(speech_data);
+ }
+ if (speech_buff != NULL) {
+ free(speech_buff);
+ }
+
+ offset = 0;
+
+ FILE *fp;
+ fp = fopen(filename, "rb");
+ if (fp == nullptr)
+ return false;
+ fseek(fp, 0, SEEK_END);
+ uint32_t nFileLen = ftell(fp);
+ fseek(fp, 44, SEEK_SET);
+
+ speech_len = (nFileLen - 44) / 2;
+ speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size);
+ speech_buff = (int16_t *)malloc(sizeof(int16_t) * speech_align_len);
+
+ if (speech_buff)
+ {
+ memset(speech_buff, 0, sizeof(int16_t) * speech_align_len);
+ int ret = fread(speech_buff, sizeof(int16_t), speech_len, fp);
+ fclose(fp);
+
+ speech_data = (float*)malloc(sizeof(float) * speech_align_len);
+ memset(speech_data, 0, sizeof(float) * speech_align_len);
+ int i;
+ float scale = 1;
+
+ if (data_type == 1) {
+ scale = 32768;
+ }
+
+ for (i = 0; i < speech_len; i++) {
+ speech_data[i] = (float)speech_buff[i] / scale;
+ }
+
+ AudioFrame* frame = new AudioFrame(speech_len);
+ frame_queue.push(frame);
+
+
+ return true;
+ }
+ else
+ return false;
+}
+
+
+bool Audio::loadwav(const char* buf, int nFileLen)
+{
+
+
+
+ if (speech_data != NULL) {
+ free(speech_data);
+ }
+ if (speech_buff != NULL) {
+ free(speech_buff);
+ }
+
+ offset = 0;
+
+ size_t nOffset = 0;
+
+#define WAV_HEADER_SIZE 44
+
+ speech_len = (nFileLen - WAV_HEADER_SIZE) / 2;
+ speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size);
+ speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_align_len);
+ if (speech_buff)
+ {
+ memset(speech_buff, 0, sizeof(int16_t) * speech_align_len);
+ memcpy((void*)speech_buff, (const void*)(buf + WAV_HEADER_SIZE), speech_len * sizeof(int16_t));
+
+
+ speech_data = (float*)malloc(sizeof(float) * speech_align_len);
+ memset(speech_data, 0, sizeof(float) * speech_align_len);
+ int i;
+ float scale = 1;
+
+ if (data_type == 1) {
+ scale = 32768;
+ }
+
+ for (i = 0; i < speech_len; i++) {
+ speech_data[i] = (float)speech_buff[i] / scale;
+ }
+
+
+ return true;
+ }
+ else
+ return false;
+
+}
+
+
+bool Audio::loadpcmwav(const char* buf, int nBufLen)
+{
+ if (speech_data != NULL) {
+ free(speech_data);
+ }
+ if (speech_buff != NULL) {
+ free(speech_buff);
+ }
+ offset = 0;
+
+ size_t nOffset = 0;
+
+
+
+ speech_len = nBufLen / 2;
+ speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size);
+ speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_align_len);
+ if (speech_buff)
+ {
+ memset(speech_buff, 0, sizeof(int16_t) * speech_align_len);
+ memcpy((void*)speech_buff, (const void*)buf, speech_len * sizeof(int16_t));
+
+
+ speech_data = (float*)malloc(sizeof(float) * speech_align_len);
+ memset(speech_data, 0, sizeof(float) * speech_align_len);
+
+
+ int i;
+ float scale = 1;
+
+ if (data_type == 1) {
+ scale = 32768;
+ }
+
+ for (i = 0; i < speech_len; i++) {
+ speech_data[i] = (float)speech_buff[i] / scale;
+ }
+
+ AudioFrame* frame = new AudioFrame(speech_len);
+ frame_queue.push(frame);
+ return true;
+
+ }
+ else
+ return false;
+
+
+}
+
+bool Audio::loadpcmwav(const char* filename)
+{
+
+ if (speech_data != NULL) {
+ free(speech_data);
+ }
+ if (speech_buff != NULL) {
+ free(speech_buff);
+ }
+ offset = 0;
+
+ FILE* fp;
+ fp = fopen(filename, "rb");
+ if (fp == nullptr)
+ return false;
+ fseek(fp, 0, SEEK_END);
+ uint32_t nFileLen = ftell(fp);
+ fseek(fp, 0, SEEK_SET);
+
+ speech_len = (nFileLen) / 2;
+ speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size);
+ speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_align_len);
+ if (speech_buff)
+ {
+ memset(speech_buff, 0, sizeof(int16_t) * speech_align_len);
+ int ret = fread(speech_buff, sizeof(int16_t), speech_len, fp);
+ fclose(fp);
+
+ speech_data = (float*)malloc(sizeof(float) * speech_align_len);
+ memset(speech_data, 0, sizeof(float) * speech_align_len);
+
+
+
+ int i;
+ float scale = 1;
+
+ if (data_type == 1) {
+ scale = 32768;
+ }
+
+ for (i = 0; i < speech_len; i++) {
+ speech_data[i] = (float)speech_buff[i] / scale;
+ }
+
+
+ AudioFrame* frame = new AudioFrame(speech_len);
+ frame_queue.push(frame);
+
+
+ return true;
+ }
+ else
+ return false;
+
+}
+
+
+int Audio::fetch_chunck(float *&dout, int len)
+{
+ if (offset >= speech_align_len) {
+ dout = NULL;
+ return S_ERR;
+ } else if (offset == speech_align_len - len) {
+ dout = speech_data + offset;
+ offset = speech_align_len;
+ // 临时解决
+ AudioFrame *frame = frame_queue.front();
+ frame_queue.pop();
+ delete frame;
+
+ return S_END;
+ } else {
+ dout = speech_data + offset;
+ offset += len;
+ return S_MIDDLE;
+ }
+}
+
+int Audio::fetch(float *&dout, int &len, int &flag)
+{
+ if (frame_queue.size() > 0) {
+ AudioFrame *frame = frame_queue.front();
+ frame_queue.pop();
+
+ dout = speech_data + frame->get_start();
+ len = frame->get_len();
+ delete frame;
+ flag = S_END;
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+void Audio::padding()
+{
+
+ float num_samples = speech_len;
+ float frame_length = 400;
+ float frame_shift = 160;
+ float num_frames = floor((num_samples + (frame_shift / 2)) / frame_shift);
+ float num_new_samples = (num_frames - 1) * frame_shift + frame_length;
+ float num_padding = num_new_samples - num_samples;
+ float num_left_padding = (frame_length - frame_shift) / 2;
+ float num_right_padding = num_padding - num_left_padding;
+
+ float *new_data = (float *)malloc(num_new_samples * sizeof(float));
+ int i;
+ int tmp_off = 0;
+ for (i = 0; i < num_left_padding; i++) {
+ int ii = num_left_padding - i - 1;
+ new_data[i] = speech_data[ii];
+ }
+ tmp_off = num_left_padding;
+ memcpy(new_data + tmp_off, speech_data, speech_len * sizeof(float));
+ tmp_off += speech_len;
+
+ for (i = 0; i < num_right_padding; i++) {
+ int ii = speech_len - i - 1;
+ new_data[tmp_off + i] = speech_data[ii];
+ }
+ free(speech_data);
+ speech_data = new_data;
+ speech_len = num_new_samples;
+
+ AudioFrame *frame = new AudioFrame(num_new_samples);
+ frame_queue.push(frame);
+ frame = frame_queue.front();
+ frame_queue.pop();
+ delete frame;
+}
+
+#define UNTRIGGERED 0
+#define TRIGGERED 1
+
+#define SPEECH_LEN_5S (16000 * 5)
+#define SPEECH_LEN_10S (16000 * 10)
+#define SPEECH_LEN_20S (16000 * 20)
+#define SPEECH_LEN_30S (16000 * 30)
+
+void Audio::split()
+{
+ VadInst *handle = WebRtcVad_Create();
+ WebRtcVad_Init(handle);
+ WebRtcVad_set_mode(handle, 2);
+ int window_size = 10;
+ AudioWindow audiowindow(window_size);
+ int status = UNTRIGGERED;
+ int offset = 0;
+ int fs = 16000;
+ int step = 480;
+
+ AudioFrame *frame;
+
+ frame = frame_queue.front();
+ frame_queue.pop();
+ delete frame;
+ frame = NULL;
+
+ while (offset < speech_len - step) {
+ int n = WebRtcVad_Process(handle, fs, speech_buff + offset, step);
+ if (status == UNTRIGGERED && audiowindow.put(n) >= window_size - 1) {
+ frame = new AudioFrame();
+ int start = offset - step * (window_size - 1);
+ frame->set_start(start);
+ status = TRIGGERED;
+ } else if (status == TRIGGERED) {
+ int win_weight = audiowindow.put(n);
+ int voice_len = (offset - frame->get_start());
+ int gap = 0;
+ if (voice_len < SPEECH_LEN_5S) {
+ offset += step;
+ continue;
+ } else if (voice_len < SPEECH_LEN_10S) {
+ gap = 1;
+ } else if (voice_len < SPEECH_LEN_20S) {
+ gap = window_size / 5;
+ } else {
+ gap = window_size / 2;
+ }
+
+ if (win_weight < gap) {
+ status = UNTRIGGERED;
+ offset = frame->set_end(offset, speech_align_len);
+ frame_queue.push(frame);
+ frame = NULL;
+ }
+ }
+ offset += step;
+ }
+
+ if (frame != NULL) {
+ frame->set_end(speech_len, speech_align_len);
+ frame_queue.push(frame);
+ frame = NULL;
+ }
+ WebRtcVad_Free(handle);
+}
diff --git a/funasr/runtime/onnxruntime/src/CMakeLists.txt b/funasr/runtime/onnxruntime/src/CMakeLists.txt
new file mode 100644
index 000000000..aea222b92
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/CMakeLists.txt
@@ -0,0 +1,43 @@
+
+file(GLOB files1 "*.cpp")
+file(GLOB files4 "paraformer/*.cpp")
+
+set(files ${files1} ${files2} ${files3} ${files4})
+
+# message("${files}")
+
+add_library(rapidasr ${files})
+
+if(WIN32)
+
+ set(EXTRA_LIBS libfftw3f-3 webrtcvad)
+ if(CMAKE_CL_64)
+ target_link_directories(rapidasr PUBLIC ${CMAKE_SOURCE_DIR}/win/lib/x64)
+ else()
+ target_link_directories(rapidasr PUBLIC ${CMAKE_SOURCE_DIR}/win/lib/x86)
+ endif()
+ target_include_directories(rapidasr PUBLIC ${CMAKE_SOURCE_DIR}/win/include )
+
+ target_compile_definitions(rapidasr PUBLIC -D_RPASR_API_EXPORT)
+else()
+
+ set(EXTRA_LIBS fftw3f webrtcvad pthread)
+ target_include_directories(rapidasr PUBLIC "/usr/local/opt/fftw/include")
+ target_link_directories(rapidasr PUBLIC "/usr/local/opt/fftw/lib")
+
+ target_include_directories(rapidasr PUBLIC "/usr/local/opt/openblas/include")
+ target_link_directories(rapidasr PUBLIC "/usr/local/opt/openblas/lib")
+
+ target_include_directories(rapidasr PUBLIC "/usr/include")
+ target_link_directories(rapidasr PUBLIC "/usr/lib64")
+
+ target_include_directories(rapidasr PUBLIC ${FFTW3F_INCLUDE_DIR})
+ target_link_directories(rapidasr PUBLIC ${FFTW3F_LIBRARY_DIR})
+ include_directories(${ONNXRUNTIME_DIR}/include)
+endif()
+
+include_directories(${CMAKE_SOURCE_DIR}/include)
+target_link_libraries(rapidasr PUBLIC onnxruntime ${EXTRA_LIBS})
+
+
+
diff --git a/funasr/runtime/onnxruntime/src/CommonStruct.h b/funasr/runtime/onnxruntime/src/CommonStruct.h
new file mode 100644
index 000000000..538d38b66
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/CommonStruct.h
@@ -0,0 +1,6 @@
+
+#ifndef COMMONSTRUCT_H
+#define COMMONSTRUCT_H
+
+
+#endif
diff --git a/funasr/runtime/onnxruntime/src/FeatureExtract.cpp b/funasr/runtime/onnxruntime/src/FeatureExtract.cpp
new file mode 100644
index 000000000..1b0c3c4a8
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/FeatureExtract.cpp
@@ -0,0 +1,408 @@
+
+#include "precomp.h"
+
+using namespace std;
+
+FeatureExtract::FeatureExtract(int mode) : mode(mode)
+{
+ fftw_init();
+}
+
+FeatureExtract::~FeatureExtract()
+{
+ fftwf_free(fft_input);
+ fftwf_free(fft_out);
+ fftwf_destroy_plan(p);
+}
+
+void FeatureExtract::reset()
+{
+ speech.reset();
+ fqueue.reset();
+}
+
+int FeatureExtract::size()
+{
+ return fqueue.size();
+}
+
+void FeatureExtract::fftw_init()
+{
+ int fft_size = 512;
+ fft_input = (float *)fftwf_malloc(sizeof(float) * fft_size);
+ fft_out = (fftwf_complex *)fftwf_malloc(sizeof(fftwf_complex) * fft_size);
+ memset(fft_input, 0, sizeof(float) * fft_size);
+ p = fftwf_plan_dft_r2c_1d(fft_size, fft_input, fft_out, FFTW_ESTIMATE);
+}
+
+void FeatureExtract::insert(float *din, int len, int flag)
+{
+ const float *window = (const float *)&window_hex;
+ if (mode == 3)
+ window = (const float *)&window_hamm_hex;
+
+ int window_size = 400;
+ int fft_size = 512;
+ int window_shift = 160;
+
+ speech.load(din, len);
+ int i, j;
+ float tmp_feature[80];
+ if (mode == 0 || mode == 2 || mode == 3) {
+ int ll = (speech.size() - 400) / 160 + 1;
+ fqueue.reinit(ll);
+ }
+
+ for (i = 0; i <= speech.size() - 400; i = i + window_shift) {
+ float tmp_mean = 0;
+ for (j = 0; j < window_size; j++) {
+ tmp_mean += speech[i + j];
+ }
+
+ tmp_mean = tmp_mean / window_size;
+
+ float pre_val = (float)speech[i] - tmp_mean;
+
+ for (j = 0; j < window_size; j++) {
+ float win = window[j];
+ float cur_val = (float)speech[i + j] - tmp_mean;
+ fft_input[j] = win * (cur_val - 0.97 * pre_val);
+ pre_val = cur_val;
+ }
+
+ fftwf_execute(p);
+
+ melspect((float *)fft_out, tmp_feature);
+ int tmp_flag = S_MIDDLE;
+ if (flag == S_END && i > speech.size() - 560)
+ tmp_flag = S_END;
+
+ fqueue.push(tmp_feature, tmp_flag);
+ }
+ speech.update(i);
+}
+
+bool FeatureExtract::fetch(Tensor *&dout)
+{
+ if (fqueue.size() < 1) {
+ return false;
+ } else {
+ dout = fqueue.pop();
+ return true;
+ }
+}
+
+void FeatureExtract::global_cmvn(float *din)
+{
+ const float *std;
+ const float *mean;
+
+ if (mode < 2) {
+ if (mode == 0) {
+ std = (const float *)global_cmvn_std_hex;
+ mean = (const float *)global_cmvn_mean_hex;
+ } else {
+ std = (const float *)global_cmvn_std_online_hex;
+ mean = (const float *)global_cmvn_mean_online_hex;
+ }
+
+ int i;
+ for (i = 0; i < 80; i++) {
+ float tmp = din[i] < 1e-7 ? 1e-7 : din[i];
+ tmp = log(tmp);
+ din[i] = (tmp - mean[i]) / std[i];
+ }
+ } else {
+ int i;
+
+ int val = 0x34000000;
+ float min_resol = *((float *)&val);
+
+ for (i = 0; i < 80; i++) {
+ float tmp = din[i] < min_resol ? min_resol : din[i];
+ din[i] = log(tmp);
+ }
+ }
+}
+
+void FeatureExtract::melspect(float *din, float *dout)
+{
+ float fftmag[256];
+// float tmp;
+ const float *melcoe = (const float *)melcoe_hex;
+ int i;
+ for (i = 0; i < 256; i++) {
+ float real = din[2 * i];
+ float imag = din[2 * i + 1];
+ fftmag[i] = real * real + imag * imag;
+ }
+ dout[0] = melcoe[0] * fftmag[1] + melcoe[1] * fftmag[2];
+ dout[1] = melcoe[2] * fftmag[2];
+ dout[2] = melcoe[3] * fftmag[3];
+ dout[3] = melcoe[4] * fftmag[3] + melcoe[5] * fftmag[4];
+ dout[4] = melcoe[6] * fftmag[4] + melcoe[7] * fftmag[5];
+ dout[5] = melcoe[8] * fftmag[5] + melcoe[9] * fftmag[6];
+ dout[6] = melcoe[10] * fftmag[6] + melcoe[11] * fftmag[7];
+ dout[7] = melcoe[12] * fftmag[7];
+ dout[8] = melcoe[13] * fftmag[8];
+ dout[9] = melcoe[14] * fftmag[8] + melcoe[15] * fftmag[9];
+ dout[10] = melcoe[16] * fftmag[9] + melcoe[17] * fftmag[10];
+ dout[11] = melcoe[18] * fftmag[10] + melcoe[19] * fftmag[11];
+ dout[12] = melcoe[20] * fftmag[11] + melcoe[21] * fftmag[12] +
+ melcoe[22] * fftmag[13];
+ dout[13] = melcoe[23] * fftmag[12] + melcoe[24] * fftmag[13] +
+ melcoe[25] * fftmag[14];
+ dout[14] = melcoe[26] * fftmag[14] + melcoe[27] * fftmag[15];
+ dout[15] = melcoe[28] * fftmag[15] + melcoe[29] * fftmag[16];
+ dout[16] = melcoe[30] * fftmag[16] + melcoe[31] * fftmag[17];
+ dout[17] = melcoe[32] * fftmag[17] + melcoe[33] * fftmag[18];
+ dout[18] = melcoe[34] * fftmag[18] + melcoe[35] * fftmag[19] +
+ melcoe[36] * fftmag[20];
+ dout[19] = melcoe[37] * fftmag[19] + melcoe[38] * fftmag[20] +
+ melcoe[39] * fftmag[21];
+ dout[20] = melcoe[40] * fftmag[21] + melcoe[41] * fftmag[22];
+ dout[21] = melcoe[42] * fftmag[22] + melcoe[43] * fftmag[23] +
+ melcoe[44] * fftmag[24];
+ dout[22] = melcoe[45] * fftmag[23] + melcoe[46] * fftmag[24] +
+ melcoe[47] * fftmag[25];
+ dout[23] = melcoe[48] * fftmag[25] + melcoe[49] * fftmag[26] +
+ melcoe[50] * fftmag[27];
+ dout[24] = melcoe[51] * fftmag[26] + melcoe[52] * fftmag[27] +
+ melcoe[53] * fftmag[28];
+ dout[25] = melcoe[54] * fftmag[28] + melcoe[55] * fftmag[29] +
+ melcoe[56] * fftmag[30];
+ dout[26] = melcoe[57] * fftmag[29] + melcoe[58] * fftmag[30] +
+ melcoe[59] * fftmag[31] + melcoe[60] * fftmag[32];
+ dout[27] = melcoe[61] * fftmag[31] + melcoe[62] * fftmag[32] +
+ melcoe[63] * fftmag[33];
+ dout[28] = melcoe[64] * fftmag[33] + melcoe[65] * fftmag[34] +
+ melcoe[66] * fftmag[35];
+ dout[29] = melcoe[67] * fftmag[34] + melcoe[68] * fftmag[35] +
+ melcoe[69] * fftmag[36] + melcoe[70] * fftmag[37];
+ dout[30] = melcoe[71] * fftmag[36] + melcoe[72] * fftmag[37] +
+ melcoe[73] * fftmag[38] + melcoe[74] * fftmag[39];
+ dout[31] = melcoe[75] * fftmag[38] + melcoe[76] * fftmag[39] +
+ melcoe[77] * fftmag[40] + melcoe[78] * fftmag[41];
+ dout[32] = melcoe[79] * fftmag[40] + melcoe[80] * fftmag[41] +
+ melcoe[81] * fftmag[42] + melcoe[82] * fftmag[43];
+ dout[33] = melcoe[83] * fftmag[42] + melcoe[84] * fftmag[43] +
+ melcoe[85] * fftmag[44] + melcoe[86] * fftmag[45];
+ dout[34] = melcoe[87] * fftmag[44] + melcoe[88] * fftmag[45] +
+ melcoe[89] * fftmag[46] + melcoe[90] * fftmag[47];
+ dout[35] = melcoe[91] * fftmag[46] + melcoe[92] * fftmag[47] +
+ melcoe[93] * fftmag[48] + melcoe[94] * fftmag[49];
+ dout[36] = melcoe[95] * fftmag[48] + melcoe[96] * fftmag[49] +
+ melcoe[97] * fftmag[50] + melcoe[98] * fftmag[51];
+ dout[37] = melcoe[99] * fftmag[50] + melcoe[100] * fftmag[51] +
+ melcoe[101] * fftmag[52] + melcoe[102] * fftmag[53] +
+ melcoe[103] * fftmag[54];
+ dout[38] = melcoe[104] * fftmag[52] + melcoe[105] * fftmag[53] +
+ melcoe[106] * fftmag[54] + melcoe[107] * fftmag[55] +
+ melcoe[108] * fftmag[56];
+ dout[39] = melcoe[109] * fftmag[55] + melcoe[110] * fftmag[56] +
+ melcoe[111] * fftmag[57] + melcoe[112] * fftmag[58];
+ dout[40] = melcoe[113] * fftmag[57] + melcoe[114] * fftmag[58] +
+ melcoe[115] * fftmag[59] + melcoe[116] * fftmag[60] +
+ melcoe[117] * fftmag[61];
+ dout[41] = melcoe[118] * fftmag[59] + melcoe[119] * fftmag[60] +
+ melcoe[120] * fftmag[61] + melcoe[121] * fftmag[62] +
+ melcoe[122] * fftmag[63] + melcoe[123] * fftmag[64];
+ dout[42] = melcoe[124] * fftmag[62] + melcoe[125] * fftmag[63] +
+ melcoe[126] * fftmag[64] + melcoe[127] * fftmag[65] +
+ melcoe[128] * fftmag[66];
+ dout[43] = melcoe[129] * fftmag[65] + melcoe[130] * fftmag[66] +
+ melcoe[131] * fftmag[67] + melcoe[132] * fftmag[68] +
+ melcoe[133] * fftmag[69];
+ dout[44] = melcoe[134] * fftmag[67] + melcoe[135] * fftmag[68] +
+ melcoe[136] * fftmag[69] + melcoe[137] * fftmag[70] +
+ melcoe[138] * fftmag[71] + melcoe[139] * fftmag[72];
+ dout[45] = melcoe[140] * fftmag[70] + melcoe[141] * fftmag[71] +
+ melcoe[142] * fftmag[72] + melcoe[143] * fftmag[73] +
+ melcoe[144] * fftmag[74] + melcoe[145] * fftmag[75];
+ dout[46] = melcoe[146] * fftmag[73] + melcoe[147] * fftmag[74] +
+ melcoe[148] * fftmag[75] + melcoe[149] * fftmag[76] +
+ melcoe[150] * fftmag[77] + melcoe[151] * fftmag[78];
+ dout[47] = melcoe[152] * fftmag[76] + melcoe[153] * fftmag[77] +
+ melcoe[154] * fftmag[78] + melcoe[155] * fftmag[79] +
+ melcoe[156] * fftmag[80] + melcoe[157] * fftmag[81];
+ dout[48] = melcoe[158] * fftmag[79] + melcoe[159] * fftmag[80] +
+ melcoe[160] * fftmag[81] + melcoe[161] * fftmag[82] +
+ melcoe[162] * fftmag[83] + melcoe[163] * fftmag[84];
+ dout[49] = melcoe[164] * fftmag[82] + melcoe[165] * fftmag[83] +
+ melcoe[166] * fftmag[84] + melcoe[167] * fftmag[85] +
+ melcoe[168] * fftmag[86] + melcoe[169] * fftmag[87] +
+ melcoe[170] * fftmag[88];
+ dout[50] = melcoe[171] * fftmag[85] + melcoe[172] * fftmag[86] +
+ melcoe[173] * fftmag[87] + melcoe[174] * fftmag[88] +
+ melcoe[175] * fftmag[89] + melcoe[176] * fftmag[90] +
+ melcoe[177] * fftmag[91];
+ dout[51] = melcoe[178] * fftmag[89] + melcoe[179] * fftmag[90] +
+ melcoe[180] * fftmag[91] + melcoe[181] * fftmag[92] +
+ melcoe[182] * fftmag[93] + melcoe[183] * fftmag[94] +
+ melcoe[184] * fftmag[95];
+ dout[52] = melcoe[185] * fftmag[92] + melcoe[186] * fftmag[93] +
+ melcoe[187] * fftmag[94] + melcoe[188] * fftmag[95] +
+ melcoe[189] * fftmag[96] + melcoe[190] * fftmag[97] +
+ melcoe[191] * fftmag[98];
+ dout[53] = melcoe[192] * fftmag[96] + melcoe[193] * fftmag[97] +
+ melcoe[194] * fftmag[98] + melcoe[195] * fftmag[99] +
+ melcoe[196] * fftmag[100] + melcoe[197] * fftmag[101] +
+ melcoe[198] * fftmag[102];
+ dout[54] = melcoe[199] * fftmag[99] + melcoe[200] * fftmag[100] +
+ melcoe[201] * fftmag[101] + melcoe[202] * fftmag[102] +
+ melcoe[203] * fftmag[103] + melcoe[204] * fftmag[104] +
+ melcoe[205] * fftmag[105] + melcoe[206] * fftmag[106];
+ dout[55] = melcoe[207] * fftmag[103] + melcoe[208] * fftmag[104] +
+ melcoe[209] * fftmag[105] + melcoe[210] * fftmag[106] +
+ melcoe[211] * fftmag[107] + melcoe[212] * fftmag[108] +
+ melcoe[213] * fftmag[109] + melcoe[214] * fftmag[110];
+ dout[56] = melcoe[215] * fftmag[107] + melcoe[216] * fftmag[108] +
+ melcoe[217] * fftmag[109] + melcoe[218] * fftmag[110] +
+ melcoe[219] * fftmag[111] + melcoe[220] * fftmag[112] +
+ melcoe[221] * fftmag[113] + melcoe[222] * fftmag[114];
+ dout[57] = melcoe[223] * fftmag[111] + melcoe[224] * fftmag[112] +
+ melcoe[225] * fftmag[113] + melcoe[226] * fftmag[114] +
+ melcoe[227] * fftmag[115] + melcoe[228] * fftmag[116] +
+ melcoe[229] * fftmag[117] + melcoe[230] * fftmag[118] +
+ melcoe[231] * fftmag[119];
+ dout[58] = melcoe[232] * fftmag[115] + melcoe[233] * fftmag[116] +
+ melcoe[234] * fftmag[117] + melcoe[235] * fftmag[118] +
+ melcoe[236] * fftmag[119] + melcoe[237] * fftmag[120] +
+ melcoe[238] * fftmag[121] + melcoe[239] * fftmag[122] +
+ melcoe[240] * fftmag[123];
+ dout[59] = melcoe[241] * fftmag[120] + melcoe[242] * fftmag[121] +
+ melcoe[243] * fftmag[122] + melcoe[244] * fftmag[123] +
+ melcoe[245] * fftmag[124] + melcoe[246] * fftmag[125] +
+ melcoe[247] * fftmag[126] + melcoe[248] * fftmag[127] +
+ melcoe[249] * fftmag[128];
+ dout[60] = melcoe[250] * fftmag[124] + melcoe[251] * fftmag[125] +
+ melcoe[252] * fftmag[126] + melcoe[253] * fftmag[127] +
+ melcoe[254] * fftmag[128] + melcoe[255] * fftmag[129] +
+ melcoe[256] * fftmag[130] + melcoe[257] * fftmag[131] +
+ melcoe[258] * fftmag[132];
+ dout[61] = melcoe[259] * fftmag[129] + melcoe[260] * fftmag[130] +
+ melcoe[261] * fftmag[131] + melcoe[262] * fftmag[132] +
+ melcoe[263] * fftmag[133] + melcoe[264] * fftmag[134] +
+ melcoe[265] * fftmag[135] + melcoe[266] * fftmag[136] +
+ melcoe[267] * fftmag[137];
+ dout[62] = melcoe[268] * fftmag[133] + melcoe[269] * fftmag[134] +
+ melcoe[270] * fftmag[135] + melcoe[271] * fftmag[136] +
+ melcoe[272] * fftmag[137] + melcoe[273] * fftmag[138] +
+ melcoe[274] * fftmag[139] + melcoe[275] * fftmag[140] +
+ melcoe[276] * fftmag[141] + melcoe[277] * fftmag[142];
+ dout[63] = melcoe[278] * fftmag[138] + melcoe[279] * fftmag[139] +
+ melcoe[280] * fftmag[140] + melcoe[281] * fftmag[141] +
+ melcoe[282] * fftmag[142] + melcoe[283] * fftmag[143] +
+ melcoe[284] * fftmag[144] + melcoe[285] * fftmag[145] +
+ melcoe[286] * fftmag[146] + melcoe[287] * fftmag[147];
+ dout[64] = melcoe[288] * fftmag[143] + melcoe[289] * fftmag[144] +
+ melcoe[290] * fftmag[145] + melcoe[291] * fftmag[146] +
+ melcoe[292] * fftmag[147] + melcoe[293] * fftmag[148] +
+ melcoe[294] * fftmag[149] + melcoe[295] * fftmag[150] +
+ melcoe[296] * fftmag[151] + melcoe[297] * fftmag[152] +
+ melcoe[298] * fftmag[153];
+ dout[65] = melcoe[299] * fftmag[148] + melcoe[300] * fftmag[149] +
+ melcoe[301] * fftmag[150] + melcoe[302] * fftmag[151] +
+ melcoe[303] * fftmag[152] + melcoe[304] * fftmag[153] +
+ melcoe[305] * fftmag[154] + melcoe[306] * fftmag[155] +
+ melcoe[307] * fftmag[156] + melcoe[308] * fftmag[157] +
+ melcoe[309] * fftmag[158];
+ dout[66] = melcoe[310] * fftmag[154] + melcoe[311] * fftmag[155] +
+ melcoe[312] * fftmag[156] + melcoe[313] * fftmag[157] +
+ melcoe[314] * fftmag[158] + melcoe[315] * fftmag[159] +
+ melcoe[316] * fftmag[160] + melcoe[317] * fftmag[161] +
+ melcoe[318] * fftmag[162] + melcoe[319] * fftmag[163] +
+ melcoe[320] * fftmag[164];
+ dout[67] = melcoe[321] * fftmag[159] + melcoe[322] * fftmag[160] +
+ melcoe[323] * fftmag[161] + melcoe[324] * fftmag[162] +
+ melcoe[325] * fftmag[163] + melcoe[326] * fftmag[164] +
+ melcoe[327] * fftmag[165] + melcoe[328] * fftmag[166] +
+ melcoe[329] * fftmag[167] + melcoe[330] * fftmag[168] +
+ melcoe[331] * fftmag[169] + melcoe[332] * fftmag[170];
+ dout[68] = melcoe[333] * fftmag[165] + melcoe[334] * fftmag[166] +
+ melcoe[335] * fftmag[167] + melcoe[336] * fftmag[168] +
+ melcoe[337] * fftmag[169] + melcoe[338] * fftmag[170] +
+ melcoe[339] * fftmag[171] + melcoe[340] * fftmag[172] +
+ melcoe[341] * fftmag[173] + melcoe[342] * fftmag[174] +
+ melcoe[343] * fftmag[175] + melcoe[344] * fftmag[176];
+ dout[69] = melcoe[345] * fftmag[171] + melcoe[346] * fftmag[172] +
+ melcoe[347] * fftmag[173] + melcoe[348] * fftmag[174] +
+ melcoe[349] * fftmag[175] + melcoe[350] * fftmag[176] +
+ melcoe[351] * fftmag[177] + melcoe[352] * fftmag[178] +
+ melcoe[353] * fftmag[179] + melcoe[354] * fftmag[180] +
+ melcoe[355] * fftmag[181] + melcoe[356] * fftmag[182];
+ dout[70] = melcoe[357] * fftmag[177] + melcoe[358] * fftmag[178] +
+ melcoe[359] * fftmag[179] + melcoe[360] * fftmag[180] +
+ melcoe[361] * fftmag[181] + melcoe[362] * fftmag[182] +
+ melcoe[363] * fftmag[183] + melcoe[364] * fftmag[184] +
+ melcoe[365] * fftmag[185] + melcoe[366] * fftmag[186] +
+ melcoe[367] * fftmag[187] + melcoe[368] * fftmag[188];
+ dout[71] = melcoe[369] * fftmag[183] + melcoe[370] * fftmag[184] +
+ melcoe[371] * fftmag[185] + melcoe[372] * fftmag[186] +
+ melcoe[373] * fftmag[187] + melcoe[374] * fftmag[188] +
+ melcoe[375] * fftmag[189] + melcoe[376] * fftmag[190] +
+ melcoe[377] * fftmag[191] + melcoe[378] * fftmag[192] +
+ melcoe[379] * fftmag[193] + melcoe[380] * fftmag[194] +
+ melcoe[381] * fftmag[195];
+ dout[72] = melcoe[382] * fftmag[189] + melcoe[383] * fftmag[190] +
+ melcoe[384] * fftmag[191] + melcoe[385] * fftmag[192] +
+ melcoe[386] * fftmag[193] + melcoe[387] * fftmag[194] +
+ melcoe[388] * fftmag[195] + melcoe[389] * fftmag[196] +
+ melcoe[390] * fftmag[197] + melcoe[391] * fftmag[198] +
+ melcoe[392] * fftmag[199] + melcoe[393] * fftmag[200] +
+ melcoe[394] * fftmag[201] + melcoe[395] * fftmag[202];
+ dout[73] = melcoe[396] * fftmag[196] + melcoe[397] * fftmag[197] +
+ melcoe[398] * fftmag[198] + melcoe[399] * fftmag[199] +
+ melcoe[400] * fftmag[200] + melcoe[401] * fftmag[201] +
+ melcoe[402] * fftmag[202] + melcoe[403] * fftmag[203] +
+ melcoe[404] * fftmag[204] + melcoe[405] * fftmag[205] +
+ melcoe[406] * fftmag[206] + melcoe[407] * fftmag[207] +
+ melcoe[408] * fftmag[208] + melcoe[409] * fftmag[209];
+ dout[74] = melcoe[410] * fftmag[203] + melcoe[411] * fftmag[204] +
+ melcoe[412] * fftmag[205] + melcoe[413] * fftmag[206] +
+ melcoe[414] * fftmag[207] + melcoe[415] * fftmag[208] +
+ melcoe[416] * fftmag[209] + melcoe[417] * fftmag[210] +
+ melcoe[418] * fftmag[211] + melcoe[419] * fftmag[212] +
+ melcoe[420] * fftmag[213] + melcoe[421] * fftmag[214] +
+ melcoe[422] * fftmag[215] + melcoe[423] * fftmag[216];
+ dout[75] = melcoe[424] * fftmag[210] + melcoe[425] * fftmag[211] +
+ melcoe[426] * fftmag[212] + melcoe[427] * fftmag[213] +
+ melcoe[428] * fftmag[214] + melcoe[429] * fftmag[215] +
+ melcoe[430] * fftmag[216] + melcoe[431] * fftmag[217] +
+ melcoe[432] * fftmag[218] + melcoe[433] * fftmag[219] +
+ melcoe[434] * fftmag[220] + melcoe[435] * fftmag[221] +
+ melcoe[436] * fftmag[222] + melcoe[437] * fftmag[223];
+ dout[76] = melcoe[438] * fftmag[217] + melcoe[439] * fftmag[218] +
+ melcoe[440] * fftmag[219] + melcoe[441] * fftmag[220] +
+ melcoe[442] * fftmag[221] + melcoe[443] * fftmag[222] +
+ melcoe[444] * fftmag[223] + melcoe[445] * fftmag[224] +
+ melcoe[446] * fftmag[225] + melcoe[447] * fftmag[226] +
+ melcoe[448] * fftmag[227] + melcoe[449] * fftmag[228] +
+ melcoe[450] * fftmag[229] + melcoe[451] * fftmag[230] +
+ melcoe[452] * fftmag[231];
+ dout[77] = melcoe[453] * fftmag[224] + melcoe[454] * fftmag[225] +
+ melcoe[455] * fftmag[226] + melcoe[456] * fftmag[227] +
+ melcoe[457] * fftmag[228] + melcoe[458] * fftmag[229] +
+ melcoe[459] * fftmag[230] + melcoe[460] * fftmag[231] +
+ melcoe[461] * fftmag[232] + melcoe[462] * fftmag[233] +
+ melcoe[463] * fftmag[234] + melcoe[464] * fftmag[235] +
+ melcoe[465] * fftmag[236] + melcoe[466] * fftmag[237] +
+ melcoe[467] * fftmag[238] + melcoe[468] * fftmag[239];
+ dout[78] = melcoe[469] * fftmag[232] + melcoe[470] * fftmag[233] +
+ melcoe[471] * fftmag[234] + melcoe[472] * fftmag[235] +
+ melcoe[473] * fftmag[236] + melcoe[474] * fftmag[237] +
+ melcoe[475] * fftmag[238] + melcoe[476] * fftmag[239] +
+ melcoe[477] * fftmag[240] + melcoe[478] * fftmag[241] +
+ melcoe[479] * fftmag[242] + melcoe[480] * fftmag[243] +
+ melcoe[481] * fftmag[244] + melcoe[482] * fftmag[245] +
+ melcoe[483] * fftmag[246] + melcoe[484] * fftmag[247];
+ dout[79] = melcoe[485] * fftmag[240] + melcoe[486] * fftmag[241] +
+ melcoe[487] * fftmag[242] + melcoe[488] * fftmag[243] +
+ melcoe[489] * fftmag[244] + melcoe[490] * fftmag[245] +
+ melcoe[491] * fftmag[246] + melcoe[492] * fftmag[247] +
+ melcoe[493] * fftmag[248] + melcoe[494] * fftmag[249] +
+ melcoe[495] * fftmag[250] + melcoe[496] * fftmag[251] +
+ melcoe[497] * fftmag[252] + melcoe[498] * fftmag[253] +
+ melcoe[499] * fftmag[254] + melcoe[500] * fftmag[255];
+ global_cmvn(dout);
+}
diff --git a/funasr/runtime/onnxruntime/src/FeatureExtract.h b/funasr/runtime/onnxruntime/src/FeatureExtract.h
new file mode 100644
index 000000000..f16ea3a97
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/FeatureExtract.h
@@ -0,0 +1,36 @@
+
+#ifndef FEATUREEXTRACT_H
+#define FEATUREEXTRACT_H
+
+#include
+#include
+
+#include "FeatureQueue.h"
+#include "SpeechWrap.h"
+#include "Tensor.h"
+
+class FeatureExtract {
+ private:
+ SpeechWrap speech;
+ FeatureQueue fqueue;
+ int mode;
+
+ float *fft_input;
+ fftwf_complex *fft_out;
+ fftwf_plan p;
+
+ void fftw_init();
+ void melspect(float *din, float *dout);
+ void global_cmvn(float *din);
+
+ public:
+ FeatureExtract(int mode);
+ ~FeatureExtract();
+ int size();
+ int status();
+ void reset();
+ void insert(float *din, int len, int flag);
+ bool fetch(Tensor *&dout);
+};
+
+#endif
diff --git a/funasr/runtime/onnxruntime/src/FeatureQueue.cpp b/funasr/runtime/onnxruntime/src/FeatureQueue.cpp
new file mode 100644
index 000000000..f07633b42
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/FeatureQueue.cpp
@@ -0,0 +1,59 @@
+#include "precomp.h"
+FeatureQueue::FeatureQueue()
+{
+ buff = new Tensor(67, 80);
+ window_size = 67;
+ buff_idx = 0;
+}
+
+FeatureQueue::~FeatureQueue()
+{
+ delete buff;
+}
+
+void FeatureQueue::reinit(int size)
+{
+ delete buff;
+ buff = new Tensor(size, 80);
+ buff_idx = 0;
+ window_size = size;
+}
+
+void FeatureQueue::reset()
+{
+ buff_idx = 0;
+}
+
+void FeatureQueue::push(float *din, int flag)
+{
+ int offset = buff_idx * 80;
+ memcpy(buff->buff + offset, din, 80 * sizeof(float));
+ buff_idx++;
+
+ if (flag == S_END) {
+ Tensor *tmp = new Tensor(buff_idx, 80);
+ memcpy(tmp->buff, buff->buff, buff_idx * 80 * sizeof(float));
+ feature_queue.push(tmp);
+ buff_idx = 0;
+ } else if (buff_idx == window_size) {
+ feature_queue.push(buff);
+ Tensor *tmp = new Tensor(window_size, 80);
+ memcpy(tmp->buff, buff->buff + (window_size - 3) * 80,
+ 3 * 80 * sizeof(float));
+ buff_idx = 3;
+ buff = tmp;
+ }
+}
+
+Tensor *FeatureQueue::pop()
+{
+
+ Tensor *tmp = feature_queue.front();
+ feature_queue.pop();
+ return tmp;
+}
+
+int FeatureQueue::size()
+{
+ return feature_queue.size();
+}
diff --git a/funasr/runtime/onnxruntime/src/FeatureQueue.h b/funasr/runtime/onnxruntime/src/FeatureQueue.h
new file mode 100644
index 000000000..be3360b49
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/FeatureQueue.h
@@ -0,0 +1,28 @@
+
+#ifndef FEATUREQUEUE_H
+#define FEATUREQUEUE_H
+
+#include "Tensor.h"
+#include
+#include
+using namespace std;
+
+
+class FeatureQueue {
+ private:
+ queue *> feature_queue;
+ Tensor *buff;
+ int buff_idx;
+ int window_size;
+
+ public:
+ FeatureQueue();
+ ~FeatureQueue();
+ void reinit(int size);
+ void reset();
+ void push(float *din, int flag);
+ Tensor *pop();
+ int size();
+};
+
+#endif
diff --git a/funasr/runtime/onnxruntime/src/Model.cpp b/funasr/runtime/onnxruntime/src/Model.cpp
new file mode 100644
index 000000000..ddd4fd0b4
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/Model.cpp
@@ -0,0 +1,11 @@
+#include "precomp.h"
+
+Model *create_model(const char *path,int nThread)
+{
+ Model *mm;
+
+
+ mm = new paraformer::ModelImp(path, nThread);
+
+ return mm;
+}
diff --git a/funasr/runtime/onnxruntime/src/SpeechWrap.cpp b/funasr/runtime/onnxruntime/src/SpeechWrap.cpp
new file mode 100644
index 000000000..60d0a2b70
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/SpeechWrap.cpp
@@ -0,0 +1,39 @@
+#include "precomp.h"
+
+SpeechWrap::SpeechWrap()
+{
+ cache_size = 0;
+}
+
+SpeechWrap::~SpeechWrap()
+{
+}
+
+void SpeechWrap::reset()
+{
+ cache_size = 0;
+}
+
+void SpeechWrap::load(float *din, int len)
+{
+ in = din;
+ in_size = len;
+ total_size = cache_size + in_size;
+}
+
+int SpeechWrap::size()
+{
+ return total_size;
+}
+
+void SpeechWrap::update(int offset)
+{
+ int in_offset = offset - cache_size;
+ cache_size = (total_size - offset);
+ memcpy(cache, in + in_offset, cache_size * sizeof(float));
+}
+
+float &SpeechWrap::operator[](int i)
+{
+ return i < cache_size ? cache[i] : in[i - cache_size];
+}
diff --git a/funasr/runtime/onnxruntime/src/SpeechWrap.h b/funasr/runtime/onnxruntime/src/SpeechWrap.h
new file mode 100644
index 000000000..5d3ee4087
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/SpeechWrap.h
@@ -0,0 +1,26 @@
+
+#ifndef SPEECHWRAP_H
+#define SPEECHWRAP_H
+
+#include
+
+class SpeechWrap {
+ private:
+ float cache[400];
+ int cache_size;
+ float *in;
+ int in_size;
+ int total_size;
+ int next_cache_size;
+
+ public:
+ SpeechWrap();
+ ~SpeechWrap();
+ void load(float *din, int len);
+ void update(int offset);
+ void reset();
+ int size();
+ float &operator[](int i);
+};
+
+#endif
diff --git a/funasr/runtime/onnxruntime/src/Tensor.h b/funasr/runtime/onnxruntime/src/Tensor.h
new file mode 100644
index 000000000..68ac9aa68
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/Tensor.h
@@ -0,0 +1,155 @@
+#ifndef TENSOR_H
+#define TENSOR_H
+
+#include "alignedmem.h"
+
+using namespace std;
+
+template class Tensor {
+ private:
+ void alloc_buff();
+ void free_buff();
+ int mem_size;
+
+ public:
+ T *buff;
+ int size[4];
+ int buff_size;
+ Tensor(Tensor *in);
+ Tensor(int a);
+ Tensor(int a, int b);
+ Tensor(int a, int b, int c);
+ Tensor(int a, int b, int c, int d);
+ ~Tensor();
+ void zeros();
+ void shape();
+ void disp();
+ void dump(const char *mode);
+ void concat(Tensor *din, int dim);
+ void resize(int a, int b, int c, int d);
+ void add(float coe, Tensor *in);
+ void add(Tensor *in);
+ void add(Tensor *in1, Tensor *in2);
+ void reload(Tensor *in);
+};
+
+template Tensor::Tensor(int a) : size{1, 1, 1, a}
+{
+ alloc_buff();
+}
+
+template Tensor::Tensor(int a, int b) : size{1, 1, a, b}
+{
+ alloc_buff();
+}
+
+template Tensor::Tensor(int a, int b, int c) : size{1, a, b, c}
+{
+
+ alloc_buff();
+}
+
+template
+Tensor::Tensor(int a, int b, int c, int d) : size{a, b, c, d}
+{
+ alloc_buff();
+}
+
+template Tensor::Tensor(Tensor *in)
+{
+ memcpy(size, in->size, 4 * sizeof(int));
+ alloc_buff();
+ memcpy(buff, in->buff, in->buff_size * sizeof(T));
+}
+
+template Tensor::~Tensor()
+{
+ free_buff();
+}
+
+template void Tensor::alloc_buff()
+{
+ buff_size = size[0] * size[1] * size[2] * size[3];
+ mem_size = buff_size;
+ buff = (T *)aligned_malloc(32, buff_size * sizeof(T));
+}
+
+template void Tensor::free_buff()
+{
+ aligned_free(buff);
+}
+
+template void Tensor::zeros()
+{
+ memset(buff, 0, buff_size * sizeof(T));
+}
+
+template void Tensor::shape()
+{
+ printf("(%d,%d,%d,%d)\n", size[0], size[1], size[2], size[3]);
+}
+
+// TODO:: fix it!!!!
+template void Tensor::concat(Tensor *din, int dim)
+{
+ memcpy(buff + buff_size, din->buff, din->buff_size * sizeof(T));
+ buff_size += din->buff_size;
+ size[dim] += din->size[dim];
+}
+
+// TODO:: fix it!!!!
+template void Tensor::resize(int a, int b, int c, int d)
+{
+ size[0] = a;
+ size[1] = b;
+ size[2] = c;
+ size[3] = d;
+ buff_size = size[0] * size[1] * size[2] * size[3];
+}
+
+template void Tensor::add(float coe, Tensor *in)
+{
+ int i;
+ for (i = 0; i < buff_size; i++) {
+ buff[i] = buff[i] + coe * in->buff[i];
+ }
+}
+
+template void Tensor::add(Tensor *in)
+{
+ int i;
+ for (i = 0; i < buff_size; i++) {
+ buff[i] = buff[i] + in->buff[i];
+ }
+}
+
+template void Tensor::add(Tensor *in1, Tensor *in2)
+{
+ int i;
+ for (i = 0; i < buff_size; i++) {
+ buff[i] = buff[i] + in1->buff[i] + in2->buff[i];
+ }
+}
+
+template void Tensor::reload(Tensor *in)
+{
+ memcpy(buff, in->buff, in->buff_size * sizeof(T));
+}
+
+template void Tensor::disp()
+{
+ int i;
+ for (i = 0; i < buff_size; i++) {
+ cout << buff[i] << " ";
+ }
+ cout << endl;
+}
+
+template void Tensor::dump(const char *mode)
+{
+ FILE *fp;
+ fp = fopen("tmp.bin", mode);
+ fwrite(buff, 1, buff_size * sizeof(T), fp);
+ fclose(fp);
+}
+#endif
diff --git a/funasr/runtime/onnxruntime/src/Vocab.cpp b/funasr/runtime/onnxruntime/src/Vocab.cpp
new file mode 100644
index 000000000..d2d034181
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/Vocab.cpp
@@ -0,0 +1,170 @@
+#include "Vocab.h"
+
+#include
+#include
+#include
+#include
+#include
+
+using namespace std;
+
+Vocab::Vocab(const char *filename)
+{
+ ifstream in(filename);
+ string line;
+
+ if (in) // 有该文件
+ {
+ while (getline(in, line)) // line中不包括每行的换行符
+ {
+ vocab.push_back(line);
+ }
+ // cout << vocab[1719] << endl;
+ }
+ // else // 没有该文件
+ //{
+ // cout << "no such file" << endl;
+ // }
+}
+Vocab::~Vocab()
+{
+}
+
+string Vocab::vector2string(vector in)
+{
+ int i;
+ stringstream ss;
+ for (auto it = in.begin(); it != in.end(); it++) {
+ ss << vocab[*it];
+ }
+
+ return ss.str();
+}
+
+int str2int(string str)
+{
+ const char *ch_array = str.c_str();
+ if (((ch_array[0] & 0xf0) != 0xe0) || ((ch_array[1] & 0xc0) != 0x80) ||
+ ((ch_array[2] & 0xc0) != 0x80))
+ return 0;
+
+ int val = ((ch_array[0] & 0x0f) << 12) | ((ch_array[1] & 0x3f) << 6) |
+ (ch_array[2] & 0x3f);
+ return val;
+}
+
+bool Vocab::isChinese(string ch)
+{
+ if (ch.size() != 3) {
+ return false;
+ }
+
+ int unicode = str2int(ch);
+ if (unicode >= 19968 && unicode <= 40959) {
+ return true;
+ }
+
+ return false;
+}
+
+
+string Vocab::vector2stringV2(vector in)
+{
+ int i;
+ list words;
+
+ int is_pre_english = false;
+ int pre_english_len = 0;
+
+ int is_combining = false;
+ string combine = "";
+
+ for (auto it = in.begin(); it != in.end(); it++) {
+ string word = vocab[*it];
+
+ // step1 space character skips
+ if (word == "" || word == "" || word == "")
+ continue;
+
+ // step2 combie phoneme to full word
+ {
+ int sub_word = !(word.find("@@") == string::npos);
+
+ // process word start and middle part
+ if (sub_word) {
+ combine += word.erase(word.length() - 2);
+ is_combining = true;
+ continue;
+ }
+ // process word end part
+ else if (is_combining) {
+ combine += word;
+ is_combining = false;
+ word = combine;
+ combine = "";
+ }
+ }
+
+ // step3 process english word deal with space , turn abbreviation to upper case
+ {
+
+ // input word is chinese, not need process
+ if (isChinese(word)) {
+ words.push_back(word);
+ is_pre_english = false;
+ }
+ // input word is english word
+ else {
+
+ // pre word is chinese
+ if (!is_pre_english) {
+ word[0] = word[0] - 32;
+ words.push_back(word);
+ pre_english_len = word.size();
+
+ }
+
+ // pre word is english word
+ else {
+
+ // single letter turn to upper case
+ if (word.size() == 1) {
+ word[0] = word[0] - 32;
+ }
+
+ if (pre_english_len > 1) {
+ words.push_back(" ");
+ words.push_back(word);
+ pre_english_len = word.size();
+ }
+ else {
+ if (word.size() > 1) {
+ words.push_back(" ");
+ }
+ words.push_back(word);
+ pre_english_len = word.size();
+ }
+ }
+
+ is_pre_english = true;
+
+ }
+ }
+ }
+
+ // for (auto it = words.begin(); it != words.end(); it++) {
+ // cout << *it << endl;
+ // }
+
+ stringstream ss;
+ for (auto it = words.begin(); it != words.end(); it++) {
+ ss << *it;
+ }
+
+ return ss.str();
+}
+
+int Vocab::size()
+{
+ return vocab.size();
+}
diff --git a/funasr/runtime/onnxruntime/src/Vocab.h b/funasr/runtime/onnxruntime/src/Vocab.h
new file mode 100644
index 000000000..328a2a18f
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/Vocab.h
@@ -0,0 +1,24 @@
+
+#ifndef VOCAB_H
+#define VOCAB_H
+
+#include
+#include
+#include
+using namespace std;
+
+class Vocab {
+ private:
+ vector vocab;
+ bool isChinese(string ch);
+ bool isEnglish(string ch);
+
+ public:
+ Vocab(const char *filename);
+ ~Vocab();
+ int size();
+ string vector2string(vector in);
+ string vector2stringV2(vector in);
+};
+
+#endif
diff --git a/funasr/runtime/onnxruntime/src/alignedmem.cpp b/funasr/runtime/onnxruntime/src/alignedmem.cpp
new file mode 100644
index 000000000..e174afe03
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/alignedmem.cpp
@@ -0,0 +1,18 @@
+#include "precomp.h"
+void *aligned_malloc(size_t alignment, size_t required_bytes)
+{
+ void *p1; // original block
+ void **p2; // aligned block
+ int offset = alignment - 1 + sizeof(void *);
+ if ((p1 = (void *)malloc(required_bytes + offset)) == NULL) {
+ return NULL;
+ }
+ p2 = (void **)(((size_t)(p1) + offset) & ~(alignment - 1));
+ p2[-1] = p1;
+ return p2;
+}
+
+void aligned_free(void *p)
+{
+ free(((void **)p)[-1]);
+}
diff --git a/funasr/runtime/onnxruntime/src/alignedmem.h b/funasr/runtime/onnxruntime/src/alignedmem.h
new file mode 100644
index 000000000..dca68f4c5
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/alignedmem.h
@@ -0,0 +1,10 @@
+
+#ifndef ALIGNEDMEM_H
+#define ALIGNEDMEM_H
+
+
+
+extern void *aligned_malloc(size_t alignment, size_t required_bytes);
+extern void aligned_free(void *p);
+
+#endif
diff --git a/funasr/runtime/onnxruntime/src/commonfunc.h b/funasr/runtime/onnxruntime/src/commonfunc.h
new file mode 100644
index 000000000..11c234e77
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/commonfunc.h
@@ -0,0 +1,56 @@
+#pragma once
+
+
+typedef struct
+{
+ std::string msg;
+ float snippet_time;
+}RPASR_RECOG_RESULT;
+
+
+#ifdef _WIN32
+#include
+
+
+
+inline std::wstring string2wstring(const std::string& str, const std::string& locale)
+{
+ typedef std::codecvt_byname F;
+ std::wstring_convert strCnv(new F(locale));
+ return strCnv.from_bytes(str);
+}
+
+inline std::wstring strToWstr(std::string str) {
+ if (str.length() == 0)
+ return L"";
+ return string2wstring(str, "zh-CN");
+
+}
+
+#endif
+
+
+
+inline void getInputName(Ort::Session* session, string& inputName,int nIndex=0) {
+ size_t numInputNodes = session->GetInputCount();
+ if (numInputNodes > 0) {
+ Ort::AllocatorWithDefaultOptions allocator;
+ {
+ auto t = session->GetInputNameAllocated(nIndex, allocator);
+ inputName = t.get();
+
+ }
+ }
+}
+
+inline void getOutputName(Ort::Session* session, string& outputName, int nIndex = 0) {
+ size_t numOutputNodes = session->GetOutputCount();
+ if (numOutputNodes > 0) {
+ Ort::AllocatorWithDefaultOptions allocator;
+ {
+ auto t = session->GetOutputNameAllocated(nIndex, allocator);
+ outputName = t.get();
+
+ }
+ }
+}
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/src/librapidasrapi.cpp b/funasr/runtime/onnxruntime/src/librapidasrapi.cpp
new file mode 100644
index 000000000..f5f9d66be
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/librapidasrapi.cpp
@@ -0,0 +1,213 @@
+#include "precomp.h"
+#ifdef __cplusplus
+
+extern "C" {
+#endif
+
+
+ // APIs for qmasr
+ _RAPIDASRAPI RPASR_HANDLE RapidAsrInit(const char* szModelDir, int nThreadNum)
+ {
+
+
+ Model* mm = create_model(szModelDir, nThreadNum);
+
+ return mm;
+ }
+
+
+ _RAPIDASRAPI RPASR_RESULT RapidAsrRecogBuffer(RPASR_HANDLE handle, const char* szBuf, int nLen, RPASR_MODE Mode, QM_CALLBACK fnCallback)
+ {
+
+
+ Model* pRecogObj = (Model*)handle;
+
+ if (!pRecogObj)
+ return nullptr;
+
+ Audio audio(1);
+ if (!audio.loadwav(szBuf, nLen))
+ return nullptr;
+ //audio.split();
+
+ float* buff;
+ int len;
+ int flag=0;
+ RPASR_RECOG_RESULT* pResult = new RPASR_RECOG_RESULT;
+ pResult->snippet_time = audio.get_time_len();
+ int nStep = 0;
+ int nTotal = audio.get_queue_size();
+ while (audio.fetch(buff, len, flag) > 0) {
+ pRecogObj->reset();
+ string msg = pRecogObj->forward(buff, len, flag);
+ pResult->msg += msg;
+ nStep++;
+ if (fnCallback)
+ fnCallback(nStep, nTotal);
+ }
+
+
+ return pResult;
+ }
+
+ _RAPIDASRAPI RPASR_RESULT RapidAsrRecogPCMBuffer(RPASR_HANDLE handle, const char* szBuf, int nLen, RPASR_MODE Mode, QM_CALLBACK fnCallback)
+ {
+
+ Model* pRecogObj = (Model*)handle;
+
+ if (!pRecogObj)
+ return nullptr;
+
+ Audio audio(1);
+ if (!audio.loadpcmwav(szBuf, nLen))
+ return nullptr;
+ //audio.split();
+
+ float* buff;
+ int len;
+ int flag = 0;
+ RPASR_RECOG_RESULT* pResult = new RPASR_RECOG_RESULT;
+ pResult->snippet_time = audio.get_time_len();
+ int nStep = 0;
+ int nTotal = audio.get_queue_size();
+ while (audio.fetch(buff, len, flag) > 0) {
+ pRecogObj->reset();
+ string msg = pRecogObj->forward(buff, len, flag);
+ pResult->msg += msg;
+ nStep++;
+ if (fnCallback)
+ fnCallback(nStep, nTotal);
+ }
+
+
+ return pResult;
+
+ }
+
+ _RAPIDASRAPI RPASR_RESULT RapidAsrRecogPCMFile(RPASR_HANDLE handle, const char* szFileName, RPASR_MODE Mode, QM_CALLBACK fnCallback)
+ {
+
+ Model* pRecogObj = (Model*)handle;
+
+ if (!pRecogObj)
+ return nullptr;
+
+ Audio audio(1);
+ if (!audio.loadpcmwav(szFileName))
+ return nullptr;
+ //audio.split();
+
+ float* buff;
+ int len;
+ int flag = 0;
+ RPASR_RECOG_RESULT* pResult = new RPASR_RECOG_RESULT;
+ pResult->snippet_time = audio.get_time_len();
+ int nStep = 0;
+ int nTotal = audio.get_queue_size();
+ while (audio.fetch(buff, len, flag) > 0) {
+ pRecogObj->reset();
+ string msg = pRecogObj->forward(buff, len, flag);
+ pResult->msg += msg;
+ nStep++;
+ if (fnCallback)
+ fnCallback(nStep, nTotal);
+ }
+
+
+ return pResult;
+
+ }
+
+ _RAPIDASRAPI RPASR_RESULT RapidAsrRecogFile(RPASR_HANDLE handle, const char* szWavfile, RPASR_MODE Mode, QM_CALLBACK fnCallback)
+ {
+ Model* pRecogObj = (Model*)handle;
+
+ if (!pRecogObj)
+ return nullptr;
+
+ Audio audio(1);
+ if(!audio.loadwav(szWavfile))
+ return nullptr;
+ //audio.split();
+
+ float* buff;
+ int len;
+ int flag = 0;
+ int nStep = 0;
+ int nTotal = audio.get_queue_size();
+ RPASR_RECOG_RESULT* pResult = new RPASR_RECOG_RESULT;
+ pResult->snippet_time = audio.get_time_len();
+ while (audio.fetch(buff, len, flag) > 0) {
+ pRecogObj->reset();
+ string msg = pRecogObj->forward(buff, len, flag);
+ pResult->msg+= msg;
+ nStep++;
+ if (fnCallback)
+ fnCallback(nStep, nTotal);
+ }
+
+
+
+
+ return pResult;
+ }
+
+ _RAPIDASRAPI const int RapidAsrGetRetNumber(RPASR_RESULT Result)
+ {
+ if (!Result)
+ return 0;
+
+ return 1;
+
+ }
+
+
+ _RAPIDASRAPI const float RapidAsrGetRetSnippetTime(RPASR_RESULT Result)
+ {
+ if (!Result)
+ return 0.0f;
+
+ return ((RPASR_RECOG_RESULT*)Result)->snippet_time;
+
+ }
+
+ _RAPIDASRAPI const char* RapidAsrGetResult(RPASR_RESULT Result,int nIndex)
+ {
+ RPASR_RECOG_RESULT * pResult = (RPASR_RECOG_RESULT*)Result;
+ if(!pResult)
+ return nullptr;
+
+ return pResult->msg.c_str();
+
+ }
+
+ _RAPIDASRAPI void RapidAsrFreeResult(RPASR_RESULT Result)
+ {
+
+ if (Result)
+ {
+ delete (RPASR_RECOG_RESULT*)Result;
+
+ }
+ }
+
+ _RAPIDASRAPI void RapidAsrUninit(RPASR_HANDLE handle)
+ {
+
+ Model* pRecogObj = (Model*)handle;
+
+
+ if (!pRecogObj)
+ return;
+
+ delete pRecogObj;
+
+ }
+
+
+
+#ifdef __cplusplus
+
+}
+#endif
+
diff --git a/funasr/runtime/onnxruntime/src/paraformer_onnx.cpp b/funasr/runtime/onnxruntime/src/paraformer_onnx.cpp
new file mode 100644
index 000000000..46b521153
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/paraformer_onnx.cpp
@@ -0,0 +1,179 @@
+#include "precomp.h"
+
+using namespace std;
+using namespace paraformer;
+
+ModelImp::ModelImp(const char* path,int nNumThread)
+{
+ string model_path = pathAppend(path, "model.onnx");
+ string vocab_path = pathAppend(path, "vocab.txt");
+
+ fe = new FeatureExtract(3);
+
+ sessionOptions.SetInterOpNumThreads(nNumThread);
+ sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
+
+#ifdef _WIN32
+ wstring wstrPath = strToWstr(model_path);
+ m_session = new Ort::Session(env, wstrPath.c_str(), sessionOptions);
+#else
+ m_session = new Ort::Session(env, model_path.c_str(), sessionOptions);
+#endif
+
+ string strName;
+ getInputName(m_session, strName);
+ m_strInputNames.push_back(strName.c_str());
+ getInputName(m_session, strName,1);
+ m_strInputNames.push_back(strName);
+
+ getOutputName(m_session, strName);
+ m_strOutputNames.push_back(strName);
+ getOutputName(m_session, strName,1);
+ m_strOutputNames.push_back(strName);
+
+ for (auto& item : m_strInputNames)
+ m_szInputNames.push_back(item.c_str());
+ for (auto& item : m_strOutputNames)
+ m_szOutputNames.push_back(item.c_str());
+ vocab = new Vocab(vocab_path.c_str());
+}
+
+ModelImp::~ModelImp()
+{
+ if(fe)
+ delete fe;
+ if (m_session)
+ {
+ delete m_session;
+ m_session = nullptr;
+ }
+ if(vocab)
+ delete vocab;
+}
+
+void ModelImp::reset()
+{
+ fe->reset();
+}
+
+void ModelImp::apply_lfr(Tensor*& din)
+{
+ int mm = din->size[2];
+ int ll = ceil(mm / 6.0);
+ Tensor* tmp = new Tensor(ll, 560);
+ int out_offset = 0;
+ for (int i = 0; i < ll; i++) {
+ for (int j = 0; j < 7; j++) {
+ int idx = i * 6 + j - 3;
+ if (idx < 0) {
+ idx = 0;
+ }
+ if (idx >= mm) {
+ idx = mm - 1;
+ }
+ memcpy(tmp->buff + out_offset, din->buff + idx * 80,
+ sizeof(float) * 80);
+ out_offset += 80;
+ }
+ }
+ delete din;
+ din = tmp;
+}
+
+void ModelImp::apply_cmvn(Tensor* din)
+{
+ const float* var;
+ const float* mean;
+ float scale = 22.6274169979695;
+ int m = din->size[2];
+ int n = din->size[3];
+
+ var = (const float*)paraformer_cmvn_var_hex;
+ mean = (const float*)paraformer_cmvn_mean_hex;
+ for (int i = 0; i < m; i++) {
+ for (int j = 0; j < n; j++) {
+ int idx = i * n + j;
+ din->buff[idx] = (din->buff[idx] + mean[j]) * var[j];
+ }
+ }
+}
+
+string ModelImp::greedy_search(float * in, int nLen )
+{
+ vector hyps;
+ int Tmax = nLen;
+ for (int i = 0; i < Tmax; i++) {
+ int max_idx;
+ float max_val;
+ findmax(in + i * 8404, 8404, max_val, max_idx);
+ hyps.push_back(max_idx);
+ }
+
+ return vocab->vector2stringV2(hyps);
+}
+
+string ModelImp::forward(float* din, int len, int flag)
+{
+
+ Tensor* in;
+ fe->insert(din, len, flag);
+ fe->fetch(in);
+ apply_lfr(in);
+ apply_cmvn(in);
+ Ort::RunOptions run_option;
+
+ std::array input_shape_{ in->size[0],in->size[2],in->size[3] };
+ Ort::Value onnx_feats = Ort::Value::CreateTensor(m_memoryInfo,
+ in->buff,
+ in->buff_size,
+ input_shape_.data(),
+ input_shape_.size());
+
+ std::vector feats_len{ in->size[2] };
+ std::vector feats_len_dim{ 1 };
+ Ort::Value onnx_feats_len = Ort::Value::CreateTensor(
+ m_memoryInfo,
+ feats_len.data(),
+ feats_len.size() * sizeof(int32_t),
+ feats_len_dim.data(),
+ feats_len_dim.size(), ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32);
+ std::vector input_onnx;
+ input_onnx.emplace_back(std::move(onnx_feats));
+ input_onnx.emplace_back(std::move(onnx_feats_len));
+
+ string result;
+ try {
+
+ auto outputTensor = m_session->Run(run_option, m_szInputNames.data(), input_onnx.data(), m_szInputNames.size(), m_szOutputNames.data(), m_szOutputNames.size());
+ std::vector outputShape = outputTensor[0].GetTensorTypeAndShapeInfo().GetShape();
+
+
+ int64_t outputCount = std::accumulate(outputShape.begin(), outputShape.end(), 1, std::multiplies());
+ float* floatData = outputTensor[0].GetTensorMutableData();
+ auto encoder_out_lens = outputTensor[1].GetTensorMutableData();
+ result = greedy_search(floatData, *encoder_out_lens);
+ }
+ catch (...)
+ {
+ result = "";
+ }
+
+
+ if(in)
+ delete in;
+
+ return result;
+}
+
+string ModelImp::forward_chunk(float* din, int len, int flag)
+{
+
+ printf("Not Imp!!!!!!\n");
+ return "Hello";
+}
+
+string ModelImp::rescoring()
+{
+ printf("Not Imp!!!!!!\n");
+ return "Hello";
+}
diff --git a/funasr/runtime/onnxruntime/src/paraformer_onnx.h b/funasr/runtime/onnxruntime/src/paraformer_onnx.h
new file mode 100644
index 000000000..ebbbb5152
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/paraformer_onnx.h
@@ -0,0 +1,52 @@
+#pragma once
+
+
+#ifndef PARAFORMER_MODELIMP_H
+#define PARAFORMER_MODELIMP_H
+
+
+
+
+
+namespace paraformer {
+
+ class ModelImp : public Model {
+ private:
+ FeatureExtract* fe;
+
+ Vocab* vocab;
+
+ void apply_lfr(Tensor*& din);
+ void apply_cmvn(Tensor* din);
+
+
+ string greedy_search( float* in, int nLen);
+
+#ifdef _WIN_X86
+ Ort::MemoryInfo m_memoryInfo = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
+#else
+ Ort::MemoryInfo m_memoryInfo = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+#endif
+
+ Ort::Session* m_session = nullptr;
+ Ort::Env env = Ort::Env(ORT_LOGGING_LEVEL_ERROR, "paraformer");
+ Ort::SessionOptions sessionOptions = Ort::SessionOptions();
+
+ vector m_strInputNames, m_strOutputNames;
+ vector m_szInputNames;
+ vector m_szOutputNames;
+ //string m_strInputName, m_strInputNameLen;
+ //string m_strOutputName, m_strOutputNameLen;
+
+ public:
+ ModelImp(const char* path, int nNumThread=0);
+ ~ModelImp();
+ void reset();
+ string forward_chunk(float* din, int len, int flag);
+ string forward(float* din, int len, int flag);
+ string rescoring();
+
+ };
+
+} // namespace paraformer
+#endif
diff --git a/funasr/runtime/onnxruntime/src/precomp.h b/funasr/runtime/onnxruntime/src/precomp.h
new file mode 100644
index 000000000..358844baf
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/precomp.h
@@ -0,0 +1,50 @@
+#pragma once
+// system
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+
+#include
+
+using namespace std;
+// third part
+
+#include
+#include "onnxruntime_run_options_config_keys.h"
+#include "onnxruntime_cxx_api.h"
+
+
+// mine
+
+#include "commonfunc.h"
+#include
+#include "predefine_coe.h"
+
+#include
+//#include "alignedmem.h"
+#include "Vocab.h"
+#include "Tensor.h"
+#include "util.h"
+#include "CommonStruct.h"
+#include "FeatureExtract.h"
+#include "FeatureQueue.h"
+#include "SpeechWrap.h"
+#include
+#include "Model.h"
+#include "paraformer_onnx.h"
+#include "librapidasrapi.h"
+
+
+using namespace paraformer;
diff --git a/funasr/runtime/onnxruntime/src/predefine_coe.h b/funasr/runtime/onnxruntime/src/predefine_coe.h
new file mode 100644
index 000000000..93012d857
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/predefine_coe.h
@@ -0,0 +1,592 @@
+#ifndef PREDEFINE_COE_H
+#define PREDEFINE_COE_H
+
+#include
+
+const int32_t melcoe_hex[] = {
+
+ 0x3f01050c, 0x3e0afb11, 0x3f5d413c, 0x3f547fd0, 0x3e2e00c1, 0x3f132970,
+ 0x3ed9ad21, 0x3ebb8bb9, 0x3f223a24, 0x3e4de6f8, 0x3f4c8642, 0x3d9c0424,
+ 0x3f6c7f7c, 0x3f7d295a, 0x3c35a961, 0x3f6fd497, 0x3d815b45, 0x3f6af197,
+ 0x3da87344, 0x3f6dfce9, 0x3d9018b9, 0x3f787ebc, 0x3d2098fe, 0x3cf02873,
+ 0x3f75f670, 0x3e08e423, 0x3f5dc6f7, 0x3e8161eb, 0x3f3f4f0b, 0x3eca38e2,
+ 0x3f1ae38f, 0x3f0f2d23, 0x3ee1a5ba, 0x3f3e9a98, 0x3e82cad1, 0x3f7321ac,
+ 0x3e321028, 0x3d4de548, 0x3f537bf6, 0x3ed50f76, 0x3f157845, 0x3f2cf6bc,
+ 0x3ea61288, 0x3f739ea7, 0x3e794186, 0x3d461590, 0x3f41af9f, 0x3f0cdfd4,
+ 0x3ee64058, 0x3f5f23aa, 0x3e53d467, 0x3e037156, 0x3f4b0ae6, 0x3f0e2fac,
+ 0x3ee3a0a8, 0x3f6ab111, 0x3e94b1ed, 0x3daa7774, 0x3f35a70a, 0x3f2d08dc,
+ 0x3d951fb4, 0x3ea5ee48, 0x3f6d5c09, 0x3ef61e1a, 0x3f04f0f3, 0x3f66305c,
+ 0x3ea7def9, 0x3dce7d20, 0x3f2c1083, 0x3f44354b, 0x3e5baf49, 0x3e6f2ad2,
+ 0x3f49142e, 0x3f2bfe35, 0x3e0d627b, 0x3ea80396, 0x3f5ca761, 0x3f1ce830,
+ 0x3dc4d786, 0x3ec62fa0, 0x3f67650f, 0x3f165fc0, 0x3db1323f, 0x3ed34080,
+ 0x3f69d9b8, 0x3f17def1, 0x3ddbd6b6, 0x3ed0421e, 0x3f648529, 0x3f20ebbd,
+ 0x3e20901a, 0x3ebe2886, 0x3f57dbf9, 0x3f3116ac, 0x3e6edcc6, 0x3e9dd2a9,
+ 0x3f4448ce, 0x3f47f9a3, 0x3eaba511, 0x3e601974, 0x3f2a2d77, 0x3f6536e2,
+ 0x3eec3842, 0x3d0781f6, 0x3dd648ed, 0x3f09e3df, 0x3f7787e1, 0x3f1c411f,
+ 0x3e45b702, 0x3ec77dc2, 0x3f4e9240, 0x3f47f500, 0x3ebf9c61, 0x3e602c00,
+ 0x3f2031d0, 0x3f78f0f7, 0x3f135547, 0x3e3bcd78, 0x3ce1e12a, 0x3ed95573,
+ 0x3f510ca2, 0x3f4bc3c2, 0x3ed37e77, 0x3d0ded37, 0x3e50f0f8, 0x3f1640c5,
+ 0x3f77212d, 0x3f291bd1, 0x3e94df6c, 0x3eadc85e, 0x3f35904a, 0x3f6cd43b,
+ 0x3f104351, 0x3e52dc63, 0x3d995e26, 0x3edf795f, 0x3f4b48e7, 0x3f5a29e7,
+ 0x3f00963d, 0x3e1fdb2f, 0x3e175865, 0x3efed385, 0x3f580934, 0x3f50466d,
+ 0x3ef30046, 0x3e0e7c6b, 0x3e3ee64e, 0x3f067fdd, 0x3f5c60e5, 0x3f4e9ea4,
+ 0x3ef4f46a, 0x3e1cb596, 0x3e45856f, 0x3f0585cb, 0x3f58d29b, 0x3f54b3ef,
+ 0x3f0309ad, 0x3e48aa5b, 0x3e2d3042, 0x3ef9eca6, 0x3f4dd569, 0x3f6212c4,
+ 0x3f12be68, 0x3e8853a3, 0x3def69e0, 0x3eda8330, 0x3f3bd62e, 0x3f76516a,
+ 0x3f2931b5, 0x3eb98e9b, 0x3d88773c, 0x3d1ae95c, 0x3ead9c96, 0x3f2338b2,
+ 0x3f6ef119, 0x3f46054d, 0x3ef74eba, 0x3e47c83a, 0x3e67eace, 0x3f0458a3,
+ 0x3f4e0df1, 0x3f68e26b, 0x3f207590, 0x3eb1515d, 0x3d8bc852, 0x3db8eca9,
+ 0x3ebf14e0, 0x3f275751, 0x3f6e86f6, 0x3f4ae3f8, 0x3f04e6de, 0x3e7dfcce,
+ 0x3e547020, 0x3ef63244, 0x3f4080cd, 0x3f7aaa80, 0x3f366659, 0x3ee560cb,
+ 0x3e3e1967, 0x3caab00e, 0x3e93334e, 0x3f0d4f9a, 0x3f5079a6, 0x3f6ce5f8,
+ 0x3f2acd10, 0x3ed272ff, 0x3e20a4c5, 0x3d98d042, 0x3eaa65e0, 0x3f16c680,
+ 0x3f57d6cf, 0x3f679a1b, 0x3f278a40, 0x3ecfef5c, 0x3e2381fd, 0x3dc32f28,
+ 0x3eb0eb80, 0x3f180852, 0x3f571f81, 0x3f6a42d8, 0x3f2c1ce8, 0x3edcd9d1,
+ 0x3e44c475, 0x3dade93f, 0x3ea7c630, 0x3f119318, 0x3f4ecee3, 0x3f7467d4,
+ 0x3f380f62, 0x3ef84c54, 0x3e815525, 0x3cb361d7, 0x3d3982c8, 0x3e8fe13b,
+ 0x3f03d9d6, 0x3f3f556d, 0x3f7a64f1, 0x3f4af618, 0x3f10ba30, 0x3eadcbc5,
+ 0x3debbe02, 0x3e5427a0, 0x3ede8b9f, 0x3f291a1d, 0x3f628840, 0x3f646e63,
+ 0x3f2bc86b, 0x3ee70902, 0x3e6e854e, 0x3c83b300, 0x3ddc8cea, 0x3ea86f2a,
+ 0x3f0c7b7f, 0x3f445eac, 0x3f7be268, 0x3f4cf80b, 0x3f162f6e, 0x3ebf8516,
+ 0x3e26c0c2, 0x3e4c1fd5, 0x3ed3a124, 0x3f203d75, 0x3f564fd0, 0x3f73f733,
+ 0x3f3e966d, 0x3f098cbf, 0x3ea9b21c, 0x3e01e917, 0x3d408cd1, 0x3e82d326,
+ 0x3eece682, 0x3f2b26f2, 0x3f5f85ba, 0x3f6c6f56, 0x3f38b733, 0x3f0550d9,
+ 0x3ea47689, 0x3dfbabd7, 0x3d9c8552, 0x3e8e919a, 0x3ef55e4f, 0x3f2dc4bb,
+ 0x3f608a85, 0x3f6cfe84, 0x3f3ad56c, 0x3f08f945, 0x3eaed247, 0x3e189086,
+ 0x3d980be2, 0x3e8a5528, 0x3eee0d76, 0x3f2896dc, 0x3f59dbde, 0x3f75295d,
+ 0x3f4477f6, 0x3f140f14, 0x3ec7dbbd, 0x3e504e0f, 0x3c8fe67e, 0x3d2d6a38,
+ 0x3e6e2028, 0x3ed7e1d9, 0x3f1c1221, 0x3f4bec7c, 0x3f7b80cc, 0x3f553023,
+ 0x3f262589, 0x3eeebd40, 0x3e91b54d, 0x3dd4c6f8, 0x3e2b3f74, 0x3eb3b4ef,
+ 0x3f08a160, 0x3f372559, 0x3f656721, 0x3f6c988d, 0x3f3ed8f9, 0x3f11596d,
+ 0x3ec83270, 0x3e5c5ea4, 0x3d254149, 0x3d9b3b97, 0x3e824e0e, 0x3edd4d25,
+ 0x3f1be6c8, 0x3f48e857, 0x3f75abeb, 0x3f5dcdd1, 0x3f318436, 0x3f0576a0,
+ 0x3eb348db, 0x3e3833fb, 0x3c2bedc9, 0x3e08c8be, 0x3e9cf794, 0x3ef512c0,
+ 0x3f265b92, 0x3f51f301, 0x3f7d5049, 0x3f578bfc, 0x3f2ca136, 0x3f01eecf,
+ 0x3eaee867, 0x3e34c34c, 0x3c490794, 0x3e21d00f, 0x3ea6bd94, 0x3efc2262,
+ 0x3f288bcc, 0x3f52cf2d, 0x3f7cdbe2, 0x3f594d89, 0x3f2fac87, 0x3f064092,
+ 0x3eba1245, 0x3e5016cd, 0x3d335c27, 0x3e1ac9dd, 0x3ea0a6f1, 0x3ef37edc,
+ 0x3f22f6de, 0x3f4bfa4d, 0x3f74ca3e, 0x3f6298cf, 0x3f3a2e5b, 0x3f11f5e8,
+ 0x3ed3ddf9, 0x3e84323c, 0x3dd39eaa, 0x3deb3986, 0x3e8ba34a, 0x3edc142f,
+ 0x3f161103, 0x3f3de6e2, 0x3f658c2b, 0x3f72feac, 0x3f4bb92e, 0x3f24a2e9,
+ 0x3efb76d9, 0x3eae048f, 0x3e41dc34, 0x3d219509, 0x3d50153e, 0x3e511b46,
+ 0x3eb6ba2d, 0x3f024494, 0x3f28fdb9, 0x3f4f88f3, 0x3f75e6af, 0x3f63e8a7,
+ 0x3f3de4a8, 0x3f180cea, 0x3ee4c20e, 0x3e99c134, 0x3e1e2cfc, 0x3c1824f4,
+ 0x3de0bac6, 0x3e8436b1, 0x3ecfe62d, 0x3f0d9ef9, 0x3f331f66, 0x3f5874c1,
+ 0x3f7d9f6c, 0x3f5d6037, 0x3f3889c9, 0x3f13dcea, 0x3edeb27d, 0x3e95fcd3,
+ 0x3e1b303e, 0x3c3075cb, 0x3e0a7f24, 0x3e8eec6e, 0x3ed8462b, 0x3f10a6c1,
+ 0x3f350197, 0x3f5933f1, 0x3f7d3e29, 0x3f5edf68, 0x3f3b246a, 0x3f179088,
+ 0x3ee846d8, 0x3ea1b983, 0x3e36f0d8, 0x3d2c1773, 0x3e048260, 0x3e89b72b,
+ 0x3ed0def0, 0x3f0bdc94, 0x3f2f233e, 0x3f5243ca, 0x3f753e89, 0x3f67ec34,
+ 0x3f453c1d, 0x3f22b0e2, 0x3f004a36, 0x3ebc0f98, 0x3e6fa55d, 0x3dcf7467,
+ 0x3dc09e5f, 0x3e6b0f8d, 0x3eba9e3c, 0x3eff6b94, 0x3f21f834, 0x3f4416a9,
+ 0x3f661173, 0x3f781723, 0x3f5662cf, 0x3f34d14a, 0x3f13624c, 0x3ee42b1a,
+ 0x3ea1d591, 0x3e3f86e3, 0x3d6fa1a1, 0x3cfd1ba9, 0x3e2674c3, 0x3e965d6c,
+ 0x3ed93b69, 0x3f0dea73, 0x3f2f1538, 0x3f501e47, 0x3f7105e6, 0x3f6e33a9,
+ 0x3f4d8e22, 0x3f2d0944, 0x3f0ca4cd, 0x3ed8c0fd, 0x3e98782f, 0x3e30dd66,
+ 0x3d452061, 0x3d8e62bc, 0x3e49c779, 0x3ea5ed78, 0x3ee6b665, 0x3f139f81,
+ 0x3f33c3e8, 0x3f53c8a7, 0x3f73adfa, 0x3f6c8be0, 0x3f4ce4ab, 0x3f2d5c2a,
+ 0x3f0df223, 0x3edd4cb5, 0x3e9ef12d, 0x3e41a276, 0x3d8bb1ba, 0x3d9ba0ff,
+ 0x3e4c6d54, 0x3ea547ab, 0x3ee41bba, 0x3f1159a6, 0x3f30876a, 0x3f4f9762,
+ 0x3f6e89c9, 0x3f72a12b, 0x3f53e942, 0x3f354e46, 0x3f16cffe, 0x3ef0dc6f,
+ 0x3eb45177, 0x3e6ffd59, 0x3def8e9c
+
+};
+
+const int32_t window_hex[] = {
+ 0x00000000, 0x398b03f6, 0x3a61d1c5, 0x3ae0ee32, 0x3b37623a, 0x3b85f871,
+ 0x3bb69d19, 0x3bed453b, 0x3c14d40b, 0x3c35c45b, 0x3c59595d, 0x3c7f7c1d,
+ 0x3c940c13, 0x3ca98d81, 0x3cc039eb, 0x3cd8098d, 0x3cf0f52e, 0x3d057b06,
+ 0x3d1302e6, 0x3d210f33, 0x3d2f9d0e, 0x3d3ea9ba, 0x3d4e3293, 0x3d5e3510,
+ 0x3d6eaebd, 0x3d7f9d38, 0x3d887f19, 0x3d9167b5, 0x3d9a8756, 0x3da3dce9,
+ 0x3dad675d, 0x3db725ab, 0x3dc116cc, 0x3dcb39bf, 0x3dd58d86, 0x3de01126,
+ 0x3deac3a7, 0x3df5a413, 0x3e0058bb, 0x3e05f571, 0x3e0ba7b2, 0x3e116f08,
+ 0x3e174afe, 0x3e1d3b1c, 0x3e233ef0, 0x3e295605, 0x3e2f7fe7, 0x3e35bc23,
+ 0x3e3c0a46, 0x3e4269de, 0x3e48da79, 0x3e4f5ba5, 0x3e55ecf2, 0x3e5c8ded,
+ 0x3e633e26, 0x3e69fd2c, 0x3e70ca8f, 0x3e77a5de, 0x3e7e8eaa, 0x3e82c241,
+ 0x3e86437c, 0x3e89cacd, 0x3e8d57fc, 0x3e90ead3, 0x3e948319, 0x3e982097,
+ 0x3e9bc316, 0x3e9f6a5d, 0x3ea31636, 0x3ea6c66a, 0x3eaa7ac0, 0x3eae3303,
+ 0x3eb1eefa, 0x3eb5ae6f, 0x3eb9712a, 0x3ebd36f6, 0x3ec0ff9b, 0x3ec4cae2,
+ 0x3ec89895, 0x3ecc687d, 0x3ed03a64, 0x3ed40e13, 0x3ed7e354, 0x3edbb9f2,
+ 0x3edf91b5, 0x3ee36a69, 0x3ee743d7, 0x3eeb1dca, 0x3eeef80c, 0x3ef2d267,
+ 0x3ef6aca8, 0x3efa8698, 0x3efe6002, 0x3f011c59, 0x3f03083a, 0x3f04f389,
+ 0x3f06de2d, 0x3f08c80b, 0x3f0ab10a, 0x3f0c990f, 0x3f0e8001, 0x3f1065c6,
+ 0x3f124a45, 0x3f142d65, 0x3f160f0c, 0x3f17ef21, 0x3f19cd8b, 0x3f1baa32,
+ 0x3f1d84fb, 0x3f1f5dd0, 0x3f213498, 0x3f230939, 0x3f24db9d, 0x3f26abaa,
+ 0x3f28794a, 0x3f2a4464, 0x3f2c0ce1, 0x3f2dd2a9, 0x3f2f95a6, 0x3f3155bf,
+ 0x3f3312e0, 0x3f34ccef, 0x3f3683d8, 0x3f383784, 0x3f39e7dd, 0x3f3b94cc,
+ 0x3f3d3e3c, 0x3f3ee418, 0x3f40864a, 0x3f4224bd, 0x3f43bf5c, 0x3f455613,
+ 0x3f46e8cc, 0x3f487774, 0x3f4a01f6, 0x3f4b883f, 0x3f4d0a3b, 0x3f4e87d6,
+ 0x3f5000fe, 0x3f5175a0, 0x3f52e5a9, 0x3f545106, 0x3f55b7a5, 0x3f571975,
+ 0x3f587664, 0x3f59ce60, 0x3f5b2158, 0x3f5c6f3b, 0x3f5db7f9, 0x3f5efb80,
+ 0x3f6039c2, 0x3f6172af, 0x3f62a636, 0x3f63d448, 0x3f64fcd6, 0x3f661fd3,
+ 0x3f673d2e, 0x3f6854db, 0x3f6966ca, 0x3f6a72ef, 0x3f6b793d, 0x3f6c79a5,
+ 0x3f6d741d, 0x3f6e6896, 0x3f6f5706, 0x3f703f5f, 0x3f712198, 0x3f71fda4,
+ 0x3f72d379, 0x3f73a30c, 0x3f746c52, 0x3f752f43, 0x3f75ebd4, 0x3f76a1fc,
+ 0x3f7751b2, 0x3f77faee, 0x3f789da6, 0x3f7939d4, 0x3f79cf6e, 0x3f7a5e6f,
+ 0x3f7ae6cf, 0x3f7b6886, 0x3f7be38f, 0x3f7c57e4, 0x3f7cc57f, 0x3f7d2c5b,
+ 0x3f7d8c72, 0x3f7de5bf, 0x3f7e3840, 0x3f7e83ee, 0x3f7ec8c7, 0x3f7f06c7,
+ 0x3f7f3deb, 0x3f7f6e31, 0x3f7f9795, 0x3f7fba17, 0x3f7fd5b4, 0x3f7fea6b,
+ 0x3f7ff83b, 0x3f7fff23, 0x3f7fff23, 0x3f7ff83b, 0x3f7fea6b, 0x3f7fd5b4,
+ 0x3f7fba17, 0x3f7f9795, 0x3f7f6e31, 0x3f7f3deb, 0x3f7f06c7, 0x3f7ec8c7,
+ 0x3f7e83ee, 0x3f7e3840, 0x3f7de5bf, 0x3f7d8c72, 0x3f7d2c5b, 0x3f7cc57f,
+ 0x3f7c57e4, 0x3f7be38f, 0x3f7b6886, 0x3f7ae6cf, 0x3f7a5e6f, 0x3f79cf6e,
+ 0x3f7939d4, 0x3f789da6, 0x3f77faee, 0x3f7751b2, 0x3f76a1fc, 0x3f75ebd4,
+ 0x3f752f43, 0x3f746c52, 0x3f73a30c, 0x3f72d379, 0x3f71fda4, 0x3f712198,
+ 0x3f703f5f, 0x3f6f5706, 0x3f6e6896, 0x3f6d741d, 0x3f6c79a5, 0x3f6b793d,
+ 0x3f6a72ef, 0x3f6966ca, 0x3f6854db, 0x3f673d2e, 0x3f661fd3, 0x3f64fcd6,
+ 0x3f63d448, 0x3f62a636, 0x3f6172af, 0x3f6039c2, 0x3f5efb80, 0x3f5db7f9,
+ 0x3f5c6f3b, 0x3f5b2158, 0x3f59ce60, 0x3f587664, 0x3f571975, 0x3f55b7a5,
+ 0x3f545106, 0x3f52e5a9, 0x3f5175a0, 0x3f5000fe, 0x3f4e87d6, 0x3f4d0a3b,
+ 0x3f4b883f, 0x3f4a01f6, 0x3f487774, 0x3f46e8cc, 0x3f455613, 0x3f43bf5c,
+ 0x3f4224bd, 0x3f40864a, 0x3f3ee418, 0x3f3d3e3c, 0x3f3b94cc, 0x3f39e7dd,
+ 0x3f383784, 0x3f3683d8, 0x3f34ccef, 0x3f3312e0, 0x3f3155bf, 0x3f2f95a6,
+ 0x3f2dd2a9, 0x3f2c0ce1, 0x3f2a4464, 0x3f28794a, 0x3f26abaa, 0x3f24db9d,
+ 0x3f230939, 0x3f213498, 0x3f1f5dd0, 0x3f1d84fb, 0x3f1baa32, 0x3f19cd8b,
+ 0x3f17ef21, 0x3f160f0c, 0x3f142d65, 0x3f124a45, 0x3f1065c6, 0x3f0e8001,
+ 0x3f0c990f, 0x3f0ab10a, 0x3f08c80b, 0x3f06de2d, 0x3f04f389, 0x3f03083a,
+ 0x3f011c59, 0x3efe6002, 0x3efa8698, 0x3ef6aca8, 0x3ef2d267, 0x3eeef80c,
+ 0x3eeb1dca, 0x3ee743d7, 0x3ee36a69, 0x3edf91b5, 0x3edbb9f2, 0x3ed7e354,
+ 0x3ed40e13, 0x3ed03a64, 0x3ecc687d, 0x3ec89895, 0x3ec4cae2, 0x3ec0ff9b,
+ 0x3ebd36f6, 0x3eb9712a, 0x3eb5ae6f, 0x3eb1eefa, 0x3eae3303, 0x3eaa7ac0,
+ 0x3ea6c66a, 0x3ea31636, 0x3e9f6a5d, 0x3e9bc316, 0x3e982097, 0x3e948319,
+ 0x3e90ead3, 0x3e8d57fc, 0x3e89cacd, 0x3e86437c, 0x3e82c241, 0x3e7e8eaa,
+ 0x3e77a5de, 0x3e70ca8f, 0x3e69fd2c, 0x3e633e26, 0x3e5c8ded, 0x3e55ecf2,
+ 0x3e4f5ba5, 0x3e48da79, 0x3e4269de, 0x3e3c0a46, 0x3e35bc23, 0x3e2f7fe7,
+ 0x3e295605, 0x3e233ef0, 0x3e1d3b1c, 0x3e174afe, 0x3e116f08, 0x3e0ba7b2,
+ 0x3e05f571, 0x3e0058bb, 0x3df5a413, 0x3deac3a7, 0x3de01126, 0x3dd58d86,
+ 0x3dcb39bf, 0x3dc116cc, 0x3db725ab, 0x3dad675d, 0x3da3dce9, 0x3d9a8756,
+ 0x3d9167b5, 0x3d887f19, 0x3d7f9d38, 0x3d6eaebd, 0x3d5e3510, 0x3d4e3293,
+ 0x3d3ea9ba, 0x3d2f9d0e, 0x3d210f33, 0x3d1302e6, 0x3d057b06, 0x3cf0f52e,
+ 0x3cd8098d, 0x3cc039eb, 0x3ca98d81, 0x3c940c13, 0x3c7f7c1d, 0x3c59595d,
+ 0x3c35c45b, 0x3c14d40b, 0x3bed453b, 0x3bb69d19, 0x3b85f871, 0x3b37623a,
+ 0x3ae0ee32, 0x3a61d1c5, 0x398b03f6, 0x00000000
+
+};
+
+const int32_t window_hamm_hex[] = {
+ 0x3da3d70a, 0x3da3f4f1, 0x3da44ea4, 0x3da4e41d, 0x3da5b554, 0x3da6c239,
+ 0x3da80abd, 0x3da98ecb, 0x3dab4e4a, 0x3dad491d, 0x3daf7f25, 0x3db1f03d,
+ 0x3db49c3e, 0x3db782fd, 0x3dbaa449, 0x3dbdfff1, 0x3dc195be, 0x3dc56575,
+ 0x3dc96ed9, 0x3dcdb1a8, 0x3dd22d9d, 0x3dd6e26e, 0x3ddbcfd0, 0x3de0f572,
+ 0x3de65301, 0x3debe825, 0x3df1b484, 0x3df7b7c0, 0x3dfdf176, 0x3e0230a1,
+ 0x3e05835d, 0x3e08f0ba, 0x3e0c7880, 0x3e101a75, 0x3e13d65f, 0x3e17ac00,
+ 0x3e1b9b1b, 0x3e1fa36f, 0x3e23c4bc, 0x3e27febd, 0x3e2c512e, 0x3e30bbc9,
+ 0x3e353e46, 0x3e39d85c, 0x3e3e89c0, 0x3e435226, 0x3e483140, 0x3e4d26be,
+ 0x3e523251, 0x3e5753a7, 0x3e5c8a6b, 0x3e61d64a, 0x3e6736ec, 0x3e6cabfc,
+ 0x3e72351f, 0x3e77d1fd, 0x3e7d8239, 0x3e81a2bc, 0x3e848dae, 0x3e8781c3,
+ 0x3e8a7eca, 0x3e8d8495, 0x3e9092f0, 0x3e93a9ab, 0x3e96c894, 0x3e99ef77,
+ 0x3e9d1e22, 0x3ea05460, 0x3ea391ff, 0x3ea6d6c8, 0x3eaa2286, 0x3ead7505,
+ 0x3eb0ce0f, 0x3eb42d6c, 0x3eb792e6, 0x3ebafe46, 0x3ebe6f54, 0x3ec1e5d9,
+ 0x3ec5619c, 0x3ec8e264, 0x3ecc67f8, 0x3ecff220, 0x3ed380a2, 0x3ed71344,
+ 0x3edaa9cb, 0x3ede43fe, 0x3ee1e1a3, 0x3ee5827d, 0x3ee92653, 0x3eeccce9,
+ 0x3ef07604, 0x3ef42168, 0x3ef7ceda, 0x3efb7e1d, 0x3eff2ef7, 0x3f017096,
+ 0x3f034a3f, 0x3f052459, 0x3f06fec5, 0x3f08d967, 0x3f0ab41f, 0x3f0c8ed0,
+ 0x3f0e695b, 0x3f1043a2, 0x3f121d87, 0x3f13f6ec, 0x3f15cfb4, 0x3f17a7bf,
+ 0x3f197ef0, 0x3f1b5529, 0x3f1d2a4d, 0x3f1efe3d, 0x3f20d0db, 0x3f22a20b,
+ 0x3f2471ae, 0x3f263fa8, 0x3f280bda, 0x3f29d628, 0x3f2b9e74, 0x3f2d64a2,
+ 0x3f2f2895, 0x3f30ea30, 0x3f32a956, 0x3f3465ec, 0x3f361fd4, 0x3f37d6f3,
+ 0x3f398b2d, 0x3f3b3c66, 0x3f3cea83, 0x3f3e9569, 0x3f403cfb, 0x3f41e121,
+ 0x3f4381be, 0x3f451eb8, 0x3f46b7f6, 0x3f484d5d, 0x3f49ded3, 0x3f4b6c3f,
+ 0x3f4cf588, 0x3f4e7a94, 0x3f4ffb4c, 0x3f517796, 0x3f52ef5a, 0x3f546282,
+ 0x3f55d0f4, 0x3f573a9a, 0x3f589f5d, 0x3f59ff26, 0x3f5b59df, 0x3f5caf72,
+ 0x3f5dffc9, 0x3f5f4acf, 0x3f60906f, 0x3f61d093, 0x3f630b29, 0x3f64401b,
+ 0x3f656f57, 0x3f6698c9, 0x3f67bc5d, 0x3f68da03, 0x3f69f1a6, 0x3f6b0337,
+ 0x3f6c0ea3, 0x3f6d13d9, 0x3f6e12c9, 0x3f6f0b62, 0x3f6ffd95, 0x3f70e953,
+ 0x3f71ce8c, 0x3f72ad32, 0x3f738537, 0x3f74568d, 0x3f752127, 0x3f75e4f8,
+ 0x3f76a1f3, 0x3f77580d, 0x3f780739, 0x3f78af6e, 0x3f79509f, 0x3f79eac3,
+ 0x3f7a7dd1, 0x3f7b09be, 0x3f7b8e83, 0x3f7c0c15, 0x3f7c826e, 0x3f7cf187,
+ 0x3f7d5957, 0x3f7db9d8, 0x3f7e1305, 0x3f7e64d7, 0x3f7eaf4a, 0x3f7ef258,
+ 0x3f7f2dfe, 0x3f7f6237, 0x3f7f8f00, 0x3f7fb457, 0x3f7fd239, 0x3f7fe8a4,
+ 0x3f7ff797, 0x3f7fff11, 0x3f7fff11, 0x3f7ff797, 0x3f7fe8a4, 0x3f7fd239,
+ 0x3f7fb457, 0x3f7f8f00, 0x3f7f6237, 0x3f7f2dfe, 0x3f7ef258, 0x3f7eaf4a,
+ 0x3f7e64d7, 0x3f7e1305, 0x3f7db9d8, 0x3f7d5957, 0x3f7cf187, 0x3f7c826e,
+ 0x3f7c0c15, 0x3f7b8e83, 0x3f7b09be, 0x3f7a7dd1, 0x3f79eac3, 0x3f79509f,
+ 0x3f78af6e, 0x3f780739, 0x3f77580d, 0x3f76a1f3, 0x3f75e4f8, 0x3f752127,
+ 0x3f74568d, 0x3f738537, 0x3f72ad32, 0x3f71ce8c, 0x3f70e953, 0x3f6ffd95,
+ 0x3f6f0b62, 0x3f6e12c9, 0x3f6d13d9, 0x3f6c0ea3, 0x3f6b0337, 0x3f69f1a6,
+ 0x3f68da03, 0x3f67bc5d, 0x3f6698c9, 0x3f656f57, 0x3f64401b, 0x3f630b29,
+ 0x3f61d093, 0x3f60906f, 0x3f5f4acf, 0x3f5dffc9, 0x3f5caf72, 0x3f5b59df,
+ 0x3f59ff26, 0x3f589f5d, 0x3f573a9a, 0x3f55d0f4, 0x3f546282, 0x3f52ef5a,
+ 0x3f517796, 0x3f4ffb4c, 0x3f4e7a94, 0x3f4cf588, 0x3f4b6c3f, 0x3f49ded3,
+ 0x3f484d5d, 0x3f46b7f6, 0x3f451eb8, 0x3f4381be, 0x3f41e121, 0x3f403cfb,
+ 0x3f3e9569, 0x3f3cea83, 0x3f3b3c66, 0x3f398b2d, 0x3f37d6f3, 0x3f361fd4,
+ 0x3f3465ec, 0x3f32a956, 0x3f30ea30, 0x3f2f2895, 0x3f2d64a2, 0x3f2b9e74,
+ 0x3f29d628, 0x3f280bda, 0x3f263fa8, 0x3f2471ae, 0x3f22a20b, 0x3f20d0db,
+ 0x3f1efe3d, 0x3f1d2a4d, 0x3f1b5529, 0x3f197ef0, 0x3f17a7bf, 0x3f15cfb4,
+ 0x3f13f6ec, 0x3f121d87, 0x3f1043a2, 0x3f0e695b, 0x3f0c8ed0, 0x3f0ab41f,
+ 0x3f08d967, 0x3f06fec5, 0x3f052459, 0x3f034a3f, 0x3f017096, 0x3eff2ef7,
+ 0x3efb7e1d, 0x3ef7ceda, 0x3ef42168, 0x3ef07604, 0x3eeccce9, 0x3ee92653,
+ 0x3ee5827d, 0x3ee1e1a3, 0x3ede43fe, 0x3edaa9cb, 0x3ed71344, 0x3ed380a2,
+ 0x3ecff220, 0x3ecc67f8, 0x3ec8e264, 0x3ec5619c, 0x3ec1e5d9, 0x3ebe6f54,
+ 0x3ebafe46, 0x3eb792e6, 0x3eb42d6c, 0x3eb0ce0f, 0x3ead7505, 0x3eaa2286,
+ 0x3ea6d6c8, 0x3ea391ff, 0x3ea05460, 0x3e9d1e22, 0x3e99ef77, 0x3e96c894,
+ 0x3e93a9ab, 0x3e9092f0, 0x3e8d8495, 0x3e8a7eca, 0x3e8781c3, 0x3e848dae,
+ 0x3e81a2bc, 0x3e7d8239, 0x3e77d1fd, 0x3e72351f, 0x3e6cabfc, 0x3e6736ec,
+ 0x3e61d64a, 0x3e5c8a6b, 0x3e5753a7, 0x3e523251, 0x3e4d26be, 0x3e483140,
+ 0x3e435226, 0x3e3e89c0, 0x3e39d85c, 0x3e353e46, 0x3e30bbc9, 0x3e2c512e,
+ 0x3e27febd, 0x3e23c4bc, 0x3e1fa36f, 0x3e1b9b1b, 0x3e17ac00, 0x3e13d65f,
+ 0x3e101a75, 0x3e0c7880, 0x3e08f0ba, 0x3e05835d, 0x3e0230a1, 0x3dfdf176,
+ 0x3df7b7c0, 0x3df1b484, 0x3debe825, 0x3de65301, 0x3de0f572, 0x3ddbcfd0,
+ 0x3dd6e26e, 0x3dd22d9d, 0x3dcdb1a8, 0x3dc96ed9, 0x3dc56575, 0x3dc195be,
+ 0x3dbdfff1, 0x3dbaa449, 0x3db782fd, 0x3db49c3e, 0x3db1f03d, 0x3daf7f25,
+ 0x3dad491d, 0x3dab4e4a, 0x3da98ecb, 0x3da80abd, 0x3da6c239, 0x3da5b554,
+ 0x3da4e41d, 0x3da44ea4, 0x3da3f4f1, 0x3da3d70a
+
+};
+
+const int global_cmvn_mean_hex[] = {
+ 0x413d6566, 0x4147923f, 0x4156ab15, 0x41613d12, 0x416b155b, 0x41722783,
+ 0x4176cd05, 0x4178532a, 0x417aa3c3, 0x417aed19, 0x417d4d2c, 0x417e6abb,
+ 0x41805848, 0x418122ab, 0x41812b23, 0x418161a8, 0x41810ef9, 0x4180863a,
+ 0x41815d8f, 0x417ff8b2, 0x417de2aa, 0x4180a5f2, 0x417e8bd1, 0x418041ac,
+ 0x417f2d60, 0x4180487f, 0x417eb835, 0x418018d8, 0x417ef8c1, 0x417ea302,
+ 0x417f30cf, 0x417ea0bb, 0x417ebac2, 0x417faab6, 0x417fca4d, 0x41805e45,
+ 0x4180e308, 0x4180ef3e, 0x418109fc, 0x4180afa3, 0x418113e2, 0x4180c915,
+ 0x41819f86, 0x418190bf, 0x418220bd, 0x4182f2e5, 0x4183e1c7, 0x41843eec,
+ 0x4184b066, 0x418574db, 0x41852611, 0x4184fc81, 0x41851b2a, 0x4185a1c7,
+ 0x41861152, 0x41868c28, 0x41871930, 0x41871f83, 0x41868893, 0x4185d919,
+ 0x4185664b, 0x418480a6, 0x41840e3a, 0x41836ace, 0x4182b217, 0x4181cb79,
+ 0x4180fb13, 0x418098b9, 0x41805ded, 0x417ff69a, 0x417f49bd, 0x417ecef8,
+ 0x417e286c, 0x417d9135, 0x417cfff4, 0x417ca8f7, 0x417b2e8f, 0x41773788,
+ 0x4170b095, 0x4167417f};
+
+const int global_cmvn_std_hex[] = {
+ 0x4040335e, 0x405235d3, 0x40589be4, 0x4054261f, 0x40544ba2, 0x40575418,
+ 0x405b6528, 0x40617999, 0x40605fcf, 0x405c9c6d, 0x40590796, 0x405899fc,
+ 0x405810b8, 0x40587c40, 0x40592b5e, 0x4057fb12, 0x4057028b, 0x405515d7,
+ 0x4053d714, 0x405418c7, 0x405536bc, 0x4052f54e, 0x4052d382, 0x4051201d,
+ 0x4050a8d2, 0x4050857f, 0x404ffe85, 0x4050a0da, 0x40517a8a, 0x40508862,
+ 0x40504f68, 0x404f3159, 0x404f0930, 0x404e8a2e, 0x404e7383, 0x404eb185,
+ 0x404edaa9, 0x404efed2, 0x404ea8f4, 0x404f6d0d, 0x404ee9d9, 0x404f4cca,
+ 0x404fb13f, 0x405051c5, 0x40503f5e, 0x4050df6e, 0x4052974e, 0x4053d421,
+ 0x40544d48, 0x40544ec8, 0x40550e57, 0x40558287, 0x4055d122, 0x4056b22a,
+ 0x4058ea5c, 0x405acbc3, 0x405a89e7, 0x405a88ed, 0x405afadb, 0x405a1c60,
+ 0x405a6f46, 0x405b0a24, 0x405b5f44, 0x405cc0a9, 0x405d984b, 0x405ef9b8,
+ 0x4061178a, 0x406262bf, 0x40644904, 0x40660b20, 0x4067f7f1, 0x406a35e5,
+ 0x406c1e97, 0x406e16a9, 0x406eadb1, 0x406d0cba, 0x406d9ca0, 0x406f5a14,
+ 0x406e84a7, 0x406cd985};
+
+const int global_cmvn_mean_online_hex[] = {
+
+ 0x413d5d27, 0x414785ae, 0x4156986a, 0x41612a4e, 0x416b063e, 0x41721c9b,
+ 0x4176c505, 0x41784b5b, 0x417a9575, 0x417adfb2, 0x417d4153, 0x417e611e,
+ 0x41805288, 0x41811c27, 0x4181250c, 0x41815cd4, 0x41810b77, 0x4180817c,
+ 0x41815881, 0x417feaf2, 0x417dd2bf, 0x41809f37, 0x417e7b47, 0x41803a6a,
+ 0x417f1ff4, 0x41804382, 0x417ead10, 0x41801220, 0x417eeb28, 0x417e9801,
+ 0x417f26b9, 0x417e95f9, 0x417eac06, 0x417f9aa5, 0x417fbb16, 0x41805651,
+ 0x4180daaa, 0x4180e84c, 0x41810566, 0x4180ab2c, 0x418111b0, 0x4180c6cc,
+ 0x41819e27, 0x418190cc, 0x4182205c, 0x4182f265, 0x4183e1a2, 0x41844012,
+ 0x4184b0cd, 0x41857447, 0x418527f7, 0x4184fdc6, 0x41851ad2, 0x4185a148,
+ 0x41860f8b, 0x41868888, 0x418712e4, 0x41871702, 0x41867ec3, 0x4185cc48,
+ 0x418559b4, 0x41847855, 0x418408f4, 0x418368f4, 0x4182b718, 0x4181d76d,
+ 0x41810e52, 0x4180b204, 0x418078a4, 0x41801179, 0x417f5579, 0x417e93b7,
+ 0x417d6f2c, 0x417c1a0b, 0x417a6c7a, 0x41787d18, 0x4174eceb, 0x416e3ed3,
+ 0x41644af8, 0x41566dd4
+
+};
+
+const int global_cmvn_std_online_hex[] = {
+
+ 0x40408fdd, 0x405293b6, 0x4058f2d2, 0x40546ddb, 0x4054984c, 0x4057971b,
+ 0x405ba086, 0x4061afa7, 0x4060a24c, 0x405cbb7e, 0x405923f7, 0x4058c91f,
+ 0x40585cf3, 0x4058c22a, 0x40594960, 0x405824a6, 0x405703f3, 0x40556377,
+ 0x4053e02d, 0x40540a7e, 0x405553c7, 0x4052ead5, 0x4052d23d, 0x40510308,
+ 0x4050a2f3, 0x40505b81, 0x404fed20, 0x4050a372, 0x40515196, 0x40504810,
+ 0x40501fdd, 0x404f2225, 0x404f0931, 0x404e8a2b, 0x404e773b, 0x404ea782,
+ 0x404ee17d, 0x404ef49c, 0x404e884d, 0x404f696b, 0x404edd0e, 0x404f23cc,
+ 0x404f74d4, 0x40501e89, 0x405009f3, 0x4050c422, 0x4052902b, 0x4053987c,
+ 0x40542997, 0x40543695, 0x4054cbef, 0x40553947, 0x4055ab7c, 0x4056887c,
+ 0x4058b710, 0x405a8d28, 0x405a6a27, 0x405a6b3b, 0x405ac8d3, 0x405a031d,
+ 0x405a2158, 0x405abb1b, 0x405b1350, 0x405c98c0, 0x405d5cf9, 0x405ead5b,
+ 0x40609748, 0x4061dfb9, 0x4063aa9f, 0x40655831, 0x40671a35, 0x40694bf5,
+ 0x406b1f59, 0x406cb49b, 0x406cf19e, 0x406b592b, 0x406b757c, 0x406c866d,
+ 0x406ac24f, 0x406678d9
+
+};
+
+const unsigned int paraformer_cmvn_mean_hex[] = {
+
+ 0xc104fd75, 0xc1099d56, 0xc119dad7, 0xc126f9a7, 0xc133681f, 0xc13e221f,
+ 0xc145cc83, 0xc14a3166, 0xc14e1bda, 0xc14d4a62, 0xc14e41a9, 0xc14f4e7b,
+ 0xc153297e, 0xc1567ee5, 0xc157dbab, 0xc158dfa4, 0xc158e6f9, 0xc1584e70,
+ 0xc15aecea, 0xc15886b8, 0xc156bcb4, 0xc15a7ba9, 0xc1581d34, 0xc15c0a48,
+ 0xc15c463f, 0xc15dfc3b, 0xc15bb28b, 0xc15b4413, 0xc158f8c0, 0xc1588ede,
+ 0xc158c880, 0xc158ff19, 0xc159815a, 0xc159ed72, 0xc15a458d, 0xc15a93d3,
+ 0xc15a06ec, 0xc15953d8, 0xc1592e92, 0xc1579518, 0xc1587d76, 0xc157bc56,
+ 0xc159c47c, 0xc15a5ac4, 0xc15b7286, 0xc15cab60, 0xc15e7f8d, 0xc1607ee5,
+ 0xc162e9ad, 0xc165bdb0, 0xc167bf3e, 0xc169a0a5, 0xc16b4b68, 0xc16d5682,
+ 0xc16ebd51, 0xc170197a, 0xc170d1cc, 0xc1707fc1, 0xc16fd830, 0xc16ec4b1,
+ 0xc16de888, 0xc16d3b06, 0xc16cc155, 0xc16c4e31, 0xc16b6abe, 0xc169cde8,
+ 0xc1684578, 0xc166c2a4, 0xc165d326, 0xc164df46, 0xc163b4ad, 0xc1632d19,
+ 0xc162a94a, 0xc16280fc, 0xc161ae3e, 0xc15fec42, 0xc15cbadc, 0xc15664c3,
+ 0xc14c6d5d, 0xc13b64ae, 0xc104fd75, 0xc1099d56, 0xc119dad7, 0xc126f9a7,
+ 0xc133681f, 0xc13e221f, 0xc145cc83, 0xc14a3166, 0xc14e1bda, 0xc14d4a62,
+ 0xc14e41a9, 0xc14f4e7b, 0xc153297e, 0xc1567ee5, 0xc157dbab, 0xc158dfa4,
+ 0xc158e6f9, 0xc1584e70, 0xc15aecea, 0xc15886b8, 0xc156bcb4, 0xc15a7ba9,
+ 0xc1581d34, 0xc15c0a48, 0xc15c463f, 0xc15dfc3b, 0xc15bb28b, 0xc15b4413,
+ 0xc158f8c0, 0xc1588ede, 0xc158c880, 0xc158ff19, 0xc159815a, 0xc159ed72,
+ 0xc15a458d, 0xc15a93d3, 0xc15a06ec, 0xc15953d8, 0xc1592e92, 0xc1579518,
+ 0xc1587d76, 0xc157bc56, 0xc159c47c, 0xc15a5ac4, 0xc15b7286, 0xc15cab60,
+ 0xc15e7f8d, 0xc1607ee5, 0xc162e9ad, 0xc165bdb0, 0xc167bf3e, 0xc169a0a5,
+ 0xc16b4b68, 0xc16d5682, 0xc16ebd51, 0xc170197a, 0xc170d1cc, 0xc1707fc1,
+ 0xc16fd830, 0xc16ec4b1, 0xc16de888, 0xc16d3b06, 0xc16cc155, 0xc16c4e31,
+ 0xc16b6abe, 0xc169cde8, 0xc1684578, 0xc166c2a4, 0xc165d326, 0xc164df46,
+ 0xc163b4ad, 0xc1632d19, 0xc162a94a, 0xc16280fc, 0xc161ae3e, 0xc15fec42,
+ 0xc15cbadc, 0xc15664c3, 0xc14c6d5d, 0xc13b64ae, 0xc104fd75, 0xc1099d56,
+ 0xc119dad7, 0xc126f9a7, 0xc133681f, 0xc13e221f, 0xc145cc83, 0xc14a3166,
+ 0xc14e1bda, 0xc14d4a62, 0xc14e41a9, 0xc14f4e7b, 0xc153297e, 0xc1567ee5,
+ 0xc157dbab, 0xc158dfa4, 0xc158e6f9, 0xc1584e70, 0xc15aecea, 0xc15886b8,
+ 0xc156bcb4, 0xc15a7ba9, 0xc1581d34, 0xc15c0a48, 0xc15c463f, 0xc15dfc3b,
+ 0xc15bb28b, 0xc15b4413, 0xc158f8c0, 0xc1588ede, 0xc158c880, 0xc158ff19,
+ 0xc159815a, 0xc159ed72, 0xc15a458d, 0xc15a93d3, 0xc15a06ec, 0xc15953d8,
+ 0xc1592e92, 0xc1579518, 0xc1587d76, 0xc157bc56, 0xc159c47c, 0xc15a5ac4,
+ 0xc15b7286, 0xc15cab60, 0xc15e7f8d, 0xc1607ee5, 0xc162e9ad, 0xc165bdb0,
+ 0xc167bf3e, 0xc169a0a5, 0xc16b4b68, 0xc16d5682, 0xc16ebd51, 0xc170197a,
+ 0xc170d1cc, 0xc1707fc1, 0xc16fd830, 0xc16ec4b1, 0xc16de888, 0xc16d3b06,
+ 0xc16cc155, 0xc16c4e31, 0xc16b6abe, 0xc169cde8, 0xc1684578, 0xc166c2a4,
+ 0xc165d326, 0xc164df46, 0xc163b4ad, 0xc1632d19, 0xc162a94a, 0xc16280fc,
+ 0xc161ae3e, 0xc15fec42, 0xc15cbadc, 0xc15664c3, 0xc14c6d5d, 0xc13b64ae,
+ 0xc104fd75, 0xc1099d56, 0xc119dad7, 0xc126f9a7, 0xc133681f, 0xc13e221f,
+ 0xc145cc83, 0xc14a3166, 0xc14e1bda, 0xc14d4a62, 0xc14e41a9, 0xc14f4e7b,
+ 0xc153297e, 0xc1567ee5, 0xc157dbab, 0xc158dfa4, 0xc158e6f9, 0xc1584e70,
+ 0xc15aecea, 0xc15886b8, 0xc156bcb4, 0xc15a7ba9, 0xc1581d34, 0xc15c0a48,
+ 0xc15c463f, 0xc15dfc3b, 0xc15bb28b, 0xc15b4413, 0xc158f8c0, 0xc1588ede,
+ 0xc158c880, 0xc158ff19, 0xc159815a, 0xc159ed72, 0xc15a458d, 0xc15a93d3,
+ 0xc15a06ec, 0xc15953d8, 0xc1592e92, 0xc1579518, 0xc1587d76, 0xc157bc56,
+ 0xc159c47c, 0xc15a5ac4, 0xc15b7286, 0xc15cab60, 0xc15e7f8d, 0xc1607ee5,
+ 0xc162e9ad, 0xc165bdb0, 0xc167bf3e, 0xc169a0a5, 0xc16b4b68, 0xc16d5682,
+ 0xc16ebd51, 0xc170197a, 0xc170d1cc, 0xc1707fc1, 0xc16fd830, 0xc16ec4b1,
+ 0xc16de888, 0xc16d3b06, 0xc16cc155, 0xc16c4e31, 0xc16b6abe, 0xc169cde8,
+ 0xc1684578, 0xc166c2a4, 0xc165d326, 0xc164df46, 0xc163b4ad, 0xc1632d19,
+ 0xc162a94a, 0xc16280fc, 0xc161ae3e, 0xc15fec42, 0xc15cbadc, 0xc15664c3,
+ 0xc14c6d5d, 0xc13b64ae, 0xc104fd75, 0xc1099d56, 0xc119dad7, 0xc126f9a7,
+ 0xc133681f, 0xc13e221f, 0xc145cc83, 0xc14a3166, 0xc14e1bda, 0xc14d4a62,
+ 0xc14e41a9, 0xc14f4e7b, 0xc153297e, 0xc1567ee5, 0xc157dbab, 0xc158dfa4,
+ 0xc158e6f9, 0xc1584e70, 0xc15aecea, 0xc15886b8, 0xc156bcb4, 0xc15a7ba9,
+ 0xc1581d34, 0xc15c0a48, 0xc15c463f, 0xc15dfc3b, 0xc15bb28b, 0xc15b4413,
+ 0xc158f8c0, 0xc1588ede, 0xc158c880, 0xc158ff19, 0xc159815a, 0xc159ed72,
+ 0xc15a458d, 0xc15a93d3, 0xc15a06ec, 0xc15953d8, 0xc1592e92, 0xc1579518,
+ 0xc1587d76, 0xc157bc56, 0xc159c47c, 0xc15a5ac4, 0xc15b7286, 0xc15cab60,
+ 0xc15e7f8d, 0xc1607ee5, 0xc162e9ad, 0xc165bdb0, 0xc167bf3e, 0xc169a0a5,
+ 0xc16b4b68, 0xc16d5682, 0xc16ebd51, 0xc170197a, 0xc170d1cc, 0xc1707fc1,
+ 0xc16fd830, 0xc16ec4b1, 0xc16de888, 0xc16d3b06, 0xc16cc155, 0xc16c4e31,
+ 0xc16b6abe, 0xc169cde8, 0xc1684578, 0xc166c2a4, 0xc165d326, 0xc164df46,
+ 0xc163b4ad, 0xc1632d19, 0xc162a94a, 0xc16280fc, 0xc161ae3e, 0xc15fec42,
+ 0xc15cbadc, 0xc15664c3, 0xc14c6d5d, 0xc13b64ae, 0xc104fd75, 0xc1099d56,
+ 0xc119dad7, 0xc126f9a7, 0xc133681f, 0xc13e221f, 0xc145cc83, 0xc14a3166,
+ 0xc14e1bda, 0xc14d4a62, 0xc14e41a9, 0xc14f4e7b, 0xc153297e, 0xc1567ee5,
+ 0xc157dbab, 0xc158dfa4, 0xc158e6f9, 0xc1584e70, 0xc15aecea, 0xc15886b8,
+ 0xc156bcb4, 0xc15a7ba9, 0xc1581d34, 0xc15c0a48, 0xc15c463f, 0xc15dfc3b,
+ 0xc15bb28b, 0xc15b4413, 0xc158f8c0, 0xc1588ede, 0xc158c880, 0xc158ff19,
+ 0xc159815a, 0xc159ed72, 0xc15a458d, 0xc15a93d3, 0xc15a06ec, 0xc15953d8,
+ 0xc1592e92, 0xc1579518, 0xc1587d76, 0xc157bc56, 0xc159c47c, 0xc15a5ac4,
+ 0xc15b7286, 0xc15cab60, 0xc15e7f8d, 0xc1607ee5, 0xc162e9ad, 0xc165bdb0,
+ 0xc167bf3e, 0xc169a0a5, 0xc16b4b68, 0xc16d5682, 0xc16ebd51, 0xc170197a,
+ 0xc170d1cc, 0xc1707fc1, 0xc16fd830, 0xc16ec4b1, 0xc16de888, 0xc16d3b06,
+ 0xc16cc155, 0xc16c4e31, 0xc16b6abe, 0xc169cde8, 0xc1684578, 0xc166c2a4,
+ 0xc165d326, 0xc164df46, 0xc163b4ad, 0xc1632d19, 0xc162a94a, 0xc16280fc,
+ 0xc161ae3e, 0xc15fec42, 0xc15cbadc, 0xc15664c3, 0xc14c6d5d, 0xc13b64ae,
+ 0xc104fd75, 0xc1099d56, 0xc119dad7, 0xc126f9a7, 0xc133681f, 0xc13e221f,
+ 0xc145cc83, 0xc14a3166, 0xc14e1bda, 0xc14d4a62, 0xc14e41a9, 0xc14f4e7b,
+ 0xc153297e, 0xc1567ee5, 0xc157dbab, 0xc158dfa4, 0xc158e6f9, 0xc1584e70,
+ 0xc15aecea, 0xc15886b8, 0xc156bcb4, 0xc15a7ba9, 0xc1581d34, 0xc15c0a48,
+ 0xc15c463f, 0xc15dfc3b, 0xc15bb28b, 0xc15b4413, 0xc158f8c0, 0xc1588ede,
+ 0xc158c880, 0xc158ff19, 0xc159815a, 0xc159ed72, 0xc15a458d, 0xc15a93d3,
+ 0xc15a06ec, 0xc15953d8, 0xc1592e92, 0xc1579518, 0xc1587d76, 0xc157bc56,
+ 0xc159c47c, 0xc15a5ac4, 0xc15b7286, 0xc15cab60, 0xc15e7f8d, 0xc1607ee5,
+ 0xc162e9ad, 0xc165bdb0, 0xc167bf3e, 0xc169a0a5, 0xc16b4b68, 0xc16d5682,
+ 0xc16ebd51, 0xc170197a, 0xc170d1cc, 0xc1707fc1, 0xc16fd830, 0xc16ec4b1,
+ 0xc16de888, 0xc16d3b06, 0xc16cc155, 0xc16c4e31, 0xc16b6abe, 0xc169cde8,
+ 0xc1684578, 0xc166c2a4, 0xc165d326, 0xc164df46, 0xc163b4ad, 0xc1632d19,
+ 0xc162a94a, 0xc16280fc, 0xc161ae3e, 0xc15fec42, 0xc15cbadc, 0xc15664c3,
+ 0xc14c6d5d, 0xc13b64ae};
+
+const unsigned int paraformer_cmvn_var_hex[] = {
+
+ 0x40619618, 0x405fb77c, 0x405d3028, 0x405bef11, 0x405a189d, 0x4057aad5,
+ 0x4054f9cc, 0x40518e8c, 0x404fffdd, 0x40510d0d, 0x4052400d, 0x4052bab0,
+ 0x40526416, 0x40515cb8, 0x40506aee, 0x404fef8d, 0x404ff527, 0x40505b95,
+ 0x4050d61c, 0x4051d0a5, 0x4052abd2, 0x4052f14b, 0x4053d196, 0x4054800d,
+ 0x405545f2, 0x4055d71f, 0x40567588, 0x4056de4d, 0x40579b72, 0x40584d35,
+ 0x4058cd2f, 0x40594731, 0x4059a53f, 0x405a00ed, 0x405a34c1, 0x405a406e,
+ 0x405a1748, 0x405a0300, 0x405a1547, 0x405a66a7, 0x405a9be4, 0x405b04b2,
+ 0x405b5754, 0x405b9189, 0x405b9016, 0x405b7a07, 0x405b63f9, 0x405b3f45,
+ 0x405b0cb4, 0x405ac80b, 0x405ac1f7, 0x405abbd9, 0x405ac86a, 0x405ad72b,
+ 0x405af2f0, 0x405ab465, 0x405a6364, 0x405a1350, 0x4059baa3, 0x4059911d,
+ 0x40597921, 0x40595564, 0x40593b8d, 0x4059310f, 0x40594e46, 0x40599bae,
+ 0x4059e703, 0x4059feec, 0x405a053a, 0x4059feaa, 0x4059d7a0, 0x40599386,
+ 0x40592d0e, 0x4058ce4c, 0x40587335, 0x4058396a, 0x40584ee1, 0x4058925a,
+ 0x40592f6d, 0x405a9f0a, 0x40619618, 0x405fb77c, 0x405d3028, 0x405bef11,
+ 0x405a189d, 0x4057aad5, 0x4054f9cc, 0x40518e8c, 0x404fffdd, 0x40510d0d,
+ 0x4052400d, 0x4052bab0, 0x40526416, 0x40515cb8, 0x40506aee, 0x404fef8d,
+ 0x404ff527, 0x40505b95, 0x4050d61c, 0x4051d0a5, 0x4052abd2, 0x4052f14b,
+ 0x4053d196, 0x4054800d, 0x405545f2, 0x4055d71f, 0x40567588, 0x4056de4d,
+ 0x40579b72, 0x40584d35, 0x4058cd2f, 0x40594731, 0x4059a53f, 0x405a00ed,
+ 0x405a34c1, 0x405a406e, 0x405a1748, 0x405a0300, 0x405a1547, 0x405a66a7,
+ 0x405a9be4, 0x405b04b2, 0x405b5754, 0x405b9189, 0x405b9016, 0x405b7a07,
+ 0x405b63f9, 0x405b3f45, 0x405b0cb4, 0x405ac80b, 0x405ac1f7, 0x405abbd9,
+ 0x405ac86a, 0x405ad72b, 0x405af2f0, 0x405ab465, 0x405a6364, 0x405a1350,
+ 0x4059baa3, 0x4059911d, 0x40597921, 0x40595564, 0x40593b8d, 0x4059310f,
+ 0x40594e46, 0x40599bae, 0x4059e703, 0x4059feec, 0x405a053a, 0x4059feaa,
+ 0x4059d7a0, 0x40599386, 0x40592d0e, 0x4058ce4c, 0x40587335, 0x4058396a,
+ 0x40584ee1, 0x4058925a, 0x40592f6d, 0x405a9f0a, 0x40619618, 0x405fb77c,
+ 0x405d3028, 0x405bef11, 0x405a189d, 0x4057aad5, 0x4054f9cc, 0x40518e8c,
+ 0x404fffdd, 0x40510d0d, 0x4052400d, 0x4052bab0, 0x40526416, 0x40515cb8,
+ 0x40506aee, 0x404fef8d, 0x404ff527, 0x40505b95, 0x4050d61c, 0x4051d0a5,
+ 0x4052abd2, 0x4052f14b, 0x4053d196, 0x4054800d, 0x405545f2, 0x4055d71f,
+ 0x40567588, 0x4056de4d, 0x40579b72, 0x40584d35, 0x4058cd2f, 0x40594731,
+ 0x4059a53f, 0x405a00ed, 0x405a34c1, 0x405a406e, 0x405a1748, 0x405a0300,
+ 0x405a1547, 0x405a66a7, 0x405a9be4, 0x405b04b2, 0x405b5754, 0x405b9189,
+ 0x405b9016, 0x405b7a07, 0x405b63f9, 0x405b3f45, 0x405b0cb4, 0x405ac80b,
+ 0x405ac1f7, 0x405abbd9, 0x405ac86a, 0x405ad72b, 0x405af2f0, 0x405ab465,
+ 0x405a6364, 0x405a1350, 0x4059baa3, 0x4059911d, 0x40597921, 0x40595564,
+ 0x40593b8d, 0x4059310f, 0x40594e46, 0x40599bae, 0x4059e703, 0x4059feec,
+ 0x405a053a, 0x4059feaa, 0x4059d7a0, 0x40599386, 0x40592d0e, 0x4058ce4c,
+ 0x40587335, 0x4058396a, 0x40584ee1, 0x4058925a, 0x40592f6d, 0x405a9f0a,
+ 0x40619618, 0x405fb77c, 0x405d3028, 0x405bef11, 0x405a189d, 0x4057aad5,
+ 0x4054f9cc, 0x40518e8c, 0x404fffdd, 0x40510d0d, 0x4052400d, 0x4052bab0,
+ 0x40526416, 0x40515cb8, 0x40506aee, 0x404fef8d, 0x404ff527, 0x40505b95,
+ 0x4050d61c, 0x4051d0a5, 0x4052abd2, 0x4052f14b, 0x4053d196, 0x4054800d,
+ 0x405545f2, 0x4055d71f, 0x40567588, 0x4056de4d, 0x40579b72, 0x40584d35,
+ 0x4058cd2f, 0x40594731, 0x4059a53f, 0x405a00ed, 0x405a34c1, 0x405a406e,
+ 0x405a1748, 0x405a0300, 0x405a1547, 0x405a66a7, 0x405a9be4, 0x405b04b2,
+ 0x405b5754, 0x405b9189, 0x405b9016, 0x405b7a07, 0x405b63f9, 0x405b3f45,
+ 0x405b0cb4, 0x405ac80b, 0x405ac1f7, 0x405abbd9, 0x405ac86a, 0x405ad72b,
+ 0x405af2f0, 0x405ab465, 0x405a6364, 0x405a1350, 0x4059baa3, 0x4059911d,
+ 0x40597921, 0x40595564, 0x40593b8d, 0x4059310f, 0x40594e46, 0x40599bae,
+ 0x4059e703, 0x4059feec, 0x405a053a, 0x4059feaa, 0x4059d7a0, 0x40599386,
+ 0x40592d0e, 0x4058ce4c, 0x40587335, 0x4058396a, 0x40584ee1, 0x4058925a,
+ 0x40592f6d, 0x405a9f0a, 0x40619618, 0x405fb77c, 0x405d3028, 0x405bef11,
+ 0x405a189d, 0x4057aad5, 0x4054f9cc, 0x40518e8c, 0x404fffdd, 0x40510d0d,
+ 0x4052400d, 0x4052bab0, 0x40526416, 0x40515cb8, 0x40506aee, 0x404fef8d,
+ 0x404ff527, 0x40505b95, 0x4050d61c, 0x4051d0a5, 0x4052abd2, 0x4052f14b,
+ 0x4053d196, 0x4054800d, 0x405545f2, 0x4055d71f, 0x40567588, 0x4056de4d,
+ 0x40579b72, 0x40584d35, 0x4058cd2f, 0x40594731, 0x4059a53f, 0x405a00ed,
+ 0x405a34c1, 0x405a406e, 0x405a1748, 0x405a0300, 0x405a1547, 0x405a66a7,
+ 0x405a9be4, 0x405b04b2, 0x405b5754, 0x405b9189, 0x405b9016, 0x405b7a07,
+ 0x405b63f9, 0x405b3f45, 0x405b0cb4, 0x405ac80b, 0x405ac1f7, 0x405abbd9,
+ 0x405ac86a, 0x405ad72b, 0x405af2f0, 0x405ab465, 0x405a6364, 0x405a1350,
+ 0x4059baa3, 0x4059911d, 0x40597921, 0x40595564, 0x40593b8d, 0x4059310f,
+ 0x40594e46, 0x40599bae, 0x4059e703, 0x4059feec, 0x405a053a, 0x4059feaa,
+ 0x4059d7a0, 0x40599386, 0x40592d0e, 0x4058ce4c, 0x40587335, 0x4058396a,
+ 0x40584ee1, 0x4058925a, 0x40592f6d, 0x405a9f0a, 0x40619618, 0x405fb77c,
+ 0x405d3028, 0x405bef11, 0x405a189d, 0x4057aad5, 0x4054f9cc, 0x40518e8c,
+ 0x404fffdd, 0x40510d0d, 0x4052400d, 0x4052bab0, 0x40526416, 0x40515cb8,
+ 0x40506aee, 0x404fef8d, 0x404ff527, 0x40505b95, 0x4050d61c, 0x4051d0a5,
+ 0x4052abd2, 0x4052f14b, 0x4053d196, 0x4054800d, 0x405545f2, 0x4055d71f,
+ 0x40567588, 0x4056de4d, 0x40579b72, 0x40584d35, 0x4058cd2f, 0x40594731,
+ 0x4059a53f, 0x405a00ed, 0x405a34c1, 0x405a406e, 0x405a1748, 0x405a0300,
+ 0x405a1547, 0x405a66a7, 0x405a9be4, 0x405b04b2, 0x405b5754, 0x405b9189,
+ 0x405b9016, 0x405b7a07, 0x405b63f9, 0x405b3f45, 0x405b0cb4, 0x405ac80b,
+ 0x405ac1f7, 0x405abbd9, 0x405ac86a, 0x405ad72b, 0x405af2f0, 0x405ab465,
+ 0x405a6364, 0x405a1350, 0x4059baa3, 0x4059911d, 0x40597921, 0x40595564,
+ 0x40593b8d, 0x4059310f, 0x40594e46, 0x40599bae, 0x4059e703, 0x4059feec,
+ 0x405a053a, 0x4059feaa, 0x4059d7a0, 0x40599386, 0x40592d0e, 0x4058ce4c,
+ 0x40587335, 0x4058396a, 0x40584ee1, 0x4058925a, 0x40592f6d, 0x405a9f0a,
+ 0x40619618, 0x405fb77c, 0x405d3028, 0x405bef11, 0x405a189d, 0x4057aad5,
+ 0x4054f9cc, 0x40518e8c, 0x404fffdd, 0x40510d0d, 0x4052400d, 0x4052bab0,
+ 0x40526416, 0x40515cb8, 0x40506aee, 0x404fef8d, 0x404ff527, 0x40505b95,
+ 0x4050d61c, 0x4051d0a5, 0x4052abd2, 0x4052f14b, 0x4053d196, 0x4054800d,
+ 0x405545f2, 0x4055d71f, 0x40567588, 0x4056de4d, 0x40579b72, 0x40584d35,
+ 0x4058cd2f, 0x40594731, 0x4059a53f, 0x405a00ed, 0x405a34c1, 0x405a406e,
+ 0x405a1748, 0x405a0300, 0x405a1547, 0x405a66a7, 0x405a9be4, 0x405b04b2,
+ 0x405b5754, 0x405b9189, 0x405b9016, 0x405b7a07, 0x405b63f9, 0x405b3f45,
+ 0x405b0cb4, 0x405ac80b, 0x405ac1f7, 0x405abbd9, 0x405ac86a, 0x405ad72b,
+ 0x405af2f0, 0x405ab465, 0x405a6364, 0x405a1350, 0x4059baa3, 0x4059911d,
+ 0x40597921, 0x40595564, 0x40593b8d, 0x4059310f, 0x40594e46, 0x40599bae,
+ 0x4059e703, 0x4059feec, 0x405a053a, 0x4059feaa, 0x4059d7a0, 0x40599386,
+ 0x40592d0e, 0x4058ce4c, 0x40587335, 0x4058396a, 0x40584ee1, 0x4058925a,
+ 0x40592f6d, 0x405a9f0a
+
+};
+
+const int pos_enc_coe_hex[] = {
+ 0x3f800000, 0x3f84b063, 0x3f898cc0, 0x3f8e96b2, 0x3f93cfe5, 0x3f993a15,
+ 0x3f9ed70c, 0x3fa4a8a8, 0x3faab0d5, 0x3fb0f193, 0x3fb76cf5, 0x3fbe2520,
+ 0x3fc51c50, 0x3fcc54d2, 0x3fd3d10c, 0x3fdb9378, 0x3fe39ea9, 0x3febf549,
+ 0x3ff49a1b, 0x3ffd8ffe, 0x40036cf4, 0x40083d78, 0x400d3b22, 0x40126799,
+ 0x4017c496, 0x401d53df, 0x4023174b, 0x402910c4, 0x402f4244, 0x4035adda,
+ 0x403c55a4, 0x40433bd9, 0x404a62c2, 0x4051ccbd, 0x40597c3f, 0x406173d4,
+ 0x4069b621, 0x407245e2, 0x407b25ed, 0x40822c9a, 0x4086f161, 0x408be2e0,
+ 0x409102bc, 0x409652a6, 0x409bd461, 0x40a189c1, 0x40a774aa, 0x40ad9711,
+ 0x40b3f300, 0x40ba8a92, 0x40c15ff6, 0x40c8756f, 0x40cfcd58, 0x40d76a1e,
+ 0x40df4e48, 0x40e77c73, 0x40eff755, 0x40f8c1be, 0x4100ef4c, 0x4105a873,
+ 0x410a8de6, 0x410fa144, 0x4114e43b, 0x411a588a, 0x41200000, 0x4125dc7c,
+ 0x412beff0, 0x41323c5f, 0x4138c3df, 0x413f889a, 0x41468cd0, 0x414dd2d2,
+ 0x41555d0a, 0x415d2df7, 0x41654832, 0x416dae69, 0x41766364, 0x417f6a07,
+ 0x418462a7, 0x41893c2b, 0x418e432a, 0x4193794e, 0x4198e051, 0x419e79ff,
+ 0x41a44831, 0x41aa4cd6, 0x41b089ea, 0x41b70180, 0x41bdb5bc, 0x41c4a8d7,
+ 0x41cbdd1e, 0x41d354f5, 0x41db12d6, 0x41e31950, 0x41eb6b0d, 0x41f40ad0,
+ 0x41fcfb72, 0x42031ff6, 0x4207eda7, 0x420ce865, 0x421211d5, 0x42176bad,
+ 0x421cf7b4, 0x4222b7c0, 0x4228adb9, 0x422edb98, 0x4235436b, 0x423be74f,
+ 0x4242c979, 0x4249ec31, 0x425151d4, 0x4258fcd6, 0x4260efc0, 0x42692d37,
+ 0x4271b7f3, 0x427a92cb, 0x4281e057, 0x4286a253, 0x428b90ed, 0x4290adc8,
+ 0x4295fa95, 0x429b7917, 0x42a12b1f, 0x42a71290, 0x42ad3160, 0x42b38995,
+ 0x42ba1d4a, 0x42c0eead, 0x42c80000, 0x42cf539b, 0x42d6ebec, 0x42decb76,
+ 0x42e6f4d6, 0x42ef6ac1, 0x42f83003, 0x4300a3c3, 0x43055a26, 0x430a3cbb,
+ 0x430f4d1f, 0x43148d01, 0x4319fe1e, 0x431fa244, 0x43257b51, 0x432b8b36,
+ 0x4331d3f4, 0x433857a1, 0x433f1865, 0x4346187e, 0x434d5a3e, 0x4354e00b,
+ 0x435cac64, 0x4364c1e0, 0x436d232b, 0x4375d30c, 0x437ed466, 0x43841519,
+ 0x4388ebc5, 0x438defd2, 0x439322e8, 0x439886c2, 0x439e1d27, 0x43a3e7f3,
+ 0x43a9e911, 0x43b0227e, 0x43b6964a, 0x43bd4698, 0x43c435a1, 0x43cb65b0,
+ 0x43d2d927, 0x43da927e, 0x43e29445, 0x43eae123, 0x43f37bd8, 0x43fc673e,
+ 0x4402d325, 0x44079e06, 0x440c95d8, 0x4411bc42, 0x441712f8, 0x441c9bbf,
+ 0x4422586d, 0x44284ae8, 0x442e7528, 0x4434d93a, 0x443b793b, 0x4442575d,
+ 0x444975e6, 0x4450d734, 0x44587db7, 0x44606bfa, 0x4468a49c, 0x44712a58,
+ 0x447a0000, 0x44819441, 0x44865373, 0x448b3f2a, 0x44905906, 0x4495a2b9,
+ 0x449b1e02, 0x44a0ccb4, 0x44a6b0b0, 0x44accbe9, 0x44b32067, 0x44b9b042,
+ 0x44c07da6, 0x44c78ad5, 0x44ceda26, 0x44d66e03, 0x44de48f1, 0x44e66d89,
+ 0x44eede7f, 0x44f79e9e, 0x45005867, 0x45050c07, 0x4509ebbf, 0x450ef92c,
+ 0x451435fb, 0x4519a3e8, 0x451f44bf, 0x45251a60, 0x452b26b7, 0x45316bc7,
+ 0x4537eba3, 0x453ea872, 0x4545a471, 0x454ce1f0, 0x45546355, 0x455c2b1d,
+ 0x45643bdc, 0x456c983e, 0x45754309, 0x457e3f1c, 0x4583c7b8, 0x45889b8f,
+ 0x458d9cab, 0x4592ccb6, 0x45982d67, 0x459dc087, 0x45a387ee, 0x45a98587,
+ 0x45afbb4e, 0x45b62b53, 0x45bcd7b6, 0x45c3c2af, 0x45caee88, 0x45d25da1,
+ 0x45da1272, 0x45e20f88, 0x45ea5789, 0x45f2ed34, 0x45fbd360, 0x46028680,
+ 0x46074e93, 0x460c437c, 0x461166e2, 0x4616ba77};
+
+const int pos_enc_div_term_hex[] = {
+ 0x3f800000, 0x3f76f410, 0x3f6e39f8, 0x3f65ced3, 0x3f5dafd7, 0x3f55da52,
+ 0x3f4e4bac, 0x3f470165, 0x3f3ff911, 0x3f39305c, 0x3f32a506, 0x3f2c54e5,
+ 0x3f263de0, 0x3f205df3, 0x3f1ab32b, 0x3f153ba8, 0x3f0ff59a, 0x3f0adf41,
+ 0x3f05f6ee, 0x3f013b01, 0x3ef953cf, 0x3ef0843c, 0x3ee80460, 0x3edfd167,
+ 0x3ed7e89b, 0x3ed0475c, 0x3ec8eb24, 0x3ec1d181, 0x3ebaf81a, 0x3eb45caa,
+ 0x3eadfcff, 0x3ea7d6fd, 0x3ea1e89b, 0x3e9c2fe1, 0x3e96aaea, 0x3e9157e1,
+ 0x3e8c3504, 0x3e87409d, 0x3e827909, 0x3e7bb965, 0x3e72d424, 0x3e6a3f5c,
+ 0x3e61f836, 0x3e59fbf3, 0x3e5247ed, 0x3e4ad998, 0x3e43ae7c, 0x3e3cc43a,
+ 0x3e361887, 0x3e2fa92d, 0x3e29740a, 0x3e23770f, 0x3e1db040, 0x3e181db4,
+ 0x3e12bd91, 0x3e0d8e0f, 0x3e088d77, 0x3e03ba20, 0x3dfe24e1, 0x3df529bb,
+ 0x3dec7fd5, 0x3de42450, 0x3ddc1466, 0x3dd44d6c, 0x3dcccccd, 0x3dc5900d,
+ 0x3dbe94c7, 0x3db7d8a9, 0x3db15978, 0x3dab150e, 0x3da50957, 0x3d9f3451,
+ 0x3d99940e, 0x3d9426b0, 0x3d8eea6c, 0x3d89dd84, 0x3d84fe4d, 0x3d804b29,
+ 0x3d778512, 0x3d6ec5da, 0x3d6655c3, 0x3d5e3202, 0x3d5657e4, 0x3d4ec4ce,
+ 0x3d47763f, 0x3d4069ca, 0x3d399d19, 0x3d330dec, 0x3d2cba15, 0x3d269f7d,
+ 0x3d20bc1d, 0x3d1b0e01, 0x3d159348, 0x3d104a21, 0x3d0b30cc, 0x3d064597,
+ 0x3d0186e2, 0x3cf9e635, 0x3cf11176, 0x3ce88c9c, 0x3ce054d2, 0x3cd86761,
+ 0x3cd0c1a8, 0x3cc9611d, 0x3cc24350, 0x3cbb65e3, 0x3cb4c691, 0x3cae6328,
+ 0x3ca8398b, 0x3ca247ad, 0x3c9c8b97, 0x3c970362, 0x3c91ad39, 0x3c8c8757,
+ 0x3c879008, 0x3c82c5a5, 0x3c7c4d33, 0x3c7362b9, 0x3c6ac8e7, 0x3c627ce5,
+ 0x3c5a7bf1, 0x3c52c366, 0x3c4b50b4, 0x3c442163, 0x3c3d3311, 0x3c368373,
+ 0x3c301052, 0x3c29d789, 0x3c23d70a, 0x3c1e0cd7, 0x3c187705, 0x3c1313ba,
+ 0x3c0de12d, 0x3c08dda5, 0x3c040779, 0x3bfeba1b, 0x3bf5b9b0, 0x3bed0ab3,
+ 0x3be4aa46, 0x3bdc95a0, 0x3bd4ca14, 0x3bcd450e, 0x3bc6040e, 0x3bbf04ae,
+ 0x3bb8449c, 0x3bb1c19b, 0x3bab7983, 0x3ba56a3f, 0x3b9f91cc, 0x3b99ee3b,
+ 0x3b947dae, 0x3b8f3e56, 0x3b8a2e77, 0x3b854c64, 0x3b80967d, 0x3b781668,
+ 0x3b6f520d, 0x3b66dd02, 0x3b5eb47a, 0x3b56d5bf, 0x3b4f3e37, 0x3b47eb5e,
+ 0x3b40dac5, 0x3b3a0a16, 0x3b33770f, 0x3b2d1f81, 0x3b270153, 0x3b211a7e,
+ 0x3b1b690d, 0x3b15eb1c, 0x3b109edb, 0x3b0b8287, 0x3b06946f, 0x3b01d2f1,
+ 0x3afa78f1, 0x3af19f03, 0x3ae91528, 0x3ae0d88b, 0x3ad8e673, 0x3ad13c3c,
+ 0x3ac9d75c, 0x3ac2b561, 0x3abbd3ec, 0x3ab530b7, 0x3aaec98e, 0x3aa89c52,
+ 0x3aa2a6f6, 0x3a9ce782, 0x3a975c0e, 0x3a9202c3, 0x3a8cd9db, 0x3a87dfa1,
+ 0x3a83126f, 0x3a7ce158, 0x3a73f1a2, 0x3a6b52c4, 0x3a6301e2, 0x3a5afc3b,
+ 0x3a533f27, 0x3a4bc816, 0x3a44948c, 0x3a3da229, 0x3a36ee9e, 0x3a3077b3,
+ 0x3a2a3b43, 0x3a24373e, 0x3a1e69a5, 0x3a18d08b, 0x3a136a16, 0x3a0e347c,
+ 0x3a092e02, 0x3a0454ff, 0x39ff4fad, 0x39f649f8, 0x39ed95e3, 0x39e5308a,
+ 0x39dd1726, 0x39d54706, 0x39cdbd95, 0x39c67853, 0x39bf74d7, 0x39b8b0cf,
+ 0x39b229fb, 0x39abde33, 0x39a5cb5f, 0x399fef7e, 0x399a489e, 0x3994d4df,
+ 0x398f9272, 0x398a7f9b, 0x39859aa9, 0x3980e1fe, 0x3978a814, 0x396fde93,
+ 0x39676491, 0x395f373e, 0x395753e5, 0x394fb7e7, 0x394860c1, 0x39414c02,
+ 0x393a7753, 0x3933e06f, 0x392d8529, 0x39276363, 0x39217917, 0x391bc44d,
+ 0x39164323, 0x3910f3c6, 0x390bd472, 0x3906e374, 0x39021f2b, 0x38fb0c03,
+ 0x38f22ce3, 0x38e99e04, 0x38e15c92, 0x38d965ce};
+#endif
diff --git a/funasr/runtime/onnxruntime/src/tmp.h b/funasr/runtime/onnxruntime/src/tmp.h
new file mode 100644
index 000000000..b57303f82
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/tmp.h
@@ -0,0 +1,112 @@
+
+#ifndef WENETPARAMS_H
+#define WENETPARAMS_H
+// #pragma pack(1)
+
+#define vocab_size 5538
+
+typedef struct {
+ float conv0_weight[512 * 9];
+ float conv0_bias[512];
+
+ float conv1_weight[512 * 512 * 9];
+ float conv1_bias[512];
+
+ float out0_weight[9728 * 512];
+ float out0_bias[512];
+
+} EncEmbedParams;
+
+typedef struct {
+ float linear_q_weight[512 * 512];
+ float linear_q_bias[512];
+ float linear_k_weight[512 * 512];
+ float linear_k_bias[512];
+ float linear_v_weight[512 * 512];
+ float linear_v_bias[512];
+ float linear_out_weight[512 * 512];
+ float linear_out_bias[512];
+} SelfAttnParams;
+
+typedef struct {
+ SelfAttnParams linear0;
+ float linear_pos_weight[512 * 512];
+ float pos_bias_u[512];
+ float pos_bias_v[512];
+
+} EncSelfAttnParams;
+
+typedef struct {
+ float w1_weight[512 * 2048];
+ float w1_bias[2048];
+ float w2_weight[2048 * 512];
+ float w2_bias[512];
+} FeedForwardParams;
+
+typedef struct {
+ float weight[512];
+ float bias[512];
+} NormParams;
+
+typedef struct {
+ float pointwise_conv1_weight[1024 * 512];
+ float pointwise_conv1_bias[1024];
+
+ float depthwise_conv_weight[512 * 15];
+ float depthwise_conv_bias[512];
+
+ float pointwise_conv2_weight[512 * 512];
+ float pointwise_conv2_bias[512];
+ NormParams norm;
+} EncConvParams;
+
+typedef struct {
+ EncSelfAttnParams self_attn;
+ FeedForwardParams feedforward;
+ FeedForwardParams feedforward_macaron;
+ EncConvParams conv_module;
+ NormParams norm_ff;
+ NormParams norm_mha;
+ NormParams norm_macaron;
+ NormParams norm_conv;
+ NormParams norm_final;
+ // float concat_weight[1024 * 512];
+ // float concat_bias[512];
+} SubEncoderParams;
+
+typedef struct {
+ EncEmbedParams embed;
+ SubEncoderParams sub_encoder[12];
+ NormParams after_norm;
+} EncoderParams;
+
+typedef struct {
+ SelfAttnParams self_attn;
+ SelfAttnParams src_attn;
+ FeedForwardParams feedward;
+ NormParams norm1;
+ NormParams norm2;
+ NormParams norm3;
+ // float concat_weight1[1024 * 512];
+ // float concat_bias1[512];
+ // float concat_weight2[1024 * 512];
+ // float concat_bias2[512];
+} SubDecoderParams;
+
+typedef struct {
+ float embed_weight[vocab_size * 512];
+ SubDecoderParams sub_decoder[6];
+ NormParams after_norm;
+ float output_weight[vocab_size * 512];
+ float output_bias[vocab_size];
+} DecoderParams;
+
+typedef struct {
+ EncoderParams encoder;
+ float ctc_weight[512 * vocab_size];
+ float ctc_bias[vocab_size];
+ DecoderParams decoder;
+} WenetParams;
+
+// #pragma pack()
+#endif
diff --git a/funasr/runtime/onnxruntime/src/util.cpp b/funasr/runtime/onnxruntime/src/util.cpp
new file mode 100644
index 000000000..5a72c72b9
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/util.cpp
@@ -0,0 +1,180 @@
+
+#include "precomp.h"
+
+float *loadparams(const char *filename)
+{
+
+ FILE *fp;
+ fp = fopen(filename, "rb");
+ fseek(fp, 0, SEEK_END);
+ uint32_t nFileLen = ftell(fp);
+ fseek(fp, 0, SEEK_SET);
+
+ float *params_addr = (float *)aligned_malloc(32, nFileLen);
+ int n = fread(params_addr, 1, nFileLen, fp);
+ fclose(fp);
+
+ return params_addr;
+}
+
+int val_align(int val, int align)
+{
+ float tmp = ceil((float)val / (float)align) * (float)align;
+ return (int)tmp;
+}
+
+void disp_params(float *din, int size)
+{
+ int i;
+ for (i = 0; i < size; i++) {
+ printf("%f ", din[i]);
+ }
+ printf("\n");
+}
+void SaveDataFile(const char *filename, void *data, uint32_t len)
+{
+ FILE *fp;
+ fp = fopen(filename, "wb+");
+ fwrite(data, 1, len, fp);
+ fclose(fp);
+}
+
+void basic_norm(Tensor *&din, float norm)
+{
+
+ int Tmax = din->size[2];
+
+ int i, j;
+ for (i = 0; i < Tmax; i++) {
+ float sum = 0;
+ for (j = 0; j < 512; j++) {
+ int ii = i * 512 + j;
+ sum += din->buff[ii] * din->buff[ii];
+ }
+ float mean = sqrt(sum / 512 + norm);
+ for (j = 0; j < 512; j++) {
+ int ii = i * 512 + j;
+ din->buff[ii] = din->buff[ii] / mean;
+ }
+ }
+}
+
+void findmax(float *din, int len, float &max_val, int &max_idx)
+{
+ int i;
+ max_val = -INFINITY;
+ max_idx = -1;
+ for (i = 0; i < len; i++) {
+ if (din[i] > max_val) {
+ max_val = din[i];
+ max_idx = i;
+ }
+ }
+}
+
+string pathAppend(const string &p1, const string &p2)
+{
+
+ char sep = '/';
+ string tmp = p1;
+
+#ifdef _WIN32
+ sep = '\\';
+#endif
+
+ if (p1[p1.length()-1] != sep) { // Need to add a
+ tmp += sep; // path separator
+ return (tmp + p2);
+ } else
+ return (p1 + p2);
+}
+
+void relu(Tensor *din)
+{
+ int i;
+ for (i = 0; i < din->buff_size; i++) {
+ float val = din->buff[i];
+ din->buff[i] = val < 0 ? 0 : val;
+ }
+}
+
+void swish(Tensor *din)
+{
+ int i;
+ for (i = 0; i < din->buff_size; i++) {
+ float val = din->buff[i];
+ din->buff[i] = val / (1 + exp(-val));
+ }
+}
+
+void sigmoid(Tensor *din)
+{
+ int i;
+ for (i = 0; i < din->buff_size; i++) {
+ float val = din->buff[i];
+ din->buff[i] = 1 / (1 + exp(-val));
+ }
+}
+
+void doubleswish(Tensor *din)
+{
+ int i;
+ for (i = 0; i < din->buff_size; i++) {
+ float val = din->buff[i];
+ din->buff[i] = val / (1 + exp(-val + 1));
+ }
+}
+
+void softmax(float *din, int mask, int len)
+{
+ float *tmp = (float *)malloc(mask * sizeof(float));
+ int i;
+ float sum = 0;
+ float max = -INFINITY;
+
+ for (i = 0; i < mask; i++) {
+ max = max < din[i] ? din[i] : max;
+ }
+
+ for (i = 0; i < mask; i++) {
+ tmp[i] = exp(din[i] - max);
+ sum += tmp[i];
+ }
+ for (i = 0; i < mask; i++) {
+ din[i] = tmp[i] / sum;
+ }
+ free(tmp);
+ for (i = mask; i < len; i++) {
+ din[i] = 0;
+ }
+}
+
+void log_softmax(float *din, int len)
+{
+ float *tmp = (float *)malloc(len * sizeof(float));
+ int i;
+ float sum = 0;
+ for (i = 0; i < len; i++) {
+ tmp[i] = exp(din[i]);
+ sum += tmp[i];
+ }
+ for (i = 0; i < len; i++) {
+ din[i] = log(tmp[i] / sum);
+ }
+ free(tmp);
+}
+
+void glu(Tensor *din, Tensor *dout)
+{
+ int mm = din->buff_size / 1024;
+ int i, j;
+ for (i = 0; i < mm; i++) {
+ for (j = 0; j < 512; j++) {
+ int in_off = i * 1024 + j;
+ int out_off = i * 512 + j;
+ float a = din->buff[in_off];
+ float b = din->buff[in_off + 512];
+ dout->buff[out_off] = a / (1 + exp(-b));
+ }
+ }
+}
diff --git a/funasr/runtime/onnxruntime/src/util.h b/funasr/runtime/onnxruntime/src/util.h
new file mode 100644
index 000000000..48a27db28
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/util.h
@@ -0,0 +1,30 @@
+
+
+#ifndef UTIL_H
+#define UTIL_H
+
+using namespace std;
+
+extern float *loadparams(const char *filename);
+
+extern void SaveDataFile(const char *filename, void *data, uint32_t len);
+extern void relu(Tensor *din);
+extern void swish(Tensor *din);
+extern void sigmoid(Tensor *din);
+extern void doubleswish(Tensor *din);
+
+extern void softmax(float *din, int mask, int len);
+
+extern void log_softmax(float *din, int len);
+extern int val_align(int val, int align);
+extern void disp_params(float *din, int size);
+
+extern void basic_norm(Tensor *&din, float norm);
+
+extern void findmax(float *din, int len, float &max_val, int &max_idx);
+
+extern void glu(Tensor *din, Tensor *dout);
+
+string pathAppend(const string &p1, const string &p2);
+
+#endif
diff --git a/funasr/runtime/onnxruntime/tester/CMakeLists.txt b/funasr/runtime/onnxruntime/tester/CMakeLists.txt
new file mode 100644
index 000000000..d79427135
--- /dev/null
+++ b/funasr/runtime/onnxruntime/tester/CMakeLists.txt
@@ -0,0 +1,20 @@
+
+
+if(WIN32)
+ if(CMAKE_CL_64)
+ link_directories( ${CMAKE_SOURCE_DIR}/win/lib/x64 )
+ else()
+ link_directories( ${CMAKE_SOURCE_DIR}/win/lib/x86 )
+ endif()
+endif()
+
+set(EXTRA_LIBS rapidasr)
+
+
+include_directories(${CMAKE_SOURCE_DIR}/include)
+set(EXECNAME "tester")
+
+add_executable(${EXECNAME} "tester.cpp")
+target_link_libraries(${EXECNAME} PUBLIC ${EXTRA_LIBS})
+
+
diff --git a/funasr/runtime/onnxruntime/tester/tester.cpp b/funasr/runtime/onnxruntime/tester/tester.cpp
new file mode 100644
index 000000000..ba5c61ccb
--- /dev/null
+++ b/funasr/runtime/onnxruntime/tester/tester.cpp
@@ -0,0 +1,118 @@
+
+#ifndef _WIN32
+#include
+#else
+#include
+#endif
+
+#include "librapidasrapi.h"
+
+#include
+#include
+using namespace std;
+
+int main(int argc, char *argv[])
+{
+
+ if (argc < 2)
+ {
+ printf("Usage: %s /path/to/model_dir /path/to/wav/file", argv[0]);
+ exit(-1);
+ }
+ struct timeval start, end;
+ gettimeofday(&start, NULL);
+ int nThreadNum = 4;
+ RPASR_HANDLE AsrHanlde=RapidAsrInit(argv[1], nThreadNum);
+
+ if (!AsrHanlde)
+ {
+ printf("Cannot load ASR Model from: %s, there must be files model.onnx and vocab.txt", argv[1]);
+ exit(-1);
+ }
+
+
+
+ gettimeofday(&end, NULL);
+ long seconds = (end.tv_sec - start.tv_sec);
+ long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
+ printf("Model initialization takes %lfs.\n", (double)modle_init_micros / 1000000);
+
+
+
+ gettimeofday(&start, NULL);
+ float snippet_time = 0.0f;
+
+
+ RPASR_RESULT Result=RapidAsrRecogFile(AsrHanlde, argv[2], RASR_NONE, NULL);
+
+ gettimeofday(&end, NULL);
+
+ if (Result)
+ {
+ string msg = RapidAsrGetResult(Result, 0);
+ setbuf(stdout, NULL);
+ cout << "Result: \"";
+ cout << msg << endl;
+ cout << "\"." << endl;
+ snippet_time = RapidAsrGetRetSnippetTime(Result);
+ RapidAsrFreeResult(Result);
+ }
+ else
+ {
+ cout <<"no return data!";
+ }
+
+
+ //char* buff = nullptr;
+ //int len = 0;
+ //ifstream ifs(argv[2], std::ios::binary | std::ios::in);
+ //if (ifs.is_open())
+ //{
+ // ifs.seekg(0, std::ios::end);
+ // len = ifs.tellg();
+ // ifs.seekg(0, std::ios::beg);
+
+ // buff = new char[len];
+
+ // ifs.read(buff, len);
+
+
+ // //RPASR_RESULT Result = RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);
+
+ // RPASR_RESULT Result=RapidAsrRecogPCMBuffer(AsrHanlde, buff,len, RASR_NONE, NULL);
+ // //RPASR_RESULT Result = RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);
+ // gettimeofday(&end, NULL);
+ //
+ // if (Result)
+ // {
+ // string msg = RapidAsrGetResult(Result, 0);
+ // setbuf(stdout, NULL);
+ // cout << "Result: \"";
+ // cout << msg << endl;
+ // cout << "\"." << endl;
+ // snippet_time = RapidAsrGetRetSnippetTime(Result);
+ // RapidAsrFreeResult(Result);
+ // }
+ // else
+ // {
+ // cout <<"no return data!";
+ // }
+
+ //
+ //delete[]buff;
+ //}
+
+
+ printf("Audio length %lfs.\n", (double)snippet_time);
+ seconds = (end.tv_sec - start.tv_sec);
+ long taking_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
+ printf("Model inference takes %lfs.\n", (double)taking_micros / 1000000);
+
+ printf("Model inference RTF: %04lf.\n", (double)taking_micros/ (snippet_time*1000000));
+
+ RapidAsrUninit(AsrHanlde);
+
+ return 0;
+}
+
+
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/third_party/install_openblas.sh b/funasr/runtime/onnxruntime/third_party/install_openblas.sh
new file mode 100644
index 000000000..4a41012e7
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/install_openblas.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+OPENBLAS_VERSION=0.3.13
+
+WGET=${WGET:-wget}
+
+set -e
+
+if ! command -v gfortran 2>/dev/null; then
+ echo "$0: gfortran is not installed. Please install it, e.g. by:"
+ echo " apt-get install gfortran"
+ echo "(if on Debian or Ubuntu), or:"
+ echo " yum install gcc-gfortran"
+ echo "(if on RedHat/CentOS). On a Mac, if brew is installed, it's:"
+ echo " brew install gfortran"
+ exit 1
+fi
+
+
+tarball=OpenBLAS-$OPENBLAS_VERSION.tar.gz
+
+rm -rf xianyi-OpenBLAS-* OpenBLAS OpenBLAS-*.tar.gz
+
+if [ -d "$DOWNLOAD_DIR" ]; then
+ cp -p "$DOWNLOAD_DIR/$tarball" .
+else
+ url=$($WGET -qO- "https://api.github.com/repos/xianyi/OpenBLAS/releases/tags/v${OPENBLAS_VERSION}" | python -c 'import sys,json;print(json.load(sys.stdin)["tarball_url"])')
+ test -n "$url"
+ $WGET -t3 -nv -O $tarball "$url"
+fi
+
+tar xzf $tarball
+mv xianyi-OpenBLAS-* OpenBLAS
+
+make PREFIX=$(pwd)/OpenBLAS/install USE_LOCKING=1 USE_THREAD=0 -C OpenBLAS all install
+if [ $? -eq 0 ]; then
+ echo "OpenBLAS is installed successfully."
+ rm $tarball
+fi
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/CMakeLists.txt b/funasr/runtime/onnxruntime/third_party/webrtc/CMakeLists.txt
new file mode 100644
index 000000000..51812eb7c
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/CMakeLists.txt
@@ -0,0 +1,16 @@
+
+
+if(WIN32)
+ add_definitions(-DWEBRTC_WIN)
+else()
+ add_definitions(-DWEBRTC_POSIX)
+endif()
+
+
+include_directories("..")
+
+file(GLOB_RECURSE files "*.c" "rtc_base/checks.cc")
+
+message("${files}")
+
+add_library(webrtcvad ${files})
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/complex_bit_reverse.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/complex_bit_reverse.c
new file mode 100644
index 000000000..c8bd2dc45
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/complex_bit_reverse.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+
+/* Tables for data buffer indexes that are bit reversed and thus need to be
+ * swapped. Note that, index_7[{0, 2, 4, ...}] are for the left side of the swap
+ * operations, while index_7[{1, 3, 5, ...}] are for the right side of the
+ * operation. Same for index_8.
+ */
+
+/* Indexes for the case of stages == 7. */
+static const int16_t index_7[112] = {
+ 1, 64, 2, 32, 3, 96, 4, 16, 5, 80, 6, 48, 7, 112, 9, 72, 10, 40, 11, 104,
+ 12, 24, 13, 88, 14, 56, 15, 120, 17, 68, 18, 36, 19, 100, 21, 84, 22, 52,
+ 23, 116, 25, 76, 26, 44, 27, 108, 29, 92, 30, 60, 31, 124, 33, 66, 35, 98,
+ 37, 82, 38, 50, 39, 114, 41, 74, 43, 106, 45, 90, 46, 58, 47, 122, 49, 70,
+ 51, 102, 53, 86, 55, 118, 57, 78, 59, 110, 61, 94, 63, 126, 67, 97, 69,
+ 81, 71, 113, 75, 105, 77, 89, 79, 121, 83, 101, 87, 117, 91, 109, 95, 125,
+ 103, 115, 111, 123
+};
+
+/* Indexes for the case of stages == 8. */
+static const int16_t index_8[240] = {
+ 1, 128, 2, 64, 3, 192, 4, 32, 5, 160, 6, 96, 7, 224, 8, 16, 9, 144, 10, 80,
+ 11, 208, 12, 48, 13, 176, 14, 112, 15, 240, 17, 136, 18, 72, 19, 200, 20,
+ 40, 21, 168, 22, 104, 23, 232, 25, 152, 26, 88, 27, 216, 28, 56, 29, 184,
+ 30, 120, 31, 248, 33, 132, 34, 68, 35, 196, 37, 164, 38, 100, 39, 228, 41,
+ 148, 42, 84, 43, 212, 44, 52, 45, 180, 46, 116, 47, 244, 49, 140, 50, 76,
+ 51, 204, 53, 172, 54, 108, 55, 236, 57, 156, 58, 92, 59, 220, 61, 188, 62,
+ 124, 63, 252, 65, 130, 67, 194, 69, 162, 70, 98, 71, 226, 73, 146, 74, 82,
+ 75, 210, 77, 178, 78, 114, 79, 242, 81, 138, 83, 202, 85, 170, 86, 106, 87,
+ 234, 89, 154, 91, 218, 93, 186, 94, 122, 95, 250, 97, 134, 99, 198, 101,
+ 166, 103, 230, 105, 150, 107, 214, 109, 182, 110, 118, 111, 246, 113, 142,
+ 115, 206, 117, 174, 119, 238, 121, 158, 123, 222, 125, 190, 127, 254, 131,
+ 193, 133, 161, 135, 225, 137, 145, 139, 209, 141, 177, 143, 241, 147, 201,
+ 149, 169, 151, 233, 155, 217, 157, 185, 159, 249, 163, 197, 167, 229, 171,
+ 213, 173, 181, 175, 245, 179, 205, 183, 237, 187, 221, 191, 253, 199, 227,
+ 203, 211, 207, 243, 215, 235, 223, 251, 239, 247
+};
+
+void WebRtcSpl_ComplexBitReverse(int16_t* __restrict complex_data, int stages) {
+ /* For any specific value of stages, we know exactly the indexes that are
+ * bit reversed. Currently (Feb. 2012) in WebRTC the only possible values of
+ * stages are 7 and 8, so we use tables to save unnecessary iterations and
+ * calculations for these two cases.
+ */
+ if (stages == 7 || stages == 8) {
+ int m = 0;
+ int length = 112;
+ const int16_t* index = index_7;
+
+ if (stages == 8) {
+ length = 240;
+ index = index_8;
+ }
+
+ /* Decimation in time. Swap the elements with bit-reversed indexes. */
+ for (m = 0; m < length; m += 2) {
+ /* We declare a int32_t* type pointer, to load both the 16-bit real
+ * and imaginary elements from complex_data in one instruction, reducing
+ * complexity.
+ */
+ int32_t* complex_data_ptr = (int32_t*)complex_data;
+ int32_t temp = 0;
+
+ temp = complex_data_ptr[index[m]]; /* Real and imaginary */
+ complex_data_ptr[index[m]] = complex_data_ptr[index[m + 1]];
+ complex_data_ptr[index[m + 1]] = temp;
+ }
+ }
+ else {
+ int m = 0, mr = 0, l = 0;
+ int n = 1 << stages;
+ int nn = n - 1;
+
+ /* Decimation in time - re-order data */
+ for (m = 1; m <= nn; ++m) {
+ int32_t* complex_data_ptr = (int32_t*)complex_data;
+ int32_t temp = 0;
+
+ /* Find out indexes that are bit-reversed. */
+ l = n;
+ do {
+ l >>= 1;
+ } while (l > nn - mr);
+ mr = (mr & (l - 1)) + l;
+
+ if (mr <= m) {
+ continue;
+ }
+
+ /* Swap the elements with bit-reversed indexes.
+ * This is similar to the loop in the stages == 7 or 8 cases.
+ */
+ temp = complex_data_ptr[m]; /* Real and imaginary */
+ complex_data_ptr[m] = complex_data_ptr[mr];
+ complex_data_ptr[mr] = temp;
+ }
+ }
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/complex_fft.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/complex_fft.c
new file mode 100644
index 000000000..328087233
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/complex_fft.c
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/*
+ * This file contains the function WebRtcSpl_ComplexFFT().
+ * The description header can be found in signal_processing_library.h
+ *
+ */
+
+#include "webrtc/common_audio/signal_processing/complex_fft_tables.h"
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+#include "webrtc/rtc_base/system/arch.h"
+
+#define CFFTSFT 14
+#define CFFTRND 1
+#define CFFTRND2 16384
+
+#define CIFFTSFT 14
+#define CIFFTRND 1
+
+
+int WebRtcSpl_ComplexFFT(int16_t frfi[], int stages, int mode)
+{
+ int i, j, l, k, istep, n, m;
+ int16_t wr, wi;
+ int32_t tr32, ti32, qr32, qi32;
+
+ /* The 1024-value is a constant given from the size of kSinTable1024[],
+ * and should not be changed depending on the input parameter 'stages'
+ */
+ n = 1 << stages;
+ if (n > 1024)
+ return -1;
+
+ l = 1;
+ k = 10 - 1; /* Constant for given kSinTable1024[]. Do not change
+ depending on the input parameter 'stages' */
+
+ if (mode == 0)
+ {
+ // mode==0: Low-complexity and Low-accuracy mode
+ while (l < n)
+ {
+ istep = l << 1;
+
+ for (m = 0; m < l; ++m)
+ {
+ j = m << k;
+
+ /* The 256-value is a constant given as 1/4 of the size of
+ * kSinTable1024[], and should not be changed depending on the input
+ * parameter 'stages'. It will result in 0 <= j < N_SINE_WAVE/2
+ */
+ wr = kSinTable1024[j + 256];
+ wi = -kSinTable1024[j];
+
+ for (i = m; i < n; i += istep)
+ {
+ j = i + l;
+
+ tr32 = (wr * frfi[2 * j] - wi * frfi[2 * j + 1]) >> 15;
+
+ ti32 = (wr * frfi[2 * j + 1] + wi * frfi[2 * j]) >> 15;
+
+ qr32 = (int32_t)frfi[2 * i];
+ qi32 = (int32_t)frfi[2 * i + 1];
+ frfi[2 * j] = (int16_t)((qr32 - tr32) >> 1);
+ frfi[2 * j + 1] = (int16_t)((qi32 - ti32) >> 1);
+ frfi[2 * i] = (int16_t)((qr32 + tr32) >> 1);
+ frfi[2 * i + 1] = (int16_t)((qi32 + ti32) >> 1);
+ }
+ }
+
+ --k;
+ l = istep;
+
+ }
+
+ } else
+ {
+ // mode==1: High-complexity and High-accuracy mode
+ while (l < n)
+ {
+ istep = l << 1;
+
+ for (m = 0; m < l; ++m)
+ {
+ j = m << k;
+
+ /* The 256-value is a constant given as 1/4 of the size of
+ * kSinTable1024[], and should not be changed depending on the input
+ * parameter 'stages'. It will result in 0 <= j < N_SINE_WAVE/2
+ */
+ wr = kSinTable1024[j + 256];
+ wi = -kSinTable1024[j];
+
+#ifdef WEBRTC_ARCH_ARM_V7
+ int32_t wri = 0;
+ __asm __volatile("pkhbt %0, %1, %2, lsl #16" : "=r"(wri) :
+ "r"((int32_t)wr), "r"((int32_t)wi));
+#endif
+
+ for (i = m; i < n; i += istep)
+ {
+ j = i + l;
+
+#ifdef WEBRTC_ARCH_ARM_V7
+ register int32_t frfi_r;
+ __asm __volatile(
+ "pkhbt %[frfi_r], %[frfi_even], %[frfi_odd],"
+ " lsl #16\n\t"
+ "smlsd %[tr32], %[wri], %[frfi_r], %[cfftrnd]\n\t"
+ "smladx %[ti32], %[wri], %[frfi_r], %[cfftrnd]\n\t"
+ :[frfi_r]"=&r"(frfi_r),
+ [tr32]"=&r"(tr32),
+ [ti32]"=r"(ti32)
+ :[frfi_even]"r"((int32_t)frfi[2*j]),
+ [frfi_odd]"r"((int32_t)frfi[2*j +1]),
+ [wri]"r"(wri),
+ [cfftrnd]"r"(CFFTRND));
+#else
+ tr32 = wr * frfi[2 * j] - wi * frfi[2 * j + 1] + CFFTRND;
+
+ ti32 = wr * frfi[2 * j + 1] + wi * frfi[2 * j] + CFFTRND;
+#endif
+
+ tr32 >>= 15 - CFFTSFT;
+ ti32 >>= 15 - CFFTSFT;
+
+ qr32 = ((int32_t)frfi[2 * i]) * (1 << CFFTSFT);
+ qi32 = ((int32_t)frfi[2 * i + 1]) * (1 << CFFTSFT);
+
+ frfi[2 * j] = (int16_t)(
+ (qr32 - tr32 + CFFTRND2) >> (1 + CFFTSFT));
+ frfi[2 * j + 1] = (int16_t)(
+ (qi32 - ti32 + CFFTRND2) >> (1 + CFFTSFT));
+ frfi[2 * i] = (int16_t)(
+ (qr32 + tr32 + CFFTRND2) >> (1 + CFFTSFT));
+ frfi[2 * i + 1] = (int16_t)(
+ (qi32 + ti32 + CFFTRND2) >> (1 + CFFTSFT));
+ }
+ }
+
+ --k;
+ l = istep;
+ }
+ }
+ return 0;
+}
+
+int WebRtcSpl_ComplexIFFT(int16_t frfi[], int stages, int mode)
+{
+ size_t i, j, l, istep, n, m;
+ int k, scale, shift;
+ int16_t wr, wi;
+ int32_t tr32, ti32, qr32, qi32;
+ int32_t tmp32, round2;
+
+ /* The 1024-value is a constant given from the size of kSinTable1024[],
+ * and should not be changed depending on the input parameter 'stages'
+ */
+ n = ((size_t)1) << stages;
+ if (n > 1024)
+ return -1;
+
+ scale = 0;
+
+ l = 1;
+ k = 10 - 1; /* Constant for given kSinTable1024[]. Do not change
+ depending on the input parameter 'stages' */
+
+ while (l < n)
+ {
+ // variable scaling, depending upon data
+ shift = 0;
+ round2 = 8192;
+
+ tmp32 = WebRtcSpl_MaxAbsValueW16(frfi, 2 * n);
+ if (tmp32 > 13573)
+ {
+ shift++;
+ scale++;
+ round2 <<= 1;
+ }
+ if (tmp32 > 27146)
+ {
+ shift++;
+ scale++;
+ round2 <<= 1;
+ }
+
+ istep = l << 1;
+
+ if (mode == 0)
+ {
+ // mode==0: Low-complexity and Low-accuracy mode
+ for (m = 0; m < l; ++m)
+ {
+ j = m << k;
+
+ /* The 256-value is a constant given as 1/4 of the size of
+ * kSinTable1024[], and should not be changed depending on the input
+ * parameter 'stages'. It will result in 0 <= j < N_SINE_WAVE/2
+ */
+ wr = kSinTable1024[j + 256];
+ wi = kSinTable1024[j];
+
+ for (i = m; i < n; i += istep)
+ {
+ j = i + l;
+
+ tr32 = (wr * frfi[2 * j] - wi * frfi[2 * j + 1]) >> 15;
+
+ ti32 = (wr * frfi[2 * j + 1] + wi * frfi[2 * j]) >> 15;
+
+ qr32 = (int32_t)frfi[2 * i];
+ qi32 = (int32_t)frfi[2 * i + 1];
+ frfi[2 * j] = (int16_t)((qr32 - tr32) >> shift);
+ frfi[2 * j + 1] = (int16_t)((qi32 - ti32) >> shift);
+ frfi[2 * i] = (int16_t)((qr32 + tr32) >> shift);
+ frfi[2 * i + 1] = (int16_t)((qi32 + ti32) >> shift);
+ }
+ }
+ } else
+ {
+ // mode==1: High-complexity and High-accuracy mode
+
+ for (m = 0; m < l; ++m)
+ {
+ j = m << k;
+
+ /* The 256-value is a constant given as 1/4 of the size of
+ * kSinTable1024[], and should not be changed depending on the input
+ * parameter 'stages'. It will result in 0 <= j < N_SINE_WAVE/2
+ */
+ wr = kSinTable1024[j + 256];
+ wi = kSinTable1024[j];
+
+#ifdef WEBRTC_ARCH_ARM_V7
+ int32_t wri = 0;
+ __asm __volatile("pkhbt %0, %1, %2, lsl #16" : "=r"(wri) :
+ "r"((int32_t)wr), "r"((int32_t)wi));
+#endif
+
+ for (i = m; i < n; i += istep)
+ {
+ j = i + l;
+
+#ifdef WEBRTC_ARCH_ARM_V7
+ register int32_t frfi_r;
+ __asm __volatile(
+ "pkhbt %[frfi_r], %[frfi_even], %[frfi_odd], lsl #16\n\t"
+ "smlsd %[tr32], %[wri], %[frfi_r], %[cifftrnd]\n\t"
+ "smladx %[ti32], %[wri], %[frfi_r], %[cifftrnd]\n\t"
+ :[frfi_r]"=&r"(frfi_r),
+ [tr32]"=&r"(tr32),
+ [ti32]"=r"(ti32)
+ :[frfi_even]"r"((int32_t)frfi[2*j]),
+ [frfi_odd]"r"((int32_t)frfi[2*j +1]),
+ [wri]"r"(wri),
+ [cifftrnd]"r"(CIFFTRND)
+ );
+#else
+
+ tr32 = wr * frfi[2 * j] - wi * frfi[2 * j + 1] + CIFFTRND;
+
+ ti32 = wr * frfi[2 * j + 1] + wi * frfi[2 * j] + CIFFTRND;
+#endif
+ tr32 >>= 15 - CIFFTSFT;
+ ti32 >>= 15 - CIFFTSFT;
+
+ qr32 = ((int32_t)frfi[2 * i]) * (1 << CIFFTSFT);
+ qi32 = ((int32_t)frfi[2 * i + 1]) * (1 << CIFFTSFT);
+
+ frfi[2 * j] = (int16_t)(
+ (qr32 - tr32 + round2) >> (shift + CIFFTSFT));
+ frfi[2 * j + 1] = (int16_t)(
+ (qi32 - ti32 + round2) >> (shift + CIFFTSFT));
+ frfi[2 * i] = (int16_t)(
+ (qr32 + tr32 + round2) >> (shift + CIFFTSFT));
+ frfi[2 * i + 1] = (int16_t)(
+ (qi32 + ti32 + round2) >> (shift + CIFFTSFT));
+ }
+ }
+
+ }
+ --k;
+ l = istep;
+ }
+ return scale;
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/complex_fft_tables.h b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/complex_fft_tables.h
new file mode 100644
index 000000000..90fac072d
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/complex_fft_tables.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef COMMON_AUDIO_SIGNAL_PROCESSING_COMPLEX_FFT_TABLES_H_
+#define COMMON_AUDIO_SIGNAL_PROCESSING_COMPLEX_FFT_TABLES_H_
+
+#include
+
+static const int16_t kSinTable1024[] = {
+ 0, 201, 402, 603, 804, 1005, 1206, 1406, 1607,
+ 1808, 2009, 2209, 2410, 2610, 2811, 3011, 3211, 3411,
+ 3611, 3811, 4011, 4210, 4409, 4608, 4807, 5006, 5205,
+ 5403, 5601, 5799, 5997, 6195, 6392, 6589, 6786, 6982,
+ 7179, 7375, 7571, 7766, 7961, 8156, 8351, 8545, 8739,
+ 8932, 9126, 9319, 9511, 9703, 9895, 10087, 10278, 10469,
+ 10659, 10849, 11038, 11227, 11416, 11604, 11792, 11980, 12166,
+ 12353, 12539, 12724, 12909, 13094, 13278, 13462, 13645, 13827,
+ 14009, 14191, 14372, 14552, 14732, 14911, 15090, 15268, 15446,
+ 15623, 15799, 15975, 16150, 16325, 16499, 16672, 16845, 17017,
+ 17189, 17360, 17530, 17699, 17868, 18036, 18204, 18371, 18537,
+ 18702, 18867, 19031, 19194, 19357, 19519, 19680, 19840, 20000,
+ 20159, 20317, 20474, 20631, 20787, 20942, 21096, 21249, 21402,
+ 21554, 21705, 21855, 22004, 22153, 22301, 22448, 22594, 22739,
+ 22883, 23027, 23169, 23311, 23452, 23592, 23731, 23869, 24006,
+ 24143, 24278, 24413, 24546, 24679, 24811, 24942, 25072, 25201,
+ 25329, 25456, 25582, 25707, 25831, 25954, 26077, 26198, 26318,
+ 26437, 26556, 26673, 26789, 26905, 27019, 27132, 27244, 27355,
+ 27466, 27575, 27683, 27790, 27896, 28001, 28105, 28208, 28309,
+ 28410, 28510, 28608, 28706, 28802, 28897, 28992, 29085, 29177,
+ 29268, 29358, 29446, 29534, 29621, 29706, 29790, 29873, 29955,
+ 30036, 30116, 30195, 30272, 30349, 30424, 30498, 30571, 30643,
+ 30713, 30783, 30851, 30918, 30984, 31049, 31113, 31175, 31236,
+ 31297, 31356, 31413, 31470, 31525, 31580, 31633, 31684, 31735,
+ 31785, 31833, 31880, 31926, 31970, 32014, 32056, 32097, 32137,
+ 32176, 32213, 32249, 32284, 32318, 32350, 32382, 32412, 32441,
+ 32468, 32495, 32520, 32544, 32567, 32588, 32609, 32628, 32646,
+ 32662, 32678, 32692, 32705, 32717, 32727, 32736, 32744, 32751,
+ 32757, 32761, 32764, 32766, 32767, 32766, 32764, 32761, 32757,
+ 32751, 32744, 32736, 32727, 32717, 32705, 32692, 32678, 32662,
+ 32646, 32628, 32609, 32588, 32567, 32544, 32520, 32495, 32468,
+ 32441, 32412, 32382, 32350, 32318, 32284, 32249, 32213, 32176,
+ 32137, 32097, 32056, 32014, 31970, 31926, 31880, 31833, 31785,
+ 31735, 31684, 31633, 31580, 31525, 31470, 31413, 31356, 31297,
+ 31236, 31175, 31113, 31049, 30984, 30918, 30851, 30783, 30713,
+ 30643, 30571, 30498, 30424, 30349, 30272, 30195, 30116, 30036,
+ 29955, 29873, 29790, 29706, 29621, 29534, 29446, 29358, 29268,
+ 29177, 29085, 28992, 28897, 28802, 28706, 28608, 28510, 28410,
+ 28309, 28208, 28105, 28001, 27896, 27790, 27683, 27575, 27466,
+ 27355, 27244, 27132, 27019, 26905, 26789, 26673, 26556, 26437,
+ 26318, 26198, 26077, 25954, 25831, 25707, 25582, 25456, 25329,
+ 25201, 25072, 24942, 24811, 24679, 24546, 24413, 24278, 24143,
+ 24006, 23869, 23731, 23592, 23452, 23311, 23169, 23027, 22883,
+ 22739, 22594, 22448, 22301, 22153, 22004, 21855, 21705, 21554,
+ 21402, 21249, 21096, 20942, 20787, 20631, 20474, 20317, 20159,
+ 20000, 19840, 19680, 19519, 19357, 19194, 19031, 18867, 18702,
+ 18537, 18371, 18204, 18036, 17868, 17699, 17530, 17360, 17189,
+ 17017, 16845, 16672, 16499, 16325, 16150, 15975, 15799, 15623,
+ 15446, 15268, 15090, 14911, 14732, 14552, 14372, 14191, 14009,
+ 13827, 13645, 13462, 13278, 13094, 12909, 12724, 12539, 12353,
+ 12166, 11980, 11792, 11604, 11416, 11227, 11038, 10849, 10659,
+ 10469, 10278, 10087, 9895, 9703, 9511, 9319, 9126, 8932,
+ 8739, 8545, 8351, 8156, 7961, 7766, 7571, 7375, 7179,
+ 6982, 6786, 6589, 6392, 6195, 5997, 5799, 5601, 5403,
+ 5205, 5006, 4807, 4608, 4409, 4210, 4011, 3811, 3611,
+ 3411, 3211, 3011, 2811, 2610, 2410, 2209, 2009, 1808,
+ 1607, 1406, 1206, 1005, 804, 603, 402, 201, 0,
+ -201, -402, -603, -804, -1005, -1206, -1406, -1607, -1808,
+ -2009, -2209, -2410, -2610, -2811, -3011, -3211, -3411, -3611,
+ -3811, -4011, -4210, -4409, -4608, -4807, -5006, -5205, -5403,
+ -5601, -5799, -5997, -6195, -6392, -6589, -6786, -6982, -7179,
+ -7375, -7571, -7766, -7961, -8156, -8351, -8545, -8739, -8932,
+ -9126, -9319, -9511, -9703, -9895, -10087, -10278, -10469, -10659,
+ -10849, -11038, -11227, -11416, -11604, -11792, -11980, -12166, -12353,
+ -12539, -12724, -12909, -13094, -13278, -13462, -13645, -13827, -14009,
+ -14191, -14372, -14552, -14732, -14911, -15090, -15268, -15446, -15623,
+ -15799, -15975, -16150, -16325, -16499, -16672, -16845, -17017, -17189,
+ -17360, -17530, -17699, -17868, -18036, -18204, -18371, -18537, -18702,
+ -18867, -19031, -19194, -19357, -19519, -19680, -19840, -20000, -20159,
+ -20317, -20474, -20631, -20787, -20942, -21096, -21249, -21402, -21554,
+ -21705, -21855, -22004, -22153, -22301, -22448, -22594, -22739, -22883,
+ -23027, -23169, -23311, -23452, -23592, -23731, -23869, -24006, -24143,
+ -24278, -24413, -24546, -24679, -24811, -24942, -25072, -25201, -25329,
+ -25456, -25582, -25707, -25831, -25954, -26077, -26198, -26318, -26437,
+ -26556, -26673, -26789, -26905, -27019, -27132, -27244, -27355, -27466,
+ -27575, -27683, -27790, -27896, -28001, -28105, -28208, -28309, -28410,
+ -28510, -28608, -28706, -28802, -28897, -28992, -29085, -29177, -29268,
+ -29358, -29446, -29534, -29621, -29706, -29790, -29873, -29955, -30036,
+ -30116, -30195, -30272, -30349, -30424, -30498, -30571, -30643, -30713,
+ -30783, -30851, -30918, -30984, -31049, -31113, -31175, -31236, -31297,
+ -31356, -31413, -31470, -31525, -31580, -31633, -31684, -31735, -31785,
+ -31833, -31880, -31926, -31970, -32014, -32056, -32097, -32137, -32176,
+ -32213, -32249, -32284, -32318, -32350, -32382, -32412, -32441, -32468,
+ -32495, -32520, -32544, -32567, -32588, -32609, -32628, -32646, -32662,
+ -32678, -32692, -32705, -32717, -32727, -32736, -32744, -32751, -32757,
+ -32761, -32764, -32766, -32767, -32766, -32764, -32761, -32757, -32751,
+ -32744, -32736, -32727, -32717, -32705, -32692, -32678, -32662, -32646,
+ -32628, -32609, -32588, -32567, -32544, -32520, -32495, -32468, -32441,
+ -32412, -32382, -32350, -32318, -32284, -32249, -32213, -32176, -32137,
+ -32097, -32056, -32014, -31970, -31926, -31880, -31833, -31785, -31735,
+ -31684, -31633, -31580, -31525, -31470, -31413, -31356, -31297, -31236,
+ -31175, -31113, -31049, -30984, -30918, -30851, -30783, -30713, -30643,
+ -30571, -30498, -30424, -30349, -30272, -30195, -30116, -30036, -29955,
+ -29873, -29790, -29706, -29621, -29534, -29446, -29358, -29268, -29177,
+ -29085, -28992, -28897, -28802, -28706, -28608, -28510, -28410, -28309,
+ -28208, -28105, -28001, -27896, -27790, -27683, -27575, -27466, -27355,
+ -27244, -27132, -27019, -26905, -26789, -26673, -26556, -26437, -26318,
+ -26198, -26077, -25954, -25831, -25707, -25582, -25456, -25329, -25201,
+ -25072, -24942, -24811, -24679, -24546, -24413, -24278, -24143, -24006,
+ -23869, -23731, -23592, -23452, -23311, -23169, -23027, -22883, -22739,
+ -22594, -22448, -22301, -22153, -22004, -21855, -21705, -21554, -21402,
+ -21249, -21096, -20942, -20787, -20631, -20474, -20317, -20159, -20000,
+ -19840, -19680, -19519, -19357, -19194, -19031, -18867, -18702, -18537,
+ -18371, -18204, -18036, -17868, -17699, -17530, -17360, -17189, -17017,
+ -16845, -16672, -16499, -16325, -16150, -15975, -15799, -15623, -15446,
+ -15268, -15090, -14911, -14732, -14552, -14372, -14191, -14009, -13827,
+ -13645, -13462, -13278, -13094, -12909, -12724, -12539, -12353, -12166,
+ -11980, -11792, -11604, -11416, -11227, -11038, -10849, -10659, -10469,
+ -10278, -10087, -9895, -9703, -9511, -9319, -9126, -8932, -8739,
+ -8545, -8351, -8156, -7961, -7766, -7571, -7375, -7179, -6982,
+ -6786, -6589, -6392, -6195, -5997, -5799, -5601, -5403, -5205,
+ -5006, -4807, -4608, -4409, -4210, -4011, -3811, -3611, -3411,
+ -3211, -3011, -2811, -2610, -2410, -2209, -2009, -1808, -1607,
+ -1406, -1206, -1005, -804, -603, -402, -201};
+
+#endif // COMMON_AUDIO_SIGNAL_PROCESSING_COMPLEX_FFT_TABLES_H_
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/cross_correlation.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/cross_correlation.c
new file mode 100644
index 000000000..d7c9f2b9a
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/cross_correlation.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+
+/* C version of WebRtcSpl_CrossCorrelation() for generic platforms. */
+void WebRtcSpl_CrossCorrelationC(int32_t* cross_correlation,
+ const int16_t* seq1,
+ const int16_t* seq2,
+ size_t dim_seq,
+ size_t dim_cross_correlation,
+ int right_shifts,
+ int step_seq2) {
+ size_t i = 0, j = 0;
+
+ for (i = 0; i < dim_cross_correlation; i++) {
+ int32_t corr = 0;
+ for (j = 0; j < dim_seq; j++)
+ corr += (seq1[j] * seq2[j]) >> right_shifts;
+ seq2 += step_seq2;
+ *cross_correlation++ = corr;
+ }
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/division_operations.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/division_operations.c
new file mode 100644
index 000000000..2d420525b
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/division_operations.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/*
+ * This file contains implementations of the divisions
+ * WebRtcSpl_DivU32U16()
+ * WebRtcSpl_DivW32W16()
+ * WebRtcSpl_DivW32W16ResW16()
+ * WebRtcSpl_DivResultInQ31()
+ * WebRtcSpl_DivW32HiLow()
+ *
+ * The description header can be found in signal_processing_library.h
+ *
+ */
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+#include "webrtc/rtc_base/sanitizer.h"
+
+uint32_t WebRtcSpl_DivU32U16(uint32_t num, uint16_t den)
+{
+ // Guard against division with 0
+ if (den != 0)
+ {
+ return (uint32_t)(num / den);
+ } else
+ {
+ return (uint32_t)0xFFFFFFFF;
+ }
+}
+
+int32_t WebRtcSpl_DivW32W16(int32_t num, int16_t den)
+{
+ // Guard against division with 0
+ if (den != 0)
+ {
+ return (int32_t)(num / den);
+ } else
+ {
+ return (int32_t)0x7FFFFFFF;
+ }
+}
+
+int16_t WebRtcSpl_DivW32W16ResW16(int32_t num, int16_t den)
+{
+ // Guard against division with 0
+ if (den != 0)
+ {
+ return (int16_t)(num / den);
+ } else
+ {
+ return (int16_t)0x7FFF;
+ }
+}
+
+int32_t WebRtcSpl_DivResultInQ31(int32_t num, int32_t den)
+{
+ int32_t L_num = num;
+ int32_t L_den = den;
+ int32_t div = 0;
+ int k = 31;
+ int change_sign = 0;
+
+ if (num == 0)
+ return 0;
+
+ if (num < 0)
+ {
+ change_sign++;
+ L_num = -num;
+ }
+ if (den < 0)
+ {
+ change_sign++;
+ L_den = -den;
+ }
+ while (k--)
+ {
+ div <<= 1;
+ L_num <<= 1;
+ if (L_num >= L_den)
+ {
+ L_num -= L_den;
+ div++;
+ }
+ }
+ if (change_sign == 1)
+ {
+ div = -div;
+ }
+ return div;
+}
+
+int32_t RTC_NO_SANITIZE("signed-integer-overflow") // bugs.webrtc.org/5486
+WebRtcSpl_DivW32HiLow(int32_t num, int16_t den_hi, int16_t den_low)
+{
+ int16_t approx, tmp_hi, tmp_low, num_hi, num_low;
+ int32_t tmpW32;
+
+ approx = (int16_t)WebRtcSpl_DivW32W16((int32_t)0x1FFFFFFF, den_hi);
+ // result in Q14 (Note: 3FFFFFFF = 0.5 in Q30)
+
+ // tmpW32 = 1/den = approx * (2.0 - den * approx) (in Q30)
+ tmpW32 = (den_hi * approx << 1) + ((den_low * approx >> 15) << 1);
+ // tmpW32 = den * approx
+
+ tmpW32 = (int32_t)0x7fffffffL - tmpW32; // result in Q30 (tmpW32 = 2.0-(den*approx))
+ // UBSan: 2147483647 - -2 cannot be represented in type 'int'
+
+ // Store tmpW32 in hi and low format
+ tmp_hi = (int16_t)(tmpW32 >> 16);
+ tmp_low = (int16_t)((tmpW32 - ((int32_t)tmp_hi << 16)) >> 1);
+
+ // tmpW32 = 1/den in Q29
+ tmpW32 = (tmp_hi * approx + (tmp_low * approx >> 15)) << 1;
+
+ // 1/den in hi and low format
+ tmp_hi = (int16_t)(tmpW32 >> 16);
+ tmp_low = (int16_t)((tmpW32 - ((int32_t)tmp_hi << 16)) >> 1);
+
+ // Store num in hi and low format
+ num_hi = (int16_t)(num >> 16);
+ num_low = (int16_t)((num - ((int32_t)num_hi << 16)) >> 1);
+
+ // num * (1/den) by 32 bit multiplication (result in Q28)
+
+ tmpW32 = num_hi * tmp_hi + (num_hi * tmp_low >> 15) +
+ (num_low * tmp_hi >> 15);
+
+ // Put result in Q31 (convert from Q28)
+ tmpW32 = WEBRTC_SPL_LSHIFT_W32(tmpW32, 3);
+
+ return tmpW32;
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/dot_product_with_scale.cc b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/dot_product_with_scale.cc
new file mode 100644
index 000000000..d9661af11
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/dot_product_with_scale.cc
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/common_audio/signal_processing/dot_product_with_scale.h"
+
+#include "webrtc/rtc_base/numerics/safe_conversions.h"
+
+int32_t WebRtcSpl_DotProductWithScale(const int16_t* vector1,
+ const int16_t* vector2,
+ size_t length,
+ int scaling) {
+ int64_t sum = 0;
+ size_t i = 0;
+
+ /* Unroll the loop to improve performance. */
+ for (i = 0; i + 3 < length; i += 4) {
+ sum += (vector1[i + 0] * vector2[i + 0]) >> scaling;
+ sum += (vector1[i + 1] * vector2[i + 1]) >> scaling;
+ sum += (vector1[i + 2] * vector2[i + 2]) >> scaling;
+ sum += (vector1[i + 3] * vector2[i + 3]) >> scaling;
+ }
+ for (; i < length; i++) {
+ sum += (vector1[i] * vector2[i]) >> scaling;
+ }
+
+ return rtc::saturated_cast(sum);
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/dot_product_with_scale.h b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/dot_product_with_scale.h
new file mode 100644
index 000000000..bb892d40c
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/dot_product_with_scale.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef COMMON_AUDIO_SIGNAL_PROCESSING_DOT_PRODUCT_WITH_SCALE_H_
+#define COMMON_AUDIO_SIGNAL_PROCESSING_DOT_PRODUCT_WITH_SCALE_H_
+
+#include
+#include
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Calculates the dot product between two (int16_t) vectors.
+//
+// Input:
+// - vector1 : Vector 1
+// - vector2 : Vector 2
+// - vector_length : Number of samples used in the dot product
+// - scaling : The number of right bit shifts to apply on each term
+// during calculation to avoid overflow, i.e., the
+// output will be in Q(-|scaling|)
+//
+// Return value : The dot product in Q(-scaling)
+int32_t WebRtcSpl_DotProductWithScale(const int16_t* vector1,
+ const int16_t* vector2,
+ size_t length,
+ int scaling);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // COMMON_AUDIO_SIGNAL_PROCESSING_DOT_PRODUCT_WITH_SCALE_H_
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/downsample_fast.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/downsample_fast.c
new file mode 100644
index 000000000..e575861ed
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/downsample_fast.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+
+#include "webrtc/rtc_base/checks.h"
+#include "webrtc/rtc_base/sanitizer.h"
+
+// TODO(Bjornv): Change the function parameter order to WebRTC code style.
+// C version of WebRtcSpl_DownsampleFast() for generic platforms.
+int WebRtcSpl_DownsampleFastC(const int16_t* data_in,
+ size_t data_in_length,
+ int16_t* data_out,
+ size_t data_out_length,
+ const int16_t* __restrict coefficients,
+ size_t coefficients_length,
+ int factor,
+ size_t delay) {
+ int16_t* const original_data_out = data_out;
+ size_t i = 0;
+ size_t j = 0;
+ int32_t out_s32 = 0;
+ size_t endpos = delay + factor * (data_out_length - 1) + 1;
+
+ // Return error if any of the running conditions doesn't meet.
+ if (data_out_length == 0 || coefficients_length == 0
+ || data_in_length < endpos) {
+ return -1;
+ }
+
+ rtc_MsanCheckInitialized(coefficients, sizeof(coefficients[0]),
+ coefficients_length);
+
+ for (i = delay; i < endpos; i += factor) {
+ out_s32 = 2048; // Round value, 0.5 in Q12.
+
+ for (j = 0; j < coefficients_length; j++) {
+ // Negative overflow is permitted here, because this is
+ // auto-regressive filters, and the state for each batch run is
+ // stored in the "negative" positions of the output vector.
+ rtc_MsanCheckInitialized(&data_in[(ptrdiff_t) i - (ptrdiff_t) j],
+ sizeof(data_in[0]), 1);
+ // out_s32 is in Q12 domain.
+ out_s32 += coefficients[j] * data_in[(ptrdiff_t) i - (ptrdiff_t) j];
+ }
+
+ out_s32 >>= 12; // Q0.
+
+ // Saturate and store the output.
+ *data_out++ = WebRtcSpl_SatW32ToW16(out_s32);
+ }
+
+ RTC_DCHECK_EQ(original_data_out + data_out_length, data_out);
+ rtc_MsanCheckInitialized(original_data_out, sizeof(original_data_out[0]),
+ data_out_length);
+
+ return 0;
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/energy.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/energy.c
new file mode 100644
index 000000000..e83f1a698
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/energy.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/*
+ * This file contains the function WebRtcSpl_Energy().
+ * The description header can be found in signal_processing_library.h
+ *
+ */
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+
+int32_t WebRtcSpl_Energy(int16_t* vector,
+ size_t vector_length,
+ int* scale_factor)
+{
+ int32_t en = 0;
+ size_t i;
+ int scaling =
+ WebRtcSpl_GetScalingSquare(vector, vector_length, vector_length);
+ size_t looptimes = vector_length;
+ int16_t *vectorptr = vector;
+
+ for (i = 0; i < looptimes; i++)
+ {
+ en += (*vectorptr * *vectorptr) >> scaling;
+ vectorptr++;
+ }
+ *scale_factor = scaling;
+
+ return en;
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/get_scaling_square.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/get_scaling_square.c
new file mode 100644
index 000000000..82e3c8b09
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/get_scaling_square.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/*
+ * This file contains the function WebRtcSpl_GetScalingSquare().
+ * The description header can be found in signal_processing_library.h
+ *
+ */
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+
+int16_t WebRtcSpl_GetScalingSquare(int16_t* in_vector,
+ size_t in_vector_length,
+ size_t times)
+{
+ int16_t nbits = WebRtcSpl_GetSizeInBits((uint32_t)times);
+ size_t i;
+ int16_t smax = -1;
+ int16_t sabs;
+ int16_t *sptr = in_vector;
+ int16_t t;
+ size_t looptimes = in_vector_length;
+
+ for (i = looptimes; i > 0; i--)
+ {
+ sabs = (*sptr > 0 ? *sptr++ : -*sptr++);
+ smax = (sabs > smax ? sabs : smax);
+ }
+ t = WebRtcSpl_NormW32(WEBRTC_SPL_MUL(smax, smax));
+
+ if (smax == 0)
+ {
+ return 0; // Since norm(0) returns 0
+ } else
+ {
+ return (t > nbits) ? 0 : nbits - t;
+ }
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/include/real_fft.h b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/include/real_fft.h
new file mode 100644
index 000000000..84450667d
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/include/real_fft.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef COMMON_AUDIO_SIGNAL_PROCESSING_INCLUDE_REAL_FFT_H_
+#define COMMON_AUDIO_SIGNAL_PROCESSING_INCLUDE_REAL_FFT_H_
+
+#include
+
+// For ComplexFFT(), the maximum fft order is 10;
+// WebRTC APM uses orders of only 7 and 8.
+enum { kMaxFFTOrder = 10 };
+
+struct RealFFT;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct RealFFT* WebRtcSpl_CreateRealFFT(int order);
+void WebRtcSpl_FreeRealFFT(struct RealFFT* self);
+
+// Compute an FFT for a real-valued signal of length of 2^order,
+// where 1 < order <= MAX_FFT_ORDER. Transform length is determined by the
+// specification structure, which must be initialized prior to calling the FFT
+// function with WebRtcSpl_CreateRealFFT().
+// The relationship between the input and output sequences can
+// be expressed in terms of the DFT, i.e.:
+// x[n] = (2^(-scalefactor)/N) . SUM[k=0,...,N-1] X[k].e^(jnk.2.pi/N)
+// n=0,1,2,...N-1
+// N=2^order.
+// The conjugate-symmetric output sequence is represented using a CCS vector,
+// which is of length N+2, and is organized as follows:
+// Index: 0 1 2 3 4 5 . . . N-2 N-1 N N+1
+// Component: R0 0 R1 I1 R2 I2 . . . R[N/2-1] I[N/2-1] R[N/2] 0
+// where R[n] and I[n], respectively, denote the real and imaginary components
+// for FFT bin 'n'. Bins are numbered from 0 to N/2, where N is the FFT length.
+// Bin index 0 corresponds to the DC component, and bin index N/2 corresponds to
+// the foldover frequency.
+//
+// Input Arguments:
+// self - pointer to preallocated and initialized FFT specification structure.
+// real_data_in - the input signal. For an ARM Neon platform, it must be
+// aligned on a 32-byte boundary.
+//
+// Output Arguments:
+// complex_data_out - the output complex signal with (2^order + 2) 16-bit
+// elements. For an ARM Neon platform, it must be different
+// from real_data_in, and aligned on a 32-byte boundary.
+//
+// Return Value:
+// 0 - FFT calculation is successful.
+// -1 - Error with bad arguments (null pointers).
+int WebRtcSpl_RealForwardFFT(struct RealFFT* self,
+ const int16_t* real_data_in,
+ int16_t* complex_data_out);
+
+// Compute the inverse FFT for a conjugate-symmetric input sequence of length of
+// 2^order, where 1 < order <= MAX_FFT_ORDER. Transform length is determined by
+// the specification structure, which must be initialized prior to calling the
+// FFT function with WebRtcSpl_CreateRealFFT().
+// For a transform of length M, the input sequence is represented using a packed
+// CCS vector of length M+2, which is explained in the comments for
+// WebRtcSpl_RealForwardFFTC above.
+//
+// Input Arguments:
+// self - pointer to preallocated and initialized FFT specification structure.
+// complex_data_in - the input complex signal with (2^order + 2) 16-bit
+// elements. For an ARM Neon platform, it must be aligned on
+// a 32-byte boundary.
+//
+// Output Arguments:
+// real_data_out - the output real signal. For an ARM Neon platform, it must
+// be different to complex_data_in, and aligned on a 32-byte
+// boundary.
+//
+// Return Value:
+// 0 or a positive number - a value that the elements in the |real_data_out|
+// should be shifted left with in order to get
+// correct physical values.
+// -1 - Error with bad arguments (null pointers).
+int WebRtcSpl_RealInverseFFT(struct RealFFT* self,
+ const int16_t* complex_data_in,
+ int16_t* real_data_out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // COMMON_AUDIO_SIGNAL_PROCESSING_INCLUDE_REAL_FFT_H_
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/include/signal_processing_library.h b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/include/signal_processing_library.h
new file mode 100644
index 000000000..ccbb306d8
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/include/signal_processing_library.h
@@ -0,0 +1,1612 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * This header file includes all of the fix point signal processing library
+ * (SPL) function descriptions and declarations. For specific function calls,
+ * see bottom of file.
+ */
+
+#ifndef COMMON_AUDIO_SIGNAL_PROCESSING_INCLUDE_SIGNAL_PROCESSING_LIBRARY_H_
+#define COMMON_AUDIO_SIGNAL_PROCESSING_INCLUDE_SIGNAL_PROCESSING_LIBRARY_H_
+
+#include
+#include "webrtc/common_audio/signal_processing/dot_product_with_scale.h"
+
+// Macros specific for the fixed point implementation
+#define WEBRTC_SPL_WORD16_MAX 32767
+#define WEBRTC_SPL_WORD16_MIN -32768
+#define WEBRTC_SPL_WORD32_MAX (int32_t)0x7fffffff
+#define WEBRTC_SPL_WORD32_MIN (int32_t)0x80000000
+#define WEBRTC_SPL_MAX_LPC_ORDER 14
+#define WEBRTC_SPL_MIN(A, B) (A < B ? A : B) // Get min value
+#define WEBRTC_SPL_MAX(A, B) (A > B ? A : B) // Get max value
+// TODO(kma/bjorn): For the next two macros, investigate how to correct the code
+// for inputs of a = WEBRTC_SPL_WORD16_MIN or WEBRTC_SPL_WORD32_MIN.
+#define WEBRTC_SPL_ABS_W16(a) (((int16_t)a >= 0) ? ((int16_t)a) : -((int16_t)a))
+#define WEBRTC_SPL_ABS_W32(a) (((int32_t)a >= 0) ? ((int32_t)a) : -((int32_t)a))
+
+#define WEBRTC_SPL_MUL(a, b) ((int32_t)((int32_t)(a) * (int32_t)(b)))
+#define WEBRTC_SPL_UMUL(a, b) ((uint32_t)((uint32_t)(a) * (uint32_t)(b)))
+#define WEBRTC_SPL_UMUL_32_16(a, b) ((uint32_t)((uint32_t)(a) * (uint16_t)(b)))
+#define WEBRTC_SPL_MUL_16_U16(a, b) ((int32_t)(int16_t)(a) * (uint16_t)(b))
+
+// clang-format off
+// clang-format would choose some identation
+// leading to presubmit error (cpplint.py)
+#ifndef WEBRTC_ARCH_ARM_V7
+// For ARMv7 platforms, these are inline functions in spl_inl_armv7.h
+#ifndef MIPS32_LE
+// For MIPS platforms, these are inline functions in spl_inl_mips.h
+#define WEBRTC_SPL_MUL_16_16(a, b) ((int32_t)(((int16_t)(a)) * ((int16_t)(b))))
+#define WEBRTC_SPL_MUL_16_32_RSFT16(a, b) \
+ (WEBRTC_SPL_MUL_16_16(a, b >> 16) + \
+ ((WEBRTC_SPL_MUL_16_16(a, (b & 0xffff) >> 1) + 0x4000) >> 15))
+#endif
+#endif
+
+#define WEBRTC_SPL_MUL_16_32_RSFT11(a, b) \
+ (WEBRTC_SPL_MUL_16_16(a, (b) >> 16) * (1 << 5) + \
+ (((WEBRTC_SPL_MUL_16_U16(a, (uint16_t)(b)) >> 1) + 0x0200) >> 10))
+#define WEBRTC_SPL_MUL_16_32_RSFT14(a, b) \
+ (WEBRTC_SPL_MUL_16_16(a, (b) >> 16) * (1 << 2) + \
+ (((WEBRTC_SPL_MUL_16_U16(a, (uint16_t)(b)) >> 1) + 0x1000) >> 13))
+#define WEBRTC_SPL_MUL_16_32_RSFT15(a, b) \
+ ((WEBRTC_SPL_MUL_16_16(a, (b) >> 16) * (1 << 1)) + \
+ (((WEBRTC_SPL_MUL_16_U16(a, (uint16_t)(b)) >> 1) + 0x2000) >> 14))
+// clang-format on
+
+#define WEBRTC_SPL_MUL_16_16_RSFT(a, b, c) (WEBRTC_SPL_MUL_16_16(a, b) >> (c))
+
+#define WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(a, b, c) \
+ ((WEBRTC_SPL_MUL_16_16(a, b) + ((int32_t)(((int32_t)1) << ((c)-1)))) >> (c))
+
+// C + the 32 most significant bits of A * B
+#define WEBRTC_SPL_SCALEDIFF32(A, B, C) \
+ (C + (B >> 16) * A + (((uint32_t)(B & 0x0000FFFF) * A) >> 16))
+
+#define WEBRTC_SPL_SAT(a, b, c) (b > a ? a : b < c ? c : b)
+
+// Shifting with negative numbers allowed
+// Positive means left shift
+#define WEBRTC_SPL_SHIFT_W32(x, c) ((c) >= 0 ? (x) * (1 << (c)) : (x) >> -(c))
+
+// Shifting with negative numbers not allowed
+// We cannot do casting here due to signed/unsigned problem
+#define WEBRTC_SPL_LSHIFT_W32(x, c) ((x) << (c))
+
+#define WEBRTC_SPL_RSHIFT_U32(x, c) ((uint32_t)(x) >> (c))
+
+#define WEBRTC_SPL_RAND(a) ((int16_t)((((int16_t)a * 18816) >> 7) & 0x00007fff))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define WEBRTC_SPL_MEMCPY_W16(v1, v2, length) \
+ memcpy(v1, v2, (length) * sizeof(int16_t))
+
+// inline functions:
+#include "webrtc/common_audio/signal_processing/include/spl_inl.h"
+
+// third party math functions
+#include "webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h"
+
+// Initialize SPL. Currently it contains only function pointer initialization.
+// If the underlying platform is known to be ARM-Neon (WEBRTC_HAS_NEON defined),
+// the pointers will be assigned to code optimized for Neon; otherwise, generic
+// C code will be assigned.
+// Note that this function MUST be called in any application that uses SPL
+// functions.
+void WebRtcSpl_Init(void);
+
+int16_t WebRtcSpl_GetScalingSquare(int16_t* in_vector,
+ size_t in_vector_length,
+ size_t times);
+
+// Copy and set operations. Implementation in copy_set_operations.c.
+// Descriptions at bottom of file.
+void WebRtcSpl_MemSetW16(int16_t* vector,
+ int16_t set_value,
+ size_t vector_length);
+void WebRtcSpl_MemSetW32(int32_t* vector,
+ int32_t set_value,
+ size_t vector_length);
+void WebRtcSpl_MemCpyReversedOrder(int16_t* out_vector,
+ int16_t* in_vector,
+ size_t vector_length);
+void WebRtcSpl_CopyFromEndW16(const int16_t* in_vector,
+ size_t in_vector_length,
+ size_t samples,
+ int16_t* out_vector);
+void WebRtcSpl_ZerosArrayW16(int16_t* vector, size_t vector_length);
+void WebRtcSpl_ZerosArrayW32(int32_t* vector, size_t vector_length);
+// End: Copy and set operations.
+
+// Minimum and maximum operation functions and their pointers.
+// Implementation in min_max_operations.c.
+
+// Returns the largest absolute value in a signed 16-bit vector.
+//
+// Input:
+// - vector : 16-bit input vector.
+// - length : Number of samples in vector.
+//
+// Return value : Maximum absolute value in vector.
+typedef int16_t (*MaxAbsValueW16)(const int16_t* vector, size_t length);
+extern MaxAbsValueW16 WebRtcSpl_MaxAbsValueW16;
+int16_t WebRtcSpl_MaxAbsValueW16C(const int16_t* vector, size_t length);
+#if defined(WEBRTC_HAS_NEON)
+int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, size_t length);
+#endif
+#if defined(MIPS32_LE)
+int16_t WebRtcSpl_MaxAbsValueW16_mips(const int16_t* vector, size_t length);
+#endif
+
+// Returns the largest absolute value in a signed 32-bit vector.
+//
+// Input:
+// - vector : 32-bit input vector.
+// - length : Number of samples in vector.
+//
+// Return value : Maximum absolute value in vector.
+typedef int32_t (*MaxAbsValueW32)(const int32_t* vector, size_t length);
+extern MaxAbsValueW32 WebRtcSpl_MaxAbsValueW32;
+int32_t WebRtcSpl_MaxAbsValueW32C(const int32_t* vector, size_t length);
+#if defined(WEBRTC_HAS_NEON)
+int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, size_t length);
+#endif
+#if defined(MIPS_DSP_R1_LE)
+int32_t WebRtcSpl_MaxAbsValueW32_mips(const int32_t* vector, size_t length);
+#endif
+
+// Returns the maximum value of a 16-bit vector.
+//
+// Input:
+// - vector : 16-bit input vector.
+// - length : Number of samples in vector.
+//
+// Return value : Maximum sample value in |vector|.
+typedef int16_t (*MaxValueW16)(const int16_t* vector, size_t length);
+extern MaxValueW16 WebRtcSpl_MaxValueW16;
+int16_t WebRtcSpl_MaxValueW16C(const int16_t* vector, size_t length);
+#if defined(WEBRTC_HAS_NEON)
+int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, size_t length);
+#endif
+#if defined(MIPS32_LE)
+int16_t WebRtcSpl_MaxValueW16_mips(const int16_t* vector, size_t length);
+#endif
+
+// Returns the maximum value of a 32-bit vector.
+//
+// Input:
+// - vector : 32-bit input vector.
+// - length : Number of samples in vector.
+//
+// Return value : Maximum sample value in |vector|.
+typedef int32_t (*MaxValueW32)(const int32_t* vector, size_t length);
+extern MaxValueW32 WebRtcSpl_MaxValueW32;
+int32_t WebRtcSpl_MaxValueW32C(const int32_t* vector, size_t length);
+#if defined(WEBRTC_HAS_NEON)
+int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, size_t length);
+#endif
+#if defined(MIPS32_LE)
+int32_t WebRtcSpl_MaxValueW32_mips(const int32_t* vector, size_t length);
+#endif
+
+// Returns the minimum value of a 16-bit vector.
+//
+// Input:
+// - vector : 16-bit input vector.
+// - length : Number of samples in vector.
+//
+// Return value : Minimum sample value in |vector|.
+typedef int16_t (*MinValueW16)(const int16_t* vector, size_t length);
+extern MinValueW16 WebRtcSpl_MinValueW16;
+int16_t WebRtcSpl_MinValueW16C(const int16_t* vector, size_t length);
+#if defined(WEBRTC_HAS_NEON)
+int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, size_t length);
+#endif
+#if defined(MIPS32_LE)
+int16_t WebRtcSpl_MinValueW16_mips(const int16_t* vector, size_t length);
+#endif
+
+// Returns the minimum value of a 32-bit vector.
+//
+// Input:
+// - vector : 32-bit input vector.
+// - length : Number of samples in vector.
+//
+// Return value : Minimum sample value in |vector|.
+typedef int32_t (*MinValueW32)(const int32_t* vector, size_t length);
+extern MinValueW32 WebRtcSpl_MinValueW32;
+int32_t WebRtcSpl_MinValueW32C(const int32_t* vector, size_t length);
+#if defined(WEBRTC_HAS_NEON)
+int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, size_t length);
+#endif
+#if defined(MIPS32_LE)
+int32_t WebRtcSpl_MinValueW32_mips(const int32_t* vector, size_t length);
+#endif
+
+// Returns the vector index to the largest absolute value of a 16-bit vector.
+//
+// Input:
+// - vector : 16-bit input vector.
+// - length : Number of samples in vector.
+//
+// Return value : Index to the maximum absolute value in vector.
+// If there are multiple equal maxima, return the index of the
+// first. -32768 will always have precedence over 32767 (despite
+// -32768 presenting an int16 absolute value of 32767).
+size_t WebRtcSpl_MaxAbsIndexW16(const int16_t* vector, size_t length);
+
+// Returns the vector index to the maximum sample value of a 16-bit vector.
+//
+// Input:
+// - vector : 16-bit input vector.
+// - length : Number of samples in vector.
+//
+// Return value : Index to the maximum value in vector (if multiple
+// indexes have the maximum, return the first).
+size_t WebRtcSpl_MaxIndexW16(const int16_t* vector, size_t length);
+
+// Returns the vector index to the maximum sample value of a 32-bit vector.
+//
+// Input:
+// - vector : 32-bit input vector.
+// - length : Number of samples in vector.
+//
+// Return value : Index to the maximum value in vector (if multiple
+// indexes have the maximum, return the first).
+size_t WebRtcSpl_MaxIndexW32(const int32_t* vector, size_t length);
+
+// Returns the vector index to the minimum sample value of a 16-bit vector.
+//
+// Input:
+// - vector : 16-bit input vector.
+// - length : Number of samples in vector.
+//
+// Return value : Index to the mimimum value in vector (if multiple
+// indexes have the minimum, return the first).
+size_t WebRtcSpl_MinIndexW16(const int16_t* vector, size_t length);
+
+// Returns the vector index to the minimum sample value of a 32-bit vector.
+//
+// Input:
+// - vector : 32-bit input vector.
+// - length : Number of samples in vector.
+//
+// Return value : Index to the mimimum value in vector (if multiple
+// indexes have the minimum, return the first).
+size_t WebRtcSpl_MinIndexW32(const int32_t* vector, size_t length);
+
+// End: Minimum and maximum operations.
+
+// Vector scaling operations. Implementation in vector_scaling_operations.c.
+// Description at bottom of file.
+void WebRtcSpl_VectorBitShiftW16(int16_t* out_vector,
+ size_t vector_length,
+ const int16_t* in_vector,
+ int16_t right_shifts);
+void WebRtcSpl_VectorBitShiftW32(int32_t* out_vector,
+ size_t vector_length,
+ const int32_t* in_vector,
+ int16_t right_shifts);
+void WebRtcSpl_VectorBitShiftW32ToW16(int16_t* out_vector,
+ size_t vector_length,
+ const int32_t* in_vector,
+ int right_shifts);
+void WebRtcSpl_ScaleVector(const int16_t* in_vector,
+ int16_t* out_vector,
+ int16_t gain,
+ size_t vector_length,
+ int16_t right_shifts);
+void WebRtcSpl_ScaleVectorWithSat(const int16_t* in_vector,
+ int16_t* out_vector,
+ int16_t gain,
+ size_t vector_length,
+ int16_t right_shifts);
+void WebRtcSpl_ScaleAndAddVectors(const int16_t* in_vector1,
+ int16_t gain1,
+ int right_shifts1,
+ const int16_t* in_vector2,
+ int16_t gain2,
+ int right_shifts2,
+ int16_t* out_vector,
+ size_t vector_length);
+
+// The functions (with related pointer) perform the vector operation:
+// out_vector[k] = ((scale1 * in_vector1[k]) + (scale2 * in_vector2[k])
+// + round_value) >> right_shifts,
+// where round_value = (1 << right_shifts) >> 1.
+//
+// Input:
+// - in_vector1 : Input vector 1
+// - in_vector1_scale : Gain to be used for vector 1
+// - in_vector2 : Input vector 2
+// - in_vector2_scale : Gain to be used for vector 2
+// - right_shifts : Number of right bit shifts to be applied
+// - length : Number of elements in the input vectors
+//
+// Output:
+// - out_vector : Output vector
+// Return value : 0 if OK, -1 if (in_vector1 == null
+// || in_vector2 == null || out_vector == null
+// || length <= 0 || right_shift < 0).
+typedef int (*ScaleAndAddVectorsWithRound)(const int16_t* in_vector1,
+ int16_t in_vector1_scale,
+ const int16_t* in_vector2,
+ int16_t in_vector2_scale,
+ int right_shifts,
+ int16_t* out_vector,
+ size_t length);
+extern ScaleAndAddVectorsWithRound WebRtcSpl_ScaleAndAddVectorsWithRound;
+int WebRtcSpl_ScaleAndAddVectorsWithRoundC(const int16_t* in_vector1,
+ int16_t in_vector1_scale,
+ const int16_t* in_vector2,
+ int16_t in_vector2_scale,
+ int right_shifts,
+ int16_t* out_vector,
+ size_t length);
+#if defined(MIPS_DSP_R1_LE)
+int WebRtcSpl_ScaleAndAddVectorsWithRound_mips(const int16_t* in_vector1,
+ int16_t in_vector1_scale,
+ const int16_t* in_vector2,
+ int16_t in_vector2_scale,
+ int right_shifts,
+ int16_t* out_vector,
+ size_t length);
+#endif
+// End: Vector scaling operations.
+
+// iLBC specific functions. Implementations in ilbc_specific_functions.c.
+// Description at bottom of file.
+void WebRtcSpl_ReverseOrderMultArrayElements(int16_t* out_vector,
+ const int16_t* in_vector,
+ const int16_t* window,
+ size_t vector_length,
+ int16_t right_shifts);
+void WebRtcSpl_ElementwiseVectorMult(int16_t* out_vector,
+ const int16_t* in_vector,
+ const int16_t* window,
+ size_t vector_length,
+ int16_t right_shifts);
+void WebRtcSpl_AddVectorsAndShift(int16_t* out_vector,
+ const int16_t* in_vector1,
+ const int16_t* in_vector2,
+ size_t vector_length,
+ int16_t right_shifts);
+void WebRtcSpl_AddAffineVectorToVector(int16_t* out_vector,
+ int16_t* in_vector,
+ int16_t gain,
+ int32_t add_constant,
+ int16_t right_shifts,
+ size_t vector_length);
+void WebRtcSpl_AffineTransformVector(int16_t* out_vector,
+ int16_t* in_vector,
+ int16_t gain,
+ int32_t add_constant,
+ int16_t right_shifts,
+ size_t vector_length);
+// End: iLBC specific functions.
+
+// Signal processing operations.
+
+// A 32-bit fix-point implementation of auto-correlation computation
+//
+// Input:
+// - in_vector : Vector to calculate autocorrelation upon
+// - in_vector_length : Length (in samples) of |vector|
+// - order : The order up to which the autocorrelation should be
+// calculated
+//
+// Output:
+// - result : auto-correlation values (values should be seen
+// relative to each other since the absolute values
+// might have been down shifted to avoid overflow)
+//
+// - scale : The number of left shifts required to obtain the
+// auto-correlation in Q0
+//
+// Return value : Number of samples in |result|, i.e. (order+1)
+size_t WebRtcSpl_AutoCorrelation(const int16_t* in_vector,
+ size_t in_vector_length,
+ size_t order,
+ int32_t* result,
+ int* scale);
+
+// A 32-bit fix-point implementation of the Levinson-Durbin algorithm that
+// does NOT use the 64 bit class
+//
+// Input:
+// - auto_corr : Vector with autocorrelation values of length >= |order|+1
+// - order : The LPC filter order (support up to order 20)
+//
+// Output:
+// - lpc_coef : lpc_coef[0..order] LPC coefficients in Q12
+// - refl_coef : refl_coef[0...order-1]| Reflection coefficients in Q15
+//
+// Return value : 1 for stable 0 for unstable
+int16_t WebRtcSpl_LevinsonDurbin(const int32_t* auto_corr,
+ int16_t* lpc_coef,
+ int16_t* refl_coef,
+ size_t order);
+
+// Converts reflection coefficients |refl_coef| to LPC coefficients |lpc_coef|.
+// This version is a 16 bit operation.
+//
+// NOTE: The 16 bit refl_coef -> lpc_coef conversion might result in a
+// "slightly unstable" filter (i.e., a pole just outside the unit circle) in
+// "rare" cases even if the reflection coefficients are stable.
+//
+// Input:
+// - refl_coef : Reflection coefficients in Q15 that should be converted
+// to LPC coefficients
+// - use_order : Number of coefficients in |refl_coef|
+//
+// Output:
+// - lpc_coef : LPC coefficients in Q12
+void WebRtcSpl_ReflCoefToLpc(const int16_t* refl_coef,
+ int use_order,
+ int16_t* lpc_coef);
+
+// Converts LPC coefficients |lpc_coef| to reflection coefficients |refl_coef|.
+// This version is a 16 bit operation.
+// The conversion is implemented by the step-down algorithm.
+//
+// Input:
+// - lpc_coef : LPC coefficients in Q12, that should be converted to
+// reflection coefficients
+// - use_order : Number of coefficients in |lpc_coef|
+//
+// Output:
+// - refl_coef : Reflection coefficients in Q15.
+void WebRtcSpl_LpcToReflCoef(int16_t* lpc_coef,
+ int use_order,
+ int16_t* refl_coef);
+
+// Calculates reflection coefficients (16 bit) from auto-correlation values
+//
+// Input:
+// - auto_corr : Auto-correlation values
+// - use_order : Number of coefficients wanted be calculated
+//
+// Output:
+// - refl_coef : Reflection coefficients in Q15.
+void WebRtcSpl_AutoCorrToReflCoef(const int32_t* auto_corr,
+ int use_order,
+ int16_t* refl_coef);
+
+// The functions (with related pointer) calculate the cross-correlation between
+// two sequences |seq1| and |seq2|.
+// |seq1| is fixed and |seq2| slides as the pointer is increased with the
+// amount |step_seq2|. Note the arguments should obey the relationship:
+// |dim_seq| - 1 + |step_seq2| * (|dim_cross_correlation| - 1) <
+// buffer size of |seq2|
+//
+// Input:
+// - seq1 : First sequence (fixed throughout the correlation)
+// - seq2 : Second sequence (slides |step_vector2| for each
+// new correlation)
+// - dim_seq : Number of samples to use in the cross-correlation
+// - dim_cross_correlation : Number of cross-correlations to calculate (the
+// start position for |vector2| is updated for each
+// new one)
+// - right_shifts : Number of right bit shifts to use. This will
+// become the output Q-domain.
+// - step_seq2 : How many (positive or negative) steps the
+// |vector2| pointer should be updated for each new
+// cross-correlation value.
+//
+// Output:
+// - cross_correlation : The cross-correlation in Q(-right_shifts)
+typedef void (*CrossCorrelation)(int32_t* cross_correlation,
+ const int16_t* seq1,
+ const int16_t* seq2,
+ size_t dim_seq,
+ size_t dim_cross_correlation,
+ int right_shifts,
+ int step_seq2);
+extern CrossCorrelation WebRtcSpl_CrossCorrelation;
+void WebRtcSpl_CrossCorrelationC(int32_t* cross_correlation,
+ const int16_t* seq1,
+ const int16_t* seq2,
+ size_t dim_seq,
+ size_t dim_cross_correlation,
+ int right_shifts,
+ int step_seq2);
+#if defined(WEBRTC_HAS_NEON)
+void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
+ const int16_t* seq1,
+ const int16_t* seq2,
+ size_t dim_seq,
+ size_t dim_cross_correlation,
+ int right_shifts,
+ int step_seq2);
+#endif
+#if defined(MIPS32_LE)
+void WebRtcSpl_CrossCorrelation_mips(int32_t* cross_correlation,
+ const int16_t* seq1,
+ const int16_t* seq2,
+ size_t dim_seq,
+ size_t dim_cross_correlation,
+ int right_shifts,
+ int step_seq2);
+#endif
+
+// Creates (the first half of) a Hanning window. Size must be at least 1 and
+// at most 512.
+//
+// Input:
+// - size : Length of the requested Hanning window (1 to 512)
+//
+// Output:
+// - window : Hanning vector in Q14.
+void WebRtcSpl_GetHanningWindow(int16_t* window, size_t size);
+
+// Calculates y[k] = sqrt(1 - x[k]^2) for each element of the input vector
+// |in_vector|. Input and output values are in Q15.
+//
+// Inputs:
+// - in_vector : Values to calculate sqrt(1 - x^2) of
+// - vector_length : Length of vector |in_vector|
+//
+// Output:
+// - out_vector : Output values in Q15
+void WebRtcSpl_SqrtOfOneMinusXSquared(int16_t* in_vector,
+ size_t vector_length,
+ int16_t* out_vector);
+// End: Signal processing operations.
+
+// Randomization functions. Implementations collected in
+// randomization_functions.c and descriptions at bottom of this file.
+int16_t WebRtcSpl_RandU(uint32_t* seed);
+int16_t WebRtcSpl_RandN(uint32_t* seed);
+int16_t WebRtcSpl_RandUArray(int16_t* vector,
+ int16_t vector_length,
+ uint32_t* seed);
+// End: Randomization functions.
+
+// Math functions
+int32_t WebRtcSpl_Sqrt(int32_t value);
+
+// Divisions. Implementations collected in division_operations.c and
+// descriptions at bottom of this file.
+uint32_t WebRtcSpl_DivU32U16(uint32_t num, uint16_t den);
+int32_t WebRtcSpl_DivW32W16(int32_t num, int16_t den);
+int16_t WebRtcSpl_DivW32W16ResW16(int32_t num, int16_t den);
+int32_t WebRtcSpl_DivResultInQ31(int32_t num, int32_t den);
+int32_t WebRtcSpl_DivW32HiLow(int32_t num, int16_t den_hi, int16_t den_low);
+// End: Divisions.
+
+int32_t WebRtcSpl_Energy(int16_t* vector,
+ size_t vector_length,
+ int* scale_factor);
+
+// Filter operations.
+size_t WebRtcSpl_FilterAR(const int16_t* ar_coef,
+ size_t ar_coef_length,
+ const int16_t* in_vector,
+ size_t in_vector_length,
+ int16_t* filter_state,
+ size_t filter_state_length,
+ int16_t* filter_state_low,
+ size_t filter_state_low_length,
+ int16_t* out_vector,
+ int16_t* out_vector_low,
+ size_t out_vector_low_length);
+
+// WebRtcSpl_FilterMAFastQ12(...)
+//
+// Performs a MA filtering on a vector in Q12
+//
+// Input:
+// - in_vector : Input samples (state in positions
+// in_vector[-order] .. in_vector[-1])
+// - ma_coef : Filter coefficients (in Q12)
+// - ma_coef_length : Number of B coefficients (order+1)
+// - vector_length : Number of samples to be filtered
+//
+// Output:
+// - out_vector : Filtered samples
+//
+void WebRtcSpl_FilterMAFastQ12(const int16_t* in_vector,
+ int16_t* out_vector,
+ const int16_t* ma_coef,
+ size_t ma_coef_length,
+ size_t vector_length);
+
+// Performs a AR filtering on a vector in Q12
+// Input:
+// - data_in : Input samples
+// - data_out : State information in positions
+// data_out[-order] .. data_out[-1]
+// - coefficients : Filter coefficients (in Q12)
+// - coefficients_length: Number of coefficients (order+1)
+// - data_length : Number of samples to be filtered
+// Output:
+// - data_out : Filtered samples
+void WebRtcSpl_FilterARFastQ12(const int16_t* data_in,
+ int16_t* data_out,
+ const int16_t* __restrict coefficients,
+ size_t coefficients_length,
+ size_t data_length);
+
+// The functions (with related pointer) perform a MA down sampling filter
+// on a vector.
+// Input:
+// - data_in : Input samples (state in positions
+// data_in[-order] .. data_in[-1])
+// - data_in_length : Number of samples in |data_in| to be filtered.
+// This must be at least
+// |delay| + |factor|*(|out_vector_length|-1) + 1)
+// - data_out_length : Number of down sampled samples desired
+// - coefficients : Filter coefficients (in Q12)
+// - coefficients_length: Number of coefficients (order+1)
+// - factor : Decimation factor
+// - delay : Delay of filter (compensated for in out_vector)
+// Output:
+// - data_out : Filtered samples
+// Return value : 0 if OK, -1 if |in_vector| is too short
+typedef int (*DownsampleFast)(const int16_t* data_in,
+ size_t data_in_length,
+ int16_t* data_out,
+ size_t data_out_length,
+ const int16_t* __restrict coefficients,
+ size_t coefficients_length,
+ int factor,
+ size_t delay);
+extern DownsampleFast WebRtcSpl_DownsampleFast;
+int WebRtcSpl_DownsampleFastC(const int16_t* data_in,
+ size_t data_in_length,
+ int16_t* data_out,
+ size_t data_out_length,
+ const int16_t* __restrict coefficients,
+ size_t coefficients_length,
+ int factor,
+ size_t delay);
+#if defined(WEBRTC_HAS_NEON)
+int WebRtcSpl_DownsampleFastNeon(const int16_t* data_in,
+ size_t data_in_length,
+ int16_t* data_out,
+ size_t data_out_length,
+ const int16_t* __restrict coefficients,
+ size_t coefficients_length,
+ int factor,
+ size_t delay);
+#endif
+#if defined(MIPS32_LE)
+int WebRtcSpl_DownsampleFast_mips(const int16_t* data_in,
+ size_t data_in_length,
+ int16_t* data_out,
+ size_t data_out_length,
+ const int16_t* __restrict coefficients,
+ size_t coefficients_length,
+ int factor,
+ size_t delay);
+#endif
+
+// End: Filter operations.
+
+// FFT operations
+
+int WebRtcSpl_ComplexFFT(int16_t vector[], int stages, int mode);
+int WebRtcSpl_ComplexIFFT(int16_t vector[], int stages, int mode);
+
+// Treat a 16-bit complex data buffer |complex_data| as an array of 32-bit
+// values, and swap elements whose indexes are bit-reverses of each other.
+//
+// Input:
+// - complex_data : Complex data buffer containing 2^|stages| real
+// elements interleaved with 2^|stages| imaginary
+// elements: [Re Im Re Im Re Im....]
+// - stages : Number of FFT stages. Must be at least 3 and at most
+// 10, since the table WebRtcSpl_kSinTable1024[] is 1024
+// elements long.
+//
+// Output:
+// - complex_data : The complex data buffer.
+
+void WebRtcSpl_ComplexBitReverse(int16_t* __restrict complex_data, int stages);
+
+// End: FFT operations
+
+/************************************************************
+ *
+ * RESAMPLING FUNCTIONS AND THEIR STRUCTS ARE DEFINED BELOW
+ *
+ ************************************************************/
+
+/*******************************************************************
+ * resample.c
+ *
+ * Includes the following resampling combinations
+ * 22 kHz -> 16 kHz
+ * 16 kHz -> 22 kHz
+ * 22 kHz -> 8 kHz
+ * 8 kHz -> 22 kHz
+ *
+ ******************************************************************/
+
+// state structure for 22 -> 16 resampler
+typedef struct {
+ int32_t S_22_44[8];
+ int32_t S_44_32[8];
+ int32_t S_32_16[8];
+} WebRtcSpl_State22khzTo16khz;
+
+void WebRtcSpl_Resample22khzTo16khz(const int16_t* in,
+ int16_t* out,
+ WebRtcSpl_State22khzTo16khz* state,
+ int32_t* tmpmem);
+
+void WebRtcSpl_ResetResample22khzTo16khz(WebRtcSpl_State22khzTo16khz* state);
+
+// state structure for 16 -> 22 resampler
+typedef struct {
+ int32_t S_16_32[8];
+ int32_t S_32_22[8];
+} WebRtcSpl_State16khzTo22khz;
+
+void WebRtcSpl_Resample16khzTo22khz(const int16_t* in,
+ int16_t* out,
+ WebRtcSpl_State16khzTo22khz* state,
+ int32_t* tmpmem);
+
+void WebRtcSpl_ResetResample16khzTo22khz(WebRtcSpl_State16khzTo22khz* state);
+
+// state structure for 22 -> 8 resampler
+typedef struct {
+ int32_t S_22_22[16];
+ int32_t S_22_16[8];
+ int32_t S_16_8[8];
+} WebRtcSpl_State22khzTo8khz;
+
+void WebRtcSpl_Resample22khzTo8khz(const int16_t* in,
+ int16_t* out,
+ WebRtcSpl_State22khzTo8khz* state,
+ int32_t* tmpmem);
+
+void WebRtcSpl_ResetResample22khzTo8khz(WebRtcSpl_State22khzTo8khz* state);
+
+// state structure for 8 -> 22 resampler
+typedef struct {
+ int32_t S_8_16[8];
+ int32_t S_16_11[8];
+ int32_t S_11_22[8];
+} WebRtcSpl_State8khzTo22khz;
+
+void WebRtcSpl_Resample8khzTo22khz(const int16_t* in,
+ int16_t* out,
+ WebRtcSpl_State8khzTo22khz* state,
+ int32_t* tmpmem);
+
+void WebRtcSpl_ResetResample8khzTo22khz(WebRtcSpl_State8khzTo22khz* state);
+
+/*******************************************************************
+ * resample_fractional.c
+ * Functions for internal use in the other resample functions
+ *
+ * Includes the following resampling combinations
+ * 48 kHz -> 32 kHz
+ * 32 kHz -> 24 kHz
+ * 44 kHz -> 32 kHz
+ *
+ ******************************************************************/
+
+void WebRtcSpl_Resample48khzTo32khz(const int32_t* In, int32_t* Out, size_t K);
+
+void WebRtcSpl_Resample32khzTo24khz(const int32_t* In, int32_t* Out, size_t K);
+
+void WebRtcSpl_Resample44khzTo32khz(const int32_t* In, int32_t* Out, size_t K);
+
+/*******************************************************************
+ * resample_48khz.c
+ *
+ * Includes the following resampling combinations
+ * 48 kHz -> 16 kHz
+ * 16 kHz -> 48 kHz
+ * 48 kHz -> 8 kHz
+ * 8 kHz -> 48 kHz
+ *
+ ******************************************************************/
+
+typedef struct {
+ int32_t S_48_48[16];
+ int32_t S_48_32[8];
+ int32_t S_32_16[8];
+} WebRtcSpl_State48khzTo16khz;
+
+void WebRtcSpl_Resample48khzTo16khz(const int16_t* in,
+ int16_t* out,
+ WebRtcSpl_State48khzTo16khz* state,
+ int32_t* tmpmem);
+
+void WebRtcSpl_ResetResample48khzTo16khz(WebRtcSpl_State48khzTo16khz* state);
+
+typedef struct {
+ int32_t S_16_32[8];
+ int32_t S_32_24[8];
+ int32_t S_24_48[8];
+} WebRtcSpl_State16khzTo48khz;
+
+void WebRtcSpl_Resample16khzTo48khz(const int16_t* in,
+ int16_t* out,
+ WebRtcSpl_State16khzTo48khz* state,
+ int32_t* tmpmem);
+
+void WebRtcSpl_ResetResample16khzTo48khz(WebRtcSpl_State16khzTo48khz* state);
+
+typedef struct {
+ int32_t S_48_24[8];
+ int32_t S_24_24[16];
+ int32_t S_24_16[8];
+ int32_t S_16_8[8];
+} WebRtcSpl_State48khzTo8khz;
+
+void WebRtcSpl_Resample48khzTo8khz(const int16_t* in,
+ int16_t* out,
+ WebRtcSpl_State48khzTo8khz* state,
+ int32_t* tmpmem);
+
+void WebRtcSpl_ResetResample48khzTo8khz(WebRtcSpl_State48khzTo8khz* state);
+
+typedef struct {
+ int32_t S_8_16[8];
+ int32_t S_16_12[8];
+ int32_t S_12_24[8];
+ int32_t S_24_48[8];
+} WebRtcSpl_State8khzTo48khz;
+
+void WebRtcSpl_Resample8khzTo48khz(const int16_t* in,
+ int16_t* out,
+ WebRtcSpl_State8khzTo48khz* state,
+ int32_t* tmpmem);
+
+void WebRtcSpl_ResetResample8khzTo48khz(WebRtcSpl_State8khzTo48khz* state);
+
+/*******************************************************************
+ * resample_by_2.c
+ *
+ * Includes down and up sampling by a factor of two.
+ *
+ ******************************************************************/
+
+void WebRtcSpl_DownsampleBy2(const int16_t* in,
+ size_t len,
+ int16_t* out,
+ int32_t* filtState);
+
+void WebRtcSpl_UpsampleBy2(const int16_t* in,
+ size_t len,
+ int16_t* out,
+ int32_t* filtState);
+
+/************************************************************
+ * END OF RESAMPLING FUNCTIONS
+ ************************************************************/
+void WebRtcSpl_AnalysisQMF(const int16_t* in_data,
+ size_t in_data_length,
+ int16_t* low_band,
+ int16_t* high_band,
+ int32_t* filter_state1,
+ int32_t* filter_state2);
+void WebRtcSpl_SynthesisQMF(const int16_t* low_band,
+ const int16_t* high_band,
+ size_t band_length,
+ int16_t* out_data,
+ int32_t* filter_state1,
+ int32_t* filter_state2);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // COMMON_AUDIO_SIGNAL_PROCESSING_INCLUDE_SIGNAL_PROCESSING_LIBRARY_H_
+
+//
+// WebRtcSpl_AddSatW16(...)
+// WebRtcSpl_AddSatW32(...)
+//
+// Returns the result of a saturated 16-bit, respectively 32-bit, addition of
+// the numbers specified by the |var1| and |var2| parameters.
+//
+// Input:
+// - var1 : Input variable 1
+// - var2 : Input variable 2
+//
+// Return value : Added and saturated value
+//
+
+//
+// WebRtcSpl_SubSatW16(...)
+// WebRtcSpl_SubSatW32(...)
+//
+// Returns the result of a saturated 16-bit, respectively 32-bit, subtraction
+// of the numbers specified by the |var1| and |var2| parameters.
+//
+// Input:
+// - var1 : Input variable 1
+// - var2 : Input variable 2
+//
+// Returned value : Subtracted and saturated value
+//
+
+//
+// WebRtcSpl_GetSizeInBits(...)
+//
+// Returns the # of bits that are needed at the most to represent the number
+// specified by the |value| parameter.
+//
+// Input:
+// - value : Input value
+//
+// Return value : Number of bits needed to represent |value|
+//
+
+//
+// WebRtcSpl_NormW32(...)
+//
+// Norm returns the # of left shifts required to 32-bit normalize the 32-bit
+// signed number specified by the |value| parameter.
+//
+// Input:
+// - value : Input value
+//
+// Return value : Number of bit shifts needed to 32-bit normalize |value|
+//
+
+//
+// WebRtcSpl_NormW16(...)
+//
+// Norm returns the # of left shifts required to 16-bit normalize the 16-bit
+// signed number specified by the |value| parameter.
+//
+// Input:
+// - value : Input value
+//
+// Return value : Number of bit shifts needed to 32-bit normalize |value|
+//
+
+//
+// WebRtcSpl_NormU32(...)
+//
+// Norm returns the # of left shifts required to 32-bit normalize the unsigned
+// 32-bit number specified by the |value| parameter.
+//
+// Input:
+// - value : Input value
+//
+// Return value : Number of bit shifts needed to 32-bit normalize |value|
+//
+
+//
+// WebRtcSpl_GetScalingSquare(...)
+//
+// Returns the # of bits required to scale the samples specified in the
+// |in_vector| parameter so that, if the squares of the samples are added the
+// # of times specified by the |times| parameter, the 32-bit addition will not
+// overflow (result in int32_t).
+//
+// Input:
+// - in_vector : Input vector to check scaling on
+// - in_vector_length : Samples in |in_vector|
+// - times : Number of additions to be performed
+//
+// Return value : Number of right bit shifts needed to avoid
+// overflow in the addition calculation
+//
+
+//
+// WebRtcSpl_MemSetW16(...)
+//
+// Sets all the values in the int16_t vector |vector| of length
+// |vector_length| to the specified value |set_value|
+//
+// Input:
+// - vector : Pointer to the int16_t vector
+// - set_value : Value specified
+// - vector_length : Length of vector
+//
+
+//
+// WebRtcSpl_MemSetW32(...)
+//
+// Sets all the values in the int32_t vector |vector| of length
+// |vector_length| to the specified value |set_value|
+//
+// Input:
+// - vector : Pointer to the int16_t vector
+// - set_value : Value specified
+// - vector_length : Length of vector
+//
+
+//
+// WebRtcSpl_MemCpyReversedOrder(...)
+//
+// Copies all the values from the source int16_t vector |in_vector| to a
+// destination int16_t vector |out_vector|. It is done in reversed order,
+// meaning that the first sample of |in_vector| is copied to the last sample of
+// the |out_vector|. The procedure continues until the last sample of
+// |in_vector| has been copied to the first sample of |out_vector|. This
+// creates a reversed vector. Used in e.g. prediction in iLBC.
+//
+// Input:
+// - in_vector : Pointer to the first sample in a int16_t vector
+// of length |length|
+// - vector_length : Number of elements to copy
+//
+// Output:
+// - out_vector : Pointer to the last sample in a int16_t vector
+// of length |length|
+//
+
+//
+// WebRtcSpl_CopyFromEndW16(...)
+//
+// Copies the rightmost |samples| of |in_vector| (of length |in_vector_length|)
+// to the vector |out_vector|.
+//
+// Input:
+// - in_vector : Input vector
+// - in_vector_length : Number of samples in |in_vector|
+// - samples : Number of samples to extract (from right side)
+// from |in_vector|
+//
+// Output:
+// - out_vector : Vector with the requested samples
+//
+
+//
+// WebRtcSpl_ZerosArrayW16(...)
+// WebRtcSpl_ZerosArrayW32(...)
+//
+// Inserts the value "zero" in all positions of a w16 and a w32 vector
+// respectively.
+//
+// Input:
+// - vector_length : Number of samples in vector
+//
+// Output:
+// - vector : Vector containing all zeros
+//
+
+//
+// WebRtcSpl_VectorBitShiftW16(...)
+// WebRtcSpl_VectorBitShiftW32(...)
+//
+// Bit shifts all the values in a vector up or downwards. Different calls for
+// int16_t and int32_t vectors respectively.
+//
+// Input:
+// - vector_length : Length of vector
+// - in_vector : Pointer to the vector that should be bit shifted
+// - right_shifts : Number of right bit shifts (negative value gives left
+// shifts)
+//
+// Output:
+// - out_vector : Pointer to the result vector (can be the same as
+// |in_vector|)
+//
+
+//
+// WebRtcSpl_VectorBitShiftW32ToW16(...)
+//
+// Bit shifts all the values in a int32_t vector up or downwards and
+// stores the result as an int16_t vector. The function will saturate the
+// signal if needed, before storing in the output vector.
+//
+// Input:
+// - vector_length : Length of vector
+// - in_vector : Pointer to the vector that should be bit shifted
+// - right_shifts : Number of right bit shifts (negative value gives left
+// shifts)
+//
+// Output:
+// - out_vector : Pointer to the result vector (can be the same as
+// |in_vector|)
+//
+
+//
+// WebRtcSpl_ScaleVector(...)
+//
+// Performs the vector operation:
+// out_vector[k] = (gain*in_vector[k])>>right_shifts
+//
+// Input:
+// - in_vector : Input vector
+// - gain : Scaling gain
+// - vector_length : Elements in the |in_vector|
+// - right_shifts : Number of right bit shifts applied
+//
+// Output:
+// - out_vector : Output vector (can be the same as |in_vector|)
+//
+
+//
+// WebRtcSpl_ScaleVectorWithSat(...)
+//
+// Performs the vector operation:
+// out_vector[k] = SATURATE( (gain*in_vector[k])>>right_shifts )
+//
+// Input:
+// - in_vector : Input vector
+// - gain : Scaling gain
+// - vector_length : Elements in the |in_vector|
+// - right_shifts : Number of right bit shifts applied
+//
+// Output:
+// - out_vector : Output vector (can be the same as |in_vector|)
+//
+
+//
+// WebRtcSpl_ScaleAndAddVectors(...)
+//
+// Performs the vector operation:
+// out_vector[k] = (gain1*in_vector1[k])>>right_shifts1
+// + (gain2*in_vector2[k])>>right_shifts2
+//
+// Input:
+// - in_vector1 : Input vector 1
+// - gain1 : Gain to be used for vector 1
+// - right_shifts1 : Right bit shift to be used for vector 1
+// - in_vector2 : Input vector 2
+// - gain2 : Gain to be used for vector 2
+// - right_shifts2 : Right bit shift to be used for vector 2
+// - vector_length : Elements in the input vectors
+//
+// Output:
+// - out_vector : Output vector
+//
+
+//
+// WebRtcSpl_ReverseOrderMultArrayElements(...)
+//
+// Performs the vector operation:
+// out_vector[n] = (in_vector[n]*window[-n])>>right_shifts
+//
+// Input:
+// - in_vector : Input vector
+// - window : Window vector (should be reversed). The pointer
+// should be set to the last value in the vector
+// - right_shifts : Number of right bit shift to be applied after the
+// multiplication
+// - vector_length : Number of elements in |in_vector|
+//
+// Output:
+// - out_vector : Output vector (can be same as |in_vector|)
+//
+
+//
+// WebRtcSpl_ElementwiseVectorMult(...)
+//
+// Performs the vector operation:
+// out_vector[n] = (in_vector[n]*window[n])>>right_shifts
+//
+// Input:
+// - in_vector : Input vector
+// - window : Window vector.
+// - right_shifts : Number of right bit shift to be applied after the
+// multiplication
+// - vector_length : Number of elements in |in_vector|
+//
+// Output:
+// - out_vector : Output vector (can be same as |in_vector|)
+//
+
+//
+// WebRtcSpl_AddVectorsAndShift(...)
+//
+// Performs the vector operation:
+// out_vector[k] = (in_vector1[k] + in_vector2[k])>>right_shifts
+//
+// Input:
+// - in_vector1 : Input vector 1
+// - in_vector2 : Input vector 2
+// - right_shifts : Number of right bit shift to be applied after the
+// multiplication
+// - vector_length : Number of elements in |in_vector1| and |in_vector2|
+//
+// Output:
+// - out_vector : Output vector (can be same as |in_vector1|)
+//
+
+//
+// WebRtcSpl_AddAffineVectorToVector(...)
+//
+// Adds an affine transformed vector to another vector |out_vector|, i.e,
+// performs
+// out_vector[k] += (in_vector[k]*gain+add_constant)>>right_shifts
+//
+// Input:
+// - in_vector : Input vector
+// - gain : Gain value, used to multiply the in vector with
+// - add_constant : Constant value to add (usually 1<<(right_shifts-1),
+// but others can be used as well
+// - right_shifts : Number of right bit shifts (0-16)
+// - vector_length : Number of samples in |in_vector| and |out_vector|
+//
+// Output:
+// - out_vector : Vector with the output
+//
+
+//
+// WebRtcSpl_AffineTransformVector(...)
+//
+// Affine transforms a vector, i.e, performs
+// out_vector[k] = (in_vector[k]*gain+add_constant)>>right_shifts
+//
+// Input:
+// - in_vector : Input vector
+// - gain : Gain value, used to multiply the in vector with
+// - add_constant : Constant value to add (usually 1<<(right_shifts-1),
+// but others can be used as well
+// - right_shifts : Number of right bit shifts (0-16)
+// - vector_length : Number of samples in |in_vector| and |out_vector|
+//
+// Output:
+// - out_vector : Vector with the output
+//
+
+//
+// WebRtcSpl_IncreaseSeed(...)
+//
+// Increases the seed (and returns the new value)
+//
+// Input:
+// - seed : Seed for random calculation
+//
+// Output:
+// - seed : Updated seed value
+//
+// Return value : The new seed value
+//
+
+//
+// WebRtcSpl_RandU(...)
+//
+// Produces a uniformly distributed value in the int16_t range
+//
+// Input:
+// - seed : Seed for random calculation
+//
+// Output:
+// - seed : Updated seed value
+//
+// Return value : Uniformly distributed value in the range
+// [Word16_MIN...Word16_MAX]
+//
+
+//
+// WebRtcSpl_RandN(...)
+//
+// Produces a normal distributed value in the int16_t range
+//
+// Input:
+// - seed : Seed for random calculation
+//
+// Output:
+// - seed : Updated seed value
+//
+// Return value : N(0,1) value in the Q13 domain
+//
+
+//
+// WebRtcSpl_RandUArray(...)
+//
+// Produces a uniformly distributed vector with elements in the int16_t
+// range
+//
+// Input:
+// - vector_length : Samples wanted in the vector
+// - seed : Seed for random calculation
+//
+// Output:
+// - vector : Vector with the uniform values
+// - seed : Updated seed value
+//
+// Return value : Number of samples in vector, i.e., |vector_length|
+//
+
+//
+// WebRtcSpl_Sqrt(...)
+//
+// Returns the square root of the input value |value|. The precision of this
+// function is integer precision, i.e., sqrt(8) gives 2 as answer.
+// If |value| is a negative number then 0 is returned.
+//
+// Algorithm:
+//
+// A sixth order Taylor Series expansion is used here to compute the square
+// root of a number y^0.5 = (1+x)^0.5
+// where
+// x = y-1
+// = 1+(x/2)-0.5*((x/2)^2+0.5*((x/2)^3-0.625*((x/2)^4+0.875*((x/2)^5)
+// 0.5 <= x < 1
+//
+// Input:
+// - value : Value to calculate sqrt of
+//
+// Return value : Result of the sqrt calculation
+//
+
+//
+// WebRtcSpl_DivU32U16(...)
+//
+// Divides a uint32_t |num| by a uint16_t |den|.
+//
+// If |den|==0, (uint32_t)0xFFFFFFFF is returned.
+//
+// Input:
+// - num : Numerator
+// - den : Denominator
+//
+// Return value : Result of the division (as a uint32_t), i.e., the
+// integer part of num/den.
+//
+
+//
+// WebRtcSpl_DivW32W16(...)
+//
+// Divides a int32_t |num| by a int16_t |den|.
+//
+// If |den|==0, (int32_t)0x7FFFFFFF is returned.
+//
+// Input:
+// - num : Numerator
+// - den : Denominator
+//
+// Return value : Result of the division (as a int32_t), i.e., the
+// integer part of num/den.
+//
+
+//
+// WebRtcSpl_DivW32W16ResW16(...)
+//
+// Divides a int32_t |num| by a int16_t |den|, assuming that the
+// result is less than 32768, otherwise an unpredictable result will occur.
+//
+// If |den|==0, (int16_t)0x7FFF is returned.
+//
+// Input:
+// - num : Numerator
+// - den : Denominator
+//
+// Return value : Result of the division (as a int16_t), i.e., the
+// integer part of num/den.
+//
+
+//
+// WebRtcSpl_DivResultInQ31(...)
+//
+// Divides a int32_t |num| by a int16_t |den|, assuming that the
+// absolute value of the denominator is larger than the numerator, otherwise
+// an unpredictable result will occur.
+//
+// Input:
+// - num : Numerator
+// - den : Denominator
+//
+// Return value : Result of the division in Q31.
+//
+
+//
+// WebRtcSpl_DivW32HiLow(...)
+//
+// Divides a int32_t |num| by a denominator in hi, low format. The
+// absolute value of the denominator has to be larger (or equal to) the
+// numerator.
+//
+// Input:
+// - num : Numerator
+// - den_hi : High part of denominator
+// - den_low : Low part of denominator
+//
+// Return value : Divided value in Q31
+//
+
+//
+// WebRtcSpl_Energy(...)
+//
+// Calculates the energy of a vector
+//
+// Input:
+// - vector : Vector which the energy should be calculated on
+// - vector_length : Number of samples in vector
+//
+// Output:
+// - scale_factor : Number of left bit shifts needed to get the physical
+// energy value, i.e, to get the Q0 value
+//
+// Return value : Energy value in Q(-|scale_factor|)
+//
+
+//
+// WebRtcSpl_FilterAR(...)
+//
+// Performs a 32-bit AR filtering on a vector in Q12
+//
+// Input:
+// - ar_coef : AR-coefficient vector (values in Q12),
+// ar_coef[0] must be 4096.
+// - ar_coef_length : Number of coefficients in |ar_coef|.
+// - in_vector : Vector to be filtered.
+// - in_vector_length : Number of samples in |in_vector|.
+// - filter_state : Current state (higher part) of the filter.
+// - filter_state_length : Length (in samples) of |filter_state|.
+// - filter_state_low : Current state (lower part) of the filter.
+// - filter_state_low_length : Length (in samples) of |filter_state_low|.
+// - out_vector_low_length : Maximum length (in samples) of
+// |out_vector_low|.
+//
+// Output:
+// - filter_state : Updated state (upper part) vector.
+// - filter_state_low : Updated state (lower part) vector.
+// - out_vector : Vector containing the upper part of the
+// filtered values.
+// - out_vector_low : Vector containing the lower part of the
+// filtered values.
+//
+// Return value : Number of samples in the |out_vector|.
+//
+
+//
+// WebRtcSpl_ComplexIFFT(...)
+//
+// Complex Inverse FFT
+//
+// Computes an inverse complex 2^|stages|-point FFT on the input vector, which
+// is in bit-reversed order. The original content of the vector is destroyed in
+// the process, since the input is overwritten by the output, normal-ordered,
+// FFT vector. With X as the input complex vector, y as the output complex
+// vector and with M = 2^|stages|, the following is computed:
+//
+// M-1
+// y(k) = sum[X(i)*[cos(2*pi*i*k/M) + j*sin(2*pi*i*k/M)]]
+// i=0
+//
+// The implementations are optimized for speed, not for code size. It uses the
+// decimation-in-time algorithm with radix-2 butterfly technique.
+//
+// Input:
+// - vector : In pointer to complex vector containing 2^|stages|
+// real elements interleaved with 2^|stages| imaginary
+// elements.
+// [ReImReImReIm....]
+// The elements are in Q(-scale) domain, see more on Return
+// Value below.
+//
+// - stages : Number of FFT stages. Must be at least 3 and at most 10,
+// since the table WebRtcSpl_kSinTable1024[] is 1024
+// elements long.
+//
+// - mode : This parameter gives the user to choose how the FFT
+// should work.
+// mode==0: Low-complexity and Low-accuracy mode
+// mode==1: High-complexity and High-accuracy mode
+//
+// Output:
+// - vector : Out pointer to the FFT vector (the same as input).
+//
+// Return Value : The scale value that tells the number of left bit shifts
+// that the elements in the |vector| should be shifted with
+// in order to get Q0 values, i.e. the physically correct
+// values. The scale parameter is always 0 or positive,
+// except if N>1024 (|stages|>10), which returns a scale
+// value of -1, indicating error.
+//
+
+//
+// WebRtcSpl_ComplexFFT(...)
+//
+// Complex FFT
+//
+// Computes a complex 2^|stages|-point FFT on the input vector, which is in
+// bit-reversed order. The original content of the vector is destroyed in
+// the process, since the input is overwritten by the output, normal-ordered,
+// FFT vector. With x as the input complex vector, Y as the output complex
+// vector and with M = 2^|stages|, the following is computed:
+//
+// M-1
+// Y(k) = 1/M * sum[x(i)*[cos(2*pi*i*k/M) + j*sin(2*pi*i*k/M)]]
+// i=0
+//
+// The implementations are optimized for speed, not for code size. It uses the
+// decimation-in-time algorithm with radix-2 butterfly technique.
+//
+// This routine prevents overflow by scaling by 2 before each FFT stage. This is
+// a fixed scaling, for proper normalization - there will be log2(n) passes, so
+// this results in an overall factor of 1/n, distributed to maximize arithmetic
+// accuracy.
+//
+// Input:
+// - vector : In pointer to complex vector containing 2^|stages| real
+// elements interleaved with 2^|stages| imaginary elements.
+// [ReImReImReIm....]
+// The output is in the Q0 domain.
+//
+// - stages : Number of FFT stages. Must be at least 3 and at most 10,
+// since the table WebRtcSpl_kSinTable1024[] is 1024
+// elements long.
+//
+// - mode : This parameter gives the user to choose how the FFT
+// should work.
+// mode==0: Low-complexity and Low-accuracy mode
+// mode==1: High-complexity and High-accuracy mode
+//
+// Output:
+// - vector : The output FFT vector is in the Q0 domain.
+//
+// Return value : The scale parameter is always 0, except if N>1024,
+// which returns a scale value of -1, indicating error.
+//
+
+//
+// WebRtcSpl_AnalysisQMF(...)
+//
+// Splits a 0-2*F Hz signal into two sub bands: 0-F Hz and F-2*F Hz. The
+// current version has F = 8000, therefore, a super-wideband audio signal is
+// split to lower-band 0-8 kHz and upper-band 8-16 kHz.
+//
+// Input:
+// - in_data : Wide band speech signal, 320 samples (10 ms)
+//
+// Input & Output:
+// - filter_state1 : Filter state for first All-pass filter
+// - filter_state2 : Filter state for second All-pass filter
+//
+// Output:
+// - low_band : Lower-band signal 0-8 kHz band, 160 samples (10 ms)
+// - high_band : Upper-band signal 8-16 kHz band (flipped in frequency
+// domain), 160 samples (10 ms)
+//
+
+//
+// WebRtcSpl_SynthesisQMF(...)
+//
+// Combines the two sub bands (0-F and F-2*F Hz) into a signal of 0-2*F
+// Hz, (current version has F = 8000 Hz). So the filter combines lower-band
+// (0-8 kHz) and upper-band (8-16 kHz) channels to obtain super-wideband 0-16
+// kHz audio.
+//
+// Input:
+// - low_band : The signal with the 0-8 kHz band, 160 samples (10 ms)
+// - high_band : The signal with the 8-16 kHz band, 160 samples (10 ms)
+//
+// Input & Output:
+// - filter_state1 : Filter state for first All-pass filter
+// - filter_state2 : Filter state for second All-pass filter
+//
+// Output:
+// - out_data : Super-wideband speech signal, 0-16 kHz
+//
+
+// int16_t WebRtcSpl_SatW32ToW16(...)
+//
+// This function saturates a 32-bit word into a 16-bit word.
+//
+// Input:
+// - value32 : The value of a 32-bit word.
+//
+// Output:
+// - out16 : the saturated 16-bit word.
+//
+
+// int32_t WebRtc_MulAccumW16(...)
+//
+// This function multiply a 16-bit word by a 16-bit word, and accumulate this
+// value to a 32-bit integer.
+//
+// Input:
+// - a : The value of the first 16-bit word.
+// - b : The value of the second 16-bit word.
+// - c : The value of an 32-bit integer.
+//
+// Return Value: The value of a * b + c.
+//
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/include/spl_inl.h b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/include/spl_inl.h
new file mode 100644
index 000000000..d24b3a5f5
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/include/spl_inl.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This header file includes the inline functions in
+// the fix point signal processing library.
+
+#ifndef COMMON_AUDIO_SIGNAL_PROCESSING_INCLUDE_SPL_INL_H_
+#define COMMON_AUDIO_SIGNAL_PROCESSING_INCLUDE_SPL_INL_H_
+
+#include "webrtc/rtc_base/compile_assert_c.h"
+
+extern const int8_t kWebRtcSpl_CountLeadingZeros32_Table[64];
+
+// Don't call this directly except in tests!
+static __inline int WebRtcSpl_CountLeadingZeros32_NotBuiltin(uint32_t n) {
+ // Normalize n by rounding up to the nearest number that is a sequence of 0
+ // bits followed by a sequence of 1 bits. This number has the same number of
+ // leading zeros as the original n. There are exactly 33 such values.
+ n |= n >> 1;
+ n |= n >> 2;
+ n |= n >> 4;
+ n |= n >> 8;
+ n |= n >> 16;
+
+ // Multiply the modified n with a constant selected (by exhaustive search)
+ // such that each of the 33 possible values of n give a product whose 6 most
+ // significant bits are unique. Then look up the answer in the table.
+ return kWebRtcSpl_CountLeadingZeros32_Table[(n * 0x8c0b2891) >> 26];
+}
+
+// Don't call this directly except in tests!
+static __inline int WebRtcSpl_CountLeadingZeros64_NotBuiltin(uint64_t n) {
+ const int leading_zeros = n >> 32 == 0 ? 32 : 0;
+ return leading_zeros + WebRtcSpl_CountLeadingZeros32_NotBuiltin(
+ (uint32_t)(n >> (32 - leading_zeros)));
+}
+
+// Returns the number of leading zero bits in the argument.
+static __inline int WebRtcSpl_CountLeadingZeros32(uint32_t n) {
+#ifdef __GNUC__
+ RTC_COMPILE_ASSERT(sizeof(unsigned int) == sizeof(uint32_t));
+ return n == 0 ? 32 : __builtin_clz(n);
+#else
+ return WebRtcSpl_CountLeadingZeros32_NotBuiltin(n);
+#endif
+}
+
+// Returns the number of leading zero bits in the argument.
+static __inline int WebRtcSpl_CountLeadingZeros64(uint64_t n) {
+#ifdef __GNUC__
+ RTC_COMPILE_ASSERT(sizeof(unsigned long long) == sizeof(uint64_t)); // NOLINT
+ return n == 0 ? 64 : __builtin_clzll(n);
+#else
+ return WebRtcSpl_CountLeadingZeros64_NotBuiltin(n);
+#endif
+}
+
+#ifdef WEBRTC_ARCH_ARM_V7
+#include "webrtc/common_audio/signal_processing/include/spl_inl_armv7.h"
+#else
+
+#if defined(MIPS32_LE)
+#include "webrtc/common_audio/signal_processing/include/spl_inl_mips.h"
+#endif
+
+#if !defined(MIPS_DSP_R1_LE)
+static __inline int16_t WebRtcSpl_SatW32ToW16(int32_t value32) {
+ int16_t out16 = (int16_t)value32;
+
+ if (value32 > 32767)
+ out16 = 32767;
+ else if (value32 < -32768)
+ out16 = -32768;
+
+ return out16;
+}
+
+static __inline int32_t WebRtcSpl_AddSatW32(int32_t a, int32_t b) {
+ // Do the addition in unsigned numbers, since signed overflow is undefined
+ // behavior.
+ const int32_t sum = (int32_t)((uint32_t)a + (uint32_t)b);
+
+ // a + b can't overflow if a and b have different signs. If they have the
+ // same sign, a + b also has the same sign iff it didn't overflow.
+ if ((a < 0) == (b < 0) && (a < 0) != (sum < 0)) {
+ // The direction of the overflow is obvious from the sign of a + b.
+ return sum < 0 ? INT32_MAX : INT32_MIN;
+ }
+ return sum;
+}
+
+static __inline int32_t WebRtcSpl_SubSatW32(int32_t a, int32_t b) {
+ // Do the subtraction in unsigned numbers, since signed overflow is undefined
+ // behavior.
+ const int32_t diff = (int32_t)((uint32_t)a - (uint32_t)b);
+
+ // a - b can't overflow if a and b have the same sign. If they have different
+ // signs, a - b has the same sign as a iff it didn't overflow.
+ if ((a < 0) != (b < 0) && (a < 0) != (diff < 0)) {
+ // The direction of the overflow is obvious from the sign of a - b.
+ return diff < 0 ? INT32_MAX : INT32_MIN;
+ }
+ return diff;
+}
+
+static __inline int16_t WebRtcSpl_AddSatW16(int16_t a, int16_t b) {
+ return WebRtcSpl_SatW32ToW16((int32_t)a + (int32_t)b);
+}
+
+static __inline int16_t WebRtcSpl_SubSatW16(int16_t var1, int16_t var2) {
+ return WebRtcSpl_SatW32ToW16((int32_t)var1 - (int32_t)var2);
+}
+#endif // #if !defined(MIPS_DSP_R1_LE)
+
+#if !defined(MIPS32_LE)
+static __inline int16_t WebRtcSpl_GetSizeInBits(uint32_t n) {
+ return 32 - WebRtcSpl_CountLeadingZeros32(n);
+}
+
+// Return the number of steps a can be left-shifted without overflow,
+// or 0 if a == 0.
+static __inline int16_t WebRtcSpl_NormW32(int32_t a) {
+ return a == 0 ? 0 : WebRtcSpl_CountLeadingZeros32(a < 0 ? ~a : a) - 1;
+}
+
+// Return the number of steps a can be left-shifted without overflow,
+// or 0 if a == 0.
+static __inline int16_t WebRtcSpl_NormU32(uint32_t a) {
+ return a == 0 ? 0 : WebRtcSpl_CountLeadingZeros32(a);
+}
+
+// Return the number of steps a can be left-shifted without overflow,
+// or 0 if a == 0.
+static __inline int16_t WebRtcSpl_NormW16(int16_t a) {
+ const int32_t a32 = a;
+ return a == 0 ? 0 : WebRtcSpl_CountLeadingZeros32(a < 0 ? ~a32 : a32) - 17;
+}
+
+static __inline int32_t WebRtc_MulAccumW16(int16_t a, int16_t b, int32_t c) {
+ return (a * b + c);
+}
+#endif // #if !defined(MIPS32_LE)
+
+#endif // WEBRTC_ARCH_ARM_V7
+
+#endif // COMMON_AUDIO_SIGNAL_PROCESSING_INCLUDE_SPL_INL_H_
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/min_max_operations.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/min_max_operations.c
new file mode 100644
index 000000000..75975bb09
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/min_max_operations.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * This file contains the implementation of functions
+ * WebRtcSpl_MaxAbsValueW16C()
+ * WebRtcSpl_MaxAbsValueW32C()
+ * WebRtcSpl_MaxValueW16C()
+ * WebRtcSpl_MaxValueW32C()
+ * WebRtcSpl_MinValueW16C()
+ * WebRtcSpl_MinValueW32C()
+ * WebRtcSpl_MaxAbsIndexW16()
+ * WebRtcSpl_MaxIndexW16()
+ * WebRtcSpl_MaxIndexW32()
+ * WebRtcSpl_MinIndexW16()
+ * WebRtcSpl_MinIndexW32()
+ *
+ */
+
+#include
+
+#include "webrtc/rtc_base/checks.h"
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+
+// TODO(bjorn/kma): Consolidate function pairs (e.g. combine
+// WebRtcSpl_MaxAbsValueW16C and WebRtcSpl_MaxAbsIndexW16 into a single one.)
+// TODO(kma): Move the next six functions into min_max_operations_c.c.
+
+// Maximum absolute value of word16 vector. C version for generic platforms.
+int16_t WebRtcSpl_MaxAbsValueW16C(const int16_t* vector, size_t length) {
+ size_t i = 0;
+ int absolute = 0, maximum = 0;
+
+ RTC_DCHECK_GT(length, 0);
+
+ for (i = 0; i < length; i++) {
+ absolute = abs((int)vector[i]);
+
+ if (absolute > maximum) {
+ maximum = absolute;
+ }
+ }
+
+ // Guard the case for abs(-32768).
+ if (maximum > WEBRTC_SPL_WORD16_MAX) {
+ maximum = WEBRTC_SPL_WORD16_MAX;
+ }
+
+ return (int16_t)maximum;
+}
+
+// Maximum absolute value of word32 vector. C version for generic platforms.
+int32_t WebRtcSpl_MaxAbsValueW32C(const int32_t* vector, size_t length) {
+ // Use uint32_t for the local variables, to accommodate the return value
+ // of abs(0x80000000), which is 0x80000000.
+
+ uint32_t absolute = 0, maximum = 0;
+ size_t i = 0;
+
+ RTC_DCHECK_GT(length, 0);
+
+ for (i = 0; i < length; i++) {
+ absolute = abs((int)vector[i]);
+ if (absolute > maximum) {
+ maximum = absolute;
+ }
+ }
+
+ maximum = WEBRTC_SPL_MIN(maximum, WEBRTC_SPL_WORD32_MAX);
+
+ return (int32_t)maximum;
+}
+
+// Maximum value of word16 vector. C version for generic platforms.
+int16_t WebRtcSpl_MaxValueW16C(const int16_t* vector, size_t length) {
+ int16_t maximum = WEBRTC_SPL_WORD16_MIN;
+ size_t i = 0;
+
+ RTC_DCHECK_GT(length, 0);
+
+ for (i = 0; i < length; i++) {
+ if (vector[i] > maximum)
+ maximum = vector[i];
+ }
+ return maximum;
+}
+
+// Maximum value of word32 vector. C version for generic platforms.
+int32_t WebRtcSpl_MaxValueW32C(const int32_t* vector, size_t length) {
+ int32_t maximum = WEBRTC_SPL_WORD32_MIN;
+ size_t i = 0;
+
+ RTC_DCHECK_GT(length, 0);
+
+ for (i = 0; i < length; i++) {
+ if (vector[i] > maximum)
+ maximum = vector[i];
+ }
+ return maximum;
+}
+
+// Minimum value of word16 vector. C version for generic platforms.
+int16_t WebRtcSpl_MinValueW16C(const int16_t* vector, size_t length) {
+ int16_t minimum = WEBRTC_SPL_WORD16_MAX;
+ size_t i = 0;
+
+ RTC_DCHECK_GT(length, 0);
+
+ for (i = 0; i < length; i++) {
+ if (vector[i] < minimum)
+ minimum = vector[i];
+ }
+ return minimum;
+}
+
+// Minimum value of word32 vector. C version for generic platforms.
+int32_t WebRtcSpl_MinValueW32C(const int32_t* vector, size_t length) {
+ int32_t minimum = WEBRTC_SPL_WORD32_MAX;
+ size_t i = 0;
+
+ RTC_DCHECK_GT(length, 0);
+
+ for (i = 0; i < length; i++) {
+ if (vector[i] < minimum)
+ minimum = vector[i];
+ }
+ return minimum;
+}
+
+// Index of maximum absolute value in a word16 vector.
+size_t WebRtcSpl_MaxAbsIndexW16(const int16_t* vector, size_t length) {
+ // Use type int for local variables, to accomodate the value of abs(-32768).
+
+ size_t i = 0, index = 0;
+ int absolute = 0, maximum = 0;
+
+ RTC_DCHECK_GT(length, 0);
+
+ for (i = 0; i < length; i++) {
+ absolute = abs((int)vector[i]);
+
+ if (absolute > maximum) {
+ maximum = absolute;
+ index = i;
+ }
+ }
+
+ return index;
+}
+
+// Index of maximum value in a word16 vector.
+size_t WebRtcSpl_MaxIndexW16(const int16_t* vector, size_t length) {
+ size_t i = 0, index = 0;
+ int16_t maximum = WEBRTC_SPL_WORD16_MIN;
+
+ RTC_DCHECK_GT(length, 0);
+
+ for (i = 0; i < length; i++) {
+ if (vector[i] > maximum) {
+ maximum = vector[i];
+ index = i;
+ }
+ }
+
+ return index;
+}
+
+// Index of maximum value in a word32 vector.
+size_t WebRtcSpl_MaxIndexW32(const int32_t* vector, size_t length) {
+ size_t i = 0, index = 0;
+ int32_t maximum = WEBRTC_SPL_WORD32_MIN;
+
+ RTC_DCHECK_GT(length, 0);
+
+ for (i = 0; i < length; i++) {
+ if (vector[i] > maximum) {
+ maximum = vector[i];
+ index = i;
+ }
+ }
+
+ return index;
+}
+
+// Index of minimum value in a word16 vector.
+size_t WebRtcSpl_MinIndexW16(const int16_t* vector, size_t length) {
+ size_t i = 0, index = 0;
+ int16_t minimum = WEBRTC_SPL_WORD16_MAX;
+
+ RTC_DCHECK_GT(length, 0);
+
+ for (i = 0; i < length; i++) {
+ if (vector[i] < minimum) {
+ minimum = vector[i];
+ index = i;
+ }
+ }
+
+ return index;
+}
+
+// Index of minimum value in a word32 vector.
+size_t WebRtcSpl_MinIndexW32(const int32_t* vector, size_t length) {
+ size_t i = 0, index = 0;
+ int32_t minimum = WEBRTC_SPL_WORD32_MAX;
+
+ RTC_DCHECK_GT(length, 0);
+
+ for (i = 0; i < length; i++) {
+ if (vector[i] < minimum) {
+ minimum = vector[i];
+ index = i;
+ }
+ }
+
+ return index;
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/resample_48khz.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/resample_48khz.c
new file mode 100644
index 000000000..2220cc333
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/resample_48khz.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/*
+ * This file contains resampling functions between 48 kHz and nb/wb.
+ * The description header can be found in signal_processing_library.h
+ *
+ */
+
+#include
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+#include "webrtc/common_audio/signal_processing/resample_by_2_internal.h"
+
+////////////////////////////
+///// 48 kHz -> 16 kHz /////
+////////////////////////////
+
+// 48 -> 16 resampler
+void WebRtcSpl_Resample48khzTo16khz(const int16_t* in, int16_t* out,
+ WebRtcSpl_State48khzTo16khz* state, int32_t* tmpmem)
+{
+ ///// 48 --> 48(LP) /////
+ // int16_t in[480]
+ // int32_t out[480]
+ /////
+ WebRtcSpl_LPBy2ShortToInt(in, 480, tmpmem + 16, state->S_48_48);
+
+ ///// 48 --> 32 /////
+ // int32_t in[480]
+ // int32_t out[320]
+ /////
+ // copy state to and from input array
+ memcpy(tmpmem + 8, state->S_48_32, 8 * sizeof(int32_t));
+ memcpy(state->S_48_32, tmpmem + 488, 8 * sizeof(int32_t));
+ WebRtcSpl_Resample48khzTo32khz(tmpmem + 8, tmpmem, 160);
+
+ ///// 32 --> 16 /////
+ // int32_t in[320]
+ // int16_t out[160]
+ /////
+ WebRtcSpl_DownBy2IntToShort(tmpmem, 320, out, state->S_32_16);
+}
+
+// initialize state of 48 -> 16 resampler
+void WebRtcSpl_ResetResample48khzTo16khz(WebRtcSpl_State48khzTo16khz* state)
+{
+ memset(state->S_48_48, 0, 16 * sizeof(int32_t));
+ memset(state->S_48_32, 0, 8 * sizeof(int32_t));
+ memset(state->S_32_16, 0, 8 * sizeof(int32_t));
+}
+
+////////////////////////////
+///// 16 kHz -> 48 kHz /////
+////////////////////////////
+
+// 16 -> 48 resampler
+void WebRtcSpl_Resample16khzTo48khz(const int16_t* in, int16_t* out,
+ WebRtcSpl_State16khzTo48khz* state, int32_t* tmpmem)
+{
+ ///// 16 --> 32 /////
+ // int16_t in[160]
+ // int32_t out[320]
+ /////
+ WebRtcSpl_UpBy2ShortToInt(in, 160, tmpmem + 16, state->S_16_32);
+
+ ///// 32 --> 24 /////
+ // int32_t in[320]
+ // int32_t out[240]
+ // copy state to and from input array
+ /////
+ memcpy(tmpmem + 8, state->S_32_24, 8 * sizeof(int32_t));
+ memcpy(state->S_32_24, tmpmem + 328, 8 * sizeof(int32_t));
+ WebRtcSpl_Resample32khzTo24khz(tmpmem + 8, tmpmem, 80);
+
+ ///// 24 --> 48 /////
+ // int32_t in[240]
+ // int16_t out[480]
+ /////
+ WebRtcSpl_UpBy2IntToShort(tmpmem, 240, out, state->S_24_48);
+}
+
+// initialize state of 16 -> 48 resampler
+void WebRtcSpl_ResetResample16khzTo48khz(WebRtcSpl_State16khzTo48khz* state)
+{
+ memset(state->S_16_32, 0, 8 * sizeof(int32_t));
+ memset(state->S_32_24, 0, 8 * sizeof(int32_t));
+ memset(state->S_24_48, 0, 8 * sizeof(int32_t));
+}
+
+////////////////////////////
+///// 48 kHz -> 8 kHz /////
+////////////////////////////
+
+// 48 -> 8 resampler
+void WebRtcSpl_Resample48khzTo8khz(const int16_t* in, int16_t* out,
+ WebRtcSpl_State48khzTo8khz* state, int32_t* tmpmem)
+{
+ ///// 48 --> 24 /////
+ // int16_t in[480]
+ // int32_t out[240]
+ /////
+ WebRtcSpl_DownBy2ShortToInt(in, 480, tmpmem + 256, state->S_48_24);
+
+ ///// 24 --> 24(LP) /////
+ // int32_t in[240]
+ // int32_t out[240]
+ /////
+ WebRtcSpl_LPBy2IntToInt(tmpmem + 256, 240, tmpmem + 16, state->S_24_24);
+
+ ///// 24 --> 16 /////
+ // int32_t in[240]
+ // int32_t out[160]
+ /////
+ // copy state to and from input array
+ memcpy(tmpmem + 8, state->S_24_16, 8 * sizeof(int32_t));
+ memcpy(state->S_24_16, tmpmem + 248, 8 * sizeof(int32_t));
+ WebRtcSpl_Resample48khzTo32khz(tmpmem + 8, tmpmem, 80);
+
+ ///// 16 --> 8 /////
+ // int32_t in[160]
+ // int16_t out[80]
+ /////
+ WebRtcSpl_DownBy2IntToShort(tmpmem, 160, out, state->S_16_8);
+}
+
+// initialize state of 48 -> 8 resampler
+void WebRtcSpl_ResetResample48khzTo8khz(WebRtcSpl_State48khzTo8khz* state)
+{
+ memset(state->S_48_24, 0, 8 * sizeof(int32_t));
+ memset(state->S_24_24, 0, 16 * sizeof(int32_t));
+ memset(state->S_24_16, 0, 8 * sizeof(int32_t));
+ memset(state->S_16_8, 0, 8 * sizeof(int32_t));
+}
+
+////////////////////////////
+///// 8 kHz -> 48 kHz /////
+////////////////////////////
+
+// 8 -> 48 resampler
+void WebRtcSpl_Resample8khzTo48khz(const int16_t* in, int16_t* out,
+ WebRtcSpl_State8khzTo48khz* state, int32_t* tmpmem)
+{
+ ///// 8 --> 16 /////
+ // int16_t in[80]
+ // int32_t out[160]
+ /////
+ WebRtcSpl_UpBy2ShortToInt(in, 80, tmpmem + 264, state->S_8_16);
+
+ ///// 16 --> 12 /////
+ // int32_t in[160]
+ // int32_t out[120]
+ /////
+ // copy state to and from input array
+ memcpy(tmpmem + 256, state->S_16_12, 8 * sizeof(int32_t));
+ memcpy(state->S_16_12, tmpmem + 416, 8 * sizeof(int32_t));
+ WebRtcSpl_Resample32khzTo24khz(tmpmem + 256, tmpmem + 240, 40);
+
+ ///// 12 --> 24 /////
+ // int32_t in[120]
+ // int16_t out[240]
+ /////
+ WebRtcSpl_UpBy2IntToInt(tmpmem + 240, 120, tmpmem, state->S_12_24);
+
+ ///// 24 --> 48 /////
+ // int32_t in[240]
+ // int16_t out[480]
+ /////
+ WebRtcSpl_UpBy2IntToShort(tmpmem, 240, out, state->S_24_48);
+}
+
+// initialize state of 8 -> 48 resampler
+void WebRtcSpl_ResetResample8khzTo48khz(WebRtcSpl_State8khzTo48khz* state)
+{
+ memset(state->S_8_16, 0, 8 * sizeof(int32_t));
+ memset(state->S_16_12, 0, 8 * sizeof(int32_t));
+ memset(state->S_12_24, 0, 8 * sizeof(int32_t));
+ memset(state->S_24_48, 0, 8 * sizeof(int32_t));
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/resample_by_2_internal.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/resample_by_2_internal.c
new file mode 100644
index 000000000..72bc0f92b
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/resample_by_2_internal.c
@@ -0,0 +1,689 @@
+/*
+ * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/*
+ * This header file contains some internal resampling functions.
+ *
+ */
+
+#include "webrtc/common_audio/signal_processing/resample_by_2_internal.h"
+#include "webrtc/rtc_base/sanitizer.h"
+
+// allpass filter coefficients.
+static const int16_t kResampleAllpass[2][3] = {
+ {821, 6110, 12382},
+ {3050, 9368, 15063}
+};
+
+//
+// decimator
+// input: int32_t (shifted 15 positions to the left, + offset 16384) OVERWRITTEN!
+// output: int16_t (saturated) (of length len/2)
+// state: filter state array; length = 8
+
+void RTC_NO_SANITIZE("signed-integer-overflow") // bugs.webrtc.org/5486
+WebRtcSpl_DownBy2IntToShort(int32_t *in, int32_t len, int16_t *out,
+ int32_t *state)
+{
+ int32_t tmp0, tmp1, diff;
+ int32_t i;
+
+ len >>= 1;
+
+ // lower allpass filter (operates on even input samples)
+ for (i = 0; i < len; i++)
+ {
+ tmp0 = in[i << 1];
+ diff = tmp0 - state[1];
+ // UBSan: -1771017321 - 999586185 cannot be represented in type 'int'
+
+ // scale down and round
+ diff = (diff + (1 << 13)) >> 14;
+ tmp1 = state[0] + diff * kResampleAllpass[1][0];
+ state[0] = tmp0;
+ diff = tmp1 - state[2];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ tmp0 = state[1] + diff * kResampleAllpass[1][1];
+ state[1] = tmp1;
+ diff = tmp0 - state[3];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ state[3] = state[2] + diff * kResampleAllpass[1][2];
+ state[2] = tmp0;
+
+ // divide by two and store temporarily
+ in[i << 1] = (state[3] >> 1);
+ }
+
+ in++;
+
+ // upper allpass filter (operates on odd input samples)
+ for (i = 0; i < len; i++)
+ {
+ tmp0 = in[i << 1];
+ diff = tmp0 - state[5];
+ // scale down and round
+ diff = (diff + (1 << 13)) >> 14;
+ tmp1 = state[4] + diff * kResampleAllpass[0][0];
+ state[4] = tmp0;
+ diff = tmp1 - state[6];
+ // scale down and round
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ tmp0 = state[5] + diff * kResampleAllpass[0][1];
+ state[5] = tmp1;
+ diff = tmp0 - state[7];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ state[7] = state[6] + diff * kResampleAllpass[0][2];
+ state[6] = tmp0;
+
+ // divide by two and store temporarily
+ in[i << 1] = (state[7] >> 1);
+ }
+
+ in--;
+
+ // combine allpass outputs
+ for (i = 0; i < len; i += 2)
+ {
+ // divide by two, add both allpass outputs and round
+ tmp0 = (in[i << 1] + in[(i << 1) + 1]) >> 15;
+ tmp1 = (in[(i << 1) + 2] + in[(i << 1) + 3]) >> 15;
+ if (tmp0 > (int32_t)0x00007FFF)
+ tmp0 = 0x00007FFF;
+ if (tmp0 < (int32_t)0xFFFF8000)
+ tmp0 = 0xFFFF8000;
+ out[i] = (int16_t)tmp0;
+ if (tmp1 > (int32_t)0x00007FFF)
+ tmp1 = 0x00007FFF;
+ if (tmp1 < (int32_t)0xFFFF8000)
+ tmp1 = 0xFFFF8000;
+ out[i + 1] = (int16_t)tmp1;
+ }
+}
+
+//
+// decimator
+// input: int16_t
+// output: int32_t (shifted 15 positions to the left, + offset 16384) (of length len/2)
+// state: filter state array; length = 8
+
+void RTC_NO_SANITIZE("signed-integer-overflow") // bugs.webrtc.org/5486
+WebRtcSpl_DownBy2ShortToInt(const int16_t *in,
+ int32_t len,
+ int32_t *out,
+ int32_t *state)
+{
+ int32_t tmp0, tmp1, diff;
+ int32_t i;
+
+ len >>= 1;
+
+ // lower allpass filter (operates on even input samples)
+ for (i = 0; i < len; i++)
+ {
+ tmp0 = ((int32_t)in[i << 1] << 15) + (1 << 14);
+ diff = tmp0 - state[1];
+ // scale down and round
+ diff = (diff + (1 << 13)) >> 14;
+ tmp1 = state[0] + diff * kResampleAllpass[1][0];
+ state[0] = tmp0;
+ diff = tmp1 - state[2];
+ // UBSan: -1379909682 - 834099714 cannot be represented in type 'int'
+
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ tmp0 = state[1] + diff * kResampleAllpass[1][1];
+ state[1] = tmp1;
+ diff = tmp0 - state[3];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ state[3] = state[2] + diff * kResampleAllpass[1][2];
+ state[2] = tmp0;
+
+ // divide by two and store temporarily
+ out[i] = (state[3] >> 1);
+ }
+
+ in++;
+
+ // upper allpass filter (operates on odd input samples)
+ for (i = 0; i < len; i++)
+ {
+ tmp0 = ((int32_t)in[i << 1] << 15) + (1 << 14);
+ diff = tmp0 - state[5];
+ // scale down and round
+ diff = (diff + (1 << 13)) >> 14;
+ tmp1 = state[4] + diff * kResampleAllpass[0][0];
+ state[4] = tmp0;
+ diff = tmp1 - state[6];
+ // scale down and round
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ tmp0 = state[5] + diff * kResampleAllpass[0][1];
+ state[5] = tmp1;
+ diff = tmp0 - state[7];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ state[7] = state[6] + diff * kResampleAllpass[0][2];
+ state[6] = tmp0;
+
+ // divide by two and store temporarily
+ out[i] += (state[7] >> 1);
+ }
+
+ in--;
+}
+
+//
+// interpolator
+// input: int16_t
+// output: int32_t (normalized, not saturated) (of length len*2)
+// state: filter state array; length = 8
+void WebRtcSpl_UpBy2ShortToInt(const int16_t *in, int32_t len, int32_t *out,
+ int32_t *state)
+{
+ int32_t tmp0, tmp1, diff;
+ int32_t i;
+
+ // upper allpass filter (generates odd output samples)
+ for (i = 0; i < len; i++)
+ {
+ tmp0 = ((int32_t)in[i] << 15) + (1 << 14);
+ diff = tmp0 - state[5];
+ // scale down and round
+ diff = (diff + (1 << 13)) >> 14;
+ tmp1 = state[4] + diff * kResampleAllpass[0][0];
+ state[4] = tmp0;
+ diff = tmp1 - state[6];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ tmp0 = state[5] + diff * kResampleAllpass[0][1];
+ state[5] = tmp1;
+ diff = tmp0 - state[7];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ state[7] = state[6] + diff * kResampleAllpass[0][2];
+ state[6] = tmp0;
+
+ // scale down, round and store
+ out[i << 1] = state[7] >> 15;
+ }
+
+ out++;
+
+ // lower allpass filter (generates even output samples)
+ for (i = 0; i < len; i++)
+ {
+ tmp0 = ((int32_t)in[i] << 15) + (1 << 14);
+ diff = tmp0 - state[1];
+ // scale down and round
+ diff = (diff + (1 << 13)) >> 14;
+ tmp1 = state[0] + diff * kResampleAllpass[1][0];
+ state[0] = tmp0;
+ diff = tmp1 - state[2];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ tmp0 = state[1] + diff * kResampleAllpass[1][1];
+ state[1] = tmp1;
+ diff = tmp0 - state[3];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ state[3] = state[2] + diff * kResampleAllpass[1][2];
+ state[2] = tmp0;
+
+ // scale down, round and store
+ out[i << 1] = state[3] >> 15;
+ }
+}
+
+//
+// interpolator
+// input: int32_t (shifted 15 positions to the left, + offset 16384)
+// output: int32_t (shifted 15 positions to the left, + offset 16384) (of length len*2)
+// state: filter state array; length = 8
+void WebRtcSpl_UpBy2IntToInt(const int32_t *in, int32_t len, int32_t *out,
+ int32_t *state)
+{
+ int32_t tmp0, tmp1, diff;
+ int32_t i;
+
+ // upper allpass filter (generates odd output samples)
+ for (i = 0; i < len; i++)
+ {
+ tmp0 = in[i];
+ diff = tmp0 - state[5];
+ // scale down and round
+ diff = (diff + (1 << 13)) >> 14;
+ tmp1 = state[4] + diff * kResampleAllpass[0][0];
+ state[4] = tmp0;
+ diff = tmp1 - state[6];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ tmp0 = state[5] + diff * kResampleAllpass[0][1];
+ state[5] = tmp1;
+ diff = tmp0 - state[7];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ state[7] = state[6] + diff * kResampleAllpass[0][2];
+ state[6] = tmp0;
+
+ // scale down, round and store
+ out[i << 1] = state[7];
+ }
+
+ out++;
+
+ // lower allpass filter (generates even output samples)
+ for (i = 0; i < len; i++)
+ {
+ tmp0 = in[i];
+ diff = tmp0 - state[1];
+ // scale down and round
+ diff = (diff + (1 << 13)) >> 14;
+ tmp1 = state[0] + diff * kResampleAllpass[1][0];
+ state[0] = tmp0;
+ diff = tmp1 - state[2];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ tmp0 = state[1] + diff * kResampleAllpass[1][1];
+ state[1] = tmp1;
+ diff = tmp0 - state[3];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ state[3] = state[2] + diff * kResampleAllpass[1][2];
+ state[2] = tmp0;
+
+ // scale down, round and store
+ out[i << 1] = state[3];
+ }
+}
+
+//
+// interpolator
+// input: int32_t (shifted 15 positions to the left, + offset 16384)
+// output: int16_t (saturated) (of length len*2)
+// state: filter state array; length = 8
+void WebRtcSpl_UpBy2IntToShort(const int32_t *in, int32_t len, int16_t *out,
+ int32_t *state)
+{
+ int32_t tmp0, tmp1, diff;
+ int32_t i;
+
+ // upper allpass filter (generates odd output samples)
+ for (i = 0; i < len; i++)
+ {
+ tmp0 = in[i];
+ diff = tmp0 - state[5];
+ // scale down and round
+ diff = (diff + (1 << 13)) >> 14;
+ tmp1 = state[4] + diff * kResampleAllpass[0][0];
+ state[4] = tmp0;
+ diff = tmp1 - state[6];
+ // scale down and round
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ tmp0 = state[5] + diff * kResampleAllpass[0][1];
+ state[5] = tmp1;
+ diff = tmp0 - state[7];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ state[7] = state[6] + diff * kResampleAllpass[0][2];
+ state[6] = tmp0;
+
+ // scale down, saturate and store
+ tmp1 = state[7] >> 15;
+ if (tmp1 > (int32_t)0x00007FFF)
+ tmp1 = 0x00007FFF;
+ if (tmp1 < (int32_t)0xFFFF8000)
+ tmp1 = 0xFFFF8000;
+ out[i << 1] = (int16_t)tmp1;
+ }
+
+ out++;
+
+ // lower allpass filter (generates even output samples)
+ for (i = 0; i < len; i++)
+ {
+ tmp0 = in[i];
+ diff = tmp0 - state[1];
+ // scale down and round
+ diff = (diff + (1 << 13)) >> 14;
+ tmp1 = state[0] + diff * kResampleAllpass[1][0];
+ state[0] = tmp0;
+ diff = tmp1 - state[2];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ tmp0 = state[1] + diff * kResampleAllpass[1][1];
+ state[1] = tmp1;
+ diff = tmp0 - state[3];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ state[3] = state[2] + diff * kResampleAllpass[1][2];
+ state[2] = tmp0;
+
+ // scale down, saturate and store
+ tmp1 = state[3] >> 15;
+ if (tmp1 > (int32_t)0x00007FFF)
+ tmp1 = 0x00007FFF;
+ if (tmp1 < (int32_t)0xFFFF8000)
+ tmp1 = 0xFFFF8000;
+ out[i << 1] = (int16_t)tmp1;
+ }
+}
+
+// lowpass filter
+// input: int16_t
+// output: int32_t (normalized, not saturated)
+// state: filter state array; length = 8
+void WebRtcSpl_LPBy2ShortToInt(const int16_t* in, int32_t len, int32_t* out,
+ int32_t* state)
+{
+ int32_t tmp0, tmp1, diff;
+ int32_t i;
+
+ len >>= 1;
+
+ // lower allpass filter: odd input -> even output samples
+ in++;
+ // initial state of polyphase delay element
+ tmp0 = state[12];
+ for (i = 0; i < len; i++)
+ {
+ diff = tmp0 - state[1];
+ // scale down and round
+ diff = (diff + (1 << 13)) >> 14;
+ tmp1 = state[0] + diff * kResampleAllpass[1][0];
+ state[0] = tmp0;
+ diff = tmp1 - state[2];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ tmp0 = state[1] + diff * kResampleAllpass[1][1];
+ state[1] = tmp1;
+ diff = tmp0 - state[3];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ state[3] = state[2] + diff * kResampleAllpass[1][2];
+ state[2] = tmp0;
+
+ // scale down, round and store
+ out[i << 1] = state[3] >> 1;
+ tmp0 = ((int32_t)in[i << 1] << 15) + (1 << 14);
+ }
+ in--;
+
+ // upper allpass filter: even input -> even output samples
+ for (i = 0; i < len; i++)
+ {
+ tmp0 = ((int32_t)in[i << 1] << 15) + (1 << 14);
+ diff = tmp0 - state[5];
+ // scale down and round
+ diff = (diff + (1 << 13)) >> 14;
+ tmp1 = state[4] + diff * kResampleAllpass[0][0];
+ state[4] = tmp0;
+ diff = tmp1 - state[6];
+ // scale down and round
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ tmp0 = state[5] + diff * kResampleAllpass[0][1];
+ state[5] = tmp1;
+ diff = tmp0 - state[7];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ state[7] = state[6] + diff * kResampleAllpass[0][2];
+ state[6] = tmp0;
+
+ // average the two allpass outputs, scale down and store
+ out[i << 1] = (out[i << 1] + (state[7] >> 1)) >> 15;
+ }
+
+ // switch to odd output samples
+ out++;
+
+ // lower allpass filter: even input -> odd output samples
+ for (i = 0; i < len; i++)
+ {
+ tmp0 = ((int32_t)in[i << 1] << 15) + (1 << 14);
+ diff = tmp0 - state[9];
+ // scale down and round
+ diff = (diff + (1 << 13)) >> 14;
+ tmp1 = state[8] + diff * kResampleAllpass[1][0];
+ state[8] = tmp0;
+ diff = tmp1 - state[10];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ tmp0 = state[9] + diff * kResampleAllpass[1][1];
+ state[9] = tmp1;
+ diff = tmp0 - state[11];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ state[11] = state[10] + diff * kResampleAllpass[1][2];
+ state[10] = tmp0;
+
+ // scale down, round and store
+ out[i << 1] = state[11] >> 1;
+ }
+
+ // upper allpass filter: odd input -> odd output samples
+ in++;
+ for (i = 0; i < len; i++)
+ {
+ tmp0 = ((int32_t)in[i << 1] << 15) + (1 << 14);
+ diff = tmp0 - state[13];
+ // scale down and round
+ diff = (diff + (1 << 13)) >> 14;
+ tmp1 = state[12] + diff * kResampleAllpass[0][0];
+ state[12] = tmp0;
+ diff = tmp1 - state[14];
+ // scale down and round
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ tmp0 = state[13] + diff * kResampleAllpass[0][1];
+ state[13] = tmp1;
+ diff = tmp0 - state[15];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ state[15] = state[14] + diff * kResampleAllpass[0][2];
+ state[14] = tmp0;
+
+ // average the two allpass outputs, scale down and store
+ out[i << 1] = (out[i << 1] + (state[15] >> 1)) >> 15;
+ }
+}
+
+// lowpass filter
+// input: int32_t (shifted 15 positions to the left, + offset 16384)
+// output: int32_t (normalized, not saturated)
+// state: filter state array; length = 8
+void RTC_NO_SANITIZE("signed-integer-overflow") // bugs.webrtc.org/5486
+WebRtcSpl_LPBy2IntToInt(const int32_t* in, int32_t len, int32_t* out,
+ int32_t* state)
+{
+ int32_t tmp0, tmp1, diff;
+ int32_t i;
+
+ len >>= 1;
+
+ // lower allpass filter: odd input -> even output samples
+ in++;
+ // initial state of polyphase delay element
+ tmp0 = state[12];
+ for (i = 0; i < len; i++)
+ {
+ diff = tmp0 - state[1];
+ // scale down and round
+ diff = (diff + (1 << 13)) >> 14;
+ tmp1 = state[0] + diff * kResampleAllpass[1][0];
+ state[0] = tmp0;
+ diff = tmp1 - state[2];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ tmp0 = state[1] + diff * kResampleAllpass[1][1];
+ state[1] = tmp1;
+ diff = tmp0 - state[3];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ state[3] = state[2] + diff * kResampleAllpass[1][2];
+ state[2] = tmp0;
+
+ // scale down, round and store
+ out[i << 1] = state[3] >> 1;
+ tmp0 = in[i << 1];
+ }
+ in--;
+
+ // upper allpass filter: even input -> even output samples
+ for (i = 0; i < len; i++)
+ {
+ tmp0 = in[i << 1];
+ diff = tmp0 - state[5];
+ // UBSan: -794814117 - 1566149201 cannot be represented in type 'int'
+
+ // scale down and round
+ diff = (diff + (1 << 13)) >> 14;
+ tmp1 = state[4] + diff * kResampleAllpass[0][0];
+ state[4] = tmp0;
+ diff = tmp1 - state[6];
+ // scale down and round
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ tmp0 = state[5] + diff * kResampleAllpass[0][1];
+ state[5] = tmp1;
+ diff = tmp0 - state[7];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ state[7] = state[6] + diff * kResampleAllpass[0][2];
+ state[6] = tmp0;
+
+ // average the two allpass outputs, scale down and store
+ out[i << 1] = (out[i << 1] + (state[7] >> 1)) >> 15;
+ }
+
+ // switch to odd output samples
+ out++;
+
+ // lower allpass filter: even input -> odd output samples
+ for (i = 0; i < len; i++)
+ {
+ tmp0 = in[i << 1];
+ diff = tmp0 - state[9];
+ // scale down and round
+ diff = (diff + (1 << 13)) >> 14;
+ tmp1 = state[8] + diff * kResampleAllpass[1][0];
+ state[8] = tmp0;
+ diff = tmp1 - state[10];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ tmp0 = state[9] + diff * kResampleAllpass[1][1];
+ state[9] = tmp1;
+ diff = tmp0 - state[11];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ state[11] = state[10] + diff * kResampleAllpass[1][2];
+ state[10] = tmp0;
+
+ // scale down, round and store
+ out[i << 1] = state[11] >> 1;
+ }
+
+ // upper allpass filter: odd input -> odd output samples
+ in++;
+ for (i = 0; i < len; i++)
+ {
+ tmp0 = in[i << 1];
+ diff = tmp0 - state[13];
+ // scale down and round
+ diff = (diff + (1 << 13)) >> 14;
+ tmp1 = state[12] + diff * kResampleAllpass[0][0];
+ state[12] = tmp0;
+ diff = tmp1 - state[14];
+ // scale down and round
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ tmp0 = state[13] + diff * kResampleAllpass[0][1];
+ state[13] = tmp1;
+ diff = tmp0 - state[15];
+ // scale down and truncate
+ diff = diff >> 14;
+ if (diff < 0)
+ diff += 1;
+ state[15] = state[14] + diff * kResampleAllpass[0][2];
+ state[14] = tmp0;
+
+ // average the two allpass outputs, scale down and store
+ out[i << 1] = (out[i << 1] + (state[15] >> 1)) >> 15;
+ }
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/resample_by_2_internal.h b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/resample_by_2_internal.h
new file mode 100644
index 000000000..145395a4c
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/resample_by_2_internal.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * This header file contains some internal resampling functions.
+ *
+ */
+
+#ifndef COMMON_AUDIO_SIGNAL_PROCESSING_RESAMPLE_BY_2_INTERNAL_H_
+#define COMMON_AUDIO_SIGNAL_PROCESSING_RESAMPLE_BY_2_INTERNAL_H_
+
+#include
+
+/*******************************************************************
+ * resample_by_2_fast.c
+ * Functions for internal use in the other resample functions
+ ******************************************************************/
+void WebRtcSpl_DownBy2IntToShort(int32_t* in,
+ int32_t len,
+ int16_t* out,
+ int32_t* state);
+
+void WebRtcSpl_DownBy2ShortToInt(const int16_t* in,
+ int32_t len,
+ int32_t* out,
+ int32_t* state);
+
+void WebRtcSpl_UpBy2ShortToInt(const int16_t* in,
+ int32_t len,
+ int32_t* out,
+ int32_t* state);
+
+void WebRtcSpl_UpBy2IntToInt(const int32_t* in,
+ int32_t len,
+ int32_t* out,
+ int32_t* state);
+
+void WebRtcSpl_UpBy2IntToShort(const int32_t* in,
+ int32_t len,
+ int16_t* out,
+ int32_t* state);
+
+void WebRtcSpl_LPBy2ShortToInt(const int16_t* in,
+ int32_t len,
+ int32_t* out,
+ int32_t* state);
+
+void WebRtcSpl_LPBy2IntToInt(const int32_t* in,
+ int32_t len,
+ int32_t* out,
+ int32_t* state);
+
+#endif // COMMON_AUDIO_SIGNAL_PROCESSING_RESAMPLE_BY_2_INTERNAL_H_
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/resample_fractional.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/resample_fractional.c
new file mode 100644
index 000000000..6409fbac4
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/resample_fractional.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/*
+ * This file contains the resampling functions between 48, 44, 32 and 24 kHz.
+ * The description headers can be found in signal_processing_library.h
+ *
+ */
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+
+// interpolation coefficients
+static const int16_t kCoefficients48To32[2][8] = {
+ {778, -2050, 1087, 23285, 12903, -3783, 441, 222},
+ {222, 441, -3783, 12903, 23285, 1087, -2050, 778}
+};
+
+static const int16_t kCoefficients32To24[3][8] = {
+ {767, -2362, 2434, 24406, 10620, -3838, 721, 90},
+ {386, -381, -2646, 19062, 19062, -2646, -381, 386},
+ {90, 721, -3838, 10620, 24406, 2434, -2362, 767}
+};
+
+static const int16_t kCoefficients44To32[4][9] = {
+ {117, -669, 2245, -6183, 26267, 13529, -3245, 845, -138},
+ {-101, 612, -2283, 8532, 29790, -5138, 1789, -524, 91},
+ {50, -292, 1016, -3064, 32010, 3933, -1147, 315, -53},
+ {-156, 974, -3863, 18603, 21691, -6246, 2353, -712, 126}
+};
+
+// Resampling ratio: 2/3
+// input: int32_t (normalized, not saturated) :: size 3 * K
+// output: int32_t (shifted 15 positions to the left, + offset 16384) :: size 2 * K
+// K: number of blocks
+
+void WebRtcSpl_Resample48khzTo32khz(const int32_t *In, int32_t *Out, size_t K)
+{
+ /////////////////////////////////////////////////////////////
+ // Filter operation:
+ //
+ // Perform resampling (3 input samples -> 2 output samples);
+ // process in sub blocks of size 3 samples.
+ int32_t tmp;
+ size_t m;
+
+ for (m = 0; m < K; m++)
+ {
+ tmp = 1 << 14;
+ tmp += kCoefficients48To32[0][0] * In[0];
+ tmp += kCoefficients48To32[0][1] * In[1];
+ tmp += kCoefficients48To32[0][2] * In[2];
+ tmp += kCoefficients48To32[0][3] * In[3];
+ tmp += kCoefficients48To32[0][4] * In[4];
+ tmp += kCoefficients48To32[0][5] * In[5];
+ tmp += kCoefficients48To32[0][6] * In[6];
+ tmp += kCoefficients48To32[0][7] * In[7];
+ Out[0] = tmp;
+
+ tmp = 1 << 14;
+ tmp += kCoefficients48To32[1][0] * In[1];
+ tmp += kCoefficients48To32[1][1] * In[2];
+ tmp += kCoefficients48To32[1][2] * In[3];
+ tmp += kCoefficients48To32[1][3] * In[4];
+ tmp += kCoefficients48To32[1][4] * In[5];
+ tmp += kCoefficients48To32[1][5] * In[6];
+ tmp += kCoefficients48To32[1][6] * In[7];
+ tmp += kCoefficients48To32[1][7] * In[8];
+ Out[1] = tmp;
+
+ // update pointers
+ In += 3;
+ Out += 2;
+ }
+}
+
+// Resampling ratio: 3/4
+// input: int32_t (normalized, not saturated) :: size 4 * K
+// output: int32_t (shifted 15 positions to the left, + offset 16384) :: size 3 * K
+// K: number of blocks
+
+void WebRtcSpl_Resample32khzTo24khz(const int32_t *In, int32_t *Out, size_t K)
+{
+ /////////////////////////////////////////////////////////////
+ // Filter operation:
+ //
+ // Perform resampling (4 input samples -> 3 output samples);
+ // process in sub blocks of size 4 samples.
+ size_t m;
+ int32_t tmp;
+
+ for (m = 0; m < K; m++)
+ {
+ tmp = 1 << 14;
+ tmp += kCoefficients32To24[0][0] * In[0];
+ tmp += kCoefficients32To24[0][1] * In[1];
+ tmp += kCoefficients32To24[0][2] * In[2];
+ tmp += kCoefficients32To24[0][3] * In[3];
+ tmp += kCoefficients32To24[0][4] * In[4];
+ tmp += kCoefficients32To24[0][5] * In[5];
+ tmp += kCoefficients32To24[0][6] * In[6];
+ tmp += kCoefficients32To24[0][7] * In[7];
+ Out[0] = tmp;
+
+ tmp = 1 << 14;
+ tmp += kCoefficients32To24[1][0] * In[1];
+ tmp += kCoefficients32To24[1][1] * In[2];
+ tmp += kCoefficients32To24[1][2] * In[3];
+ tmp += kCoefficients32To24[1][3] * In[4];
+ tmp += kCoefficients32To24[1][4] * In[5];
+ tmp += kCoefficients32To24[1][5] * In[6];
+ tmp += kCoefficients32To24[1][6] * In[7];
+ tmp += kCoefficients32To24[1][7] * In[8];
+ Out[1] = tmp;
+
+ tmp = 1 << 14;
+ tmp += kCoefficients32To24[2][0] * In[2];
+ tmp += kCoefficients32To24[2][1] * In[3];
+ tmp += kCoefficients32To24[2][2] * In[4];
+ tmp += kCoefficients32To24[2][3] * In[5];
+ tmp += kCoefficients32To24[2][4] * In[6];
+ tmp += kCoefficients32To24[2][5] * In[7];
+ tmp += kCoefficients32To24[2][6] * In[8];
+ tmp += kCoefficients32To24[2][7] * In[9];
+ Out[2] = tmp;
+
+ // update pointers
+ In += 4;
+ Out += 3;
+ }
+}
+
+//
+// fractional resampling filters
+// Fout = 11/16 * Fin
+// Fout = 8/11 * Fin
+//
+
+// compute two inner-products and store them to output array
+static void WebRtcSpl_ResampDotProduct(const int32_t *in1, const int32_t *in2,
+ const int16_t *coef_ptr, int32_t *out1,
+ int32_t *out2)
+{
+ int32_t tmp1 = 16384;
+ int32_t tmp2 = 16384;
+ int16_t coef;
+
+ coef = coef_ptr[0];
+ tmp1 += coef * in1[0];
+ tmp2 += coef * in2[-0];
+
+ coef = coef_ptr[1];
+ tmp1 += coef * in1[1];
+ tmp2 += coef * in2[-1];
+
+ coef = coef_ptr[2];
+ tmp1 += coef * in1[2];
+ tmp2 += coef * in2[-2];
+
+ coef = coef_ptr[3];
+ tmp1 += coef * in1[3];
+ tmp2 += coef * in2[-3];
+
+ coef = coef_ptr[4];
+ tmp1 += coef * in1[4];
+ tmp2 += coef * in2[-4];
+
+ coef = coef_ptr[5];
+ tmp1 += coef * in1[5];
+ tmp2 += coef * in2[-5];
+
+ coef = coef_ptr[6];
+ tmp1 += coef * in1[6];
+ tmp2 += coef * in2[-6];
+
+ coef = coef_ptr[7];
+ tmp1 += coef * in1[7];
+ tmp2 += coef * in2[-7];
+
+ coef = coef_ptr[8];
+ *out1 = tmp1 + coef * in1[8];
+ *out2 = tmp2 + coef * in2[-8];
+}
+
+// Resampling ratio: 8/11
+// input: int32_t (normalized, not saturated) :: size 11 * K
+// output: int32_t (shifted 15 positions to the left, + offset 16384) :: size 8 * K
+// K: number of blocks
+
+void WebRtcSpl_Resample44khzTo32khz(const int32_t *In, int32_t *Out, size_t K)
+{
+ /////////////////////////////////////////////////////////////
+ // Filter operation:
+ //
+ // Perform resampling (11 input samples -> 8 output samples);
+ // process in sub blocks of size 11 samples.
+ int32_t tmp;
+ size_t m;
+
+ for (m = 0; m < K; m++)
+ {
+ tmp = 1 << 14;
+
+ // first output sample
+ Out[0] = ((int32_t)In[3] << 15) + tmp;
+
+ // sum and accumulate filter coefficients and input samples
+ tmp += kCoefficients44To32[3][0] * In[5];
+ tmp += kCoefficients44To32[3][1] * In[6];
+ tmp += kCoefficients44To32[3][2] * In[7];
+ tmp += kCoefficients44To32[3][3] * In[8];
+ tmp += kCoefficients44To32[3][4] * In[9];
+ tmp += kCoefficients44To32[3][5] * In[10];
+ tmp += kCoefficients44To32[3][6] * In[11];
+ tmp += kCoefficients44To32[3][7] * In[12];
+ tmp += kCoefficients44To32[3][8] * In[13];
+ Out[4] = tmp;
+
+ // sum and accumulate filter coefficients and input samples
+ WebRtcSpl_ResampDotProduct(&In[0], &In[17], kCoefficients44To32[0], &Out[1], &Out[7]);
+
+ // sum and accumulate filter coefficients and input samples
+ WebRtcSpl_ResampDotProduct(&In[2], &In[15], kCoefficients44To32[1], &Out[2], &Out[6]);
+
+ // sum and accumulate filter coefficients and input samples
+ WebRtcSpl_ResampDotProduct(&In[3], &In[14], kCoefficients44To32[2], &Out[3], &Out[5]);
+
+ // update pointers
+ In += 11;
+ Out += 8;
+ }
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/spl_init.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/spl_init.c
new file mode 100644
index 000000000..82fba1dac
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/spl_init.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* The global function contained in this file initializes SPL function
+ * pointers, currently only for ARM platforms.
+ *
+ * Some code came from common/rtcd.c in the WebM project.
+ */
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+#include "webrtc/system_wrappers/include/cpu_features_wrapper.h"
+
+/* Declare function pointers. */
+MaxAbsValueW16 WebRtcSpl_MaxAbsValueW16;
+MaxAbsValueW32 WebRtcSpl_MaxAbsValueW32;
+MaxValueW16 WebRtcSpl_MaxValueW16;
+MaxValueW32 WebRtcSpl_MaxValueW32;
+MinValueW16 WebRtcSpl_MinValueW16;
+MinValueW32 WebRtcSpl_MinValueW32;
+CrossCorrelation WebRtcSpl_CrossCorrelation;
+DownsampleFast WebRtcSpl_DownsampleFast;
+ScaleAndAddVectorsWithRound WebRtcSpl_ScaleAndAddVectorsWithRound;
+
+#if (!defined(WEBRTC_HAS_NEON)) && !defined(MIPS32_LE)
+/* Initialize function pointers to the generic C version. */
+static void InitPointersToC(void) {
+ WebRtcSpl_MaxAbsValueW16 = WebRtcSpl_MaxAbsValueW16C;
+ WebRtcSpl_MaxAbsValueW32 = WebRtcSpl_MaxAbsValueW32C;
+ WebRtcSpl_MaxValueW16 = WebRtcSpl_MaxValueW16C;
+ WebRtcSpl_MaxValueW32 = WebRtcSpl_MaxValueW32C;
+ WebRtcSpl_MinValueW16 = WebRtcSpl_MinValueW16C;
+ WebRtcSpl_MinValueW32 = WebRtcSpl_MinValueW32C;
+ WebRtcSpl_CrossCorrelation = WebRtcSpl_CrossCorrelationC;
+ WebRtcSpl_DownsampleFast = WebRtcSpl_DownsampleFastC;
+ WebRtcSpl_ScaleAndAddVectorsWithRound =
+ WebRtcSpl_ScaleAndAddVectorsWithRoundC;
+}
+#endif
+
+#if defined(WEBRTC_HAS_NEON)
+/* Initialize function pointers to the Neon version. */
+static void InitPointersToNeon(void) {
+ WebRtcSpl_MaxAbsValueW16 = WebRtcSpl_MaxAbsValueW16Neon;
+ WebRtcSpl_MaxAbsValueW32 = WebRtcSpl_MaxAbsValueW32Neon;
+ WebRtcSpl_MaxValueW16 = WebRtcSpl_MaxValueW16Neon;
+ WebRtcSpl_MaxValueW32 = WebRtcSpl_MaxValueW32Neon;
+ WebRtcSpl_MinValueW16 = WebRtcSpl_MinValueW16Neon;
+ WebRtcSpl_MinValueW32 = WebRtcSpl_MinValueW32Neon;
+ WebRtcSpl_CrossCorrelation = WebRtcSpl_CrossCorrelationNeon;
+ WebRtcSpl_DownsampleFast = WebRtcSpl_DownsampleFastNeon;
+ WebRtcSpl_ScaleAndAddVectorsWithRound =
+ WebRtcSpl_ScaleAndAddVectorsWithRoundC;
+}
+#endif
+
+#if defined(MIPS32_LE)
+/* Initialize function pointers to the MIPS version. */
+static void InitPointersToMIPS(void) {
+ WebRtcSpl_MaxAbsValueW16 = WebRtcSpl_MaxAbsValueW16_mips;
+ WebRtcSpl_MaxValueW16 = WebRtcSpl_MaxValueW16_mips;
+ WebRtcSpl_MaxValueW32 = WebRtcSpl_MaxValueW32_mips;
+ WebRtcSpl_MinValueW16 = WebRtcSpl_MinValueW16_mips;
+ WebRtcSpl_MinValueW32 = WebRtcSpl_MinValueW32_mips;
+ WebRtcSpl_CrossCorrelation = WebRtcSpl_CrossCorrelation_mips;
+ WebRtcSpl_DownsampleFast = WebRtcSpl_DownsampleFast_mips;
+#if defined(MIPS_DSP_R1_LE)
+ WebRtcSpl_MaxAbsValueW32 = WebRtcSpl_MaxAbsValueW32_mips;
+ WebRtcSpl_ScaleAndAddVectorsWithRound =
+ WebRtcSpl_ScaleAndAddVectorsWithRound_mips;
+#else
+ WebRtcSpl_MaxAbsValueW32 = WebRtcSpl_MaxAbsValueW32C;
+ WebRtcSpl_ScaleAndAddVectorsWithRound =
+ WebRtcSpl_ScaleAndAddVectorsWithRoundC;
+#endif
+}
+#endif
+
+static void InitFunctionPointers(void) {
+#if defined(WEBRTC_HAS_NEON)
+ InitPointersToNeon();
+#elif defined(MIPS32_LE)
+ InitPointersToMIPS();
+#else
+ InitPointersToC();
+#endif /* WEBRTC_HAS_NEON */
+}
+
+#if defined(WEBRTC_POSIX)
+#include
+
+static void once(void (*func)(void)) {
+ static pthread_once_t lock = PTHREAD_ONCE_INIT;
+ pthread_once(&lock, func);
+}
+
+#elif defined(_WIN32)
+#include
+
+static void once(void (*func)(void)) {
+ /* Didn't use InitializeCriticalSection() since there's no race-free context
+ * in which to execute it.
+ *
+ * TODO(kma): Change to different implementation (e.g.
+ * InterlockedCompareExchangePointer) to avoid issues similar to
+ * http://code.google.com/p/webm/issues/detail?id=467.
+ */
+ static CRITICAL_SECTION lock = {(void *)((size_t)-1), -1, 0, 0, 0, 0};
+ static int done = 0;
+
+ EnterCriticalSection(&lock);
+ if (!done) {
+ func();
+ done = 1;
+ }
+ LeaveCriticalSection(&lock);
+}
+
+/* There's no fallback version as an #else block here to ensure thread safety.
+ * In case of neither pthread for WEBRTC_POSIX nor _WIN32 is present, build
+ * system should pick it up.
+ */
+#endif /* WEBRTC_POSIX */
+
+void WebRtcSpl_Init(void) {
+ once(InitFunctionPointers);
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/spl_inl.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/spl_inl.c
new file mode 100644
index 000000000..efa6a65f0
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/spl_inl.c
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2016 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include
+
+#include "webrtc/common_audio/signal_processing/include/spl_inl.h"
+
+// Table used by WebRtcSpl_CountLeadingZeros32_NotBuiltin. For each uint32_t n
+// that's a sequence of 0 bits followed by a sequence of 1 bits, the entry at
+// index (n * 0x8c0b2891) >> 26 in this table gives the number of zero bits in
+// n.
+const int8_t kWebRtcSpl_CountLeadingZeros32_Table[64] = {
+ 32, 8, 17, -1, -1, 14, -1, -1, -1, 20, -1, -1, -1, 28, -1, 18,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 26, 25, 24,
+ 4, 11, 23, 31, 3, 7, 10, 16, 22, 30, -1, -1, 2, 6, 13, 9,
+ -1, 15, -1, 21, -1, 29, 19, -1, -1, -1, -1, -1, 1, 27, 5, 12,
+};
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/spl_sqrt.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/spl_sqrt.c
new file mode 100644
index 000000000..f79ac9fa6
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/spl_sqrt.c
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/*
+ * This file contains the function WebRtcSpl_Sqrt().
+ * The description header can be found in signal_processing_library.h
+ *
+ */
+
+#include "webrtc/rtc_base/checks.h"
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+
+int32_t WebRtcSpl_SqrtLocal(int32_t in);
+
+int32_t WebRtcSpl_SqrtLocal(int32_t in)
+{
+
+ int16_t x_half, t16;
+ int32_t A, B, x2;
+
+ /* The following block performs:
+ y=in/2
+ x=y-2^30
+ x_half=x/2^31
+ t = 1 + (x_half) - 0.5*((x_half)^2) + 0.5*((x_half)^3) - 0.625*((x_half)^4)
+ + 0.875*((x_half)^5)
+ */
+
+ B = in / 2;
+
+ B = B - ((int32_t)0x40000000); // B = in/2 - 1/2
+ x_half = (int16_t)(B >> 16); // x_half = x/2 = (in-1)/2
+ B = B + ((int32_t)0x40000000); // B = 1 + x/2
+ B = B + ((int32_t)0x40000000); // Add 0.5 twice (since 1.0 does not exist in Q31)
+
+ x2 = ((int32_t)x_half) * ((int32_t)x_half) * 2; // A = (x/2)^2
+ A = -x2; // A = -(x/2)^2
+ B = B + (A >> 1); // B = 1 + x/2 - 0.5*(x/2)^2
+
+ A >>= 16;
+ A = A * A * 2; // A = (x/2)^4
+ t16 = (int16_t)(A >> 16);
+ B += -20480 * t16 * 2; // B = B - 0.625*A
+ // After this, B = 1 + x/2 - 0.5*(x/2)^2 - 0.625*(x/2)^4
+
+ A = x_half * t16 * 2; // A = (x/2)^5
+ t16 = (int16_t)(A >> 16);
+ B += 28672 * t16 * 2; // B = B + 0.875*A
+ // After this, B = 1 + x/2 - 0.5*(x/2)^2 - 0.625*(x/2)^4 + 0.875*(x/2)^5
+
+ t16 = (int16_t)(x2 >> 16);
+ A = x_half * t16 * 2; // A = x/2^3
+
+ B = B + (A >> 1); // B = B + 0.5*A
+ // After this, B = 1 + x/2 - 0.5*(x/2)^2 + 0.5*(x/2)^3 - 0.625*(x/2)^4 + 0.875*(x/2)^5
+
+ B = B + ((int32_t)32768); // Round off bit
+
+ return B;
+}
+
+int32_t WebRtcSpl_Sqrt(int32_t value)
+{
+ /*
+ Algorithm:
+
+ Six term Taylor Series is used here to compute the square root of a number
+ y^0.5 = (1+x)^0.5 where x = y-1
+ = 1+(x/2)-0.5*((x/2)^2+0.5*((x/2)^3-0.625*((x/2)^4+0.875*((x/2)^5)
+ 0.5 <= x < 1
+
+ Example of how the algorithm works, with ut=sqrt(in), and
+ with in=73632 and ut=271 (even shift value case):
+
+ in=73632
+ y= in/131072
+ x=y-1
+ t = 1 + (x/2) - 0.5*((x/2)^2) + 0.5*((x/2)^3) - 0.625*((x/2)^4) + 0.875*((x/2)^5)
+ ut=t*(1/sqrt(2))*512
+
+ or:
+
+ in=73632
+ in2=73632*2^14
+ y= in2/2^31
+ x=y-1
+ t = 1 + (x/2) - 0.5*((x/2)^2) + 0.5*((x/2)^3) - 0.625*((x/2)^4) + 0.875*((x/2)^5)
+ ut=t*(1/sqrt(2))
+ ut2=ut*2^9
+
+ which gives:
+
+ in = 73632
+ in2 = 1206386688
+ y = 0.56176757812500
+ x = -0.43823242187500
+ t = 0.74973506527313
+ ut = 0.53014274874797
+ ut2 = 2.714330873589594e+002
+
+ or:
+
+ in=73632
+ in2=73632*2^14
+ y=in2/2
+ x=y-2^30
+ x_half=x/2^31
+ t = 1 + (x_half) - 0.5*((x_half)^2) + 0.5*((x_half)^3) - 0.625*((x_half)^4)
+ + 0.875*((x_half)^5)
+ ut=t*(1/sqrt(2))
+ ut2=ut*2^9
+
+ which gives:
+
+ in = 73632
+ in2 = 1206386688
+ y = 603193344
+ x = -470548480
+ x_half = -0.21911621093750
+ t = 0.74973506527313
+ ut = 0.53014274874797
+ ut2 = 2.714330873589594e+002
+
+ */
+
+ int16_t x_norm, nshift, t16, sh;
+ int32_t A;
+
+ int16_t k_sqrt_2 = 23170; // 1/sqrt2 (==5a82)
+
+ A = value;
+
+ // The convention in this function is to calculate sqrt(abs(A)). Negate the
+ // input if it is negative.
+ if (A < 0) {
+ if (A == WEBRTC_SPL_WORD32_MIN) {
+ // This number cannot be held in an int32_t after negating.
+ // Map it to the maximum positive value.
+ A = WEBRTC_SPL_WORD32_MAX;
+ } else {
+ A = -A;
+ }
+ } else if (A == 0) {
+ return 0; // sqrt(0) = 0
+ }
+
+ sh = WebRtcSpl_NormW32(A); // # shifts to normalize A
+ A = WEBRTC_SPL_LSHIFT_W32(A, sh); // Normalize A
+ if (A < (WEBRTC_SPL_WORD32_MAX - 32767))
+ {
+ A = A + ((int32_t)32768); // Round off bit
+ } else
+ {
+ A = WEBRTC_SPL_WORD32_MAX;
+ }
+
+ x_norm = (int16_t)(A >> 16); // x_norm = AH
+
+ nshift = (sh / 2);
+ RTC_DCHECK_GE(nshift, 0);
+
+ A = (int32_t)WEBRTC_SPL_LSHIFT_W32((int32_t)x_norm, 16);
+ A = WEBRTC_SPL_ABS_W32(A); // A = abs(x_norm<<16)
+ A = WebRtcSpl_SqrtLocal(A); // A = sqrt(A)
+
+ if (2 * nshift == sh) {
+ // Even shift value case
+
+ t16 = (int16_t)(A >> 16); // t16 = AH
+
+ A = k_sqrt_2 * t16 * 2; // A = 1/sqrt(2)*t16
+ A = A + ((int32_t)32768); // Round off
+ A = A & ((int32_t)0x7fff0000); // Round off
+
+ A >>= 15; // A = A>>16
+
+ } else
+ {
+ A >>= 16; // A = A>>16
+ }
+
+ A = A & ((int32_t)0x0000ffff);
+ A >>= nshift; // De-normalize the result.
+
+ return A;
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/vector_scaling_operations.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/vector_scaling_operations.c
new file mode 100644
index 000000000..e1f391d10
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/signal_processing/vector_scaling_operations.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/*
+ * This file contains implementations of the functions
+ * WebRtcSpl_VectorBitShiftW16()
+ * WebRtcSpl_VectorBitShiftW32()
+ * WebRtcSpl_VectorBitShiftW32ToW16()
+ * WebRtcSpl_ScaleVector()
+ * WebRtcSpl_ScaleVectorWithSat()
+ * WebRtcSpl_ScaleAndAddVectors()
+ * WebRtcSpl_ScaleAndAddVectorsWithRoundC()
+ */
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+
+void WebRtcSpl_VectorBitShiftW16(int16_t *res, size_t length,
+ const int16_t *in, int16_t right_shifts)
+{
+ size_t i;
+
+ if (right_shifts > 0)
+ {
+ for (i = length; i > 0; i--)
+ {
+ (*res++) = ((*in++) >> right_shifts);
+ }
+ } else
+ {
+ for (i = length; i > 0; i--)
+ {
+ (*res++) = ((*in++) * (1 << (-right_shifts)));
+ }
+ }
+}
+
+void WebRtcSpl_VectorBitShiftW32(int32_t *out_vector,
+ size_t vector_length,
+ const int32_t *in_vector,
+ int16_t right_shifts)
+{
+ size_t i;
+
+ if (right_shifts > 0)
+ {
+ for (i = vector_length; i > 0; i--)
+ {
+ (*out_vector++) = ((*in_vector++) >> right_shifts);
+ }
+ } else
+ {
+ for (i = vector_length; i > 0; i--)
+ {
+ (*out_vector++) = ((*in_vector++) << (-right_shifts));
+ }
+ }
+}
+
+void WebRtcSpl_VectorBitShiftW32ToW16(int16_t* out, size_t length,
+ const int32_t* in, int right_shifts) {
+ size_t i;
+ int32_t tmp_w32;
+
+ if (right_shifts >= 0) {
+ for (i = length; i > 0; i--) {
+ tmp_w32 = (*in++) >> right_shifts;
+ (*out++) = WebRtcSpl_SatW32ToW16(tmp_w32);
+ }
+ } else {
+ int left_shifts = -right_shifts;
+ for (i = length; i > 0; i--) {
+ tmp_w32 = (*in++) << left_shifts;
+ (*out++) = WebRtcSpl_SatW32ToW16(tmp_w32);
+ }
+ }
+}
+
+void WebRtcSpl_ScaleVector(const int16_t *in_vector, int16_t *out_vector,
+ int16_t gain, size_t in_vector_length,
+ int16_t right_shifts)
+{
+ // Performs vector operation: out_vector = (gain*in_vector)>>right_shifts
+ size_t i;
+ const int16_t *inptr;
+ int16_t *outptr;
+
+ inptr = in_vector;
+ outptr = out_vector;
+
+ for (i = 0; i < in_vector_length; i++)
+ {
+ *outptr++ = (int16_t)((*inptr++ * gain) >> right_shifts);
+ }
+}
+
+void WebRtcSpl_ScaleVectorWithSat(const int16_t *in_vector, int16_t *out_vector,
+ int16_t gain, size_t in_vector_length,
+ int16_t right_shifts)
+{
+ // Performs vector operation: out_vector = (gain*in_vector)>>right_shifts
+ size_t i;
+ const int16_t *inptr;
+ int16_t *outptr;
+
+ inptr = in_vector;
+ outptr = out_vector;
+
+ for (i = 0; i < in_vector_length; i++) {
+ *outptr++ = WebRtcSpl_SatW32ToW16((*inptr++ * gain) >> right_shifts);
+ }
+}
+
+void WebRtcSpl_ScaleAndAddVectors(const int16_t *in1, int16_t gain1, int shift1,
+ const int16_t *in2, int16_t gain2, int shift2,
+ int16_t *out, size_t vector_length)
+{
+ // Performs vector operation: out = (gain1*in1)>>shift1 + (gain2*in2)>>shift2
+ size_t i;
+ const int16_t *in1ptr;
+ const int16_t *in2ptr;
+ int16_t *outptr;
+
+ in1ptr = in1;
+ in2ptr = in2;
+ outptr = out;
+
+ for (i = 0; i < vector_length; i++)
+ {
+ *outptr++ = (int16_t)((gain1 * *in1ptr++) >> shift1) +
+ (int16_t)((gain2 * *in2ptr++) >> shift2);
+ }
+}
+
+// C version of WebRtcSpl_ScaleAndAddVectorsWithRound() for generic platforms.
+int WebRtcSpl_ScaleAndAddVectorsWithRoundC(const int16_t* in_vector1,
+ int16_t in_vector1_scale,
+ const int16_t* in_vector2,
+ int16_t in_vector2_scale,
+ int right_shifts,
+ int16_t* out_vector,
+ size_t length) {
+ size_t i = 0;
+ int round_value = (1 << right_shifts) >> 1;
+
+ if (in_vector1 == NULL || in_vector2 == NULL || out_vector == NULL ||
+ length == 0 || right_shifts < 0) {
+ return -1;
+ }
+
+ for (i = 0; i < length; i++) {
+ out_vector[i] = (int16_t)((
+ in_vector1[i] * in_vector1_scale + in_vector2[i] * in_vector2_scale +
+ round_value) >> right_shifts);
+ }
+
+ return 0;
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c
new file mode 100644
index 000000000..25bb0a114
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c
@@ -0,0 +1,77 @@
+/*
+ * Written by Wilco Dijkstra, 1996. The following email exchange establishes the
+ * license.
+ *
+ * From: Wilco Dijkstra
+ * Date: Fri, Jun 24, 2011 at 3:20 AM
+ * Subject: Re: sqrt routine
+ * To: Kevin Ma
+ * Hi Kevin,
+ * Thanks for asking. Those routines are public domain (originally posted to
+ * comp.sys.arm a long time ago), so you can use them freely for any purpose.
+ * Cheers,
+ * Wilco
+ *
+ * ----- Original Message -----
+ * From: "Kevin Ma"
+ * To:
+ * Sent: Thursday, June 23, 2011 11:44 PM
+ * Subject: Fwd: sqrt routine
+ * Hi Wilco,
+ * I saw your sqrt routine from several web sites, including
+ * http://www.finesse.demon.co.uk/steven/sqrt.html.
+ * Just wonder if there's any copyright information with your Successive
+ * approximation routines, or if I can freely use it for any purpose.
+ * Thanks.
+ * Kevin
+ */
+
+// Minor modifications in code style for WebRTC, 2012.
+
+#include "webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h"
+
+/*
+ * Algorithm:
+ * Successive approximation of the equation (root + delta) ^ 2 = N
+ * until delta < 1. If delta < 1 we have the integer part of SQRT (N).
+ * Use delta = 2^i for i = 15 .. 0.
+ *
+ * Output precision is 16 bits. Note for large input values (close to
+ * 0x7FFFFFFF), bit 15 (the highest bit of the low 16-bit half word)
+ * contains the MSB information (a non-sign value). Do with caution
+ * if you need to cast the output to int16_t type.
+ *
+ * If the input value is negative, it returns 0.
+ */
+
+#define WEBRTC_SPL_SQRT_ITER(N) \
+ try1 = root + (1 << (N)); \
+ if (value >= try1 << (N)) \
+ { \
+ value -= try1 << (N); \
+ root |= 2 << (N); \
+ }
+
+int32_t WebRtcSpl_SqrtFloor(int32_t value)
+{
+ int32_t root = 0, try1;
+
+ WEBRTC_SPL_SQRT_ITER (15);
+ WEBRTC_SPL_SQRT_ITER (14);
+ WEBRTC_SPL_SQRT_ITER (13);
+ WEBRTC_SPL_SQRT_ITER (12);
+ WEBRTC_SPL_SQRT_ITER (11);
+ WEBRTC_SPL_SQRT_ITER (10);
+ WEBRTC_SPL_SQRT_ITER ( 9);
+ WEBRTC_SPL_SQRT_ITER ( 8);
+ WEBRTC_SPL_SQRT_ITER ( 7);
+ WEBRTC_SPL_SQRT_ITER ( 6);
+ WEBRTC_SPL_SQRT_ITER ( 5);
+ WEBRTC_SPL_SQRT_ITER ( 4);
+ WEBRTC_SPL_SQRT_ITER ( 3);
+ WEBRTC_SPL_SQRT_ITER ( 2);
+ WEBRTC_SPL_SQRT_ITER ( 1);
+ WEBRTC_SPL_SQRT_ITER ( 0);
+
+ return root >> 1;
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h
new file mode 100644
index 000000000..eaa58e30a
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include
+
+//
+// WebRtcSpl_SqrtFloor(...)
+//
+// Returns the square root of the input value |value|. The precision of this
+// function is rounding down integer precision, i.e., sqrt(8) gives 2 as answer.
+// If |value| is a negative number then 0 is returned.
+//
+// Algorithm:
+//
+// An iterative 4 cylce/bit routine
+//
+// Input:
+// - value : Value to calculate sqrt of
+//
+// Return value : Result of the sqrt calculation
+//
+int32_t WebRtcSpl_SqrtFloor(int32_t value);
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/include/webrtc_vad.h b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/include/webrtc_vad.h
new file mode 100644
index 000000000..f5bbadf5b
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/include/webrtc_vad.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * This header file includes the VAD API calls. Specific function calls are
+ * given below.
+ */
+
+#ifndef COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_ // NOLINT
+#define COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_
+
+#include
+#include
+
+typedef struct WebRtcVadInst VadInst;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Creates an instance to the VAD structure.
+VadInst* WebRtcVad_Create(void);
+
+// Frees the dynamic memory of a specified VAD instance.
+//
+// - handle [i] : Pointer to VAD instance that should be freed.
+void WebRtcVad_Free(VadInst* handle);
+
+// Initializes a VAD instance.
+//
+// - handle [i/o] : Instance that should be initialized.
+//
+// returns : 0 - (OK),
+// -1 - (null pointer or Default mode could not be set).
+int WebRtcVad_Init(VadInst* handle);
+
+// Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
+// restrictive in reporting speech. Put in other words the probability of being
+// speech when the VAD returns 1 is increased with increasing mode. As a
+// consequence also the missed detection rate goes up.
+//
+// - handle [i/o] : VAD instance.
+// - mode [i] : Aggressiveness mode (0, 1, 2, or 3).
+//
+// returns : 0 - (OK),
+// -1 - (null pointer, mode could not be set or the VAD instance
+// has not been initialized).
+int WebRtcVad_set_mode(VadInst* handle, int mode);
+
+// Calculates a VAD decision for the |audio_frame|. For valid sampling rates
+// frame lengths, see the description of WebRtcVad_ValidRatesAndFrameLengths().
+//
+// - handle [i/o] : VAD Instance. Needs to be initialized by
+// WebRtcVad_Init() before call.
+// - fs [i] : Sampling frequency (Hz): 8000, 16000, or 32000
+// - audio_frame [i] : Audio frame buffer.
+// - frame_length [i] : Length of audio frame buffer in number of samples.
+//
+// returns : 1 - (Active Voice),
+// 0 - (Non-active Voice),
+// -1 - (Error)
+int WebRtcVad_Process(VadInst* handle,
+ int fs,
+ const int16_t* audio_frame,
+ size_t frame_length);
+
+// Checks for valid combinations of |rate| and |frame_length|. We support 10,
+// 20 and 30 ms frames and the rates 8000, 16000 and 32000 Hz.
+//
+// - rate [i] : Sampling frequency (Hz).
+// - frame_length [i] : Speech frame buffer length in number of samples.
+//
+// returns : 0 - (valid combination), -1 - (invalid combination)
+int WebRtcVad_ValidRateAndFrameLength(int rate, size_t frame_length);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_ // NOLINT
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_core.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_core.c
new file mode 100644
index 000000000..eb336f9b7
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_core.c
@@ -0,0 +1,685 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/common_audio/vad/vad_core.h"
+
+#include "webrtc/rtc_base/sanitizer.h"
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+#include "webrtc/common_audio/vad/vad_filterbank.h"
+#include "webrtc/common_audio/vad/vad_gmm.h"
+#include "webrtc/common_audio/vad/vad_sp.h"
+
+// Spectrum Weighting
+static const int16_t kSpectrumWeight[kNumChannels] = { 6, 8, 10, 12, 14, 16 };
+static const int16_t kNoiseUpdateConst = 655; // Q15
+static const int16_t kSpeechUpdateConst = 6554; // Q15
+static const int16_t kBackEta = 154; // Q8
+// Minimum difference between the two models, Q5
+static const int16_t kMinimumDifference[kNumChannels] = {
+ 544, 544, 576, 576, 576, 576 };
+// Upper limit of mean value for speech model, Q7
+static const int16_t kMaximumSpeech[kNumChannels] = {
+ 11392, 11392, 11520, 11520, 11520, 11520 };
+// Minimum value for mean value
+static const int16_t kMinimumMean[kNumGaussians] = { 640, 768 };
+// Upper limit of mean value for noise model, Q7
+static const int16_t kMaximumNoise[kNumChannels] = {
+ 9216, 9088, 8960, 8832, 8704, 8576 };
+// Start values for the Gaussian models, Q7
+// Weights for the two Gaussians for the six channels (noise)
+static const int16_t kNoiseDataWeights[kTableSize] = {
+ 34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 };
+// Weights for the two Gaussians for the six channels (speech)
+static const int16_t kSpeechDataWeights[kTableSize] = {
+ 48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 };
+// Means for the two Gaussians for the six channels (noise)
+static const int16_t kNoiseDataMeans[kTableSize] = {
+ 6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 };
+// Means for the two Gaussians for the six channels (speech)
+static const int16_t kSpeechDataMeans[kTableSize] = {
+ 8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483
+};
+// Stds for the two Gaussians for the six channels (noise)
+static const int16_t kNoiseDataStds[kTableSize] = {
+ 378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 };
+// Stds for the two Gaussians for the six channels (speech)
+static const int16_t kSpeechDataStds[kTableSize] = {
+ 555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };
+
+// Constants used in GmmProbability().
+//
+// Maximum number of counted speech (VAD = 1) frames in a row.
+static const int16_t kMaxSpeechFrames = 6;
+// Minimum standard deviation for both speech and noise.
+static const int16_t kMinStd = 384;
+
+// Constants in WebRtcVad_InitCore().
+// Default aggressiveness mode.
+static const short kDefaultMode = 0;
+static const int kInitCheck = 42;
+
+// Constants used in WebRtcVad_set_mode_core().
+//
+// Thresholds for different frame lengths (10 ms, 20 ms and 30 ms).
+//
+// Mode 0, Quality.
+static const int16_t kOverHangMax1Q[3] = { 8, 4, 3 };
+static const int16_t kOverHangMax2Q[3] = { 14, 7, 5 };
+static const int16_t kLocalThresholdQ[3] = { 24, 21, 24 };
+static const int16_t kGlobalThresholdQ[3] = { 57, 48, 57 };
+// Mode 1, Low bitrate.
+static const int16_t kOverHangMax1LBR[3] = { 8, 4, 3 };
+static const int16_t kOverHangMax2LBR[3] = { 14, 7, 5 };
+static const int16_t kLocalThresholdLBR[3] = { 37, 32, 37 };
+static const int16_t kGlobalThresholdLBR[3] = { 100, 80, 100 };
+// Mode 2, Aggressive.
+static const int16_t kOverHangMax1AGG[3] = { 6, 3, 2 };
+static const int16_t kOverHangMax2AGG[3] = { 9, 5, 3 };
+static const int16_t kLocalThresholdAGG[3] = { 82, 78, 82 };
+static const int16_t kGlobalThresholdAGG[3] = { 285, 260, 285 };
+// Mode 3, Very aggressive.
+static const int16_t kOverHangMax1VAG[3] = { 6, 3, 2 };
+static const int16_t kOverHangMax2VAG[3] = { 9, 5, 3 };
+static const int16_t kLocalThresholdVAG[3] = { 94, 94, 94 };
+static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 };
+
+// Calculates the weighted average w.r.t. number of Gaussians. The |data| are
+// updated with an |offset| before averaging.
+//
+// - data [i/o] : Data to average.
+// - offset [i] : An offset added to |data|.
+// - weights [i] : Weights used for averaging.
+//
+// returns : The weighted average.
+static int32_t WeightedAverage(int16_t* data, int16_t offset,
+ const int16_t* weights) {
+ int k;
+ int32_t weighted_average = 0;
+
+ for (k = 0; k < kNumGaussians; k++) {
+ data[k * kNumChannels] += offset;
+ weighted_average += data[k * kNumChannels] * weights[k * kNumChannels];
+ }
+ return weighted_average;
+}
+
+// An s16 x s32 -> s32 multiplication that's allowed to overflow. (It's still
+// undefined behavior, so not a good idea; this just makes UBSan ignore the
+// violation, so that our old code can continue to do what it's always been
+// doing.)
+static inline int32_t RTC_NO_SANITIZE("signed-integer-overflow")
+ OverflowingMulS16ByS32ToS32(int16_t a, int32_t b) {
+ return a * b;
+}
+
+// Calculates the probabilities for both speech and background noise using
+// Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which
+// type of signal is most probable.
+//
+// - self [i/o] : Pointer to VAD instance
+// - features [i] : Feature vector of length |kNumChannels|
+// = log10(energy in frequency band)
+// - total_power [i] : Total power in audio frame.
+// - frame_length [i] : Number of input samples
+//
+// - returns : the VAD decision (0 - noise, 1 - speech).
+static int16_t GmmProbability(VadInstT* self, int16_t* features,
+ int16_t total_power, size_t frame_length) {
+ int channel, k;
+ int16_t feature_minimum;
+ int16_t h0, h1;
+ int16_t log_likelihood_ratio;
+ int16_t vadflag = 0;
+ int16_t shifts_h0, shifts_h1;
+ int16_t tmp_s16, tmp1_s16, tmp2_s16;
+ int16_t diff;
+ int gaussian;
+ int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
+ int16_t delt, ndelt;
+ int16_t maxspe, maxmu;
+ int16_t deltaN[kTableSize], deltaS[kTableSize];
+ int16_t ngprvec[kTableSize] = { 0 }; // Conditional probability = 0.
+ int16_t sgprvec[kTableSize] = { 0 }; // Conditional probability = 0.
+ int32_t h0_test, h1_test;
+ int32_t tmp1_s32, tmp2_s32;
+ int32_t sum_log_likelihood_ratios = 0;
+ int32_t noise_global_mean, speech_global_mean;
+ int32_t noise_probability[kNumGaussians], speech_probability[kNumGaussians];
+ int16_t overhead1, overhead2, individualTest, totalTest;
+
+ // Set various thresholds based on frame lengths (80, 160 or 240 samples).
+ if (frame_length == 80) {
+ overhead1 = self->over_hang_max_1[0];
+ overhead2 = self->over_hang_max_2[0];
+ individualTest = self->individual[0];
+ totalTest = self->total[0];
+ } else if (frame_length == 160) {
+ overhead1 = self->over_hang_max_1[1];
+ overhead2 = self->over_hang_max_2[1];
+ individualTest = self->individual[1];
+ totalTest = self->total[1];
+ } else {
+ overhead1 = self->over_hang_max_1[2];
+ overhead2 = self->over_hang_max_2[2];
+ individualTest = self->individual[2];
+ totalTest = self->total[2];
+ }
+
+ if (total_power > kMinEnergy) {
+ // The signal power of current frame is large enough for processing. The
+ // processing consists of two parts:
+ // 1) Calculating the likelihood of speech and thereby a VAD decision.
+ // 2) Updating the underlying model, w.r.t., the decision made.
+
+ // The detection scheme is an LRT with hypothesis
+ // H0: Noise
+ // H1: Speech
+ //
+ // We combine a global LRT with local tests, for each frequency sub-band,
+ // here defined as |channel|.
+ for (channel = 0; channel < kNumChannels; channel++) {
+ // For each channel we model the probability with a GMM consisting of
+ // |kNumGaussians|, with different means and standard deviations depending
+ // on H0 or H1.
+ h0_test = 0;
+ h1_test = 0;
+ for (k = 0; k < kNumGaussians; k++) {
+ gaussian = channel + k * kNumChannels;
+ // Probability under H0, that is, probability of frame being noise.
+ // Value given in Q27 = Q7 * Q20.
+ tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
+ self->noise_means[gaussian],
+ self->noise_stds[gaussian],
+ &deltaN[gaussian]);
+ noise_probability[k] = kNoiseDataWeights[gaussian] * tmp1_s32;
+ h0_test += noise_probability[k]; // Q27
+
+ // Probability under H1, that is, probability of frame being speech.
+ // Value given in Q27 = Q7 * Q20.
+ tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
+ self->speech_means[gaussian],
+ self->speech_stds[gaussian],
+ &deltaS[gaussian]);
+ speech_probability[k] = kSpeechDataWeights[gaussian] * tmp1_s32;
+ h1_test += speech_probability[k]; // Q27
+ }
+
+ // Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}).
+ // Approximation:
+ // log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q)
+ // = log2(h1_test) - log2(h0_test)
+ // = log2(2^(31-shifts_h1)*(1+b1))
+ // - log2(2^(31-shifts_h0)*(1+b0))
+ // = shifts_h0 - shifts_h1
+ // + log2(1+b1) - log2(1+b0)
+ // ~= shifts_h0 - shifts_h1
+ //
+ // Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1.
+ // Further, b0 and b1 are independent and on the average the two terms
+ // cancel.
+ shifts_h0 = WebRtcSpl_NormW32(h0_test);
+ shifts_h1 = WebRtcSpl_NormW32(h1_test);
+ if (h0_test == 0) {
+ shifts_h0 = 31;
+ }
+ if (h1_test == 0) {
+ shifts_h1 = 31;
+ }
+ log_likelihood_ratio = shifts_h0 - shifts_h1;
+
+ // Update |sum_log_likelihood_ratios| with spectrum weighting. This is
+ // used for the global VAD decision.
+ sum_log_likelihood_ratios +=
+ (int32_t) (log_likelihood_ratio * kSpectrumWeight[channel]);
+
+ // Local VAD decision.
+ if ((log_likelihood_ratio * 4) > individualTest) {
+ vadflag = 1;
+ }
+
+ // TODO(bjornv): The conditional probabilities below are applied on the
+ // hard coded number of Gaussians set to two. Find a way to generalize.
+ // Calculate local noise probabilities used later when updating the GMM.
+ h0 = (int16_t) (h0_test >> 12); // Q15
+ if (h0 > 0) {
+ // High probability of noise. Assign conditional probabilities for each
+ // Gaussian in the GMM.
+ tmp1_s32 = (noise_probability[0] & 0xFFFFF000) << 2; // Q29
+ ngprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h0); // Q14
+ ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel];
+ } else {
+ // Low noise probability. Assign conditional probability 1 to the first
+ // Gaussian and 0 to the rest (which is already set at initialization).
+ ngprvec[channel] = 16384;
+ }
+
+ // Calculate local speech probabilities used later when updating the GMM.
+ h1 = (int16_t) (h1_test >> 12); // Q15
+ if (h1 > 0) {
+ // High probability of speech. Assign conditional probabilities for each
+ // Gaussian in the GMM. Otherwise use the initialized values, i.e., 0.
+ tmp1_s32 = (speech_probability[0] & 0xFFFFF000) << 2; // Q29
+ sgprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h1); // Q14
+ sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel];
+ }
+ }
+
+ // Make a global VAD decision.
+ vadflag |= (sum_log_likelihood_ratios >= totalTest);
+
+ // Update the model parameters.
+ maxspe = 12800;
+ for (channel = 0; channel < kNumChannels; channel++) {
+
+ // Get minimum value in past which is used for long term correction in Q4.
+ feature_minimum = WebRtcVad_FindMinimum(self, features[channel], channel);
+
+ // Compute the "global" mean, that is the sum of the two means weighted.
+ noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
+ &kNoiseDataWeights[channel]);
+ tmp1_s16 = (int16_t) (noise_global_mean >> 6); // Q8
+
+ for (k = 0; k < kNumGaussians; k++) {
+ gaussian = channel + k * kNumChannels;
+
+ nmk = self->noise_means[gaussian];
+ smk = self->speech_means[gaussian];
+ nsk = self->noise_stds[gaussian];
+ ssk = self->speech_stds[gaussian];
+
+ // Update noise mean vector if the frame consists of noise only.
+ nmk2 = nmk;
+ if (!vadflag) {
+ // deltaN = (x-mu)/sigma^2
+ // ngprvec[k] = |noise_probability[k]| /
+ // (|noise_probability[0]| + |noise_probability[1]|)
+
+ // (Q14 * Q11 >> 11) = Q14.
+ delt = (int16_t)((ngprvec[gaussian] * deltaN[gaussian]) >> 11);
+ // Q7 + (Q14 * Q15 >> 22) = Q7.
+ nmk2 = nmk + (int16_t)((delt * kNoiseUpdateConst) >> 22);
+ }
+
+ // Long term correction of the noise mean.
+ // Q8 - Q8 = Q8.
+ ndelt = (feature_minimum << 4) - tmp1_s16;
+ // Q7 + (Q8 * Q8) >> 9 = Q7.
+ nmk3 = nmk2 + (int16_t)((ndelt * kBackEta) >> 9);
+
+ // Control that the noise mean does not drift to much.
+ tmp_s16 = (int16_t) ((k + 5) << 7);
+ if (nmk3 < tmp_s16) {
+ nmk3 = tmp_s16;
+ }
+ tmp_s16 = (int16_t) ((72 + k - channel) << 7);
+ if (nmk3 > tmp_s16) {
+ nmk3 = tmp_s16;
+ }
+ self->noise_means[gaussian] = nmk3;
+
+ if (vadflag) {
+ // Update speech mean vector:
+ // |deltaS| = (x-mu)/sigma^2
+ // sgprvec[k] = |speech_probability[k]| /
+ // (|speech_probability[0]| + |speech_probability[1]|)
+
+ // (Q14 * Q11) >> 11 = Q14.
+ delt = (int16_t)((sgprvec[gaussian] * deltaS[gaussian]) >> 11);
+ // Q14 * Q15 >> 21 = Q8.
+ tmp_s16 = (int16_t)((delt * kSpeechUpdateConst) >> 21);
+ // Q7 + (Q8 >> 1) = Q7. With rounding.
+ smk2 = smk + ((tmp_s16 + 1) >> 1);
+
+ // Control that the speech mean does not drift to much.
+ maxmu = maxspe + 640;
+ if (smk2 < kMinimumMean[k]) {
+ smk2 = kMinimumMean[k];
+ }
+ if (smk2 > maxmu) {
+ smk2 = maxmu;
+ }
+ self->speech_means[gaussian] = smk2; // Q7.
+
+ // (Q7 >> 3) = Q4. With rounding.
+ tmp_s16 = ((smk + 4) >> 3);
+
+ tmp_s16 = features[channel] - tmp_s16; // Q4
+ // (Q11 * Q4 >> 3) = Q12.
+ tmp1_s32 = (deltaS[gaussian] * tmp_s16) >> 3;
+ tmp2_s32 = tmp1_s32 - 4096;
+ tmp_s16 = sgprvec[gaussian] >> 2;
+ // (Q14 >> 2) * Q12 = Q24.
+ tmp1_s32 = tmp_s16 * tmp2_s32;
+
+ tmp2_s32 = tmp1_s32 >> 4; // Q20
+
+ // 0.1 * Q20 / Q7 = Q13.
+ if (tmp2_s32 > 0) {
+ tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp2_s32, ssk * 10);
+ } else {
+ tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp2_s32, ssk * 10);
+ tmp_s16 = -tmp_s16;
+ }
+ // Divide by 4 giving an update factor of 0.025 (= 0.1 / 4).
+ // Note that division by 4 equals shift by 2, hence,
+ // (Q13 >> 8) = (Q13 >> 6) / 4 = Q7.
+ tmp_s16 += 128; // Rounding.
+ ssk += (tmp_s16 >> 8);
+ if (ssk < kMinStd) {
+ ssk = kMinStd;
+ }
+ self->speech_stds[gaussian] = ssk;
+ } else {
+ // Update GMM variance vectors.
+ // deltaN * (features[channel] - nmk) - 1
+ // Q4 - (Q7 >> 3) = Q4.
+ tmp_s16 = features[channel] - (nmk >> 3);
+ // (Q11 * Q4 >> 3) = Q12.
+ tmp1_s32 = (deltaN[gaussian] * tmp_s16) >> 3;
+ tmp1_s32 -= 4096;
+
+ // (Q14 >> 2) * Q12 = Q24.
+ tmp_s16 = (ngprvec[gaussian] + 2) >> 2;
+ tmp2_s32 = OverflowingMulS16ByS32ToS32(tmp_s16, tmp1_s32);
+ // Q20 * approx 0.001 (2^-10=0.0009766), hence,
+ // (Q24 >> 14) = (Q24 >> 4) / 2^10 = Q20.
+ tmp1_s32 = tmp2_s32 >> 14;
+
+ // Q20 / Q7 = Q13.
+ if (tmp1_s32 > 0) {
+ tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, nsk);
+ } else {
+ tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp1_s32, nsk);
+ tmp_s16 = -tmp_s16;
+ }
+ tmp_s16 += 32; // Rounding
+ nsk += tmp_s16 >> 6; // Q13 >> 6 = Q7.
+ if (nsk < kMinStd) {
+ nsk = kMinStd;
+ }
+ self->noise_stds[gaussian] = nsk;
+ }
+ }
+
+ // Separate models if they are too close.
+ // |noise_global_mean| in Q14 (= Q7 * Q7).
+ noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
+ &kNoiseDataWeights[channel]);
+
+ // |speech_global_mean| in Q14 (= Q7 * Q7).
+ speech_global_mean = WeightedAverage(&self->speech_means[channel], 0,
+ &kSpeechDataWeights[channel]);
+
+ // |diff| = "global" speech mean - "global" noise mean.
+ // (Q14 >> 9) - (Q14 >> 9) = Q5.
+ diff = (int16_t) (speech_global_mean >> 9) -
+ (int16_t) (noise_global_mean >> 9);
+ if (diff < kMinimumDifference[channel]) {
+ tmp_s16 = kMinimumDifference[channel] - diff;
+
+ // |tmp1_s16| = ~0.8 * (kMinimumDifference - diff) in Q7.
+ // |tmp2_s16| = ~0.2 * (kMinimumDifference - diff) in Q7.
+ tmp1_s16 = (int16_t)((13 * tmp_s16) >> 2);
+ tmp2_s16 = (int16_t)((3 * tmp_s16) >> 2);
+
+ // Move Gaussian means for speech model by |tmp1_s16| and update
+ // |speech_global_mean|. Note that |self->speech_means[channel]| is
+ // changed after the call.
+ speech_global_mean = WeightedAverage(&self->speech_means[channel],
+ tmp1_s16,
+ &kSpeechDataWeights[channel]);
+
+ // Move Gaussian means for noise model by -|tmp2_s16| and update
+ // |noise_global_mean|. Note that |self->noise_means[channel]| is
+ // changed after the call.
+ noise_global_mean = WeightedAverage(&self->noise_means[channel],
+ -tmp2_s16,
+ &kNoiseDataWeights[channel]);
+ }
+
+ // Control that the speech & noise means do not drift to much.
+ maxspe = kMaximumSpeech[channel];
+ tmp2_s16 = (int16_t) (speech_global_mean >> 7);
+ if (tmp2_s16 > maxspe) {
+ // Upper limit of speech model.
+ tmp2_s16 -= maxspe;
+
+ for (k = 0; k < kNumGaussians; k++) {
+ self->speech_means[channel + k * kNumChannels] -= tmp2_s16;
+ }
+ }
+
+ tmp2_s16 = (int16_t) (noise_global_mean >> 7);
+ if (tmp2_s16 > kMaximumNoise[channel]) {
+ tmp2_s16 -= kMaximumNoise[channel];
+
+ for (k = 0; k < kNumGaussians; k++) {
+ self->noise_means[channel + k * kNumChannels] -= tmp2_s16;
+ }
+ }
+ }
+ self->frame_counter++;
+ }
+
+ // Smooth with respect to transition hysteresis.
+ if (!vadflag) {
+ if (self->over_hang > 0) {
+ vadflag = 2 + self->over_hang;
+ self->over_hang--;
+ }
+ self->num_of_speech = 0;
+ } else {
+ self->num_of_speech++;
+ if (self->num_of_speech > kMaxSpeechFrames) {
+ self->num_of_speech = kMaxSpeechFrames;
+ self->over_hang = overhead2;
+ } else {
+ self->over_hang = overhead1;
+ }
+ }
+ return vadflag;
+}
+
+// Initialize the VAD. Set aggressiveness mode to default value.
+int WebRtcVad_InitCore(VadInstT* self) {
+ int i;
+
+ if (self == NULL) {
+ return -1;
+ }
+
+ // Initialization of general struct variables.
+ self->vad = 1; // Speech active (=1).
+ self->frame_counter = 0;
+ self->over_hang = 0;
+ self->num_of_speech = 0;
+
+ // Initialization of downsampling filter state.
+ memset(self->downsampling_filter_states, 0,
+ sizeof(self->downsampling_filter_states));
+
+ // Initialization of 48 to 8 kHz downsampling.
+ WebRtcSpl_ResetResample48khzTo8khz(&self->state_48_to_8);
+
+ // Read initial PDF parameters.
+ for (i = 0; i < kTableSize; i++) {
+ self->noise_means[i] = kNoiseDataMeans[i];
+ self->speech_means[i] = kSpeechDataMeans[i];
+ self->noise_stds[i] = kNoiseDataStds[i];
+ self->speech_stds[i] = kSpeechDataStds[i];
+ }
+
+ // Initialize Index and Minimum value vectors.
+ for (i = 0; i < 16 * kNumChannels; i++) {
+ self->low_value_vector[i] = 10000;
+ self->index_vector[i] = 0;
+ }
+
+ // Initialize splitting filter states.
+ memset(self->upper_state, 0, sizeof(self->upper_state));
+ memset(self->lower_state, 0, sizeof(self->lower_state));
+
+ // Initialize high pass filter states.
+ memset(self->hp_filter_state, 0, sizeof(self->hp_filter_state));
+
+ // Initialize mean value memory, for WebRtcVad_FindMinimum().
+ for (i = 0; i < kNumChannels; i++) {
+ self->mean_value[i] = 1600;
+ }
+
+ // Set aggressiveness mode to default (=|kDefaultMode|).
+ if (WebRtcVad_set_mode_core(self, kDefaultMode) != 0) {
+ return -1;
+ }
+
+ self->init_flag = kInitCheck;
+
+ return 0;
+}
+
+// Set aggressiveness mode
+int WebRtcVad_set_mode_core(VadInstT* self, int mode) {
+ int return_value = 0;
+
+ switch (mode) {
+ case 0:
+ // Quality mode.
+ memcpy(self->over_hang_max_1, kOverHangMax1Q,
+ sizeof(self->over_hang_max_1));
+ memcpy(self->over_hang_max_2, kOverHangMax2Q,
+ sizeof(self->over_hang_max_2));
+ memcpy(self->individual, kLocalThresholdQ,
+ sizeof(self->individual));
+ memcpy(self->total, kGlobalThresholdQ,
+ sizeof(self->total));
+ break;
+ case 1:
+ // Low bitrate mode.
+ memcpy(self->over_hang_max_1, kOverHangMax1LBR,
+ sizeof(self->over_hang_max_1));
+ memcpy(self->over_hang_max_2, kOverHangMax2LBR,
+ sizeof(self->over_hang_max_2));
+ memcpy(self->individual, kLocalThresholdLBR,
+ sizeof(self->individual));
+ memcpy(self->total, kGlobalThresholdLBR,
+ sizeof(self->total));
+ break;
+ case 2:
+ // Aggressive mode.
+ memcpy(self->over_hang_max_1, kOverHangMax1AGG,
+ sizeof(self->over_hang_max_1));
+ memcpy(self->over_hang_max_2, kOverHangMax2AGG,
+ sizeof(self->over_hang_max_2));
+ memcpy(self->individual, kLocalThresholdAGG,
+ sizeof(self->individual));
+ memcpy(self->total, kGlobalThresholdAGG,
+ sizeof(self->total));
+ break;
+ case 3:
+ // Very aggressive mode.
+ memcpy(self->over_hang_max_1, kOverHangMax1VAG,
+ sizeof(self->over_hang_max_1));
+ memcpy(self->over_hang_max_2, kOverHangMax2VAG,
+ sizeof(self->over_hang_max_2));
+ memcpy(self->individual, kLocalThresholdVAG,
+ sizeof(self->individual));
+ memcpy(self->total, kGlobalThresholdVAG,
+ sizeof(self->total));
+ break;
+ default:
+ return_value = -1;
+ break;
+ }
+
+ return return_value;
+}
+
+// Calculate VAD decision by first extracting feature values and then calculate
+// probability for both speech and background noise.
+
+int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame,
+ size_t frame_length) {
+ int vad;
+ size_t i;
+ int16_t speech_nb[240]; // 30 ms in 8 kHz.
+ // |tmp_mem| is a temporary memory used by resample function, length is
+ // frame length in 10 ms (480 samples) + 256 extra.
+ int32_t tmp_mem[480 + 256] = { 0 };
+ const size_t kFrameLen10ms48khz = 480;
+ const size_t kFrameLen10ms8khz = 80;
+ size_t num_10ms_frames = frame_length / kFrameLen10ms48khz;
+
+ for (i = 0; i < num_10ms_frames; i++) {
+ WebRtcSpl_Resample48khzTo8khz(speech_frame,
+ &speech_nb[i * kFrameLen10ms8khz],
+ &inst->state_48_to_8,
+ tmp_mem);
+ }
+
+ // Do VAD on an 8 kHz signal
+ vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6);
+
+ return vad;
+}
+
+int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame,
+ size_t frame_length)
+{
+ size_t len;
+ int vad;
+ int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
+ int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
+
+
+ // Downsample signal 32->16->8 before doing VAD
+ WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
+ frame_length);
+ len = frame_length / 2;
+
+ WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
+ len /= 2;
+
+ // Do VAD on an 8 kHz signal
+ vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
+
+ return vad;
+}
+
+int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame,
+ size_t frame_length)
+{
+ size_t len;
+ int vad;
+ int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
+
+ // Wideband: Downsample signal before doing VAD
+ WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
+ frame_length);
+
+ len = frame_length / 2;
+ vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
+
+ return vad;
+}
+
+int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame,
+ size_t frame_length)
+{
+ int16_t feature_vector[kNumChannels], total_power;
+
+ // Get power in the bands
+ total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,
+ feature_vector);
+
+ // Make a VAD
+ inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length);
+
+ return inst->vad;
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_core.h b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_core.h
new file mode 100644
index 000000000..8f0cfc063
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_core.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * This header file includes the descriptions of the core VAD calls.
+ */
+
+#ifndef COMMON_AUDIO_VAD_VAD_CORE_H_
+#define COMMON_AUDIO_VAD_VAD_CORE_H_
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+
+enum { kNumChannels = 6 }; // Number of frequency bands (named channels).
+enum { kNumGaussians = 2 }; // Number of Gaussians per channel in the GMM.
+enum { kTableSize = kNumChannels * kNumGaussians };
+enum { kMinEnergy = 10 }; // Minimum energy required to trigger audio signal.
+
+typedef struct VadInstT_ {
+ int vad;
+ int32_t downsampling_filter_states[4];
+ WebRtcSpl_State48khzTo8khz state_48_to_8;
+ int16_t noise_means[kTableSize];
+ int16_t speech_means[kTableSize];
+ int16_t noise_stds[kTableSize];
+ int16_t speech_stds[kTableSize];
+ // TODO(bjornv): Change to |frame_count|.
+ int32_t frame_counter;
+ int16_t over_hang; // Over Hang
+ int16_t num_of_speech;
+ // TODO(bjornv): Change to |age_vector|.
+ int16_t index_vector[16 * kNumChannels];
+ int16_t low_value_vector[16 * kNumChannels];
+ // TODO(bjornv): Change to |median|.
+ int16_t mean_value[kNumChannels];
+ int16_t upper_state[5];
+ int16_t lower_state[5];
+ int16_t hp_filter_state[4];
+ int16_t over_hang_max_1[3];
+ int16_t over_hang_max_2[3];
+ int16_t individual[3];
+ int16_t total[3];
+
+ int init_flag;
+} VadInstT;
+
+// Initializes the core VAD component. The default aggressiveness mode is
+// controlled by |kDefaultMode| in vad_core.c.
+//
+// - self [i/o] : Instance that should be initialized
+//
+// returns : 0 (OK), -1 (null pointer in or if the default mode can't be
+// set)
+int WebRtcVad_InitCore(VadInstT* self);
+
+/****************************************************************************
+ * WebRtcVad_set_mode_core(...)
+ *
+ * This function changes the VAD settings
+ *
+ * Input:
+ * - inst : VAD instance
+ * - mode : Aggressiveness degree
+ * 0 (High quality) - 3 (Highly aggressive)
+ *
+ * Output:
+ * - inst : Changed instance
+ *
+ * Return value : 0 - Ok
+ * -1 - Error
+ */
+
+int WebRtcVad_set_mode_core(VadInstT* self, int mode);
+
+/****************************************************************************
+ * WebRtcVad_CalcVad48khz(...)
+ * WebRtcVad_CalcVad32khz(...)
+ * WebRtcVad_CalcVad16khz(...)
+ * WebRtcVad_CalcVad8khz(...)
+ *
+ * Calculate probability for active speech and make VAD decision.
+ *
+ * Input:
+ * - inst : Instance that should be initialized
+ * - speech_frame : Input speech frame
+ * - frame_length : Number of input samples
+ *
+ * Output:
+ * - inst : Updated filter states etc.
+ *
+ * Return value : VAD decision
+ * 0 - No active speech
+ * 1-6 - Active speech
+ */
+int WebRtcVad_CalcVad48khz(VadInstT* inst,
+ const int16_t* speech_frame,
+ size_t frame_length);
+int WebRtcVad_CalcVad32khz(VadInstT* inst,
+ const int16_t* speech_frame,
+ size_t frame_length);
+int WebRtcVad_CalcVad16khz(VadInstT* inst,
+ const int16_t* speech_frame,
+ size_t frame_length);
+int WebRtcVad_CalcVad8khz(VadInstT* inst,
+ const int16_t* speech_frame,
+ size_t frame_length);
+
+#endif // COMMON_AUDIO_VAD_VAD_CORE_H_
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_filterbank.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_filterbank.c
new file mode 100644
index 000000000..7d25e2abe
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_filterbank.c
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/common_audio/vad/vad_filterbank.h"
+
+#include "webrtc/rtc_base/checks.h"
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+
+// Constants used in LogOfEnergy().
+static const int16_t kLogConst = 24660; // 160*log10(2) in Q9.
+static const int16_t kLogEnergyIntPart = 14336; // 14 in Q10
+
+// Coefficients used by HighPassFilter, Q14.
+static const int16_t kHpZeroCoefs[3] = { 6631, -13262, 6631 };
+static const int16_t kHpPoleCoefs[3] = { 16384, -7756, 5620 };
+
+// Allpass filter coefficients, upper and lower, in Q15.
+// Upper: 0.64, Lower: 0.17
+static const int16_t kAllPassCoefsQ15[2] = { 20972, 5571 };
+
+// Adjustment for division with two in SplitFilter.
+static const int16_t kOffsetVector[6] = { 368, 368, 272, 176, 176, 176 };
+
+// High pass filtering, with a cut-off frequency at 80 Hz, if the |data_in| is
+// sampled at 500 Hz.
+//
+// - data_in [i] : Input audio data sampled at 500 Hz.
+// - data_length [i] : Length of input and output data.
+// - filter_state [i/o] : State of the filter.
+// - data_out [o] : Output audio data in the frequency interval
+// 80 - 250 Hz.
+static void HighPassFilter(const int16_t* data_in, size_t data_length,
+ int16_t* filter_state, int16_t* data_out) {
+ size_t i;
+ const int16_t* in_ptr = data_in;
+ int16_t* out_ptr = data_out;
+ int32_t tmp32 = 0;
+
+
+ // The sum of the absolute values of the impulse response:
+ // The zero/pole-filter has a max amplification of a single sample of: 1.4546
+ // Impulse response: 0.4047 -0.6179 -0.0266 0.1993 0.1035 -0.0194
+ // The all-zero section has a max amplification of a single sample of: 1.6189
+ // Impulse response: 0.4047 -0.8094 0.4047 0 0 0
+ // The all-pole section has a max amplification of a single sample of: 1.9931
+ // Impulse response: 1.0000 0.4734 -0.1189 -0.2187 -0.0627 0.04532
+
+ for (i = 0; i < data_length; i++) {
+ // All-zero section (filter coefficients in Q14).
+ tmp32 = kHpZeroCoefs[0] * *in_ptr;
+ tmp32 += kHpZeroCoefs[1] * filter_state[0];
+ tmp32 += kHpZeroCoefs[2] * filter_state[1];
+ filter_state[1] = filter_state[0];
+ filter_state[0] = *in_ptr++;
+
+ // All-pole section (filter coefficients in Q14).
+ tmp32 -= kHpPoleCoefs[1] * filter_state[2];
+ tmp32 -= kHpPoleCoefs[2] * filter_state[3];
+ filter_state[3] = filter_state[2];
+ filter_state[2] = (int16_t) (tmp32 >> 14);
+ *out_ptr++ = filter_state[2];
+ }
+}
+
+// All pass filtering of |data_in|, used before splitting the signal into two
+// frequency bands (low pass vs high pass).
+// Note that |data_in| and |data_out| can NOT correspond to the same address.
+//
+// - data_in [i] : Input audio signal given in Q0.
+// - data_length [i] : Length of input and output data.
+// - filter_coefficient [i] : Given in Q15.
+// - filter_state [i/o] : State of the filter given in Q(-1).
+// - data_out [o] : Output audio signal given in Q(-1).
+static void AllPassFilter(const int16_t* data_in, size_t data_length,
+ int16_t filter_coefficient, int16_t* filter_state,
+ int16_t* data_out) {
+ // The filter can only cause overflow (in the w16 output variable)
+ // if more than 4 consecutive input numbers are of maximum value and
+ // has the the same sign as the impulse responses first taps.
+ // First 6 taps of the impulse response:
+ // 0.6399 0.5905 -0.3779 0.2418 -0.1547 0.0990
+
+ size_t i;
+ int16_t tmp16 = 0;
+ int32_t tmp32 = 0;
+ int32_t state32 = ((int32_t) (*filter_state) * (1 << 16)); // Q15
+
+ for (i = 0; i < data_length; i++) {
+ tmp32 = state32 + filter_coefficient * *data_in;
+ tmp16 = (int16_t) (tmp32 >> 16); // Q(-1)
+ *data_out++ = tmp16;
+ state32 = (*data_in * (1 << 14)) - filter_coefficient * tmp16; // Q14
+ state32 *= 2; // Q15.
+ data_in += 2;
+ }
+
+ *filter_state = (int16_t) (state32 >> 16); // Q(-1)
+}
+
+// Splits |data_in| into |hp_data_out| and |lp_data_out| corresponding to
+// an upper (high pass) part and a lower (low pass) part respectively.
+//
+// - data_in [i] : Input audio data to be split into two frequency bands.
+// - data_length [i] : Length of |data_in|.
+// - upper_state [i/o] : State of the upper filter, given in Q(-1).
+// - lower_state [i/o] : State of the lower filter, given in Q(-1).
+// - hp_data_out [o] : Output audio data of the upper half of the spectrum.
+// The length is |data_length| / 2.
+// - lp_data_out [o] : Output audio data of the lower half of the spectrum.
+// The length is |data_length| / 2.
+static void SplitFilter(const int16_t* data_in, size_t data_length,
+ int16_t* upper_state, int16_t* lower_state,
+ int16_t* hp_data_out, int16_t* lp_data_out) {
+ size_t i;
+ size_t half_length = data_length >> 1; // Downsampling by 2.
+ int16_t tmp_out;
+
+ // All-pass filtering upper branch.
+ AllPassFilter(&data_in[0], half_length, kAllPassCoefsQ15[0], upper_state,
+ hp_data_out);
+
+ // All-pass filtering lower branch.
+ AllPassFilter(&data_in[1], half_length, kAllPassCoefsQ15[1], lower_state,
+ lp_data_out);
+
+ // Make LP and HP signals.
+ for (i = 0; i < half_length; i++) {
+ tmp_out = *hp_data_out;
+ *hp_data_out++ -= *lp_data_out;
+ *lp_data_out++ += tmp_out;
+ }
+}
+
+// Calculates the energy of |data_in| in dB, and also updates an overall
+// |total_energy| if necessary.
+//
+// - data_in [i] : Input audio data for energy calculation.
+// - data_length [i] : Length of input data.
+// - offset [i] : Offset value added to |log_energy|.
+// - total_energy [i/o] : An external energy updated with the energy of
+// |data_in|.
+// NOTE: |total_energy| is only updated if
+// |total_energy| <= |kMinEnergy|.
+// - log_energy [o] : 10 * log10("energy of |data_in|") given in Q4.
+static void LogOfEnergy(const int16_t* data_in, size_t data_length,
+ int16_t offset, int16_t* total_energy,
+ int16_t* log_energy) {
+ // |tot_rshifts| accumulates the number of right shifts performed on |energy|.
+ int tot_rshifts = 0;
+ // The |energy| will be normalized to 15 bits. We use unsigned integer because
+ // we eventually will mask out the fractional part.
+ uint32_t energy = 0;
+
+ RTC_DCHECK(data_in);
+ RTC_DCHECK_GT(data_length, 0);
+
+ energy = (uint32_t) WebRtcSpl_Energy((int16_t*) data_in, data_length,
+ &tot_rshifts);
+
+ if (energy != 0) {
+ // By construction, normalizing to 15 bits is equivalent with 17 leading
+ // zeros of an unsigned 32 bit value.
+ int normalizing_rshifts = 17 - WebRtcSpl_NormU32(energy);
+ // In a 15 bit representation the leading bit is 2^14. log2(2^14) in Q10 is
+ // (14 << 10), which is what we initialize |log2_energy| with. For a more
+ // detailed derivations, see below.
+ int16_t log2_energy = kLogEnergyIntPart;
+
+ tot_rshifts += normalizing_rshifts;
+ // Normalize |energy| to 15 bits.
+ // |tot_rshifts| is now the total number of right shifts performed on
+ // |energy| after normalization. This means that |energy| is in
+ // Q(-tot_rshifts).
+ if (normalizing_rshifts < 0) {
+ energy <<= -normalizing_rshifts;
+ } else {
+ energy >>= normalizing_rshifts;
+ }
+
+ // Calculate the energy of |data_in| in dB, in Q4.
+ //
+ // 10 * log10("true energy") in Q4 = 2^4 * 10 * log10("true energy") =
+ // 160 * log10(|energy| * 2^|tot_rshifts|) =
+ // 160 * log10(2) * log2(|energy| * 2^|tot_rshifts|) =
+ // 160 * log10(2) * (log2(|energy|) + log2(2^|tot_rshifts|)) =
+ // (160 * log10(2)) * (log2(|energy|) + |tot_rshifts|) =
+ // |kLogConst| * (|log2_energy| + |tot_rshifts|)
+ //
+ // We know by construction that |energy| is normalized to 15 bits. Hence,
+ // |energy| = 2^14 + frac_Q15, where frac_Q15 is a fractional part in Q15.
+ // Further, we'd like |log2_energy| in Q10
+ // log2(|energy|) in Q10 = 2^10 * log2(2^14 + frac_Q15) =
+ // 2^10 * log2(2^14 * (1 + frac_Q15 * 2^-14)) =
+ // 2^10 * (14 + log2(1 + frac_Q15 * 2^-14)) ~=
+ // (14 << 10) + 2^10 * (frac_Q15 * 2^-14) =
+ // (14 << 10) + (frac_Q15 * 2^-4) = (14 << 10) + (frac_Q15 >> 4)
+ //
+ // Note that frac_Q15 = (|energy| & 0x00003FFF)
+
+ // Calculate and add the fractional part to |log2_energy|.
+ log2_energy += (int16_t) ((energy & 0x00003FFF) >> 4);
+
+ // |kLogConst| is in Q9, |log2_energy| in Q10 and |tot_rshifts| in Q0.
+ // Note that we in our derivation above have accounted for an output in Q4.
+ *log_energy = (int16_t)(((kLogConst * log2_energy) >> 19) +
+ ((tot_rshifts * kLogConst) >> 9));
+
+ if (*log_energy < 0) {
+ *log_energy = 0;
+ }
+ } else {
+ *log_energy = offset;
+ return;
+ }
+
+ *log_energy += offset;
+
+ // Update the approximate |total_energy| with the energy of |data_in|, if
+ // |total_energy| has not exceeded |kMinEnergy|. |total_energy| is used as an
+ // energy indicator in WebRtcVad_GmmProbability() in vad_core.c.
+ if (*total_energy <= kMinEnergy) {
+ if (tot_rshifts >= 0) {
+ // We know by construction that the |energy| > |kMinEnergy| in Q0, so add
+ // an arbitrary value such that |total_energy| exceeds |kMinEnergy|.
+ *total_energy += kMinEnergy + 1;
+ } else {
+ // By construction |energy| is represented by 15 bits, hence any number of
+ // right shifted |energy| will fit in an int16_t. In addition, adding the
+ // value to |total_energy| is wrap around safe as long as
+ // |kMinEnergy| < 8192.
+ *total_energy += (int16_t) (energy >> -tot_rshifts); // Q0.
+ }
+ }
+}
+
+int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in,
+ size_t data_length, int16_t* features) {
+ int16_t total_energy = 0;
+ // We expect |data_length| to be 80, 160 or 240 samples, which corresponds to
+ // 10, 20 or 30 ms in 8 kHz. Therefore, the intermediate downsampled data will
+ // have at most 120 samples after the first split and at most 60 samples after
+ // the second split.
+ int16_t hp_120[120], lp_120[120];
+ int16_t hp_60[60], lp_60[60];
+ const size_t half_data_length = data_length >> 1;
+ size_t length = half_data_length; // |data_length| / 2, corresponds to
+ // bandwidth = 2000 Hz after downsampling.
+
+ // Initialize variables for the first SplitFilter().
+ int frequency_band = 0;
+ const int16_t* in_ptr = data_in; // [0 - 4000] Hz.
+ int16_t* hp_out_ptr = hp_120; // [2000 - 4000] Hz.
+ int16_t* lp_out_ptr = lp_120; // [0 - 2000] Hz.
+
+ RTC_DCHECK_LE(data_length, 240);
+ RTC_DCHECK_LT(4, kNumChannels - 1); // Checking maximum |frequency_band|.
+
+ // Split at 2000 Hz and downsample.
+ SplitFilter(in_ptr, data_length, &self->upper_state[frequency_band],
+ &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
+
+ // For the upper band (2000 Hz - 4000 Hz) split at 3000 Hz and downsample.
+ frequency_band = 1;
+ in_ptr = hp_120; // [2000 - 4000] Hz.
+ hp_out_ptr = hp_60; // [3000 - 4000] Hz.
+ lp_out_ptr = lp_60; // [2000 - 3000] Hz.
+ SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
+ &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
+
+ // Energy in 3000 Hz - 4000 Hz.
+ length >>= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz.
+
+ LogOfEnergy(hp_60, length, kOffsetVector[5], &total_energy, &features[5]);
+
+ // Energy in 2000 Hz - 3000 Hz.
+ LogOfEnergy(lp_60, length, kOffsetVector[4], &total_energy, &features[4]);
+
+ // For the lower band (0 Hz - 2000 Hz) split at 1000 Hz and downsample.
+ frequency_band = 2;
+ in_ptr = lp_120; // [0 - 2000] Hz.
+ hp_out_ptr = hp_60; // [1000 - 2000] Hz.
+ lp_out_ptr = lp_60; // [0 - 1000] Hz.
+ length = half_data_length; // |data_length| / 2 <=> bandwidth = 2000 Hz.
+ SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
+ &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
+
+ // Energy in 1000 Hz - 2000 Hz.
+ length >>= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz.
+ LogOfEnergy(hp_60, length, kOffsetVector[3], &total_energy, &features[3]);
+
+ // For the lower band (0 Hz - 1000 Hz) split at 500 Hz and downsample.
+ frequency_band = 3;
+ in_ptr = lp_60; // [0 - 1000] Hz.
+ hp_out_ptr = hp_120; // [500 - 1000] Hz.
+ lp_out_ptr = lp_120; // [0 - 500] Hz.
+ SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
+ &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
+
+ // Energy in 500 Hz - 1000 Hz.
+ length >>= 1; // |data_length| / 8 <=> bandwidth = 500 Hz.
+ LogOfEnergy(hp_120, length, kOffsetVector[2], &total_energy, &features[2]);
+
+ // For the lower band (0 Hz - 500 Hz) split at 250 Hz and downsample.
+ frequency_band = 4;
+ in_ptr = lp_120; // [0 - 500] Hz.
+ hp_out_ptr = hp_60; // [250 - 500] Hz.
+ lp_out_ptr = lp_60; // [0 - 250] Hz.
+ SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
+ &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
+
+ // Energy in 250 Hz - 500 Hz.
+ length >>= 1; // |data_length| / 16 <=> bandwidth = 250 Hz.
+ LogOfEnergy(hp_60, length, kOffsetVector[1], &total_energy, &features[1]);
+
+ // Remove 0 Hz - 80 Hz, by high pass filtering the lower band.
+ HighPassFilter(lp_60, length, self->hp_filter_state, hp_120);
+
+ // Energy in 80 Hz - 250 Hz.
+ LogOfEnergy(hp_120, length, kOffsetVector[0], &total_energy, &features[0]);
+
+ return total_energy;
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_filterbank.h b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_filterbank.h
new file mode 100644
index 000000000..b40050519
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_filterbank.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * This file includes feature calculating functionality used in vad_core.c.
+ */
+
+#ifndef COMMON_AUDIO_VAD_VAD_FILTERBANK_H_
+#define COMMON_AUDIO_VAD_VAD_FILTERBANK_H_
+
+#include "webrtc/common_audio/vad/vad_core.h"
+
+// Takes |data_length| samples of |data_in| and calculates the logarithm of the
+// energy of each of the |kNumChannels| = 6 frequency bands used by the VAD:
+// 80 Hz - 250 Hz
+// 250 Hz - 500 Hz
+// 500 Hz - 1000 Hz
+// 1000 Hz - 2000 Hz
+// 2000 Hz - 3000 Hz
+// 3000 Hz - 4000 Hz
+//
+// The values are given in Q4 and written to |features|. Further, an approximate
+// overall energy is returned. The return value is used in
+// WebRtcVad_GmmProbability() as a signal indicator, hence it is arbitrary above
+// the threshold |kMinEnergy|.
+//
+// - self [i/o] : State information of the VAD.
+// - data_in [i] : Input audio data, for feature extraction.
+// - data_length [i] : Audio data size, in number of samples.
+// - features [o] : 10 * log10(energy in each frequency band), Q4.
+// - returns : Total energy of the signal (NOTE! This value is not
+// exact. It is only used in a comparison.)
+int16_t WebRtcVad_CalculateFeatures(VadInstT* self,
+ const int16_t* data_in,
+ size_t data_length,
+ int16_t* features);
+
+#endif // COMMON_AUDIO_VAD_VAD_FILTERBANK_H_
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_gmm.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_gmm.c
new file mode 100644
index 000000000..176270cc5
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_gmm.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/common_audio/vad/vad_gmm.h"
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+
+static const int32_t kCompVar = 22005;
+static const int16_t kLog2Exp = 5909; // log2(exp(1)) in Q12.
+
+// For a normal distribution, the probability of |input| is calculated and
+// returned (in Q20). The formula for normal distributed probability is
+//
+// 1 / s * exp(-(x - m)^2 / (2 * s^2))
+//
+// where the parameters are given in the following Q domains:
+// m = |mean| (Q7)
+// s = |std| (Q7)
+// x = |input| (Q4)
+// in addition to the probability we output |delta| (in Q11) used when updating
+// the noise/speech model.
+int32_t WebRtcVad_GaussianProbability(int16_t input,
+ int16_t mean,
+ int16_t std,
+ int16_t* delta) {
+ int16_t tmp16, inv_std, inv_std2, exp_value = 0;
+ int32_t tmp32;
+
+ // Calculate |inv_std| = 1 / s, in Q10.
+ // 131072 = 1 in Q17, and (|std| >> 1) is for rounding instead of truncation.
+ // Q-domain: Q17 / Q7 = Q10.
+ tmp32 = (int32_t) 131072 + (int32_t) (std >> 1);
+ inv_std = (int16_t) WebRtcSpl_DivW32W16(tmp32, std);
+
+ // Calculate |inv_std2| = 1 / s^2, in Q14.
+ tmp16 = (inv_std >> 2); // Q10 -> Q8.
+ // Q-domain: (Q8 * Q8) >> 2 = Q14.
+ inv_std2 = (int16_t)((tmp16 * tmp16) >> 2);
+ // TODO(bjornv): Investigate if changing to
+ // inv_std2 = (int16_t)((inv_std * inv_std) >> 6);
+ // gives better accuracy.
+
+ tmp16 = (input << 3); // Q4 -> Q7
+ tmp16 = tmp16 - mean; // Q7 - Q7 = Q7
+
+ // To be used later, when updating noise/speech model.
+ // |delta| = (x - m) / s^2, in Q11.
+ // Q-domain: (Q14 * Q7) >> 10 = Q11.
+ *delta = (int16_t)((inv_std2 * tmp16) >> 10);
+
+ // Calculate the exponent |tmp32| = (x - m)^2 / (2 * s^2), in Q10. Replacing
+ // division by two with one shift.
+ // Q-domain: (Q11 * Q7) >> 8 = Q10.
+ tmp32 = (*delta * tmp16) >> 9;
+
+ // If the exponent is small enough to give a non-zero probability we calculate
+ // |exp_value| ~= exp(-(x - m)^2 / (2 * s^2))
+ // ~= exp2(-log2(exp(1)) * |tmp32|).
+ if (tmp32 < kCompVar) {
+ // Calculate |tmp16| = log2(exp(1)) * |tmp32|, in Q10.
+ // Q-domain: (Q12 * Q10) >> 12 = Q10.
+ tmp16 = (int16_t)((kLog2Exp * tmp32) >> 12);
+ tmp16 = -tmp16;
+ exp_value = (0x0400 | (tmp16 & 0x03FF));
+ tmp16 ^= 0xFFFF;
+ tmp16 >>= 10;
+ tmp16 += 1;
+ // Get |exp_value| = exp(-|tmp32|) in Q10.
+ exp_value >>= tmp16;
+ }
+
+ // Calculate and return (1 / s) * exp(-(x - m)^2 / (2 * s^2)), in Q20.
+ // Q-domain: Q10 * Q10 = Q20.
+ return inv_std * exp_value;
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_gmm.h b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_gmm.h
new file mode 100644
index 000000000..6b2d11ba3
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_gmm.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Gaussian probability calculations internally used in vad_core.c.
+
+#ifndef COMMON_AUDIO_VAD_VAD_GMM_H_
+#define COMMON_AUDIO_VAD_VAD_GMM_H_
+
+#include
+
+// Calculates the probability for |input|, given that |input| comes from a
+// normal distribution with mean and standard deviation (|mean|, |std|).
+//
+// Inputs:
+// - input : input sample in Q4.
+// - mean : mean input in the statistical model, Q7.
+// - std : standard deviation, Q7.
+//
+// Output:
+//
+// - delta : input used when updating the model, Q11.
+// |delta| = (|input| - |mean|) / |std|^2.
+//
+// Return:
+// (probability for |input|) =
+// 1 / |std| * exp(-(|input| - |mean|)^2 / (2 * |std|^2));
+int32_t WebRtcVad_GaussianProbability(int16_t input,
+ int16_t mean,
+ int16_t std,
+ int16_t* delta);
+
+#endif // COMMON_AUDIO_VAD_VAD_GMM_H_
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_sp.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_sp.c
new file mode 100644
index 000000000..97d5d6ce4
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_sp.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/common_audio/vad/vad_sp.h"
+
+#include "webrtc/rtc_base/checks.h"
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+#include "webrtc/common_audio/vad/vad_core.h"
+
+// Allpass filter coefficients, upper and lower, in Q13.
+// Upper: 0.64, Lower: 0.17.
+static const int16_t kAllPassCoefsQ13[2] = { 5243, 1392 }; // Q13.
+static const int16_t kSmoothingDown = 6553; // 0.2 in Q15.
+static const int16_t kSmoothingUp = 32439; // 0.99 in Q15.
+
+// TODO(bjornv): Move this function to vad_filterbank.c.
+// Downsampling filter based on splitting filter and allpass functions.
+void WebRtcVad_Downsampling(const int16_t* signal_in,
+ int16_t* signal_out,
+ int32_t* filter_state,
+ size_t in_length) {
+ int16_t tmp16_1 = 0, tmp16_2 = 0;
+ int32_t tmp32_1 = filter_state[0];
+ int32_t tmp32_2 = filter_state[1];
+ size_t n = 0;
+ // Downsampling by 2 gives half length.
+ size_t half_length = (in_length >> 1);
+
+ // Filter coefficients in Q13, filter state in Q0.
+ for (n = 0; n < half_length; n++) {
+ // All-pass filtering upper branch.
+ tmp16_1 = (int16_t) ((tmp32_1 >> 1) +
+ ((kAllPassCoefsQ13[0] * *signal_in) >> 14));
+ *signal_out = tmp16_1;
+ tmp32_1 = (int32_t)(*signal_in++) - ((kAllPassCoefsQ13[0] * tmp16_1) >> 12);
+
+ // All-pass filtering lower branch.
+ tmp16_2 = (int16_t) ((tmp32_2 >> 1) +
+ ((kAllPassCoefsQ13[1] * *signal_in) >> 14));
+ *signal_out++ += tmp16_2;
+ tmp32_2 = (int32_t)(*signal_in++) - ((kAllPassCoefsQ13[1] * tmp16_2) >> 12);
+ }
+ // Store the filter states.
+ filter_state[0] = tmp32_1;
+ filter_state[1] = tmp32_2;
+}
+
+// Inserts |feature_value| into |low_value_vector|, if it is one of the 16
+// smallest values the last 100 frames. Then calculates and returns the median
+// of the five smallest values.
+int16_t WebRtcVad_FindMinimum(VadInstT* self,
+ int16_t feature_value,
+ int channel) {
+ int i = 0, j = 0;
+ int position = -1;
+ // Offset to beginning of the 16 minimum values in memory.
+ const int offset = (channel << 4);
+ int16_t current_median = 1600;
+ int16_t alpha = 0;
+ int32_t tmp32 = 0;
+ // Pointer to memory for the 16 minimum values and the age of each value of
+ // the |channel|.
+ int16_t* age = &self->index_vector[offset];
+ int16_t* smallest_values = &self->low_value_vector[offset];
+
+ RTC_DCHECK_LT(channel, kNumChannels);
+
+ // Each value in |smallest_values| is getting 1 loop older. Update |age|, and
+ // remove old values.
+ for (i = 0; i < 16; i++) {
+ if (age[i] != 100) {
+ age[i]++;
+ } else {
+ // Too old value. Remove from memory and shift larger values downwards.
+ for (j = i; j < 15; j++) {
+ smallest_values[j] = smallest_values[j + 1];
+ age[j] = age[j + 1];
+ }
+ age[15] = 101;
+ smallest_values[15] = 10000;
+ }
+ }
+
+ // Check if |feature_value| is smaller than any of the values in
+ // |smallest_values|. If so, find the |position| where to insert the new value
+ // (|feature_value|).
+ if (feature_value < smallest_values[7]) {
+ if (feature_value < smallest_values[3]) {
+ if (feature_value < smallest_values[1]) {
+ if (feature_value < smallest_values[0]) {
+ position = 0;
+ } else {
+ position = 1;
+ }
+ } else if (feature_value < smallest_values[2]) {
+ position = 2;
+ } else {
+ position = 3;
+ }
+ } else if (feature_value < smallest_values[5]) {
+ if (feature_value < smallest_values[4]) {
+ position = 4;
+ } else {
+ position = 5;
+ }
+ } else if (feature_value < smallest_values[6]) {
+ position = 6;
+ } else {
+ position = 7;
+ }
+ } else if (feature_value < smallest_values[15]) {
+ if (feature_value < smallest_values[11]) {
+ if (feature_value < smallest_values[9]) {
+ if (feature_value < smallest_values[8]) {
+ position = 8;
+ } else {
+ position = 9;
+ }
+ } else if (feature_value < smallest_values[10]) {
+ position = 10;
+ } else {
+ position = 11;
+ }
+ } else if (feature_value < smallest_values[13]) {
+ if (feature_value < smallest_values[12]) {
+ position = 12;
+ } else {
+ position = 13;
+ }
+ } else if (feature_value < smallest_values[14]) {
+ position = 14;
+ } else {
+ position = 15;
+ }
+ }
+
+ // If we have detected a new small value, insert it at the correct position
+ // and shift larger values up.
+ if (position > -1) {
+ for (i = 15; i > position; i--) {
+ smallest_values[i] = smallest_values[i - 1];
+ age[i] = age[i - 1];
+ }
+ smallest_values[position] = feature_value;
+ age[position] = 1;
+ }
+
+ // Get |current_median|.
+ if (self->frame_counter > 2) {
+ current_median = smallest_values[2];
+ } else if (self->frame_counter > 0) {
+ current_median = smallest_values[0];
+ }
+
+ // Smooth the median value.
+ if (self->frame_counter > 0) {
+ if (current_median < self->mean_value[channel]) {
+ alpha = kSmoothingDown; // 0.2 in Q15.
+ } else {
+ alpha = kSmoothingUp; // 0.99 in Q15.
+ }
+ }
+ tmp32 = (alpha + 1) * self->mean_value[channel];
+ tmp32 += (WEBRTC_SPL_WORD16_MAX - alpha) * current_median;
+ tmp32 += 16384;
+ self->mean_value[channel] = (int16_t) (tmp32 >> 15);
+
+ return self->mean_value[channel];
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_sp.h b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_sp.h
new file mode 100644
index 000000000..002fcd8d6
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/vad_sp.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file includes specific signal processing tools used in vad_core.c.
+
+#ifndef COMMON_AUDIO_VAD_VAD_SP_H_
+#define COMMON_AUDIO_VAD_VAD_SP_H_
+
+#include "webrtc/common_audio/vad/vad_core.h"
+
+// Downsamples the signal by a factor 2, eg. 32->16 or 16->8.
+//
+// Inputs:
+// - signal_in : Input signal.
+// - in_length : Length of input signal in samples.
+//
+// Input & Output:
+// - filter_state : Current filter states of the two all-pass filters. The
+// |filter_state| is updated after all samples have been
+// processed.
+//
+// Output:
+// - signal_out : Downsampled signal (of length |in_length| / 2).
+void WebRtcVad_Downsampling(const int16_t* signal_in,
+ int16_t* signal_out,
+ int32_t* filter_state,
+ size_t in_length);
+
+// Updates and returns the smoothed feature minimum. As minimum we use the
+// median of the five smallest feature values in a 100 frames long window.
+// As long as |handle->frame_counter| is zero, that is, we haven't received any
+// "valid" data, FindMinimum() outputs the default value of 1600.
+//
+// Inputs:
+// - feature_value : New feature value to update with.
+// - channel : Channel number.
+//
+// Input & Output:
+// - handle : State information of the VAD.
+//
+// Returns:
+// : Smoothed minimum value for a moving window.
+int16_t WebRtcVad_FindMinimum(VadInstT* handle,
+ int16_t feature_value,
+ int channel);
+
+#endif // COMMON_AUDIO_VAD_VAD_SP_H_
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/webrtc_vad.c b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/webrtc_vad.c
new file mode 100644
index 000000000..4315b0995
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/common_audio/vad/webrtc_vad.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/common_audio/vad/include/webrtc_vad.h"
+
+#include
+#include
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+#include "webrtc/common_audio/vad/vad_core.h"
+
+static const int kInitCheck = 42;
+static const int kValidRates[] = { 8000, 16000, 32000, 48000 };
+static const size_t kRatesSize = sizeof(kValidRates) / sizeof(*kValidRates);
+static const int kMaxFrameLengthMs = 30;
+
+VadInst* WebRtcVad_Create() {
+ VadInstT* self = (VadInstT*)malloc(sizeof(VadInstT));
+
+ WebRtcSpl_Init();
+ self->init_flag = 0;
+
+ return (VadInst*)self;
+}
+
+void WebRtcVad_Free(VadInst* handle) {
+ free(handle);
+}
+
+// TODO(bjornv): Move WebRtcVad_InitCore() code here.
+int WebRtcVad_Init(VadInst* handle) {
+ // Initialize the core VAD component.
+ return WebRtcVad_InitCore((VadInstT*) handle);
+}
+
+// TODO(bjornv): Move WebRtcVad_set_mode_core() code here.
+int WebRtcVad_set_mode(VadInst* handle, int mode) {
+ VadInstT* self = (VadInstT*) handle;
+
+ if (handle == NULL) {
+ return -1;
+ }
+ if (self->init_flag != kInitCheck) {
+ return -1;
+ }
+
+ return WebRtcVad_set_mode_core(self, mode);
+}
+
+int WebRtcVad_Process(VadInst* handle, int fs, const int16_t* audio_frame,
+ size_t frame_length) {
+ int vad = -1;
+ VadInstT* self = (VadInstT*) handle;
+
+ if (handle == NULL) {
+ return -1;
+ }
+
+ if (self->init_flag != kInitCheck) {
+ return -1;
+ }
+ if (audio_frame == NULL) {
+ return -1;
+ }
+ if (WebRtcVad_ValidRateAndFrameLength(fs, frame_length) != 0) {
+ return -1;
+ }
+
+ if (fs == 48000) {
+ vad = WebRtcVad_CalcVad48khz(self, audio_frame, frame_length);
+ } else if (fs == 32000) {
+ vad = WebRtcVad_CalcVad32khz(self, audio_frame, frame_length);
+ } else if (fs == 16000) {
+ vad = WebRtcVad_CalcVad16khz(self, audio_frame, frame_length);
+ } else if (fs == 8000) {
+ vad = WebRtcVad_CalcVad8khz(self, audio_frame, frame_length);
+ }
+
+ if (vad > 0) {
+ vad = 1;
+ }
+ return vad;
+}
+
+int WebRtcVad_ValidRateAndFrameLength(int rate, size_t frame_length) {
+ int return_value = -1;
+ size_t i;
+ int valid_length_ms;
+ size_t valid_length;
+
+ // We only allow 10, 20 or 30 ms frames. Loop through valid frame rates and
+ // see if we have a matching pair.
+ for (i = 0; i < kRatesSize; i++) {
+ if (kValidRates[i] == rate) {
+ for (valid_length_ms = 10; valid_length_ms <= kMaxFrameLengthMs;
+ valid_length_ms += 10) {
+ valid_length = (size_t)(kValidRates[i] / 1000 * valid_length_ms);
+ if (frame_length == valid_length) {
+ return_value = 0;
+ break;
+ }
+ }
+ break;
+ }
+ }
+
+ return return_value;
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/rtc_base/checks.cc b/funasr/runtime/onnxruntime/third_party/webrtc/rtc_base/checks.cc
new file mode 100644
index 000000000..03baf3156
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/rtc_base/checks.cc
@@ -0,0 +1,167 @@
+/*
+ * Copyright 2006 The WebRTC Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Most of this was borrowed (with minor modifications) from V8's and Chromium's
+// src/base/logging.cc.
+
+#include
+#include
+#include
+
+#if defined(WEBRTC_ANDROID)
+#define RTC_LOG_TAG_ANDROID "rtc"
+#include // NOLINT
+#endif
+
+#if defined(WEBRTC_WIN)
+#include
+#endif
+
+#if defined(WEBRTC_WIN)
+#define LAST_SYSTEM_ERROR (::GetLastError())
+#elif defined(__native_client__) && __native_client__
+#define LAST_SYSTEM_ERROR (0)
+#elif defined(WEBRTC_POSIX)
+#include
+#define LAST_SYSTEM_ERROR (errno)
+#endif // WEBRTC_WIN
+
+#include "webrtc/rtc_base/checks.h"
+
+namespace {
+#if defined(__GNUC__)
+__attribute__((__format__(__printf__, 2, 3)))
+#endif
+ void AppendFormat(std::string* s, const char* fmt, ...) {
+ va_list args, copy;
+ va_start(args, fmt);
+ va_copy(copy, args);
+ const int predicted_length = std::vsnprintf(nullptr, 0, fmt, copy);
+ va_end(copy);
+
+ if (predicted_length > 0) {
+ const size_t size = s->size();
+ s->resize(size + predicted_length);
+ // Pass "+ 1" to vsnprintf to include space for the '\0'.
+ std::vsnprintf(&((*s)[size]), predicted_length + 1, fmt, args);
+ }
+ va_end(args);
+}
+}
+
+namespace rtc {
+namespace webrtc_checks_impl {
+
+// Reads one argument from args, appends it to s and advances fmt.
+// Returns true iff an argument was sucessfully parsed.
+bool ParseArg(va_list* args, const CheckArgType** fmt, std::string* s) {
+ if (**fmt == CheckArgType::kEnd)
+ return false;
+
+ switch (**fmt) {
+ case CheckArgType::kInt:
+ AppendFormat(s, "%d", va_arg(*args, int));
+ break;
+ case CheckArgType::kLong:
+ AppendFormat(s, "%ld", va_arg(*args, long));
+ break;
+ case CheckArgType::kLongLong:
+ AppendFormat(s, "%lld", va_arg(*args, long long));
+ break;
+ case CheckArgType::kUInt:
+ AppendFormat(s, "%u", va_arg(*args, unsigned));
+ break;
+ case CheckArgType::kULong:
+ AppendFormat(s, "%lu", va_arg(*args, unsigned long));
+ break;
+ case CheckArgType::kULongLong:
+ AppendFormat(s, "%llu", va_arg(*args, unsigned long long));
+ break;
+ case CheckArgType::kDouble:
+ AppendFormat(s, "%g", va_arg(*args, double));
+ break;
+ case CheckArgType::kLongDouble:
+ AppendFormat(s, "%Lg", va_arg(*args, long double));
+ break;
+ case CheckArgType::kCharP:
+ s->append(va_arg(*args, const char*));
+ break;
+ case CheckArgType::kStdString:
+ s->append(*va_arg(*args, const std::string*));
+ break;
+ case CheckArgType::kVoidP:
+ AppendFormat(s, "%p", va_arg(*args, const void*));
+ break;
+ default:
+ s->append("[Invalid CheckArgType]");
+ return false;
+ }
+ (*fmt)++;
+ return true;
+}
+
+RTC_NORETURN void FatalLog(const char* file,
+ int line,
+ const char* message,
+ const CheckArgType* fmt,
+ ...) {
+ va_list args;
+ va_start(args, fmt);
+
+ std::string s;
+ AppendFormat(&s,
+ "\n\n"
+ "#\n"
+ "# Fatal error in: %s, line %d\n"
+ "# last system error: %u\n"
+ "# Check failed: %s",
+ file, line, LAST_SYSTEM_ERROR, message);
+
+ if (*fmt == CheckArgType::kCheckOp) {
+ // This log message was generated by RTC_CHECK_OP, so we have to complete
+ // the error message using the operands that have been passed as the first
+ // two arguments.
+ fmt++;
+
+ std::string s1, s2;
+ if (ParseArg(&args, &fmt, &s1) && ParseArg(&args, &fmt, &s2))
+ AppendFormat(&s, " (%s vs. %s)\n# ", s1.c_str(), s2.c_str());
+ } else {
+ s.append("\n# ");
+ }
+
+ // Append all the user-supplied arguments to the message.
+ while (ParseArg(&args, &fmt, &s))
+ ;
+
+ va_end(args);
+
+ const char* output = s.c_str();
+
+#if defined(WEBRTC_ANDROID)
+ __android_log_print(ANDROID_LOG_ERROR, RTC_LOG_TAG_ANDROID, "%s\n", output);
+#endif
+
+ fflush(stdout);
+ fprintf(stderr, "%s", output);
+ fflush(stderr);
+ abort();
+}
+
+} // namespace webrtc_checks_impl
+} // namespace rtc
+
+// Function to call from the C version of the RTC_CHECK and RTC_DCHECK macros.
+RTC_NORETURN void rtc_FatalMessage(const char* file, int line,
+ const char* msg) {
+ static constexpr rtc::webrtc_checks_impl::CheckArgType t[] = {
+ rtc::webrtc_checks_impl::CheckArgType::kEnd};
+ FatalLog(file, line, msg, t);
+}
diff --git a/funasr/runtime/onnxruntime/third_party/webrtc/rtc_base/checks.h b/funasr/runtime/onnxruntime/third_party/webrtc/rtc_base/checks.h
new file mode 100644
index 000000000..3dbef18f5
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/webrtc/rtc_base/checks.h
@@ -0,0 +1,400 @@
+/*
+ * Copyright 2006 The WebRTC Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef RTC_BASE_CHECKS_H_
+#define RTC_BASE_CHECKS_H_
+
+// If you for some reson need to know if DCHECKs are on, test the value of
+// RTC_DCHECK_IS_ON. (Test its value, not if it's defined; it'll always be
+// defined, to either a true or a false value.)
+#if !defined(NDEBUG) || defined(DCHECK_ALWAYS_ON)
+#define RTC_DCHECK_IS_ON 1
+#else
+#define RTC_DCHECK_IS_ON 0
+#endif
+
+// Annotate a function that will not return control flow to the caller.
+#if defined(_MSC_VER)
+#define RTC_NORETURN __declspec(noreturn)
+#elif defined(__GNUC__)
+#define RTC_NORETURN __attribute__ ((__noreturn__))
+#else
+#define RTC_NORETURN
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+RTC_NORETURN void rtc_FatalMessage(const char* file, int line, const char* msg);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#ifdef __cplusplus
+// C++ version.
+
+#include
+
+#include "webrtc/rtc_base/numerics/safe_compare.h"
+#include "webrtc/rtc_base/system/inline.h"
+
+// The macros here print a message to stderr and abort under various
+// conditions. All will accept additional stream messages. For example:
+// RTC_DCHECK_EQ(foo, bar) << "I'm printed when foo != bar.";
+//
+// - RTC_CHECK(x) is an assertion that x is always true, and that if it isn't,
+// it's better to terminate the process than to continue. During development,
+// the reason that it's better to terminate might simply be that the error
+// handling code isn't in place yet; in production, the reason might be that
+// the author of the code truly believes that x will always be true, but that
+// she recognizes that if she is wrong, abrupt and unpleasant process
+// termination is still better than carrying on with the assumption violated.
+//
+// RTC_CHECK always evaluates its argument, so it's OK for x to have side
+// effects.
+//
+// - RTC_DCHECK(x) is the same as RTC_CHECK(x)---an assertion that x is always
+// true---except that x will only be evaluated in debug builds; in production
+// builds, x is simply assumed to be true. This is useful if evaluating x is
+// expensive and the expected cost of failing to detect the violated
+// assumption is acceptable. You should not handle cases where a production
+// build fails to spot a violated condition, even those that would result in
+// crashes. If the code needs to cope with the error, make it cope, but don't
+// call RTC_DCHECK; if the condition really can't occur, but you'd sleep
+// better at night knowing that the process will suicide instead of carrying
+// on in case you were wrong, use RTC_CHECK instead of RTC_DCHECK.
+//
+// RTC_DCHECK only evaluates its argument in debug builds, so if x has visible
+// side effects, you need to write e.g.
+// bool w = x; RTC_DCHECK(w);
+//
+// - RTC_CHECK_EQ, _NE, _GT, ..., and RTC_DCHECK_EQ, _NE, _GT, ... are
+// specialized variants of RTC_CHECK and RTC_DCHECK that print prettier
+// messages if the condition doesn't hold. Prefer them to raw RTC_CHECK and
+// RTC_DCHECK.
+//
+// - FATAL() aborts unconditionally.
+//
+// TODO(ajm): Ideally, checks.h would be combined with logging.h, but
+// consolidation with system_wrappers/logging.h should happen first.
+
+namespace rtc {
+namespace webrtc_checks_impl {
+enum class CheckArgType : int8_t {
+ kEnd = 0,
+ kInt,
+ kLong,
+ kLongLong,
+ kUInt,
+ kULong,
+ kULongLong,
+ kDouble,
+ kLongDouble,
+ kCharP,
+ kStdString,
+ kVoidP,
+
+ // kCheckOp doesn't represent an argument type. Instead, it is sent as the
+ // first argument from RTC_CHECK_OP to make FatalLog use the next two
+ // arguments to build the special CHECK_OP error message
+ // (the "a == b (1 vs. 2)" bit).
+ kCheckOp,
+};
+
+RTC_NORETURN void FatalLog(const char* file,
+ int line,
+ const char* message,
+ const CheckArgType* fmt,
+ ...);
+
+// Wrapper for log arguments. Only ever make values of this type with the
+// MakeVal() functions.
+template
+struct Val {
+ static constexpr CheckArgType Type() { return N; }
+ T GetVal() const { return val; }
+ T val;
+};
+
+inline Val MakeVal(int x) {
+ return {x};
+}
+inline Val MakeVal(long x) {
+ return {x};
+}
+inline Val MakeVal(long long x) {
+ return {x};
+}
+inline Val MakeVal(unsigned int x) {
+ return {x};
+}
+inline Val MakeVal(unsigned long x) {
+ return {x};
+}
+inline Val MakeVal(
+ unsigned long long x) {
+ return {x};
+}
+
+inline Val MakeVal(double x) {
+ return {x};
+}
+inline Val MakeVal(long double x) {
+ return {x};
+}
+
+inline Val MakeVal(const char* x) {
+ return {x};
+}
+inline Val MakeVal(
+ const std::string& x) {
+ return {&x};
+}
+
+inline Val MakeVal(const void* x) {
+ return {x};
+}
+
+// Ephemeral type that represents the result of the logging << operator.
+template
+class LogStreamer;
+
+// Base case: Before the first << argument.
+template <>
+class LogStreamer<> final {
+ public:
+ template <
+ typename U,
+ typename std::enable_if::value>::type* = nullptr>
+ RTC_FORCE_INLINE LogStreamer()))> operator<<(
+ U arg) const {
+ return LogStreamer()))>(MakeVal(arg),
+ this);
+ }
+
+ template <
+ typename U,
+ typename std::enable_if::value>::type* = nullptr>
+ RTC_FORCE_INLINE LogStreamer