", "", text)
+ text = re.sub(" ", "", text)
+ text = text.lower()
+ else:
+ idx = outs[0]
+ text = " "
+
+ text = [x for x in text]
+ text = " ".join(text)
+ out = "{} {}\n".format(idx, text)
+ f.write(out)
diff --git a/egs/aishell/tranformer/utils/run.pl b/egs/aishell/tranformer/utils/run.pl
new file mode 100755
index 000000000..483f95bc6
--- /dev/null
+++ b/egs/aishell/tranformer/utils/run.pl
@@ -0,0 +1,356 @@
+#!/usr/bin/env perl
+use warnings; #sed replacement for -w perl parameter
+# In general, doing
+# run.pl some.log a b c is like running the command a b c in
+# the bash shell, and putting the standard error and output into some.log.
+# To run parallel jobs (backgrounded on the host machine), you can do (e.g.)
+# run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB
+# and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier].
+# If any of the jobs fails, this script will fail.
+
+# A typical example is:
+# run.pl some.log my-prog "--opt=foo bar" foo \| other-prog baz
+# and run.pl will run something like:
+# ( my-prog '--opt=foo bar' foo | other-prog baz ) >& some.log
+#
+# Basically it takes the command-line arguments, quotes them
+# as necessary to preserve spaces, and evaluates them with bash.
+# In addition it puts the command line at the top of the log, and
+# the start and end times of the command at the beginning and end.
+# The reason why this is useful is so that we can create a different
+# version of this program that uses a queueing system instead.
+
+#use Data::Dumper;
+
+@ARGV < 2 && die "usage: run.pl log-file command-line arguments...";
+
+#print STDERR "COMMAND-LINE: " . Dumper(\@ARGV) . "\n";
+$job_pick = 'all';
+$max_jobs_run = -1;
+$jobstart = 1;
+$jobend = 1;
+$ignored_opts = ""; # These will be ignored.
+
+# First parse an option like JOB=1:4, and any
+# options that would normally be given to
+# queue.pl, which we will just discard.
+
+for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
+ # allow the JOB=1:n option to be interleaved with the
+ # options to qsub.
+ while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
+ # parse any options that would normally go to qsub, but which will be ignored here.
+ my $switch = shift @ARGV;
+ if ($switch eq "-V") {
+ $ignored_opts .= "-V ";
+ } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") {
+ # we do support the option --max-jobs-run n, and its GridEngine form -tc n.
+ # if the command appears multiple times uses the smallest option.
+ if ( $max_jobs_run <= 0 ) {
+ $max_jobs_run = shift @ARGV;
+ } else {
+ my $new_constraint = shift @ARGV;
+ if ( ($new_constraint < $max_jobs_run) ) {
+ $max_jobs_run = $new_constraint;
+ }
+ }
+
+ if (! ($max_jobs_run > 0)) {
+ die "run.pl: invalid option --max-jobs-run $max_jobs_run";
+ }
+ } else {
+ my $argument = shift @ARGV;
+ if ($argument =~ m/^--/) {
+ print STDERR "run.pl: WARNING: suspicious argument '$argument' to $switch; starts with '-'\n";
+ }
+ if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
+ $ignored_opts .= "-sync "; # Note: in the
+ # corresponding code in queue.pl it says instead, just "$sync = 1;".
+ } elsif ($switch eq "-pe") { # e.g. -pe smp 5
+ my $argument2 = shift @ARGV;
+ $ignored_opts .= "$switch $argument $argument2 ";
+ } elsif ($switch eq "--gpu") {
+ $using_gpu = $argument;
+ } elsif ($switch eq "--pick") {
+ if($argument =~ m/^(all|failed|incomplete)$/) {
+ $job_pick = $argument;
+ } else {
+ print STDERR "run.pl: ERROR: --pick argument must be one of 'all', 'failed' or 'incomplete'"
+ }
+ } else {
+ # Ignore option.
+ $ignored_opts .= "$switch $argument ";
+ }
+ }
+ }
+ if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
+ $jobname = $1;
+ $jobstart = $2;
+ $jobend = $3;
+ if ($jobstart > $jobend) {
+ die "run.pl: invalid job range $ARGV[0]";
+ }
+ if ($jobstart <= 0) {
+ die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility).";
+ }
+ shift;
+ } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
+ $jobname = $1;
+ $jobstart = $2;
+ $jobend = $2;
+ shift;
+ } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
+ print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]\n";
+ }
+}
+
+# Users found this message confusing so we are removing it.
+# if ($ignored_opts ne "") {
+# print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n";
+# }
+
+if ($max_jobs_run == -1) { # If --max-jobs-run option not set,
+ # then work out the number of processors if possible,
+ # and set it based on that.
+ $max_jobs_run = 0;
+ if ($using_gpu) {
+ if (open(P, "nvidia-smi -L |")) {
+ $max_jobs_run++ while ();
+ close(P);
+ }
+ if ($max_jobs_run == 0) {
+ $max_jobs_run = 1;
+ print STDERR "run.pl: Warning: failed to detect number of GPUs from nvidia-smi, using ${max_jobs_run}\n";
+ }
+ } elsif (open(P, ") { if (m/^processor/) { $max_jobs_run++; } }
+ if ($max_jobs_run == 0) {
+ print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo\n";
+ $max_jobs_run = 10; # reasonable default.
+ }
+ close(P);
+ } elsif (open(P, "sysctl -a |")) { # BSD/Darwin
+ while (
) {
+ if (m/hw\.ncpu\s*[:=]\s*(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4
+ $max_jobs_run = $1;
+ last;
+ }
+ }
+ close(P);
+ if ($max_jobs_run == 0) {
+ print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a\n";
+ $max_jobs_run = 10; # reasonable default.
+ }
+ } else {
+ # allow at most 32 jobs at once, on non-UNIX systems; change this code
+ # if you need to change this default.
+ $max_jobs_run = 32;
+ }
+ # The just-computed value of $max_jobs_run is just the number of processors
+ # (or our best guess); and if it happens that the number of jobs we need to
+ # run is just slightly above $max_jobs_run, it will make sense to increase
+ # $max_jobs_run to equal the number of jobs, so we don't have a small number
+ # of leftover jobs.
+ $num_jobs = $jobend - $jobstart + 1;
+ if (!$using_gpu &&
+ $num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) {
+ $max_jobs_run = $num_jobs;
+ }
+}
+
+sub pick_or_exit {
+ # pick_or_exit ( $logfile )
+ # Invoked before each job is started helps to run jobs selectively.
+ #
+ # Given the name of the output logfile decides whether the job must be
+ # executed (by returning from the subroutine) or not (by terminating the
+ # process calling exit)
+ #
+ # PRE: $job_pick is a global variable set by command line switch --pick
+ # and indicates which class of jobs must be executed.
+ #
+ # 1) If a failed job is not executed the process exit code will indicate
+ # failure, just as if the task was just executed and failed.
+ #
+ # 2) If a task is incomplete it will be executed. Incomplete may be either
+ # a job whose log file does not contain the accounting notes in the end,
+ # or a job whose log file does not exist.
+ #
+ # 3) If the $job_pick is set to 'all' (default behavior) a task will be
+ # executed regardless of the result of previous attempts.
+ #
+ # This logic could have been implemented in the main execution loop
+ # but a subroutine to preserve the current level of readability of
+ # that part of the code.
+ #
+ # Alexandre Felipe, (o.alexandre.felipe@gmail.com) 14th of August of 2020
+ #
+ if($job_pick eq 'all'){
+ return; # no need to bother with the previous log
+ }
+ open my $fh, "<", $_[0] or return; # job not executed yet
+ my $log_line;
+ my $cur_line;
+ while ($cur_line = <$fh>) {
+ if( $cur_line =~ m/# Ended \(code .*/ ) {
+ $log_line = $cur_line;
+ }
+ }
+ close $fh;
+ if (! defined($log_line)){
+ return; # incomplete
+ }
+ if ( $log_line =~ m/# Ended \(code 0\).*/ ) {
+ exit(0); # complete
+ } elsif ( $log_line =~ m/# Ended \(code \d+(; signal \d+)?\).*/ ){
+ if ($job_pick !~ m/^(failed|all)$/) {
+ exit(1); # failed but not going to run
+ } else {
+ return; # failed
+ }
+ } elsif ( $log_line =~ m/.*\S.*/ ) {
+ return; # incomplete jobs are always run
+ }
+}
+
+
+$logfile = shift @ARGV;
+
+if (defined $jobname && $logfile !~ m/$jobname/ &&
+ $jobend > $jobstart) {
+ print STDERR "run.pl: you are trying to run a parallel job but "
+ . "you are putting the output into just one log file ($logfile)\n";
+ exit(1);
+}
+
+$cmd = "";
+
+foreach $x (@ARGV) {
+ if ($x =~ m/^\S+$/) { $cmd .= $x . " "; }
+ elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
+ else { $cmd .= "\"$x\" "; }
+}
+
+#$Data::Dumper::Indent=0;
+$ret = 0;
+$numfail = 0;
+%active_pids=();
+
+use POSIX ":sys_wait_h";
+for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
+ if (scalar(keys %active_pids) >= $max_jobs_run) {
+
+ # Lets wait for a change in any child's status
+ # Then we have to work out which child finished
+ $r = waitpid(-1, 0);
+ $code = $?;
+ if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen.
+ if ( defined $active_pids{$r} ) {
+ $jid=$active_pids{$r};
+ $fail[$jid]=$code;
+ if ($code !=0) { $numfail++;}
+ delete $active_pids{$r};
+ # print STDERR "Finished: $r/$jid " . Dumper(\%active_pids) . "\n";
+ } else {
+ die "run.pl: Cannot find the PID of the child process that just finished.";
+ }
+
+ # In theory we could do a non-blocking waitpid over all jobs running just
+ # to find out if only one or more jobs finished during the previous waitpid()
+ # However, we just omit this and will reap the next one in the next pass
+ # through the for(;;) cycle
+ }
+ $childpid = fork();
+ if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; }
+ if ($childpid == 0) { # We're in the child... this branch
+ # executes the job and returns (possibly with an error status).
+ if (defined $jobname) {
+ $cmd =~ s/$jobname/$jobid/g;
+ $logfile =~ s/$jobname/$jobid/g;
+ }
+ # exit if the job does not need to be executed
+ pick_or_exit( $logfile );
+
+ system("mkdir -p `dirname $logfile` 2>/dev/null");
+ open(F, ">$logfile") || die "run.pl: Error opening log file $logfile";
+ print F "# " . $cmd . "\n";
+ print F "# Started at " . `date`;
+ $starttime = `date +'%s'`;
+ print F "#\n";
+ close(F);
+
+ # Pipe into bash.. make sure we're not using any other shell.
+ open(B, "|bash") || die "run.pl: Error opening shell command";
+ print B "( " . $cmd . ") 2>>$logfile >> $logfile";
+ close(B); # If there was an error, exit status is in $?
+ $ret = $?;
+
+ $lowbits = $ret & 127;
+ $highbits = $ret >> 8;
+ if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" }
+ else { $return_str = "code $highbits"; }
+
+ $endtime = `date +'%s'`;
+ open(F, ">>$logfile") || die "run.pl: Error opening log file $logfile (again)";
+ $enddate = `date`;
+ chop $enddate;
+ print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n";
+ print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n";
+ close(F);
+ exit($ret == 0 ? 0 : 1);
+ } else {
+ $pid[$jobid] = $childpid;
+ $active_pids{$childpid} = $jobid;
+ # print STDERR "Queued: " . Dumper(\%active_pids) . "\n";
+ }
+}
+
+# Now we have submitted all the jobs, lets wait until all the jobs finish
+foreach $child (keys %active_pids) {
+ $jobid=$active_pids{$child};
+ $r = waitpid($pid[$jobid], 0);
+ $code = $?;
+ if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen.
+ if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully
+}
+
+# Some sanity checks:
+# The $fail array should not contain undefined codes
+# The number of non-zeros in that array should be equal to $numfail
+# We cannot do foreach() here, as the JOB ids do not start at zero
+$failed_jids=0;
+for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
+ $job_return = $fail[$jobid];
+ if (not defined $job_return ) {
+ # print Dumper(\@fail);
+
+ die "run.pl: Sanity check failed: we have indication that some jobs are running " .
+ "even after we waited for all jobs to finish" ;
+ }
+ if ($job_return != 0 ){ $failed_jids++;}
+}
+if ($failed_jids != $numfail) {
+ die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)."
+}
+if ($numfail > 0) { $ret = 1; }
+
+if ($ret != 0) {
+ $njobs = $jobend - $jobstart + 1;
+ if ($njobs == 1) {
+ if (defined $jobname) {
+ $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
+ # that job.
+ }
+ print STDERR "run.pl: job failed, log is in $logfile\n";
+ if ($logfile =~ m/JOB/) {
+ print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
+ }
+ }
+ else {
+ $logfile =~ s/$jobname/*/g;
+ print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n";
+ }
+}
+
+
+exit ($ret);
diff --git a/egs/aishell/tranformer/utils/shuffle_list.pl b/egs/aishell/tranformer/utils/shuffle_list.pl
new file mode 100755
index 000000000..a116200f4
--- /dev/null
+++ b/egs/aishell/tranformer/utils/shuffle_list.pl
@@ -0,0 +1,44 @@
+#!/usr/bin/env perl
+
+# Copyright 2013 Johns Hopkins University (author: Daniel Povey)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+if ($ARGV[0] eq "--srand") {
+ $n = $ARGV[1];
+ $n =~ m/\d+/ || die "Bad argument to --srand option: \"$n\"";
+ srand($ARGV[1]);
+ shift;
+ shift;
+} else {
+ srand(0); # Gives inconsistent behavior if we don't seed.
+}
+
+if (@ARGV > 1 || $ARGV[0] =~ m/^-.+/) { # >1 args, or an option we
+ # don't understand.
+ print "Usage: shuffle_list.pl [--srand N] [input file] > output\n";
+ print "randomizes the order of lines of input.\n";
+ exit(1);
+}
+
+@lines;
+while (<>) {
+ push @lines, [ (rand(), $_)] ;
+}
+
+@lines = sort { $a->[0] cmp $b->[0] } @lines;
+foreach $l (@lines) {
+ print $l->[1];
+}
\ No newline at end of file
diff --git a/egs/aishell/tranformer/utils/split_data.py b/egs/aishell/tranformer/utils/split_data.py
new file mode 100755
index 000000000..060eae6d3
--- /dev/null
+++ b/egs/aishell/tranformer/utils/split_data.py
@@ -0,0 +1,60 @@
+import os
+import sys
+import random
+
+
+in_dir = sys.argv[1]
+out_dir = sys.argv[2]
+num_split = sys.argv[3]
+
+
+def split_scp(scp, num):
+ assert len(scp) >= num
+ avg = len(scp) // num
+ out = []
+ begin = 0
+
+ for i in range(num):
+ if i == num - 1:
+ out.append(scp[begin:])
+ else:
+ out.append(scp[begin:begin+avg])
+ begin += avg
+
+ return out
+
+
+os.path.exists("{}/wav.scp".format(in_dir))
+os.path.exists("{}/text".format(in_dir))
+
+with open("{}/wav.scp".format(in_dir), 'r') as infile:
+ wav_list = infile.readlines()
+
+with open("{}/text".format(in_dir), 'r') as infile:
+ text_list = infile.readlines()
+
+assert len(wav_list) == len(text_list)
+
+x = list(zip(wav_list, text_list))
+random.shuffle(x)
+wav_shuffle_list, text_shuffle_list = zip(*x)
+
+num_split = int(num_split)
+wav_split_list = split_scp(wav_shuffle_list, num_split)
+text_split_list = split_scp(text_shuffle_list, num_split)
+
+for idx, wav_list in enumerate(wav_split_list, 1):
+ path = out_dir + "/split" + str(num_split) + "/" + str(idx)
+ if not os.path.exists(path):
+ os.makedirs(path)
+ with open("{}/wav.scp".format(path), 'w') as wav_writer:
+ for line in wav_list:
+ wav_writer.write(line)
+
+for idx, text_list in enumerate(text_split_list, 1):
+ path = out_dir + "/split" + str(num_split) + "/" + str(idx)
+ if not os.path.exists(path):
+ os.makedirs(path)
+ with open("{}/text".format(path), 'w') as text_writer:
+ for line in text_list:
+ text_writer.write(line)
diff --git a/egs/aishell/tranformer/utils/split_scp.pl b/egs/aishell/tranformer/utils/split_scp.pl
new file mode 100755
index 000000000..0876dcb6d
--- /dev/null
+++ b/egs/aishell/tranformer/utils/split_scp.pl
@@ -0,0 +1,246 @@
+#!/usr/bin/env perl
+
+# Copyright 2010-2011 Microsoft Corporation
+
+# See ../../COPYING for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This program splits up any kind of .scp or archive-type file.
+# If there is no utt2spk option it will work on any text file and
+# will split it up with an approximately equal number of lines in
+# each but.
+# With the --utt2spk option it will work on anything that has the
+# utterance-id as the first entry on each line; the utt2spk file is
+# of the form "utterance speaker" (on each line).
+# It splits it into equal size chunks as far as it can. If you use the utt2spk
+# option it will make sure these chunks coincide with speaker boundaries. In
+# this case, if there are more chunks than speakers (and in some other
+# circumstances), some of the resulting chunks will be empty and it will print
+# an error message and exit with nonzero status.
+# You will normally call this like:
+# split_scp.pl scp scp.1 scp.2 scp.3 ...
+# or
+# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
+# Note that you can use this script to split the utt2spk file itself,
+# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
+
+# You can also call the scripts like:
+# split_scp.pl -j 3 0 scp scp.0
+# [note: with this option, it assumes zero-based indexing of the split parts,
+# i.e. the second number must be 0 <= n < num-jobs.]
+
+use warnings;
+
+$num_jobs = 0;
+$job_id = 0;
+$utt2spk_file = "";
+$one_based = 0;
+
+for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
+ if ($ARGV[0] eq "-j") {
+ shift @ARGV;
+ $num_jobs = shift @ARGV;
+ $job_id = shift @ARGV;
+ }
+ if ($ARGV[0] =~ /--utt2spk=(.+)/) {
+ $utt2spk_file=$1;
+ shift;
+ }
+ if ($ARGV[0] eq '--one-based') {
+ $one_based = 1;
+ shift @ARGV;
+ }
+}
+
+if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
+ $job_id - $one_based >= $num_jobs)) {
+ die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
+ ($one_based ? " --one-based" : "") . "'\n"
+}
+
+$one_based
+ and $job_id--;
+
+if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
+ die
+"Usage: split_scp.pl [--utt2spk=] in.scp out1.scp out2.scp ...
+ or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=] in.scp [out.scp]
+ ... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
+}
+
+$error = 0;
+$inscp = shift @ARGV;
+if ($num_jobs == 0) { # without -j option
+ @OUTPUTS = @ARGV;
+} else {
+ for ($j = 0; $j < $num_jobs; $j++) {
+ if ($j == $job_id) {
+ if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
+ else { push @OUTPUTS, "-"; }
+ } else {
+ push @OUTPUTS, "/dev/null";
+ }
+ }
+}
+
+if ($utt2spk_file ne "") { # We have the --utt2spk option...
+ open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
+ while(<$u_fh>) {
+ @A = split;
+ @A == 2 || die "$0: Bad line $_ in utt2spk file $utt2spk_file\n";
+ ($u,$s) = @A;
+ $utt2spk{$u} = $s;
+ }
+ close $u_fh;
+ open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
+ @spkrs = ();
+ while(<$i_fh>) {
+ @A = split;
+ if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; }
+ $u = $A[0];
+ $s = $utt2spk{$u};
+ defined $s || die "$0: No utterance $u in utt2spk file $utt2spk_file\n";
+ if(!defined $spk_count{$s}) {
+ push @spkrs, $s;
+ $spk_count{$s} = 0;
+ $spk_data{$s} = []; # ref to new empty array.
+ }
+ $spk_count{$s}++;
+ push @{$spk_data{$s}}, $_;
+ }
+ # Now split as equally as possible ..
+ # First allocate spks to files by allocating an approximately
+ # equal number of speakers.
+ $numspks = @spkrs; # number of speakers.
+ $numscps = @OUTPUTS; # number of output files.
+ if ($numspks < $numscps) {
+ die "$0: Refusing to split data because number of speakers $numspks " .
+ "is less than the number of output .scp files $numscps\n";
+ }
+ for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+ $scparray[$scpidx] = []; # [] is array reference.
+ }
+ for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
+ $scpidx = int(($spkidx*$numscps) / $numspks);
+ $spk = $spkrs[$spkidx];
+ push @{$scparray[$scpidx]}, $spk;
+ $scpcount[$scpidx] += $spk_count{$spk};
+ }
+
+ # Now will try to reassign beginning + ending speakers
+ # to different scp's and see if it gets more balanced.
+ # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
+ # We can show that if considering changing just 2 scp's, we minimize
+ # this by minimizing the squared difference in sizes. This is
+ # equivalent to minimizing the absolute difference in sizes. This
+ # shows this method is bound to converge.
+
+ $changed = 1;
+ while($changed) {
+ $changed = 0;
+ for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+ # First try to reassign ending spk of this scp.
+ if($scpidx < $numscps-1) {
+ $sz = @{$scparray[$scpidx]};
+ if($sz > 0) {
+ $spk = $scparray[$scpidx]->[$sz-1];
+ $count = $spk_count{$spk};
+ $nutt1 = $scpcount[$scpidx];
+ $nutt2 = $scpcount[$scpidx+1];
+ if( abs( ($nutt2+$count) - ($nutt1-$count))
+ < abs($nutt2 - $nutt1)) { # Would decrease
+ # size-diff by reassigning spk...
+ $scpcount[$scpidx+1] += $count;
+ $scpcount[$scpidx] -= $count;
+ pop @{$scparray[$scpidx]};
+ unshift @{$scparray[$scpidx+1]}, $spk;
+ $changed = 1;
+ }
+ }
+ }
+ if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
+ $spk = $scparray[$scpidx]->[0];
+ $count = $spk_count{$spk};
+ $nutt1 = $scpcount[$scpidx-1];
+ $nutt2 = $scpcount[$scpidx];
+ if( abs( ($nutt2-$count) - ($nutt1+$count))
+ < abs($nutt2 - $nutt1)) { # Would decrease
+ # size-diff by reassigning spk...
+ $scpcount[$scpidx-1] += $count;
+ $scpcount[$scpidx] -= $count;
+ shift @{$scparray[$scpidx]};
+ push @{$scparray[$scpidx-1]}, $spk;
+ $changed = 1;
+ }
+ }
+ }
+ }
+ # Now print out the files...
+ for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+ $scpfile = $OUTPUTS[$scpidx];
+ ($scpfile ne '-' ? open($f_fh, '>', $scpfile)
+ : open($f_fh, '>&', \*STDOUT)) ||
+ die "$0: Could not open scp file $scpfile for writing: $!\n";
+ $count = 0;
+ if(@{$scparray[$scpidx]} == 0) {
+ print STDERR "$0: eError: split_scp.pl producing empty .scp file " .
+ "$scpfile (too many splits and too few speakers?)\n";
+ $error = 1;
+ } else {
+ foreach $spk ( @{$scparray[$scpidx]} ) {
+ print $f_fh @{$spk_data{$spk}};
+ $count += $spk_count{$spk};
+ }
+ $count == $scpcount[$scpidx] || die "Count mismatch [code error]";
+ }
+ close($f_fh);
+ }
+} else {
+ # This block is the "normal" case where there is no --utt2spk
+ # option and we just break into equal size chunks.
+
+ open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
+
+ $numscps = @OUTPUTS; # size of array.
+ @F = ();
+ while(<$i_fh>) {
+ push @F, $_;
+ }
+ $numlines = @F;
+ if($numlines == 0) {
+ print STDERR "$0: error: empty input scp file $inscp\n";
+ $error = 1;
+ }
+ $linesperscp = int( $numlines / $numscps); # the "whole part"..
+ $linesperscp >= 1 || die "$0: You are splitting into too many pieces! [reduce \$nj ($numscps) to be smaller than the number of lines ($numlines) in $inscp]\n";
+ $remainder = $numlines - ($linesperscp * $numscps);
+ ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder";
+ # [just doing int() rounds down].
+ $n = 0;
+ for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
+ $scpfile = $OUTPUTS[$scpidx];
+ ($scpfile ne '-' ? open($o_fh, '>', $scpfile)
+ : open($o_fh, '>&', \*STDOUT)) ||
+ die "$0: Could not open scp file $scpfile for writing: $!\n";
+ for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
+ print $o_fh $F[$n++];
+ }
+ close($o_fh) || die "$0: Eror closing scp file $scpfile: $!\n";
+ }
+ $n == $numlines || die "$n != $numlines [code error]";
+}
+
+exit ($error);
diff --git a/egs/aishell/tranformer/utils/subset_data_dir_tr_cv.sh b/egs/aishell/tranformer/utils/subset_data_dir_tr_cv.sh
new file mode 100755
index 000000000..e16cebdf1
--- /dev/null
+++ b/egs/aishell/tranformer/utils/subset_data_dir_tr_cv.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+dev_num_utt=1000
+
+echo "$0 $@"
+. utils/parse_options.sh || exit 1;
+
+train_data=$1
+out_dir=$2
+
+[ ! -f ${train_data}/wav.scp ] && echo "$0: no such file ${train_data}/wav.scp" && exit 1;
+[ ! -f ${train_data}/text ] && echo "$0: no such file ${train_data}/text" && exit 1;
+
+mkdir -p ${out_dir}/train && mkdir -p ${out_dir}/dev
+
+cp ${train_data}/wav.scp ${out_dir}/train/wav.scp.bak
+cp ${train_data}/text ${out_dir}/train/text.bak
+
+num_utt=$(wc -l <${out_dir}/train/wav.scp.bak)
+
+utils/shuffle_list.pl --srand 1 ${out_dir}/train/wav.scp.bak > ${out_dir}/train/wav.scp.shuf
+head -n ${dev_num_utt} ${out_dir}/train/wav.scp.shuf > ${out_dir}/dev/wav.scp
+tail -n $((${num_utt}-${dev_num_utt})) ${out_dir}/train/wav.scp.shuf > ${out_dir}/train/wav.scp
+
+utils/shuffle_list.pl --srand 1 ${out_dir}/train/text.bak > ${out_dir}/train/text.shuf
+head -n ${dev_num_utt} ${out_dir}/train/text.shuf > ${out_dir}/dev/text
+tail -n $((${num_utt}-${dev_num_utt})) ${out_dir}/train/text.shuf > ${out_dir}/train/text
+
+rm ${out_dir}/train/wav.scp.bak ${out_dir}/train/text.bak
+rm ${out_dir}/train/wav.scp.shuf ${out_dir}/train/text.shuf
diff --git a/egs/aishell/tranformer/utils/text2token.py b/egs/aishell/tranformer/utils/text2token.py
new file mode 100755
index 000000000..56c39138f
--- /dev/null
+++ b/egs/aishell/tranformer/utils/text2token.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+
+
+import argparse
+import codecs
+import re
+import sys
+
+is_python2 = sys.version_info[0] == 2
+
+
+def exist_or_not(i, match_pos):
+ start_pos = None
+ end_pos = None
+ for pos in match_pos:
+ if pos[0] <= i < pos[1]:
+ start_pos = pos[0]
+ end_pos = pos[1]
+ break
+
+ return start_pos, end_pos
+
+
+def get_parser():
+ parser = argparse.ArgumentParser(
+ description="convert raw text to tokenized text",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+ parser.add_argument(
+ "--nchar",
+ "-n",
+ default=1,
+ type=int,
+ help="number of characters to split, i.e., \
+ aabb -> a a b b with -n 1 and aa bb with -n 2",
+ )
+ parser.add_argument(
+ "--skip-ncols", "-s", default=0, type=int, help="skip first n columns"
+ )
+ parser.add_argument("--space", default="", type=str, help="space symbol")
+ parser.add_argument(
+ "--non-lang-syms",
+ "-l",
+ default=None,
+ type=str,
+ help="list of non-linguistic symobles, e.g., etc.",
+ )
+ parser.add_argument("text", type=str, default=False, nargs="?", help="input text")
+ parser.add_argument(
+ "--trans_type",
+ "-t",
+ type=str,
+ default="char",
+ choices=["char", "phn"],
+ help="""Transcript type. char/phn. e.g., for TIMIT FADG0_SI1279 -
+ If trans_type is char,
+ read from SI1279.WRD file -> "bricks are an alternative"
+ Else if trans_type is phn,
+ read from SI1279.PHN file -> "sil b r ih sil k s aa r er n aa l
+ sil t er n ih sil t ih v sil" """,
+ )
+ return parser
+
+
+def main():
+ parser = get_parser()
+ args = parser.parse_args()
+
+ rs = []
+ if args.non_lang_syms is not None:
+ with codecs.open(args.non_lang_syms, "r", encoding="utf-8") as f:
+ nls = [x.rstrip() for x in f.readlines()]
+ rs = [re.compile(re.escape(x)) for x in nls]
+
+ if args.text:
+ f = codecs.open(args.text, encoding="utf-8")
+ else:
+ f = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
+
+ sys.stdout = codecs.getwriter("utf-8")(
+ sys.stdout if is_python2 else sys.stdout.buffer
+ )
+ line = f.readline()
+ n = args.nchar
+ while line:
+ x = line.split()
+ print(" ".join(x[: args.skip_ncols]), end=" ")
+ a = " ".join(x[args.skip_ncols :])
+
+ # get all matched positions
+ match_pos = []
+ for r in rs:
+ i = 0
+ while i >= 0:
+ m = r.search(a, i)
+ if m:
+ match_pos.append([m.start(), m.end()])
+ i = m.end()
+ else:
+ break
+
+ if args.trans_type == "phn":
+ a = a.split(" ")
+ else:
+ if len(match_pos) > 0:
+ chars = []
+ i = 0
+ while i < len(a):
+ start_pos, end_pos = exist_or_not(i, match_pos)
+ if start_pos is not None:
+ chars.append(a[start_pos:end_pos])
+ i = end_pos
+ else:
+ chars.append(a[i])
+ i += 1
+ a = chars
+
+ a = [a[j : j + n] for j in range(0, len(a), n)]
+
+ a_flat = []
+ for z in a:
+ a_flat.append("".join(z))
+
+ a_chars = [z.replace(" ", args.space) for z in a_flat]
+ if args.trans_type == "phn":
+ a_chars = [z.replace("sil", args.space) for z in a_chars]
+ print(" ".join(a_chars))
+ line = f.readline()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/egs/aishell/tranformer/utils/text_tokenize.py b/egs/aishell/tranformer/utils/text_tokenize.py
new file mode 100755
index 000000000..962ea11bc
--- /dev/null
+++ b/egs/aishell/tranformer/utils/text_tokenize.py
@@ -0,0 +1,106 @@
+import re
+import argparse
+
+
+def load_dict(seg_file):
+ seg_dict = {}
+ with open(seg_file, 'r') as infile:
+ for line in infile:
+ s = line.strip().split()
+ key = s[0]
+ value = s[1:]
+ seg_dict[key] = " ".join(value)
+ return seg_dict
+
+
+def forward_segment(text, dic):
+ word_list = []
+ i = 0
+ while i < len(text):
+ longest_word = text[i]
+ for j in range(i + 1, len(text) + 1):
+ word = text[i:j]
+ if word in dic:
+ if len(word) > len(longest_word):
+ longest_word = word
+ word_list.append(longest_word)
+ i += len(longest_word)
+ return word_list
+
+
+def tokenize(txt,
+ seg_dict):
+ out_txt = ""
+ pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])")
+ for word in txt:
+ if pattern.match(word):
+ if word in seg_dict:
+ out_txt += seg_dict[word] + " "
+ else:
+ out_txt += "" + " "
+ else:
+ continue
+ return out_txt.strip()
+
+
+def get_parser():
+ parser = argparse.ArgumentParser(
+ description="text tokenize",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+ parser.add_argument(
+ "--text-file",
+ "-t",
+ default=False,
+ required=True,
+ type=str,
+ help="input text",
+ )
+ parser.add_argument(
+ "--seg-file",
+ "-s",
+ default=False,
+ required=True,
+ type=str,
+ help="seg file",
+ )
+ parser.add_argument(
+ "--txt-index",
+ "-i",
+ default=1,
+ required=True,
+ type=int,
+ help="txt index",
+ )
+ parser.add_argument(
+ "--output-dir",
+ "-o",
+ default=False,
+ required=True,
+ type=str,
+ help="output dir",
+ )
+ return parser
+
+
+def main():
+ parser = get_parser()
+ args = parser.parse_args()
+
+ txt_writer = open("{}/text.{}.txt".format(args.output_dir, args.txt_index), 'w')
+ shape_writer = open("{}/len.{}".format(args.output_dir, args.txt_index), 'w')
+ seg_dict = load_dict(args.seg_file)
+ with open(args.text_file, 'r') as infile:
+ for line in infile:
+ s = line.strip().split()
+ text_id = s[0]
+ text_list = forward_segment("".join(s[1:]).lower(), seg_dict)
+ text = tokenize(text_list, seg_dict)
+ lens = len(text.strip().split())
+ txt_writer.write(text_id + " " + text + '\n')
+ shape_writer.write(text_id + " " + str(lens) + '\n')
+
+
+if __name__ == '__main__':
+ main()
+
diff --git a/egs/aishell/tranformer/utils/text_tokenize.sh b/egs/aishell/tranformer/utils/text_tokenize.sh
new file mode 100755
index 000000000..6b74fef80
--- /dev/null
+++ b/egs/aishell/tranformer/utils/text_tokenize.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+
+# Begin configuration section.
+nj=32
+cmd=utils/run.pl
+
+echo "$0 $@"
+
+. utils/parse_options.sh || exit 1;
+
+# tokenize configuration
+text_dir=$1
+seg_file=$2
+logdir=$3
+output_dir=$4
+
+txt_dir=${output_dir}/txt; mkdir -p ${output_dir}/txt
+mkdir -p ${logdir}
+
+$cmd JOB=1:$nj $logdir/text_tokenize.JOB.log \
+ python utils/text_tokenize.py -t ${text_dir}/txt/text.JOB.txt \
+ -s ${seg_file} -i JOB -o ${txt_dir} \
+ || exit 1;
+
+# concatenate the text files together.
+for n in $(seq $nj); do
+ cat ${txt_dir}/text.$n.txt || exit 1
+done > ${output_dir}/text || exit 1
+
+for n in $(seq $nj); do
+ cat ${txt_dir}/len.$n || exit 1
+done > ${output_dir}/text_shape || exit 1
+
+echo "$0: Succeeded text tokenize"
diff --git a/egs/aishell/tranformer/utils/textnorm_zh.py b/egs/aishell/tranformer/utils/textnorm_zh.py
new file mode 100755
index 000000000..79feb83fd
--- /dev/null
+++ b/egs/aishell/tranformer/utils/textnorm_zh.py
@@ -0,0 +1,834 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+# Authors:
+# 2019.5 Zhiyang Zhou (https://github.com/Joee1995/chn_text_norm.git)
+# 2019.9 Jiayu DU
+#
+# requirements:
+# - python 3.X
+# notes: python 2.X WILL fail or produce misleading results
+
+import sys, os, argparse, codecs, string, re
+
+# ================================================================================ #
+# basic constant
+# ================================================================================ #
+CHINESE_DIGIS = u'零一二三四五六七八九'
+BIG_CHINESE_DIGIS_SIMPLIFIED = u'零壹贰叁肆伍陆柒捌玖'
+BIG_CHINESE_DIGIS_TRADITIONAL = u'零壹貳參肆伍陸柒捌玖'
+SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = u'十百千万'
+SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = u'拾佰仟萬'
+LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'亿兆京垓秭穰沟涧正载'
+LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'億兆京垓秭穰溝澗正載'
+SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'十百千万'
+SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'拾佰仟萬'
+
+ZERO_ALT = u'〇'
+ONE_ALT = u'幺'
+TWO_ALTS = [u'两', u'兩']
+
+POSITIVE = [u'正', u'正']
+NEGATIVE = [u'负', u'負']
+POINT = [u'点', u'點']
+# PLUS = [u'加', u'加']
+# SIL = [u'杠', u'槓']
+
+FILLER_CHARS = ['呃', '啊']
+ER_WHITELIST = '(儿女|儿子|儿孙|女儿|儿媳|妻儿|' \
+ '胎儿|婴儿|新生儿|婴幼儿|幼儿|少儿|小儿|儿歌|儿童|儿科|托儿所|孤儿|' \
+ '儿戏|儿化|台儿庄|鹿儿岛|正儿八经|吊儿郎当|生儿育女|托儿带女|养儿防老|痴儿呆女|' \
+ '佳儿佳妇|儿怜兽扰|儿无常父|儿不嫌母丑|儿行千里母担忧|儿大不由爷|苏乞儿)'
+
+# 中文数字系统类型
+NUMBERING_TYPES = ['low', 'mid', 'high']
+
+CURRENCY_NAMES = '(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|' \
+ '里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)'
+CURRENCY_UNITS = '((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)'
+COM_QUANTIFIERS = '(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|' \
+ '砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|' \
+ '针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|' \
+ '毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|' \
+ '盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|' \
+ '纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块)'
+
+# punctuation information are based on Zhon project (https://github.com/tsroten/zhon.git)
+CHINESE_PUNC_STOP = '!?。。'
+CHINESE_PUNC_NON_STOP = '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏'
+CHINESE_PUNC_LIST = CHINESE_PUNC_STOP + CHINESE_PUNC_NON_STOP
+
+# ================================================================================ #
+# basic class
+# ================================================================================ #
+class ChineseChar(object):
+ """
+ 中文字符
+ 每个字符对应简体和繁体,
+ e.g. 简体 = '负', 繁体 = '負'
+ 转换时可转换为简体或繁体
+ """
+
+ def __init__(self, simplified, traditional):
+ self.simplified = simplified
+ self.traditional = traditional
+ #self.__repr__ = self.__str__
+
+ def __str__(self):
+ return self.simplified or self.traditional or None
+
+ def __repr__(self):
+ return self.__str__()
+
+
+class ChineseNumberUnit(ChineseChar):
+ """
+ 中文数字/数位字符
+ 每个字符除繁简体外还有一个额外的大写字符
+ e.g. '陆' 和 '陸'
+ """
+
+ def __init__(self, power, simplified, traditional, big_s, big_t):
+ super(ChineseNumberUnit, self).__init__(simplified, traditional)
+ self.power = power
+ self.big_s = big_s
+ self.big_t = big_t
+
+ def __str__(self):
+ return '10^{}'.format(self.power)
+
+ @classmethod
+ def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False):
+
+ if small_unit:
+ return ChineseNumberUnit(power=index + 1,
+ simplified=value[0], traditional=value[1], big_s=value[1], big_t=value[1])
+ elif numbering_type == NUMBERING_TYPES[0]:
+ return ChineseNumberUnit(power=index + 8,
+ simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
+ elif numbering_type == NUMBERING_TYPES[1]:
+ return ChineseNumberUnit(power=(index + 2) * 4,
+ simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
+ elif numbering_type == NUMBERING_TYPES[2]:
+ return ChineseNumberUnit(power=pow(2, index + 3),
+ simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
+ else:
+ raise ValueError(
+ 'Counting type should be in {0} ({1} provided).'.format(NUMBERING_TYPES, numbering_type))
+
+
+class ChineseNumberDigit(ChineseChar):
+ """
+ 中文数字字符
+ """
+
+ def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None):
+ super(ChineseNumberDigit, self).__init__(simplified, traditional)
+ self.value = value
+ self.big_s = big_s
+ self.big_t = big_t
+ self.alt_s = alt_s
+ self.alt_t = alt_t
+
+ def __str__(self):
+ return str(self.value)
+
+ @classmethod
+ def create(cls, i, v):
+ return ChineseNumberDigit(i, v[0], v[1], v[2], v[3])
+
+
+class ChineseMath(ChineseChar):
+ """
+ 中文数位字符
+ """
+
+ def __init__(self, simplified, traditional, symbol, expression=None):
+ super(ChineseMath, self).__init__(simplified, traditional)
+ self.symbol = symbol
+ self.expression = expression
+ self.big_s = simplified
+ self.big_t = traditional
+
+
+CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath
+
+
+class NumberSystem(object):
+ """
+ 中文数字系统
+ """
+ pass
+
+
+class MathSymbol(object):
+ """
+ 用于中文数字系统的数学符号 (繁/简体), e.g.
+ positive = ['正', '正']
+ negative = ['负', '負']
+ point = ['点', '點']
+ """
+
+ def __init__(self, positive, negative, point):
+ self.positive = positive
+ self.negative = negative
+ self.point = point
+
+ def __iter__(self):
+ for v in self.__dict__.values():
+ yield v
+
+
+# class OtherSymbol(object):
+# """
+# 其他符号
+# """
+#
+# def __init__(self, sil):
+# self.sil = sil
+#
+# def __iter__(self):
+# for v in self.__dict__.values():
+# yield v
+
+
+# ================================================================================ #
+# basic utils
+# ================================================================================ #
+def create_system(numbering_type=NUMBERING_TYPES[1]):
+ """
+ 根据数字系统类型返回创建相应的数字系统,默认为 mid
+ NUMBERING_TYPES = ['low', 'mid', 'high']: 中文数字系统类型
+ low: '兆' = '亿' * '十' = $10^{9}$, '京' = '兆' * '十', etc.
+ mid: '兆' = '亿' * '万' = $10^{12}$, '京' = '兆' * '万', etc.
+ high: '兆' = '亿' * '亿' = $10^{16}$, '京' = '兆' * '兆', etc.
+ 返回对应的数字系统
+ """
+
+ # chinese number units of '亿' and larger
+ all_larger_units = zip(
+ LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL)
+ larger_units = [CNU.create(i, v, numbering_type, False)
+ for i, v in enumerate(all_larger_units)]
+ # chinese number units of '十, 百, 千, 万'
+ all_smaller_units = zip(
+ SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED, SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL)
+ smaller_units = [CNU.create(i, v, small_unit=True)
+ for i, v in enumerate(all_smaller_units)]
+ # digis
+ chinese_digis = zip(CHINESE_DIGIS, CHINESE_DIGIS,
+ BIG_CHINESE_DIGIS_SIMPLIFIED, BIG_CHINESE_DIGIS_TRADITIONAL)
+ digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)]
+ digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT
+ digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT
+ digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1]
+
+ # symbols
+ positive_cn = CM(POSITIVE[0], POSITIVE[1], '+', lambda x: x)
+ negative_cn = CM(NEGATIVE[0], NEGATIVE[1], '-', lambda x: -x)
+ point_cn = CM(POINT[0], POINT[1], '.', lambda x,
+ y: float(str(x) + '.' + str(y)))
+ # sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y)))
+ system = NumberSystem()
+ system.units = smaller_units + larger_units
+ system.digits = digits
+ system.math = MathSymbol(positive_cn, negative_cn, point_cn)
+ # system.symbols = OtherSymbol(sil_cn)
+ return system
+
+
+def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]):
+
+ def get_symbol(char, system):
+ for u in system.units:
+ if char in [u.traditional, u.simplified, u.big_s, u.big_t]:
+ return u
+ for d in system.digits:
+ if char in [d.traditional, d.simplified, d.big_s, d.big_t, d.alt_s, d.alt_t]:
+ return d
+ for m in system.math:
+ if char in [m.traditional, m.simplified]:
+ return m
+
+ def string2symbols(chinese_string, system):
+ int_string, dec_string = chinese_string, ''
+ for p in [system.math.point.simplified, system.math.point.traditional]:
+ if p in chinese_string:
+ int_string, dec_string = chinese_string.split(p)
+ break
+ return [get_symbol(c, system) for c in int_string], \
+ [get_symbol(c, system) for c in dec_string]
+
+ def correct_symbols(integer_symbols, system):
+ """
+ 一百八 to 一百八十
+ 一亿一千三百万 to 一亿 一千万 三百万
+ """
+
+ if integer_symbols and isinstance(integer_symbols[0], CNU):
+ if integer_symbols[0].power == 1:
+ integer_symbols = [system.digits[1]] + integer_symbols
+
+ if len(integer_symbols) > 1:
+ if isinstance(integer_symbols[-1], CND) and isinstance(integer_symbols[-2], CNU):
+ integer_symbols.append(
+ CNU(integer_symbols[-2].power - 1, None, None, None, None))
+
+ result = []
+ unit_count = 0
+ for s in integer_symbols:
+ if isinstance(s, CND):
+ result.append(s)
+ unit_count = 0
+ elif isinstance(s, CNU):
+ current_unit = CNU(s.power, None, None, None, None)
+ unit_count += 1
+
+ if unit_count == 1:
+ result.append(current_unit)
+ elif unit_count > 1:
+ for i in range(len(result)):
+ if isinstance(result[-i - 1], CNU) and result[-i - 1].power < current_unit.power:
+ result[-i - 1] = CNU(result[-i - 1].power +
+ current_unit.power, None, None, None, None)
+ return result
+
+ def compute_value(integer_symbols):
+ """
+ Compute the value.
+ When current unit is larger than previous unit, current unit * all previous units will be used as all previous units.
+ e.g. '两千万' = 2000 * 10000 not 2000 + 10000
+ """
+ value = [0]
+ last_power = 0
+ for s in integer_symbols:
+ if isinstance(s, CND):
+ value[-1] = s.value
+ elif isinstance(s, CNU):
+ value[-1] *= pow(10, s.power)
+ if s.power > last_power:
+ value[:-1] = list(map(lambda v: v *
+ pow(10, s.power), value[:-1]))
+ last_power = s.power
+ value.append(0)
+ return sum(value)
+
+ system = create_system(numbering_type)
+ int_part, dec_part = string2symbols(chinese_string, system)
+ int_part = correct_symbols(int_part, system)
+ int_str = str(compute_value(int_part))
+ dec_str = ''.join([str(d.value) for d in dec_part])
+ if dec_part:
+ return '{0}.{1}'.format(int_str, dec_str)
+ else:
+ return int_str
+
+
+def num2chn(number_string, numbering_type=NUMBERING_TYPES[1], big=False,
+ traditional=False, alt_zero=False, alt_one=False, alt_two=True,
+ use_zeros=True, use_units=True):
+
+ def get_value(value_string, use_zeros=True):
+
+ striped_string = value_string.lstrip('0')
+
+ # record nothing if all zeros
+ if not striped_string:
+ return []
+
+ # record one digits
+ elif len(striped_string) == 1:
+ if use_zeros and len(value_string) != len(striped_string):
+ return [system.digits[0], system.digits[int(striped_string)]]
+ else:
+ return [system.digits[int(striped_string)]]
+
+ # recursively record multiple digits
+ else:
+ result_unit = next(u for u in reversed(
+ system.units) if u.power < len(striped_string))
+ result_string = value_string[:-result_unit.power]
+ return get_value(result_string) + [result_unit] + get_value(striped_string[-result_unit.power:])
+
+ system = create_system(numbering_type)
+
+ int_dec = number_string.split('.')
+ if len(int_dec) == 1:
+ int_string = int_dec[0]
+ dec_string = ""
+ elif len(int_dec) == 2:
+ int_string = int_dec[0]
+ dec_string = int_dec[1]
+ else:
+ raise ValueError(
+ "invalid input num string with more than one dot: {}".format(number_string))
+
+ if use_units and len(int_string) > 1:
+ result_symbols = get_value(int_string)
+ else:
+ result_symbols = [system.digits[int(c)] for c in int_string]
+ dec_symbols = [system.digits[int(c)] for c in dec_string]
+ if dec_string:
+ result_symbols += [system.math.point] + dec_symbols
+
+ if alt_two:
+ liang = CND(2, system.digits[2].alt_s, system.digits[2].alt_t,
+ system.digits[2].big_s, system.digits[2].big_t)
+ for i, v in enumerate(result_symbols):
+ if isinstance(v, CND) and v.value == 2:
+ next_symbol = result_symbols[i +
+ 1] if i < len(result_symbols) - 1 else None
+ previous_symbol = result_symbols[i - 1] if i > 0 else None
+ if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))):
+ if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)):
+ result_symbols[i] = liang
+
+ # if big is True, '两' will not be used and `alt_two` has no impact on output
+ if big:
+ attr_name = 'big_'
+ if traditional:
+ attr_name += 't'
+ else:
+ attr_name += 's'
+ else:
+ if traditional:
+ attr_name = 'traditional'
+ else:
+ attr_name = 'simplified'
+
+ result = ''.join([getattr(s, attr_name) for s in result_symbols])
+
+ # if not use_zeros:
+ # result = result.strip(getattr(system.digits[0], attr_name))
+
+ if alt_zero:
+ result = result.replace(
+ getattr(system.digits[0], attr_name), system.digits[0].alt_s)
+
+ if alt_one:
+ result = result.replace(
+ getattr(system.digits[1], attr_name), system.digits[1].alt_s)
+
+ for i, p in enumerate(POINT):
+ if result.startswith(p):
+ return CHINESE_DIGIS[0] + result
+
+ # ^10, 11, .., 19
+ if len(result) >= 2 and result[1] in [SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0],
+ SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0]] and \
+ result[0] in [CHINESE_DIGIS[1], BIG_CHINESE_DIGIS_SIMPLIFIED[1], BIG_CHINESE_DIGIS_TRADITIONAL[1]]:
+ result = result[1:]
+
+ return result
+
+
+# ================================================================================ #
+# different types of rewriters
+# ================================================================================ #
+class Cardinal:
+ """
+ CARDINAL类
+ """
+
+ def __init__(self, cardinal=None, chntext=None):
+ self.cardinal = cardinal
+ self.chntext = chntext
+
+ def chntext2cardinal(self):
+ return chn2num(self.chntext)
+
+ def cardinal2chntext(self):
+ return num2chn(self.cardinal)
+
+class Digit:
+ """
+ DIGIT类
+ """
+
+ def __init__(self, digit=None, chntext=None):
+ self.digit = digit
+ self.chntext = chntext
+
+ # def chntext2digit(self):
+ # return chn2num(self.chntext)
+
+ def digit2chntext(self):
+ return num2chn(self.digit, alt_two=False, use_units=False)
+
+
+class TelePhone:
+ """
+ TELEPHONE类
+ """
+
+ def __init__(self, telephone=None, raw_chntext=None, chntext=None):
+ self.telephone = telephone
+ self.raw_chntext = raw_chntext
+ self.chntext = chntext
+
+ # def chntext2telephone(self):
+ # sil_parts = self.raw_chntext.split('')
+ # self.telephone = '-'.join([
+ # str(chn2num(p)) for p in sil_parts
+ # ])
+ # return self.telephone
+
+ def telephone2chntext(self, fixed=False):
+
+ if fixed:
+ sil_parts = self.telephone.split('-')
+ self.raw_chntext = ''.join([
+ num2chn(part, alt_two=False, use_units=False) for part in sil_parts
+ ])
+ self.chntext = self.raw_chntext.replace('', '')
+ else:
+ sp_parts = self.telephone.strip('+').split()
+ self.raw_chntext = ''.join([
+ num2chn(part, alt_two=False, use_units=False) for part in sp_parts
+ ])
+ self.chntext = self.raw_chntext.replace('', '')
+ return self.chntext
+
+
+class Fraction:
+ """
+ FRACTION类
+ """
+
+ def __init__(self, fraction=None, chntext=None):
+ self.fraction = fraction
+ self.chntext = chntext
+
+ def chntext2fraction(self):
+ denominator, numerator = self.chntext.split('分之')
+ return chn2num(numerator) + '/' + chn2num(denominator)
+
+ def fraction2chntext(self):
+ numerator, denominator = self.fraction.split('/')
+ return num2chn(denominator) + '分之' + num2chn(numerator)
+
+
+class Date:
+ """
+ DATE类
+ """
+
+ def __init__(self, date=None, chntext=None):
+ self.date = date
+ self.chntext = chntext
+
+ # def chntext2date(self):
+ # chntext = self.chntext
+ # try:
+ # year, other = chntext.strip().split('年', maxsplit=1)
+ # year = Digit(chntext=year).digit2chntext() + '年'
+ # except ValueError:
+ # other = chntext
+ # year = ''
+ # if other:
+ # try:
+ # month, day = other.strip().split('月', maxsplit=1)
+ # month = Cardinal(chntext=month).chntext2cardinal() + '月'
+ # except ValueError:
+ # day = chntext
+ # month = ''
+ # if day:
+ # day = Cardinal(chntext=day[:-1]).chntext2cardinal() + day[-1]
+ # else:
+ # month = ''
+ # day = ''
+ # date = year + month + day
+ # self.date = date
+ # return self.date
+
+ def date2chntext(self):
+ date = self.date
+ try:
+ year, other = date.strip().split('年', 1)
+ year = Digit(digit=year).digit2chntext() + '年'
+ except ValueError:
+ other = date
+ year = ''
+ if other:
+ try:
+ month, day = other.strip().split('月', 1)
+ month = Cardinal(cardinal=month).cardinal2chntext() + '月'
+ except ValueError:
+ day = date
+ month = ''
+ if day:
+ day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1]
+ else:
+ month = ''
+ day = ''
+ chntext = year + month + day
+ self.chntext = chntext
+ return self.chntext
+
+
+class Money:
+ """
+ MONEY类
+ """
+
+ def __init__(self, money=None, chntext=None):
+ self.money = money
+ self.chntext = chntext
+
+ # def chntext2money(self):
+ # return self.money
+
+ def money2chntext(self):
+ money = self.money
+ pattern = re.compile(r'(\d+(\.\d+)?)')
+ matchers = pattern.findall(money)
+ if matchers:
+ for matcher in matchers:
+ money = money.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext())
+ self.chntext = money
+ return self.chntext
+
+
+class Percentage:
+ """
+ PERCENTAGE类
+ """
+
+ def __init__(self, percentage=None, chntext=None):
+ self.percentage = percentage
+ self.chntext = chntext
+
+ def chntext2percentage(self):
+ return chn2num(self.chntext.strip().strip('百分之')) + '%'
+
+ def percentage2chntext(self):
+ return '百分之' + num2chn(self.percentage.strip().strip('%'))
+
+
+def remove_erhua(text, er_whitelist):
+ """
+ 去除儿化音词中的儿:
+ 他女儿在那边儿 -> 他女儿在那边
+ """
+
+ er_pattern = re.compile(er_whitelist)
+ new_str=''
+ while re.search('儿',text):
+ a = re.search('儿',text).span()
+ remove_er_flag = 0
+
+ if er_pattern.search(text):
+ b = er_pattern.search(text).span()
+ if b[0] <= a[0]:
+ remove_er_flag = 1
+
+ if remove_er_flag == 0 :
+ new_str = new_str + text[0:a[0]]
+ text = text[a[1]:]
+ else:
+ new_str = new_str + text[0:b[1]]
+ text = text[b[1]:]
+
+ text = new_str + text
+ return text
+
+# ================================================================================ #
+# NSW Normalizer
+# ================================================================================ #
+class NSWNormalizer:
+ def __init__(self, raw_text):
+ self.raw_text = '^' + raw_text + '$'
+ self.norm_text = ''
+
+ def _particular(self):
+ text = self.norm_text
+ pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))")
+ matchers = pattern.findall(text)
+ if matchers:
+ # print('particular')
+ for matcher in matchers:
+ text = text.replace(matcher[0], matcher[1]+'2'+matcher[2], 1)
+ self.norm_text = text
+ return self.norm_text
+
+ def normalize(self):
+ text = self.raw_text
+
+ # 规范化日期
+ pattern = re.compile(r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)")
+ matchers = pattern.findall(text)
+ if matchers:
+ #print('date')
+ for matcher in matchers:
+ text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1)
+
+ # 规范化金钱
+ pattern = re.compile(r"\D+((\d+(\.\d+)?)[多余几]?" + CURRENCY_UNITS + r"(\d" + CURRENCY_UNITS + r"?)?)")
+ matchers = pattern.findall(text)
+ if matchers:
+ #print('money')
+ for matcher in matchers:
+ text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1)
+
+ # 规范化固话/手机号码
+ # 手机
+ # http://www.jihaoba.com/news/show/13680
+ # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
+ # 联通:130、131、132、156、155、186、185、176
+ # 电信:133、153、189、180、181、177
+ pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D")
+ matchers = pattern.findall(text)
+ if matchers:
+ #print('telephone')
+ for matcher in matchers:
+ text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1)
+ # 固话
+ pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D")
+ matchers = pattern.findall(text)
+ if matchers:
+ # print('fixed telephone')
+ for matcher in matchers:
+ text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True), 1)
+
+ # 规范化分数
+ pattern = re.compile(r"(\d+/\d+)")
+ matchers = pattern.findall(text)
+ if matchers:
+ #print('fraction')
+ for matcher in matchers:
+ text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1)
+
+ # 规范化百分数
+ text = text.replace('%', '%')
+ pattern = re.compile(r"(\d+(\.\d+)?%)")
+ matchers = pattern.findall(text)
+ if matchers:
+ #print('percentage')
+ for matcher in matchers:
+ text = text.replace(matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1)
+
+ # 规范化纯数+量词
+ pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS)
+ matchers = pattern.findall(text)
+ if matchers:
+ #print('cardinal+quantifier')
+ for matcher in matchers:
+ text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
+
+ # 规范化数字编号
+ pattern = re.compile(r"(\d{4,32})")
+ matchers = pattern.findall(text)
+ if matchers:
+ #print('digit')
+ for matcher in matchers:
+ text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1)
+
+ # 规范化纯数
+ pattern = re.compile(r"(\d+(\.\d+)?)")
+ matchers = pattern.findall(text)
+ if matchers:
+ #print('cardinal')
+ for matcher in matchers:
+ text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
+
+ self.norm_text = text
+ self._particular()
+
+ return self.norm_text.lstrip('^').rstrip('$')
+
+
+def nsw_test_case(raw_text):
+ print('I:' + raw_text)
+ print('O:' + NSWNormalizer(raw_text).normalize())
+ print('')
+
+
+def nsw_test():
+ nsw_test_case('固话:0595-23865596或23880880。')
+ nsw_test_case('固话:0595-23865596或23880880。')
+ nsw_test_case('手机:+86 19859213959或15659451527。')
+ nsw_test_case('分数:32477/76391。')
+ nsw_test_case('百分数:80.03%。')
+ nsw_test_case('编号:31520181154418。')
+ nsw_test_case('纯数:2983.07克或12345.60米。')
+ nsw_test_case('日期:1999年2月20日或09年3月15号。')
+ nsw_test_case('金钱:12块5,34.5元,20.1万')
+ nsw_test_case('特殊:O2O或B2C。')
+ nsw_test_case('3456万吨')
+ nsw_test_case('2938个')
+ nsw_test_case('938')
+ nsw_test_case('今天吃了115个小笼包231个馒头')
+ nsw_test_case('有62%的概率')
+
+
+if __name__ == '__main__':
+ #nsw_test()
+
+ p = argparse.ArgumentParser()
+ p.add_argument('ifile', help='input filename, assume utf-8 encoding')
+ p.add_argument('ofile', help='output filename')
+ p.add_argument('--to_upper', action='store_true', help='convert to upper case')
+ p.add_argument('--to_lower', action='store_true', help='convert to lower case')
+ p.add_argument('--has_key', action='store_true', help="input text has Kaldi's key as first field.")
+ p.add_argument('--remove_fillers', type=bool, default=True, help='remove filler chars such as "呃, 啊"')
+ p.add_argument('--remove_erhua', type=bool, default=True, help='remove erhua chars such as "这儿"')
+ p.add_argument('--log_interval', type=int, default=10000, help='log interval in number of processed lines')
+ args = p.parse_args()
+
+ ifile = codecs.open(args.ifile, 'r', 'utf8')
+ ofile = codecs.open(args.ofile, 'w+', 'utf8')
+
+ n = 0
+ for l in ifile:
+ key = ''
+ text = ''
+ if args.has_key:
+ cols = l.split(maxsplit=1)
+ key = cols[0]
+ if len(cols) == 2:
+ text = cols[1].strip()
+ else:
+ text = ''
+ else:
+ text = l.strip()
+
+ # cases
+ if args.to_upper and args.to_lower:
+ sys.stderr.write('text norm: to_upper OR to_lower?')
+ exit(1)
+ if args.to_upper:
+ text = text.upper()
+ if args.to_lower:
+ text = text.lower()
+
+ # Filler chars removal
+ if args.remove_fillers:
+ for ch in FILLER_CHARS:
+ text = text.replace(ch, '')
+
+ if args.remove_erhua:
+ text = remove_erhua(text, ER_WHITELIST)
+
+ # NSW(Non-Standard-Word) normalization
+ text = NSWNormalizer(text).normalize()
+
+ # Punctuations removal
+ old_chars = CHINESE_PUNC_LIST + string.punctuation # includes all CN and EN punctuations
+ new_chars = ' ' * len(old_chars)
+ del_chars = ''
+ text = text.translate(str.maketrans(old_chars, new_chars, del_chars))
+
+ #
+ if args.has_key:
+ ofile.write(key + '\t' + text + '\n')
+ else:
+ ofile.write(text + '\n')
+
+ n += 1
+ if n % args.log_interval == 0:
+ sys.stderr.write("text norm: {} lines done.\n".format(n))
+
+ sys.stderr.write("text norm: {} lines done in total.\n".format(n))
+
+ ifile.close()
+ ofile.close()
diff --git a/egs_modelscope/aishell/paraformer/README.md b/egs_modelscope/aishell/paraformer/README.md
new file mode 100644
index 000000000..48a5621b1
--- /dev/null
+++ b/egs_modelscope/aishell/paraformer/README.md
@@ -0,0 +1,38 @@
+# ModelScope: Paraformer-large Model
+
+## Highlight
+
+### ModelScope: Paraformer-Large Model
+- Fast: Non-autoregressive (NAR) model, the Paraformer can achieve comparable performance to the state-of-the-art AR transformer, with more than 10x speedup.
+- Accurate: SOTA in a lot of public ASR tasks, with a very significant relative improvement, capable of industrial implementation.
+- Convenient: Quickly and easily download Paraformer-large from Modelscope for finetuning and inference.
+ - Support finetuning and inference on AISHELL-1 and AISHELL-2.
+ - Support inference on AISHELL-1, AISHELL-2, Wenetspeech, SpeechIO and other audio.
+
+## How to finetune and infer using a pretrained ModelScope Paraformer-large Model
+
+### Finetune
+- Modify finetune training related parameters in `conf/train_asr_paraformer_sanm_50e_16d_2048_512_lfr6.yaml`
+- Setting parameters in `paraformer_large_finetune.sh`
+ - data_aishell: please set the aishell data path
+ - tag: exp tag
+ - init_model_name: speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch # pre-trained model, download from modelscope during fine-tuning
+- Then you can run the pipeline to finetune with our model download from modelscope and infer after finetune:
+```sh
+ sh ./paraformer_large_finetune.sh
+```
+
+### Inference
+
+Or you can download the model from ModelScope for inference directly.
+
+- Setting parameters in `paraformer_large_infer.sh`
+ - ori_data: please set the aishell raw data path
+ - data_dir: data output dictionary
+ - exp_dir: the result path
+ - model_name: speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch # pre-trained model, download from modelscope
+ - test_sets: please set the testsets name
+- Then you can run the pipeline to infer with:
+```sh
+ sh ./paraformer_large_infer.sh
+```
diff --git a/egs_modelscope/aishell/paraformer/RESULTS.md b/egs_modelscope/aishell/paraformer/RESULTS.md
new file mode 100644
index 000000000..516750453
--- /dev/null
+++ b/egs_modelscope/aishell/paraformer/RESULTS.md
@@ -0,0 +1,24 @@
+# Paraformer-Large
+- Model link:
+- Model size: 220M
+- Train config: conf/train_asr_paraformer_sanm_50e_16d_2048_512_lfr6.yaml
+
+# Environments
+- date: `Tue Nov 22 18:48:39 CST 2022`
+- python version: `3.7.12`
+- FunASR version: `0.1.0`
+- pytorch version: `pytorch 1.7.0`
+- Git hash: ``
+- Commit date: ``
+
+# Beachmark Results
+
+## AISHELL-1
+- Decode config: conf/decode_asr_transformer_noctc_1best.yaml
+ - Decode without CTC
+ - Decode without LM
+
+| testset | CER(%)|
+|:---------:|:-----:|
+| dev | 1.75 |
+| test | 1.95 |
diff --git a/egs_modelscope/aishell/paraformer/conf/decode_asr_transformer_noctc_10best_lm_weight_0.15.yaml b/egs_modelscope/aishell/paraformer/conf/decode_asr_transformer_noctc_10best_lm_weight_0.15.yaml
new file mode 100644
index 000000000..22f02d913
--- /dev/null
+++ b/egs_modelscope/aishell/paraformer/conf/decode_asr_transformer_noctc_10best_lm_weight_0.15.yaml
@@ -0,0 +1,6 @@
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.0
+lm_weight: 0.15
diff --git a/egs_modelscope/aishell/paraformer/conf/decode_asr_transformer_noctc_1best.yaml b/egs_modelscope/aishell/paraformer/conf/decode_asr_transformer_noctc_1best.yaml
new file mode 100644
index 000000000..e6231927c
--- /dev/null
+++ b/egs_modelscope/aishell/paraformer/conf/decode_asr_transformer_noctc_1best.yaml
@@ -0,0 +1,6 @@
+beam_size: 1
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.0
+lm_weight: 0.0
diff --git a/egs_modelscope/aishell/paraformer/conf/train_asr_paraformer_sanm_50e_16d_2048_512_lfr6.yaml b/egs_modelscope/aishell/paraformer/conf/train_asr_paraformer_sanm_50e_16d_2048_512_lfr6.yaml
new file mode 100644
index 000000000..e9210f373
--- /dev/null
+++ b/egs_modelscope/aishell/paraformer/conf/train_asr_paraformer_sanm_50e_16d_2048_512_lfr6.yaml
@@ -0,0 +1,91 @@
+# network architecture
+# encoder related
+encoder_conf:
+ dropout_rate: 0.1
+ positional_dropout_rate: 0.1
+ attention_dropout_rate: 0.1
+
+# decoder related
+decoder_conf:
+ dropout_rate: 0.1
+ positional_dropout_rate: 0.1
+ self_attention_dropout_rate: 0.1
+ src_attention_dropout_rate: 0.1
+
+predictor_conf:
+ threshold: 1.0
+ l_order: 1
+ r_order: 1
+ tail_threshold: 0.45
+
+# hybrid CTC/attention
+model_conf:
+ ctc_weight: 0.0
+ lsm_weight: 0.1 # label smoothing option
+ length_normalized_loss: true
+ predictor_weight: 1.0
+ predictor_bias: 1
+ sampling_ratio: 0.75
+
+# minibatch related
+# dataset_type: small
+batch_type: length
+batch_bins: 2000
+num_workers: 16
+# dataset_type: large
+dataset_conf:
+ filter_conf:
+ min_length: 10
+ max_length: 250
+ min_token_length: 1
+ max_token_length: 200
+ shuffle: true
+ shuffle_conf:
+ shuffle_size: 10240
+ sort_size: 500
+ batch_conf:
+ batch_type: 'token'
+ batch_size: 6000
+ num_workers: 16
+
+# optimization related
+accum_grad: 1
+grad_clip: 5
+max_epoch: 20
+val_scheduler_criterion:
+ - valid
+ - acc
+best_model_criterion:
+- - valid
+ - acc
+ - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+ lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+ warmup_steps: 30000
+
+specaug: specaug_lfr
+specaug_conf:
+ apply_time_warp: false
+ time_warp_window: 5
+ time_warp_mode: bicubic
+ apply_freq_mask: true
+ freq_mask_width_range:
+ - 0
+ - 30
+ lfr_rate: 6
+ num_freq_mask: 1
+ apply_time_mask: true
+ time_mask_width_range:
+ - 0
+ - 12
+ num_time_mask: 1
+
+unused_parameters: true
+log_interval: 50
+normalize: None
+split_with_space: true
diff --git a/egs_modelscope/aishell/paraformer/local/aishell_data_prep.sh b/egs_modelscope/aishell/paraformer/local/aishell_data_prep.sh
new file mode 100755
index 000000000..83f489b3c
--- /dev/null
+++ b/egs_modelscope/aishell/paraformer/local/aishell_data_prep.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+#. ./path.sh || exit 1;
+
+if [ $# != 3 ]; then
+ echo "Usage: $0 "
+ echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript data"
+ exit 1;
+fi
+
+aishell_audio_dir=$1
+aishell_text=$2/aishell_transcript_v0.8.txt
+output_dir=$3
+
+train_dir=$output_dir/data/local/train
+dev_dir=$output_dir/data/local/dev
+test_dir=$output_dir/data/local/test
+tmp_dir=$output_dir/data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
+ echo "Error: $0 requires two directory arguments"
+ exit 1;
+fi
+
+# find wav audio file for train, dev and test resp.
+find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 141925 ] && \
+ echo Warning: expected 141925 data data files, found $n
+
+grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $train_dir $dev_dir $test_dir; do
+ echo Preparing $dir transcriptions
+ sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+ paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+ utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt
+ awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+ utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+ sort -u $dir/transcripts.txt > $dir/text
+done
+
+mkdir -p $output_dir/data/train $output_dir/data/dev $output_dir/data/test
+
+for f in wav.scp text; do
+ cp $train_dir/$f $output_dir/data/train/$f || exit 1;
+ cp $dev_dir/$f $output_dir/data/dev/$f || exit 1;
+ cp $test_dir/$f $output_dir/data/test/$f || exit 1;
+done
+
+echo "$0: AISHELL data preparation succeeded"
+exit 0;
diff --git a/egs_modelscope/aishell/paraformer/modelscope_utils b/egs_modelscope/aishell/paraformer/modelscope_utils
new file mode 120000
index 000000000..fc97768c8
--- /dev/null
+++ b/egs_modelscope/aishell/paraformer/modelscope_utils
@@ -0,0 +1 @@
+../../common/modelscope_utils
\ No newline at end of file
diff --git a/egs_modelscope/aishell/paraformer/paraformer_large_finetune.sh b/egs_modelscope/aishell/paraformer/paraformer_large_finetune.sh
new file mode 100755
index 000000000..a68338fb9
--- /dev/null
+++ b/egs_modelscope/aishell/paraformer/paraformer_large_finetune.sh
@@ -0,0 +1,224 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+
+# machines configuration
+CUDA_VISIBLE_DEVICES="0,1" # set gpus, e.g., CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=2
+count=1
+gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding
+njob=4 # the number of jobs for each gpu
+train_cmd=utils/run.pl
+
+# general configuration
+feats_dir="." #feature output dictionary, for large data
+exp_dir="."
+lang=zh
+dumpdir=dump/fbank
+feats_type=fbank
+token_type=char
+scp=feats.scp
+type=kaldi_ark
+stage=0
+stop_stage=4
+
+# feature configuration
+feats_dim=560
+sample_frequency=16000
+nj=32
+speed_perturb="1.0"
+lfr=True
+lfr_m=7
+lfr_n=6
+
+init_model_name=speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch # pre-trained model, download from modelscope during fine-tuning
+cmvn_file=init_model/${init_model_name}/am.mvn
+seg_file=init_model/${init_model_name}/seg_dict
+vocab=init_model/${init_model_name}/tokens.txt
+
+# data
+data_aishell=
+
+# exp tag
+tag=""
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+asr_config=conf/train_asr_paraformer_sanm_50e_16d_2048_512_lfr6.yaml
+init_param="init_model/${init_model_name}/${init_model_name}"
+
+inference_config=conf/decode_asr_transformer_noctc_1best.yaml
+inference_asr_model=valid.acc.ave_10best.pth
+
+. utils/parse_options.sh || exit 1;
+
+# download model from modelscope
+python modelscope_utils/download_model.py --model_name ${init_model_name}
+
+if [ ! -d ${HOME}/.cache/modelscope/hub/damo/${init_model_name} ]; then
+ echo "${HOME}/.cache/modelscope/hub/damo/${init_model_name} must exist"
+ exit 1
+else
+ if [ -d init_model/${init_model_name} ]; then
+ echo "init_model/${init_model_name} is already exists. if you want to decode again, please delete init_model/${init_model_name} first."
+ else
+ mkdir -p init_model/${init_model_name}
+ cp -r ${HOME}/.cache/modelscope/hub/damo/${init_model_name}/* init_model/${init_model_name}
+ fi
+fi
+
+model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
+
+# you can set gpu num for decoding here
+gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
+ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
+inference_nj=$[${ngpu}*${njob}]
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ echo "stage 0: Data preparation"
+ # Data preparation
+ local/aishell_data_prep.sh ${data_aishell}/data_aishell/wav ${data_aishell}/data_aishell/transcript ${feats_dir}
+ for x in train dev test; do
+ cp ${feats_dir}/data/${x}/text ${feats_dir}/data/${x}/text.org
+ paste -d " " <(cut -f 1 -d" " ${feats_dir}/data/${x}/text.org) <(cut -f 2- -d" " ${feats_dir}/data/${x}/text.org | tr -d " ") \
+ > ${feats_dir}/data/${x}/text
+ rm ${feats_dir}/data/${x}/text.org
+ done
+fi
+
+feat_train_dir=${feats_dir}/${dumpdir}/train; mkdir -p ${feat_train_dir}
+feat_dev_dir=${feats_dir}/${dumpdir}/dev; mkdir -p ${feat_dev_dir}
+feat_test_dir=${feats_dir}/${dumpdir}/test; mkdir -p ${feat_test_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ echo "Feature Generation"
+ # compute fbank features
+ fbankdir=${feats_dir}/fbank
+ utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --speed_perturb ${speed_perturb} \
+ ${feats_dir}/data/train ${exp_dir}/exp/make_fbank/train ${fbankdir}/train
+ utils/fix_data_feat.sh ${fbankdir}/train
+ utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj \
+ ${feats_dir}/data/dev ${exp_dir}/exp/make_fbank/dev ${fbankdir}/dev
+ utils/fix_data_feat.sh ${fbankdir}/dev
+ utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj \
+ ${feats_dir}/data/test ${exp_dir}/exp/make_fbank/test ${fbankdir}/test
+ utils/fix_data_feat.sh ${fbankdir}/test
+
+ echo "apply low_frame_rate and cmvn"
+ [ ! -f ${cmvn_file} ] && echo "$0: cmvn file is required" && exit 1;
+ utils/apply_lfr_and_cmvn.sh --cmd "$train_cmd" --nj $nj \
+ --lfr $lfr --lfr-m $lfr_m --lfr-n $lfr_n \
+ ${fbankdir}/train ${cmvn_file} ${exp_dir}/exp/make_fbank/train ${feat_train_dir}
+ utils/apply_lfr_and_cmvn.sh --cmd "$train_cmd" --nj $nj \
+ --lfr $lfr --lfr-m $lfr_m --lfr-n $lfr_n \
+ ${fbankdir}/dev ${cmvn_file} ${exp_dir}/exp/make_fbank/dev ${feat_dev_dir}
+ utils/apply_lfr_and_cmvn.sh --cmd "$train_cmd" --nj $nj \
+ --lfr $lfr --lfr-m $lfr_m --lfr-n $lfr_n \
+ ${fbankdir}/test ${cmvn_file} ${exp_dir}/exp/make_fbank/test ${feat_test_dir}
+
+ echo "Text Tokenize"
+ # 我爱reading->我 爱 read@@ ing
+ utils/text_tokenize.sh --cmd "$train_cmd" --nj $nj ${fbankdir}/train ${seg_file} ${feat_train_dir}/log ${feat_train_dir}
+ utils/fix_data_feat.sh ${feat_train_dir}
+ utils/text_tokenize.sh --cmd "$train_cmd" --nj $nj ${fbankdir}/dev ${seg_file} ${feat_dev_dir}/log ${feat_dev_dir}
+ utils/fix_data_feat.sh ${feat_dev_dir}
+ cp ${fbankdir}/test/text ${feat_test_dir}
+fi
+
+token_list=${feats_dir}/data/${lang}_token_list/char/tokens.txt
+echo "dictionary: ${token_list}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ echo "stage 2: Dictionary Preparation"
+ mkdir -p ${feats_dir}/data/${lang}_token_list/char/
+ cp $vocab ${token_list}
+
+ vocab_size=$(wc -l <${token_list})
+ awk -v v=,${vocab_size} '{print $0v}' ${feat_train_dir}/text_shape > ${feat_train_dir}/text_shape.char
+ awk -v v=,${vocab_size} '{print $0v}' ${feat_dev_dir}/text_shape > ${feat_dev_dir}/text_shape.char
+ mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/train
+ mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/dev
+ cp ${feat_train_dir}/speech_shape ${feat_train_dir}/text_shape ${feat_train_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/train
+ cp ${feat_dev_dir}/speech_shape ${feat_dev_dir}/text_shape ${feat_dev_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/dev
+fi
+
+# Training Stage
+world_size=$gpu_num # run on one machine
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ # update asr train config.yaml
+ python modelscope_utils/update_config.py --modelscope_config init_model/${init_model_name}/asr_train_config.yaml --finetune_config ${asr_config} --output_config init_model/${init_model_name}/asr_finetune_config.yaml
+ finetune_config=init_model/${init_model_name}/asr_finetune_config.yaml
+
+ mkdir -p ${exp_dir}/exp/${model_dir}
+ mkdir -p ${exp_dir}/exp/${model_dir}/log
+ INIT_FILE=$exp_dir/ddp_init
+ if [ -f $INIT_FILE ];then
+ rm -f $INIT_FILE
+ fi
+ init_method=file://$(readlink -f $INIT_FILE)
+ echo "$0: init method is $init_method"
+ for ((i = 0; i < $gpu_num; ++i)); do
+ {
+ rank=$i
+ local_rank=$i
+ gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+ asr_train_paraformer.py \
+ --gpu_id $gpu_id \
+ --use_preprocessor true \
+ --token_type $token_type \
+ --token_list $token_list \
+ --train_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${train_set}/${scp},speech,${type} \
+ --train_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${train_set}/text,text,text \
+ --train_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${train_set}/speech_shape \
+ --train_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${train_set}/text_shape.char \
+ --valid_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${valid_set}/${scp},speech,${type} \
+ --valid_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${valid_set}/text,text,text \
+ --valid_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${valid_set}/speech_shape \
+ --valid_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${valid_set}/text_shape.char \
+ --resume true \
+ --output_dir ${exp_dir}/exp/${model_dir} \
+ --init_param $init_param \
+ --config $finetune_config \
+ --input_size $feats_dim \
+ --ngpu $gpu_num \
+ --num_worker_count $count \
+ --multiprocessing_distributed true \
+ --dist_init_method $init_method \
+ --dist_world_size $world_size \
+ --dist_rank $rank \
+ --local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
+ } &
+ done
+ wait
+fi
+
+# Testing Stage
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ ./utils/easy_asr_infer.sh \
+ --lang zh \
+ --datadir ${feats_dir} \
+ --feats_type ${feats_type} \
+ --feats_dim ${feats_dim} \
+ --token_type ${token_type} \
+ --gpu_inference ${gpu_inference} \
+ --inference_config "${inference_config}" \
+ --test_sets "${test_sets}" \
+ --token_list $token_list \
+ --asr_exp ${exp_dir}/exp/${model_dir} \
+ --stage 12 \
+ --stop_stage 12 \
+ --scp $scp \
+ --text text \
+ --inference_nj $inference_nj \
+ --njob $njob \
+ --inference_asr_model $inference_asr_model \
+ --gpuid_list $gpuid_list \
+ --mode paraformer
+fi
+
diff --git a/egs_modelscope/aishell/paraformer/paraformer_large_infer.sh b/egs_modelscope/aishell/paraformer/paraformer_large_infer.sh
new file mode 100755
index 000000000..8e2c8f33d
--- /dev/null
+++ b/egs_modelscope/aishell/paraformer/paraformer_large_infer.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+ori_data=
+data_dir=
+exp_dir=
+model_name=speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
+inference_nj=32
+gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1"
+ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
+njob=4 # the number of jobs for each gpu
+gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding
+
+if ${gpu_inference}; then
+ inference_nj=$[${ngpu}*${njob}]
+else
+ inference_nj=$njob
+fi
+
+# LM configs
+use_lm=false
+beam_size=1
+lm_weight=0.0
+
+test_sets="dev test"
+
+. utils/parse_options.sh
+
+aishell_audio_dir=$ori_data/data_aishell/wav
+aishell_text=$ori_data/data_aishell/transcript/aishell_transcript_v0.8.txt
+dev_dir=${data_dir}/aishell/dev
+test_dir=${data_dir}/aishell/test
+tmp_dir=${data_dir}/aishell/tmp
+
+mkdir -p ${dev_dir}
+mkdir -p ${test_dir}
+mkdir -p ${tmp_dir}
+
+find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+for dir in $dev_dir $test_dir; do
+ sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+ paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+ utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt
+ awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+ utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+ sort -u $dir/transcripts.txt > $dir/text
+done
+
+mkdir -p ${exp_dir}/aishell
+
+modelscope_utils/modelscope_infer.sh \
+ --data_dir ${data_dir}/aishell \
+ --exp_dir ${exp_dir}/aishell \
+ --test_sets "${test_sets}" \
+ --model_name ${model_name} \
+ --inference_nj ${inference_nj} \
+ --gpuid_list ${gpuid_list} \
+ --njob ${njob} \
+ --gpu_inference ${gpu_inference} \
+ --use_lm ${use_lm} \
+ --beam_size ${beam_size} \
+ --lm_weight ${lm_weight}
diff --git a/egs_modelscope/aishell/paraformer/path.sh b/egs_modelscope/aishell/paraformer/path.sh
new file mode 100755
index 000000000..7972642d0
--- /dev/null
+++ b/egs_modelscope/aishell/paraformer/path.sh
@@ -0,0 +1,5 @@
+export FUNASR_DIR=$PWD/../../..
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
diff --git a/egs_modelscope/aishell/paraformer/utils b/egs_modelscope/aishell/paraformer/utils
new file mode 120000
index 000000000..37d976175
--- /dev/null
+++ b/egs_modelscope/aishell/paraformer/utils
@@ -0,0 +1 @@
+../../../egs/aishell/tranformer/utils/
\ No newline at end of file
diff --git a/egs_modelscope/aishell2/paraformer/README.md b/egs_modelscope/aishell2/paraformer/README.md
new file mode 100644
index 000000000..46bd3ad71
--- /dev/null
+++ b/egs_modelscope/aishell2/paraformer/README.md
@@ -0,0 +1,39 @@
+# ModelScope: Paraformer-large Model
+
+## Highlight
+
+### ModelScope: Paraformer-Large Model
+- Fast: Non-autoregressive (NAR) model, the Paraformer can achieve comparable performance to the state-of-the-art AR transformer, with more than 10x speedup.
+- Accurate: SOTA in a lot of public ASR tasks, with a very significant relative improvement, capable of industrial implementation.
+- Convenient: Quickly and easily download Paraformer-large from Modelscope for finetuning and inference.
+ - Support finetuning and inference on AISHELL-1 and AISHELL-2.
+ - Support inference on AISHELL-1, AISHELL-2, Wenetspeech, SpeechIO and other audio.
+
+## How to finetune and infer using a pretrained ModelScope Paraformer-large Model
+
+### Finetune
+- Modify finetune training related parameters in `conf/train_asr_paraformer_sanm_50e_16d_2048_512_lfr6.yaml`
+- Setting parameters in `paraformer_large_finetune.sh`
+ - tr_dir: please set the aishell2 train data path
+ - dev_tst_dir: please set the aishell2 dev/test data path
+ - tag: exp tag
+ - init_model_name: speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch # pre-trained model, download from modelscope during fine-tuning
+- Then you can run the pipeline to finetune with our model download from modelscope and infer after finetune:
+```sh
+ sh ./paraformer_large_finetune.sh
+```
+
+### Inference
+
+Or you can download the model from ModelScope for inference directly.
+
+- Setting parameters in `paraformer_large_infer.sh`
+ - ori_data: please set the aishell2 dev/test raw data path
+ - data_dir: data output dictionary
+ - exp_dir: the result path
+ - model_name: speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch # pre-trained model, download from modelscope
+ - test_sets: please set the testsets name
+- Then you can run the pipeline to infer with:
+```sh
+ sh ./paraformer_large_infer.sh
+```
diff --git a/egs_modelscope/aishell2/paraformer/RESULTS.md b/egs_modelscope/aishell2/paraformer/RESULTS.md
new file mode 100644
index 000000000..a265a749c
--- /dev/null
+++ b/egs_modelscope/aishell2/paraformer/RESULTS.md
@@ -0,0 +1,26 @@
+# Paraformer-Large
+- Model link:
+- Model size: 220M
+- Train config: conf/train_asr_paraformer_sanm_50e_16d_2048_512_lfr6.yaml
+
+# Environments
+- date: `Tue Nov 22 18:48:39 CST 2022`
+- python version: `3.7.12`
+- FunASR version: `0.1.0`
+- pytorch version: `pytorch 1.7.0`
+- Git hash: ``
+- Commit date: ``
+
+# Beachmark Results
+
+## AISHELL-2
+- Decode config: conf/decode_asr_transformer_noctc_1best.yaml
+ - Decode without CTC
+ - Decode without LM
+
+| testset | CER(%)|
+|:------------:|:-----:|
+| dev_ios | 2.80 |
+| test_android | 3.13 |
+| test_ios | 2.85 |
+| test_mic | 3.06 |
diff --git a/egs_modelscope/aishell2/paraformer/conf/decode_asr_transformer_noctc_10best_lm_weight_0.15.yaml b/egs_modelscope/aishell2/paraformer/conf/decode_asr_transformer_noctc_10best_lm_weight_0.15.yaml
new file mode 100644
index 000000000..22f02d913
--- /dev/null
+++ b/egs_modelscope/aishell2/paraformer/conf/decode_asr_transformer_noctc_10best_lm_weight_0.15.yaml
@@ -0,0 +1,6 @@
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.0
+lm_weight: 0.15
diff --git a/egs_modelscope/aishell2/paraformer/conf/decode_asr_transformer_noctc_1best.yaml b/egs_modelscope/aishell2/paraformer/conf/decode_asr_transformer_noctc_1best.yaml
new file mode 100644
index 000000000..e6231927c
--- /dev/null
+++ b/egs_modelscope/aishell2/paraformer/conf/decode_asr_transformer_noctc_1best.yaml
@@ -0,0 +1,6 @@
+beam_size: 1
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.0
+lm_weight: 0.0
diff --git a/egs_modelscope/aishell2/paraformer/conf/train_asr_paraformer_sanm_50e_16d_2048_512_lfr6.yaml b/egs_modelscope/aishell2/paraformer/conf/train_asr_paraformer_sanm_50e_16d_2048_512_lfr6.yaml
new file mode 100644
index 000000000..e9210f373
--- /dev/null
+++ b/egs_modelscope/aishell2/paraformer/conf/train_asr_paraformer_sanm_50e_16d_2048_512_lfr6.yaml
@@ -0,0 +1,91 @@
+# network architecture
+# encoder related
+encoder_conf:
+ dropout_rate: 0.1
+ positional_dropout_rate: 0.1
+ attention_dropout_rate: 0.1
+
+# decoder related
+decoder_conf:
+ dropout_rate: 0.1
+ positional_dropout_rate: 0.1
+ self_attention_dropout_rate: 0.1
+ src_attention_dropout_rate: 0.1
+
+predictor_conf:
+ threshold: 1.0
+ l_order: 1
+ r_order: 1
+ tail_threshold: 0.45
+
+# hybrid CTC/attention
+model_conf:
+ ctc_weight: 0.0
+ lsm_weight: 0.1 # label smoothing option
+ length_normalized_loss: true
+ predictor_weight: 1.0
+ predictor_bias: 1
+ sampling_ratio: 0.75
+
+# minibatch related
+# dataset_type: small
+batch_type: length
+batch_bins: 2000
+num_workers: 16
+# dataset_type: large
+dataset_conf:
+ filter_conf:
+ min_length: 10
+ max_length: 250
+ min_token_length: 1
+ max_token_length: 200
+ shuffle: true
+ shuffle_conf:
+ shuffle_size: 10240
+ sort_size: 500
+ batch_conf:
+ batch_type: 'token'
+ batch_size: 6000
+ num_workers: 16
+
+# optimization related
+accum_grad: 1
+grad_clip: 5
+max_epoch: 20
+val_scheduler_criterion:
+ - valid
+ - acc
+best_model_criterion:
+- - valid
+ - acc
+ - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+ lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+ warmup_steps: 30000
+
+specaug: specaug_lfr
+specaug_conf:
+ apply_time_warp: false
+ time_warp_window: 5
+ time_warp_mode: bicubic
+ apply_freq_mask: true
+ freq_mask_width_range:
+ - 0
+ - 30
+ lfr_rate: 6
+ num_freq_mask: 1
+ apply_time_mask: true
+ time_mask_width_range:
+ - 0
+ - 12
+ num_time_mask: 1
+
+unused_parameters: true
+log_interval: 50
+normalize: None
+split_with_space: true
diff --git a/egs_modelscope/aishell2/paraformer/local/aishell2_data_prep.sh b/egs_modelscope/aishell2/paraformer/local/aishell2_data_prep.sh
new file mode 100755
index 000000000..77791f9c1
--- /dev/null
+++ b/egs_modelscope/aishell2/paraformer/local/aishell2_data_prep.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG)
+# 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
+# Apache 2.0
+
+# transform raw AISHELL-2 data to kaldi format
+
+. ./path.sh || exit 1;
+
+tmp=
+dir=
+
+if [ $# != 3 ]; then
+ echo "Usage: $0 "
+ echo " $0 /export/AISHELL-2/iOS/train data/local/train data/train"
+ exit 1;
+fi
+
+corpus=$1
+tmp=$2
+dir=$3
+
+echo "prepare_data.sh: Preparing data in $corpus"
+
+mkdir -p $tmp
+mkdir -p $dir
+
+# corpus check
+if [ ! -d $corpus ] || [ ! -f $corpus/wav.scp ] || [ ! -f $corpus/trans.txt ]; then
+ echo "Error: $0 requires wav.scp and trans.txt under $corpus directory."
+ exit 1;
+fi
+
+# validate utt-key list, IC0803W0380 is a bad utterance
+awk '{print $1}' $corpus/wav.scp | grep -v 'IC0803W0380' > $tmp/wav_utt.list
+awk '{print $1}' $corpus/trans.txt > $tmp/trans_utt.list
+utils/filter_scp.pl -f 1 $tmp/wav_utt.list $tmp/trans_utt.list > $tmp/utt.list
+
+# wav.scp
+awk -F'\t' -v path_prefix=$corpus '{printf("%s\t%s/%s\n",$1,path_prefix,$2)}' $corpus/wav.scp > $tmp/tmp_wav.scp
+utils/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tmp/wav.scp
+
+# text
+utils/filter_scp.pl -f 1 $tmp/utt.list $corpus/trans.txt | sort -k 1 | uniq > $tmp/text
+
+# copy prepared resources from tmp_dir to target dir
+mkdir -p $dir
+for f in wav.scp text; do
+ cp $tmp/$f $dir/$f || exit 1;
+done
+
+echo "local/prepare_data.sh succeeded"
+exit 0;
diff --git a/egs_modelscope/aishell2/paraformer/modelscope_utils b/egs_modelscope/aishell2/paraformer/modelscope_utils
new file mode 120000
index 000000000..fc97768c8
--- /dev/null
+++ b/egs_modelscope/aishell2/paraformer/modelscope_utils
@@ -0,0 +1 @@
+../../common/modelscope_utils
\ No newline at end of file
diff --git a/egs_modelscope/aishell2/paraformer/paraformer_large_finetune.sh b/egs_modelscope/aishell2/paraformer/paraformer_large_finetune.sh
new file mode 100755
index 000000000..d4b5dde73
--- /dev/null
+++ b/egs_modelscope/aishell2/paraformer/paraformer_large_finetune.sh
@@ -0,0 +1,239 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+
+# machines configuration
+CUDA_VISIBLE_DEVICES="0,1" # set gpus, e.g., CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=2
+count=1
+gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding
+njob=4 # the number of jobs for each gpu
+train_cmd=utils/run.pl
+
+# general configuration
+feats_dir="." #feature output dictionary, for large data
+exp_dir="."
+lang=zh
+dumpdir=dump/fbank
+feats_type=fbank
+token_type=char
+scp=feats.scp
+type=kaldi_ark
+stage=0
+stop_stage=4
+
+# feature configuration
+feats_dim=560
+sample_frequency=16000
+nj=100
+speed_perturb="1.0"
+lfr=True
+lfr_m=7
+lfr_n=6
+
+init_model_name=speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch # pre-trained model, download from modelscope during fine-tuning
+cmvn_file=init_model/${init_model_name}/am.mvn
+seg_file=init_model/${init_model_name}/seg_dict
+vocab=init_model/${init_model_name}/tokens.txt
+
+# data
+tr_dir=
+dev_tst_dir=
+
+# exp tag
+tag=""
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev_ios
+test_sets="dev_ios test_android test_ios test_mic"
+
+asr_config=conf/train_asr_paraformer_sanm_50e_16d_2048_512_lfr6.yaml
+init_param="init_model/${init_model_name}/${init_model_name}"
+
+inference_config=conf/decode_asr_transformer_noctc_1best.yaml
+inference_asr_model=valid.acc.ave_10best.pth
+
+. utils/parse_options.sh || exit 1;
+
+# download model from modelscope
+python modelscope_utils/download_model.py --model_name ${init_model_name}
+
+if [ ! -d ${HOME}/.cache/modelscope/hub/damo/${init_model_name} ]; then
+ echo "${HOME}/.cache/modelscope/hub/damo/${init_model_name} must exist"
+ exit 1
+else
+ if [ -d init_model/${init_model_name} ]; then
+ echo "init_model/${init_model_name} is already exists. if you want to decode again, please delete init_model/${init_model_name} first."
+ else
+ mkdir -p init_model/${init_model_name}
+ cp -r ${HOME}/.cache/modelscope/hub/damo/${init_model_name}/* init_model/${init_model_name}
+ fi
+fi
+
+model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
+
+# you can set gpu num for decoding here
+gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
+ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
+inference_nj=$[${ngpu}*${njob}]
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ echo "stage 0: Data preparation"
+ # For training set
+ local/aishell2_data_prep.sh ${tr_dir} ${feats_dir}/data/local/train ${feats_dir}/data/train || exit 1;
+ # # For dev and test set
+ for x in Android iOS Mic; do
+ local/aishell2_data_prep.sh ${dev_tst_dir}/${x}/dev ${feats_dir}/data/local/dev_${x,,} ${feats_dir}/data/dev_${x,,} || exit 1;
+ local/aishell2_data_prep.sh ${dev_tst_dir}/${x}/test ${feats_dir}/data/local/test_${x,,} ${feats_dir}/data/test_${x,,} || exit 1;
+ done
+ # Normalize text to capital letters
+ for x in train dev_android dev_ios dev_mic test_android test_ios test_mic; do
+ mv ${feats_dir}/data/${x}/text ${feats_dir}/data/${x}/text.org
+ paste -d " " <(cut -f 1 ${feats_dir}/data/${x}/text.org) <(cut -f 2- ${feats_dir}/data/${x}/text.org \
+ | tr 'A-Z' 'a-z' | tr -d " ") \
+ > ${feats_dir}/data/${x}/text
+ rm ${feats_dir}/data/${x}/text.org
+ done
+fi
+
+feat_train_dir=${feats_dir}/${dumpdir}/${train_set}; mkdir -p ${feat_train_dir}
+feat_dev_dir=${feats_dir}/${dumpdir}/${valid_set}; mkdir -p ${feat_dev_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ echo "Feature Generation"
+ # compute fbank features
+ fbankdir=${feats_dir}/fbank
+ utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --speed_perturb ${speed_perturb} \
+ ${feats_dir}/data/train ${exp_dir}/exp/make_fbank/train ${fbankdir}/train
+ utils/fix_data_feat.sh ${fbankdir}/train
+ for x in android ios mic; do
+ utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj \
+ ${feats_dir}/data/dev_${x} ${exp_dir}/exp/make_fbank/dev_${x} ${fbankdir}/dev_${x}
+ utils/fix_data_feat.sh ${fbankdir}/dev_${x}
+ utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj \
+ ${feats_dir}/data/test_${x} ${exp_dir}/exp/make_fbank/test_${x} ${fbankdir}/test_${x}
+ utils/fix_data_feat.sh ${fbankdir}/test_${x}
+ done
+
+ echo "apply low_frame_rate and cmvn"
+ [ ! -f ${cmvn_file} ] && echo "$0: cmvn file is required" && exit 1;
+ utils/apply_lfr_and_cmvn.sh --cmd "$train_cmd" --nj $nj \
+ --lfr $lfr --lfr-m $lfr_m --lfr-n $lfr_n \
+ ${fbankdir}/${train_set} ${cmvn_file} ${exp_dir}/exp/make_fbank/train ${feat_train_dir}
+ utils/apply_lfr_and_cmvn.sh --cmd "$train_cmd" --nj $nj \
+ --lfr $lfr --lfr-m $lfr_m --lfr-n $lfr_n \
+ ${fbankdir}/${valid_set} ${cmvn_file} ${exp_dir}/exp/make_fbank/dev ${feat_dev_dir}
+ for x in android ios mic; do
+ feat_test_dir=${feats_dir}/${dumpdir}/test_${x}; mkdir ${feat_test_dir}
+ utils/apply_lfr_and_cmvn.sh --cmd "$train_cmd" --nj $nj \
+ --lfr $lfr --lfr-m $lfr_m --lfr-n $lfr_n \
+ ${fbankdir}/test_${x} ${cmvn_file} ${exp_dir}/exp/make_fbank/test_${x} ${feat_test_dir}
+ done
+
+ echo "Text Tokenize"
+ # 我爱reading->我 爱 read@@ ing
+ utils/text_tokenize.sh --cmd "$train_cmd" --nj $nj ${fbankdir}/${train_set} ${seg_file} ${feat_train_dir}/log ${feat_train_dir}
+ utils/fix_data_feat.sh ${feat_train_dir}
+ utils/text_tokenize.sh --cmd "$train_cmd" --nj $nj ${fbankdir}/${valid_set} ${seg_file} ${feat_dev_dir}/log ${feat_dev_dir}
+ utils/fix_data_feat.sh ${feat_dev_dir}
+ for x in android ios mic; do
+ feat_test_dir=${feats_dir}/${dumpdir}/test_${x}
+ cp ${fbankdir}/test_${x}/text ${feat_test_dir}
+ done
+fi
+
+token_list=${feats_dir}/data/${lang}_token_list/char/tokens.txt
+echo "dictionary: ${token_list}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ echo "stage 2: Dictionary Preparation"
+ mkdir -p ${feats_dir}/data/${lang}_token_list/char/
+ cp $vocab ${token_list}
+
+ vocab_size=$(wc -l <${token_list})
+ awk -v v=,${vocab_size} '{print $0v}' ${feat_train_dir}/text_shape > ${feat_train_dir}/text_shape.char
+ awk -v v=,${vocab_size} '{print $0v}' ${feat_dev_dir}/text_shape > ${feat_dev_dir}/text_shape.char
+ mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/train
+ mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/dev_ios
+ cp ${feat_train_dir}/speech_shape ${feat_train_dir}/text_shape ${feat_train_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/${train_set}
+ cp ${feat_dev_dir}/speech_shape ${feat_dev_dir}/text_shape ${feat_dev_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/${valid_set}
+fi
+
+# Training Stage
+world_size=$gpu_num # run on one machine
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ # update asr train config.yaml
+ python modelscope_utils/update_config.py --modelscope_config init_model/${init_model_name}/asr_train_config.yaml --finetune_config ${asr_config} --output_config init_model/${init_model_name}/asr_finetune_config.yaml
+ finetune_config=init_model/${init_model_name}/asr_finetune_config.yaml
+
+ mkdir -p ${exp_dir}/exp/${model_dir}
+ mkdir -p ${exp_dir}/exp/${model_dir}/log
+ INIT_FILE=$exp_dir/ddp_init
+ if [ -f $INIT_FILE ];then
+ rm -f $INIT_FILE
+ fi
+ init_method=file://$(readlink -f $INIT_FILE)
+ echo "$0: init method is $init_method"
+ for ((i = 0; i < $gpu_num; ++i)); do
+ {
+ rank=$i
+ local_rank=$i
+ gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+ asr_train_paraformer.py \
+ --gpu_id $gpu_id \
+ --use_preprocessor true \
+ --token_type $token_type \
+ --token_list $token_list \
+ --train_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${train_set}/${scp},speech,${type} \
+ --train_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${train_set}/text,text,text \
+ --train_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${train_set}/speech_shape \
+ --train_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${train_set}/text_shape.char \
+ --valid_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${valid_set}/${scp},speech,${type} \
+ --valid_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${valid_set}/text,text,text \
+ --valid_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${valid_set}/speech_shape \
+ --valid_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${valid_set}/text_shape.char \
+ --resume true \
+ --output_dir ${exp_dir}/exp/${model_dir} \
+ --init_param $init_param \
+ --config $finetune_config \
+ --input_size $feats_dim \
+ --ngpu $gpu_num \
+ --num_worker_count $count \
+ --multiprocessing_distributed true \
+ --dist_init_method $init_method \
+ --dist_world_size $world_size \
+ --dist_rank $rank \
+ --local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
+ } &
+ done
+ wait
+fi
+
+# Testing Stage
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ ./utils/easy_asr_infer.sh \
+ --lang zh \
+ --datadir ${feats_dir} \
+ --feats_type ${feats_type} \
+ --feats_dim ${feats_dim} \
+ --token_type ${token_type} \
+ --gpu_inference ${gpu_inference} \
+ --inference_config "${inference_config}" \
+ --test_sets "${test_sets}" \
+ --token_list $token_list \
+ --asr_exp ${exp_dir}/exp/${model_dir} \
+ --stage 12 \
+ --stop_stage 12 \
+ --scp $scp \
+ --text text \
+ --inference_nj $inference_nj \
+ --njob $njob \
+ --inference_asr_model $inference_asr_model \
+ --gpuid_list $gpuid_list \
+ --mode paraformer
+fi
+
diff --git a/egs_modelscope/aishell2/paraformer/paraformer_large_infer.sh b/egs_modelscope/aishell2/paraformer/paraformer_large_infer.sh
new file mode 100755
index 000000000..95b32fc75
--- /dev/null
+++ b/egs_modelscope/aishell2/paraformer/paraformer_large_infer.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+ori_data=
+data_dir=
+exp_dir=
+model_name=speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
+inference_nj=32
+gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1"
+ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
+njob=4 # the number of jobs for each gpu
+gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding
+
+if ${gpu_inference}; then
+ inference_nj=$[${ngpu}*${njob}]
+else
+ inference_nj=$njob
+fi
+
+use_lm=false
+beam_size=1
+lm_weight=0.0
+
+test_sets="dev_ios test_android test_ios test_mic"
+
+. utils/parse_options.sh
+
+for x in Android iOS Mic; do
+ local/aishell2_data_prep.sh ${ori_data}/${x}/dev ${data_dir}/aishell2/local/dev_${x,,} ${data_dir}/aishell2/dev_${x,,} || exit 1;
+ local/aishell2_data_prep.sh ${ori_data}/${x}/test ${data_dir}/aishell2/local/test_${x,,} ${data_dir}/aishell2/test_${x,,} || exit 1;
+done
+for x in dev_android dev_ios dev_mic test_android test_ios test_mic; do
+ mv ${data_dir}/aishell2/${x}/text ${data_dir}/aishell2/${x}/text.org
+ paste -d " " <(cut -f 1 ${data_dir}/aishell2/${x}/text.org) <(cut -f 2- ${data_dir}/aishell2/${x}/text.org \
+ | tr 'A-Z' 'a-z' | tr -d " ") \
+ > ${data_dir}/aishell2/${x}/text
+ rm ${data_dir}/aishell2/${x}/text.org
+done
+
+mkdir -p ${exp_dir}/aishell2
+
+modelscope_utils/modelscope_infer.sh \
+ --data_dir ${data_dir}/aishell2 \
+ --exp_dir ${exp_dir}/aishell2 \
+ --test_sets "${test_sets}" \
+ --model_name ${model_name} \
+ --inference_nj ${inference_nj} \
+ --gpuid_list ${gpuid_list} \
+ --njob ${njob} \
+ --gpu_inference ${gpu_inference} \
+ --use_lm ${use_lm} \
+ --beam_size ${beam_size} \
+ --lm_weight ${lm_weight}
diff --git a/egs_modelscope/aishell2/paraformer/path.sh b/egs_modelscope/aishell2/paraformer/path.sh
new file mode 100755
index 000000000..7972642d0
--- /dev/null
+++ b/egs_modelscope/aishell2/paraformer/path.sh
@@ -0,0 +1,5 @@
+export FUNASR_DIR=$PWD/../../..
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
diff --git a/egs_modelscope/aishell2/paraformer/utils b/egs_modelscope/aishell2/paraformer/utils
new file mode 120000
index 000000000..37d976175
--- /dev/null
+++ b/egs_modelscope/aishell2/paraformer/utils
@@ -0,0 +1 @@
+../../../egs/aishell/tranformer/utils/
\ No newline at end of file
diff --git a/egs_modelscope/common/README.md b/egs_modelscope/common/README.md
new file mode 100644
index 000000000..f2049e2f0
--- /dev/null
+++ b/egs_modelscope/common/README.md
@@ -0,0 +1,27 @@
+# ModelScope Model
+
+## How to finetune and infer using a pretrained ModelScope Model
+
+### Finetune
+- Modify finetune training related parameters in `conf/train_asr_paraformer_sanm_50e_16d_2048_512_lfr6.yaml`
+- Setting parameters in `modelscope_common_finetune.sh`
+ - dataset: the dataset dir needs to include files: train/wav.scp, train/text; optional dev/wav.scp, dev/text, test/wav.scp test/text
+ - tag: exp tag
+ - init_model_name: speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch # pre-trained model, download from modelscope during fine-tuning
+- Then you can run the pipeline to finetune with our model download from modelscope:
+```sh
+ sh ./modelscope_common_finetune.sh
+```
+
+### Inference
+
+Or you can use the finetuned model for inference directly.
+
+- Setting parameters in `modelscope_common_infer.sh`
+ - data_dir: # wav list, ${data_dir}/wav.scp
+ - exp_dir: the result path
+ - model_name: speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch # pre-trained model, download from modelscope
+- Then you can run the pipeline to infer with:
+```sh
+ sh ./modelscope_common_infer.sh
+```
diff --git a/egs_modelscope/common/conf/decode_asr_transformer_noctc_10best_lm_weight_0.15.yaml b/egs_modelscope/common/conf/decode_asr_transformer_noctc_10best_lm_weight_0.15.yaml
new file mode 100644
index 000000000..22f02d913
--- /dev/null
+++ b/egs_modelscope/common/conf/decode_asr_transformer_noctc_10best_lm_weight_0.15.yaml
@@ -0,0 +1,6 @@
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.0
+lm_weight: 0.15
diff --git a/egs_modelscope/common/conf/decode_asr_transformer_noctc_1best.yaml b/egs_modelscope/common/conf/decode_asr_transformer_noctc_1best.yaml
new file mode 100644
index 000000000..e6231927c
--- /dev/null
+++ b/egs_modelscope/common/conf/decode_asr_transformer_noctc_1best.yaml
@@ -0,0 +1,6 @@
+beam_size: 1
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.0
+lm_weight: 0.0
diff --git a/egs_modelscope/common/conf/train_asr_paraformer_sanm_50e_16d_2048_512_lfr6.yaml b/egs_modelscope/common/conf/train_asr_paraformer_sanm_50e_16d_2048_512_lfr6.yaml
new file mode 100644
index 000000000..e9210f373
--- /dev/null
+++ b/egs_modelscope/common/conf/train_asr_paraformer_sanm_50e_16d_2048_512_lfr6.yaml
@@ -0,0 +1,91 @@
+# network architecture
+# encoder related
+encoder_conf:
+ dropout_rate: 0.1
+ positional_dropout_rate: 0.1
+ attention_dropout_rate: 0.1
+
+# decoder related
+decoder_conf:
+ dropout_rate: 0.1
+ positional_dropout_rate: 0.1
+ self_attention_dropout_rate: 0.1
+ src_attention_dropout_rate: 0.1
+
+predictor_conf:
+ threshold: 1.0
+ l_order: 1
+ r_order: 1
+ tail_threshold: 0.45
+
+# hybrid CTC/attention
+model_conf:
+ ctc_weight: 0.0
+ lsm_weight: 0.1 # label smoothing option
+ length_normalized_loss: true
+ predictor_weight: 1.0
+ predictor_bias: 1
+ sampling_ratio: 0.75
+
+# minibatch related
+# dataset_type: small
+batch_type: length
+batch_bins: 2000
+num_workers: 16
+# dataset_type: large
+dataset_conf:
+ filter_conf:
+ min_length: 10
+ max_length: 250
+ min_token_length: 1
+ max_token_length: 200
+ shuffle: true
+ shuffle_conf:
+ shuffle_size: 10240
+ sort_size: 500
+ batch_conf:
+ batch_type: 'token'
+ batch_size: 6000
+ num_workers: 16
+
+# optimization related
+accum_grad: 1
+grad_clip: 5
+max_epoch: 20
+val_scheduler_criterion:
+ - valid
+ - acc
+best_model_criterion:
+- - valid
+ - acc
+ - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+ lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+ warmup_steps: 30000
+
+specaug: specaug_lfr
+specaug_conf:
+ apply_time_warp: false
+ time_warp_window: 5
+ time_warp_mode: bicubic
+ apply_freq_mask: true
+ freq_mask_width_range:
+ - 0
+ - 30
+ lfr_rate: 6
+ num_freq_mask: 1
+ apply_time_mask: true
+ time_mask_width_range:
+ - 0
+ - 12
+ num_time_mask: 1
+
+unused_parameters: true
+log_interval: 50
+normalize: None
+split_with_space: true
diff --git a/egs_modelscope/common/modelscope_common_finetune.sh b/egs_modelscope/common/modelscope_common_finetune.sh
new file mode 100755
index 000000000..a43083f0c
--- /dev/null
+++ b/egs_modelscope/common/modelscope_common_finetune.sh
@@ -0,0 +1,230 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+
+# machines configuration
+CUDA_VISIBLE_DEVICES="0,1" # set gpus, e.g., CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=2
+count=1
+gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding
+njob=4 # the number of jobs for each gpu
+train_cmd=utils/run.pl
+
+# general configuration
+feats_dir="." #feature output dictionary, for large data
+exp_dir="."
+lang=zh
+dumpdir=dump/fbank
+feats_type=fbank
+token_type=char
+scp=feats.scp
+type=kaldi_ark
+stage=1
+stop_stage=4
+
+# feature configuration
+feats_dim=560
+sample_frequency=16000
+nj=32
+speed_perturb="1.0"
+lfr=True
+lfr_m=7
+lfr_n=6
+
+init_model_name=speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch # pre-trained model, download from modelscope during fine-tuning
+cmvn_file=init_model/${init_model_name}/am.mvn
+seg_file=init_model/${init_model_name}/seg_dict
+vocab=init_model/${init_model_name}/tokens.txt
+
+# data
+dataset= # dataset (include train/wav.scp, train/text, dev/wav.scp, dev/text, optional test/wav.scp test/text)
+
+# exp tag
+tag=""
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+asr_config=conf/train_asr_paraformer_sanm_50e_16d_2048_512_lfr6.yaml
+init_param="init_model/${init_model_name}/${init_model_name}"
+
+inference_config=conf/decode_asr_transformer_noctc_1best.yaml
+inference_asr_model=valid.acc.ave_10best.pth
+
+. utils/parse_options.sh || exit 1;
+
+# download model from modelscope
+python modelscope_utils/download_model.py --model_name ${init_model_name}
+
+if [ ! -d ${HOME}/.cache/modelscope/hub/damo/${init_model_name} ]; then
+ echo "${HOME}/.cache/modelscope/hub/damo/${init_model_name} must exist"
+ exit 1
+else
+ if [ -d init_model/${init_model_name} ]; then
+ echo "init_model/${init_model_name} is already exists. if you want to decode again, please delete init_model/${init_model_name} first."
+ else
+ mkdir -p init_model/${init_model_name}
+ cp -r ${HOME}/.cache/modelscope/hub/damo/${init_model_name}/* init_model/${init_model_name}
+ fi
+fi
+
+model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
+
+# you can set gpu num for decoding here
+gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
+ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
+inference_nj=$[${ngpu}*${njob}]
+
+[ ! -d ${dataset} ] && echo "$0: Training data is required" && exit 1;
+[ ! -f ${dataset}/train/wav.scp ] && [ ! -f ${dataset}/train/text ] && echo "$0: Training data wav.scp or text is not found" && exit 1;
+
+if [ ! -d "${dataset}/dev" ]; then
+ utils/fix_data.sh ${dataset}/train
+ utils/subset_data_dir_tr_cv.sh --dev-num-utt 1000 ${dataset}/train ${dataset}
+fi
+if [ ! -d "${dataset}/test" ]; then
+ test_sets="dev"
+fi
+
+feat_train_dir=${feats_dir}/${dumpdir}/train; mkdir -p ${feat_train_dir}
+feat_dev_dir=${feats_dir}/${dumpdir}/dev; mkdir -p ${feat_dev_dir}
+feat_test_dir=${feats_dir}/${dumpdir}/test; mkdir -p ${feat_test_dir}
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ echo "Feature Generation"
+ # compute fbank features
+ fbankdir=${feats_dir}/fbank
+ utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --speed_perturb ${speed_perturb} \
+ ${dataset}/train ${exp_dir}/exp/make_fbank/train ${fbankdir}/train
+ utils/fix_data_feat.sh ${fbankdir}/train
+ utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj \
+ ${dataset}/dev ${exp_dir}/exp/make_fbank/dev ${fbankdir}/dev
+ utils/fix_data_feat.sh ${fbankdir}/dev
+ if [ -d "${dataset}/test" ]; then
+ utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj \
+ ${dataset}/test ${exp_dir}/exp/make_fbank/test ${fbankdir}/test
+ utils/fix_data_feat.sh ${fbankdir}/test
+ fi
+
+ echo "apply low_frame_rate and cmvn"
+ [ ! -f ${cmvn_file} ] && echo "$0: cmvn file is required" && exit 1;
+ utils/apply_lfr_and_cmvn.sh --cmd "$train_cmd" --nj $nj \
+ --lfr $lfr --lfr-m $lfr_m --lfr-n $lfr_n \
+ ${fbankdir}/train ${cmvn_file} ${exp_dir}/exp/make_fbank/train ${feat_train_dir}
+ utils/apply_lfr_and_cmvn.sh --cmd "$train_cmd" --nj $nj \
+ --lfr $lfr --lfr-m $lfr_m --lfr-n $lfr_n \
+ ${fbankdir}/dev ${cmvn_file} ${exp_dir}/exp/make_fbank/dev ${feat_dev_dir}
+ if [ -d "${dataset}/test" ]; then
+ utils/apply_lfr_and_cmvn.sh --cmd "$train_cmd" --nj $nj \
+ --lfr $lfr --lfr-m $lfr_m --lfr-n $lfr_n \
+ ${fbankdir}/test ${cmvn_file} ${exp_dir}/exp/make_fbank/test ${feat_test_dir}
+ fi
+
+ echo "Text Tokenize"
+ # 我爱reading->我 爱 read@@ ing
+ utils/text_tokenize.sh --cmd "$train_cmd" --nj $nj ${fbankdir}/train ${seg_file} ${feat_train_dir}/log ${feat_train_dir}
+ utils/fix_data_feat.sh ${feat_train_dir}
+ utils/text_tokenize.sh --cmd "$train_cmd" --nj $nj ${fbankdir}/dev ${seg_file} ${feat_dev_dir}/log ${feat_dev_dir}
+ utils/fix_data_feat.sh ${feat_dev_dir}
+ if [ -d "${dataset}/test" ]; then
+ cp ${fbankdir}/test/text ${feat_test_dir}
+ fi
+fi
+
+token_list=${feats_dir}/data/${lang}_token_list/char/tokens.txt
+echo "dictionary: ${token_list}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ echo "stage 2: Dictionary Preparation"
+ mkdir -p ${feats_dir}/data/${lang}_token_list/char/
+ cp $vocab ${token_list}
+
+ vocab_size=$(wc -l <${token_list})
+ awk -v v=,${vocab_size} '{print $0v}' ${feat_train_dir}/text_shape > ${feat_train_dir}/text_shape.char
+ awk -v v=,${vocab_size} '{print $0v}' ${feat_dev_dir}/text_shape > ${feat_dev_dir}/text_shape.char
+ mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/train
+ mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/dev
+ cp ${feat_train_dir}/speech_shape ${feat_train_dir}/text_shape ${feat_train_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/train
+ cp ${feat_dev_dir}/speech_shape ${feat_dev_dir}/text_shape ${feat_dev_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/dev
+fi
+
+# Training Stage
+world_size=$gpu_num # run on one machine
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ # update asr train config.yaml
+ python modelscope_utils/update_config.py --modelscope_config init_model/${init_model_name}/asr_train_config.yaml --finetune_config ${asr_config} --output_config init_model/${init_model_name}/asr_finetune_config.yaml
+ finetune_config=init_model/${init_model_name}/asr_finetune_config.yaml
+
+ mkdir -p ${exp_dir}/exp/${model_dir}
+ mkdir -p ${exp_dir}/exp/${model_dir}/log
+ INIT_FILE=$exp_dir/ddp_init
+ if [ -f $INIT_FILE ];then
+ rm -f $INIT_FILE
+ fi
+ init_method=file://$(readlink -f $INIT_FILE)
+ echo "$0: init method is $init_method"
+ for ((i = 0; i < $gpu_num; ++i)); do
+ {
+ rank=$i
+ local_rank=$i
+ gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+ asr_train_paraformer.py \
+ --gpu_id $gpu_id \
+ --use_preprocessor true \
+ --token_type $token_type \
+ --token_list $token_list \
+ --train_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${train_set}/${scp},speech,${type} \
+ --train_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${train_set}/text,text,text \
+ --train_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${train_set}/speech_shape \
+ --train_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${train_set}/text_shape.char \
+ --valid_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${valid_set}/${scp},speech,${type} \
+ --valid_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${valid_set}/text,text,text \
+ --valid_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${valid_set}/speech_shape \
+ --valid_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${valid_set}/text_shape.char \
+ --resume true \
+ --output_dir ${exp_dir}/exp/${model_dir} \
+ --init_param $init_param \
+ --config $finetune_config \
+ --input_size $feats_dim \
+ --ngpu $gpu_num \
+ --num_worker_count $count \
+ --multiprocessing_distributed true \
+ --dist_init_method $init_method \
+ --dist_world_size $world_size \
+ --dist_rank $rank \
+ --local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
+ } &
+ done
+ wait
+fi
+
+# Testing Stage
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ ./utils/easy_asr_infer.sh \
+ --lang zh \
+ --datadir ${feats_dir} \
+ --feats_type ${feats_type} \
+ --feats_dim ${feats_dim} \
+ --token_type ${token_type} \
+ --gpu_inference ${gpu_inference} \
+ --inference_config "${inference_config}" \
+ --test_sets "${test_sets}" \
+ --token_list $token_list \
+ --asr_exp ${exp_dir}/exp/${model_dir} \
+ --stage 12 \
+ --stop_stage 12 \
+ --scp $scp \
+ --text text \
+ --inference_nj $inference_nj \
+ --njob $njob \
+ --inference_asr_model $inference_asr_model \
+ --gpuid_list $gpuid_list \
+ --mode paraformer
+fi
+
diff --git a/egs_modelscope/common/modelscope_common_infer.sh b/egs_modelscope/common/modelscope_common_infer.sh
new file mode 100755
index 000000000..12b2cbcb2
--- /dev/null
+++ b/egs_modelscope/common/modelscope_common_infer.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+model_name=speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch # pre-trained model, download from modelscope
+data_dir= # wav list, ${data_dir}/wav.scp
+exp_dir="exp"
+gpuid_list="0,1"
+ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
+njob=4
+gpu_inference=true
+decode_cmd=utils/run.pl
+
+. utils/parse_options.sh
+
+if ${gpu_inference}; then
+ inference_nj=$[${ngpu}*${njob}]
+ _ngpu=1
+else
+ inference_nj=${njob}
+ _ngpu=0
+fi
+
+# LM configs
+use_lm=false
+beam_size=1
+lm_weight=0.0
+
+python modelscope_utils/download_model.py \
+ --model_name ${model_name}
+
+if [ -d ${exp_dir} ]; then
+ echo "${exp_dir} is already exists. if you want to decode again, please delete ${exp_dir} first."
+ exit 1
+else
+ mkdir -p ${exp_dir}/${model_name}
+ cp ${HOME}/.cache/modelscope/hub/damo/${model_name}/* ${exp_dir}/${model_name}/. -r
+ _dir=${exp_dir}/decode_asr
+ _logdir=${_dir}/logdir
+ mkdir -p "${_dir}"
+ mkdir -p "${_logdir}"
+fi
+
+for n in $(seq "${inference_nj}"); do
+ split_scps+=" ${_logdir}/keys.${n}.scp"
+done
+# shellcheck disable=SC2086
+utils/split_scp.pl "${data_dir}/wav.scp" ${split_scps}
+
+if "${use_lm}"; then
+ cp ${exp_dir}/${model_name}/decode_asr_transformer.yaml ${exp_dir}/${model_name}/decode_asr_transformer.yaml.back
+ cp ${exp_dir}/${model_name}/decode_asr_transformer_wav.yaml ${exp_dir}/${model_name}/decode_asr_transformer_wav.yaml.back
+ sed -i "s#beam_size: [0-9]*#beam_size: `echo $beam_size`#g" ${exp_dir}/${model_name}/decode_asr_transformer.yaml
+ sed -i "s#beam_size: [0-9]*#beam_size: `echo $beam_size`#g" ${exp_dir}/${model_name}/decode_asr_transformer_wav.yaml
+ sed -i "s#lm_weight: 0.[0-9]*#lm_weight: `echo $lm_weight`#g" ${exp_dir}/${model_name}/decode_asr_transformer.yaml
+ sed -i "s#lm_weight: 0.[0-9]*#lm_weight: `echo $lm_weight`#g" ${exp_dir}/${model_name}/decode_asr_transformer_wav.yaml
+fi
+
+echo "Decoding started... log: '${_logdir}/asr_inference.*.log'"
+# shellcheck disable=SC2086
+${decode_cmd} --max-jobs-run "${inference_nj}" JOB=1:"${inference_nj}" "${_logdir}"/asr_inference.JOB.log \
+ python -m funasr.bin.modelscope_infer \
+ --local_model_path ${exp_dir}/${model_name} \
+ --wav_list ${_logdir}/keys.JOB.scp \
+ --output_file ${_logdir}/text.JOB \
+ --gpuid_list ${gpuid_list} \
+ --njob ${njob} \
+ --ngpu ${_ngpu} \
+
+ for i in $(seq ${inference_nj}); do
+ cat ${_logdir}/text.${i}
+ done | sort -k1 >${_dir}/text
+
+mv ${exp_dir}/${model_name}/decode_asr_transformer.yaml.back ${exp_dir}/${model_name}/decode_asr_transformer.yaml
+mv ${exp_dir}/${model_name}/decode_asr_transformer_wav.yaml.back ${exp_dir}/${model_name}/decode_asr_transformer_wav.yaml
+
diff --git a/egs_modelscope/common/modelscope_common_infer_after_finetune.sh b/egs_modelscope/common/modelscope_common_infer_after_finetune.sh
new file mode 100755
index 000000000..00dd28336
--- /dev/null
+++ b/egs_modelscope/common/modelscope_common_infer_after_finetune.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+pretrained_model_name=speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch # pre-trained model, download from modelscope
+data_dir= # wav list, ${data_dir}/wav.scp
+finetune_model_name= # fine-tuning model name
+finetune_exp_dir= # fine-tuning model experiment result path
+gpuid_list="0,1"
+ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
+njob=4
+gpu_inference=true
+decode_cmd=utils/run.pl
+
+. utils/parse_options.sh
+
+if ${gpu_inference}; then
+ inference_nj=$[${ngpu}*${njob}]
+ _ngpu=1
+else
+ inference_nj=${njob}
+ inference_nj=${njob}
+ _ngpu=0
+fi
+
+if [ ! -d ${HOME}/.cache/modelscope/hub/damo/${pretrained_model_name} ]; then
+ echo "${HOME}/.cache/modelscope/hub/damo/${pretrained_model_name} must exist."
+ exit 1
+else
+ exp_dir=${finetune_exp_dir}/${finetune_model_name}.modelscope
+ mkdir -p $exp_dir
+ cp ${finetune_exp_dir}/${finetune_model_name} ${exp_dir}/${finetune_model_name}.modelscope
+ cp ${HOME}/.cache/modelscope/hub/damo/${pretrained_model_name}/* ${exp_dir}/. -r
+fi
+
+_dir=${exp_dir}/decode_asr
+_logdir=${_dir}/logdir
+if [ -d ${_dir} ]; then
+ echo "${_dir} is already exists. if you want to decode again, please delete ${_dir} first."
+else
+ mkdir -p "${_dir}"
+ mkdir -p "${_logdir}"
+fi
+
+for n in $(seq "${inference_nj}"); do
+ split_scps+=" ${_logdir}/keys.${n}.scp"
+done
+# shellcheck disable=SC2086
+utils/split_scp.pl "${data_dir}/wav.scp" ${split_scps}
+
+echo "Decoding started... log: '${_logdir}/asr_inference.*.log'"
+# shellcheck disable=SC2086
+${decode_cmd} --max-jobs-run "${inference_nj}" JOB=1:"${inference_nj}" "${_logdir}"/asr_inference.JOB.log \
+ python -m funasr.bin.modelscope_infer \
+ --local_model_path ${exp_dir} \
+ --wav_list ${_logdir}/keys.JOB.scp \
+ --output_file ${_logdir}/text.JOB \
+ --gpuid_list ${gpuid_list} \
+ --njob ${njob} \
+ --ngpu ${_ngpu} \
+
+ for i in $(seq ${inference_nj}); do
+ cat ${_logdir}/text.${i}
+ done | sort -k1 >${_dir}/text
\ No newline at end of file
diff --git a/egs_modelscope/common/modelscope_utils/download_model.py b/egs_modelscope/common/modelscope_utils/download_model.py
new file mode 100755
index 000000000..5d5f70dd1
--- /dev/null
+++ b/egs_modelscope/common/modelscope_utils/download_model.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+import argparse
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(
+ description="download model configs",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+ parser.add_argument("--model_name",
+ type=str,
+ default="speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+ help="model name in modelscope")
+ args = parser.parse_args()
+
+ inference_pipeline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model='damo/{}'.format(args.model_name),
+ model_revision='v1.0.0')
diff --git a/egs_modelscope/common/modelscope_utils/modelscope_infer.sh b/egs_modelscope/common/modelscope_utils/modelscope_infer.sh
new file mode 100755
index 000000000..1a56dce98
--- /dev/null
+++ b/egs_modelscope/common/modelscope_utils/modelscope_infer.sh
@@ -0,0 +1,90 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+data_dir=
+exp_dir=
+model_name=
+inference_nj=32
+gpuid_list="0,1,2,3"
+njob=32
+gpu_inference=true
+
+test_sets="dev test"
+decode_cmd=utils/run.pl
+
+# LM configs
+use_lm=false
+beam_size=1
+lm_weight=0.0
+
+. utils/parse_options.sh
+
+if ${gpu_inference}; then
+ _ngpu=1
+else
+ _ngpu=0
+fi
+
+# download model from modelscope
+python modelscope_utils/download_model.py \
+ --model_name ${model_name}
+
+modelscope_dir=${HOME}/.cache/modelscope/hub/damo/${model_name}
+
+
+for dset in ${test_sets}; do
+ _dir=${exp_dir}/${model_name}/decode_asr/${dset}
+ _logdir=${_dir}/logdir
+ _data=${data_dir}/${dset}
+ if [ -d ${_dir} ]; then
+ echo "${_dir} is already exists. if you want to decode again, please delete ${_dir} first."
+ exit 1
+ else
+ mkdir -p "${_dir}"
+ mkdir -p "${_logdir}"
+ fi
+
+ if "${use_lm}"; then
+ cp ${modelscope_dir}/decode_asr_transformer.yaml ${modelscope_dir}/decode_asr_transformer.yaml.back
+ cp ${modelscope_dir}/decode_asr_transformer_wav.yaml ${modelscope_dir}/decode_asr_transformer_wav.yaml.back
+ sed -i "s#beam_size: [0-9]*#beam_size: `echo $beam_size`#g" ${modelscope_dir}/decode_asr_transformer.yaml
+ sed -i "s#beam_size: [0-9]*#beam_size: `echo $beam_size`#g" ${modelscope_dir}/decode_asr_transformer_wav.yaml
+ sed -i "s#lm_weight: 0.[0-9]*#lm_weight: `echo $lm_weight`#g" ${modelscope_dir}/decode_asr_transformer.yaml
+ sed -i "s#lm_weight: 0.[0-9]*#lm_weight: `echo $lm_weight`#g" ${modelscope_dir}/decode_asr_transformer_wav.yaml
+ fi
+
+ for n in $(seq "${inference_nj}"); do
+ split_scps+=" ${_logdir}/keys.${n}.scp"
+ done
+ # shellcheck disable=SC2086
+ utils/split_scp.pl "${data_dir}/${dset}/wav.scp" ${split_scps}
+
+ echo "Decoding started... log: '${_logdir}/asr_inference.*.log'"
+ # shellcheck disable=SC2086
+ ${decode_cmd} --max-jobs-run "${inference_nj}" JOB=1:"${inference_nj}" "${_logdir}"/asr_inference.JOB.log \
+ python -m funasr.bin.modelscope_infer \
+ --model_name ${model_name} \
+ --wav_list ${_logdir}/keys.JOB.scp \
+ --output_file ${_logdir}/text.JOB \
+ --gpuid_list ${gpuid_list} \
+ --njob ${njob} \
+ --ngpu ${_ngpu} \
+
+ for i in $(seq ${inference_nj}); do
+ cat ${_logdir}/text.${i}
+ done | sort -k1 >${_dir}/text
+
+ python utils/proce_text.py ${_dir}/text ${_dir}/text.proc
+ python utils/proce_text.py ${_data}/text ${_data}/text.proc
+ python utils/compute_wer.py ${_data}/text.proc ${_dir}/text.proc ${_dir}/text.cer
+ tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
+ cat ${_dir}/text.cer.txt
+done
+
+if "${use_lm}"; then
+ mv ${modelscope_dir}/decode_asr_transformer.yaml.back ${modelscope_dir}/decode_asr_transformer.yaml
+ mv ${modelscope_dir}/decode_asr_transformer_wav.yaml.back ${modelscope_dir}/decode_asr_transformer_wav.yaml
+fi
diff --git a/egs_modelscope/common/modelscope_utils/update_config.py b/egs_modelscope/common/modelscope_utils/update_config.py
new file mode 100644
index 000000000..88466edcd
--- /dev/null
+++ b/egs_modelscope/common/modelscope_utils/update_config.py
@@ -0,0 +1,41 @@
+import yaml
+import argparse
+
+def update_dct(fin_configs, root):
+ if root == {}:
+ return {}
+ for root_key, root_value in root.items():
+ if not isinstance(root[root_key],dict):
+ fin_configs[root_key] = root[root_key]
+ else:
+ result = update_dct(fin_configs[root_key], root[root_key])
+ fin_configs[root_key] = result
+ return fin_configs
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(
+ description="update configs",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+ parser.add_argument("--modelscope_config",
+ type=str,
+ help="modelscope config file")
+ parser.add_argument("--finetune_config",
+ type=str,
+ help="finetune config file")
+ parser.add_argument("--output_config",
+ type=str,
+ help="output config file")
+ args = parser.parse_args()
+
+ with open(args.modelscope_config) as f:
+ modelscope_configs = yaml.safe_load(f)
+
+ with open(args.finetune_config) as f:
+ finetune_configs = yaml.safe_load(f)
+
+ # update configs, e.g., lr, batch_size, ...
+ modelscope_configs = update_dct(modelscope_configs, finetune_configs)
+
+ with open(args.output_config, "w") as f:
+ yaml.dump(modelscope_configs, f, indent=4)
diff --git a/egs_modelscope/common/path.sh b/egs_modelscope/common/path.sh
new file mode 100755
index 000000000..c340218c2
--- /dev/null
+++ b/egs_modelscope/common/path.sh
@@ -0,0 +1,5 @@
+export FUNASR_DIR=$PWD/../..
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
diff --git a/egs_modelscope/common/utils b/egs_modelscope/common/utils
new file mode 120000
index 000000000..cbef564a5
--- /dev/null
+++ b/egs_modelscope/common/utils
@@ -0,0 +1 @@
+../../egs/aishell/tranformer/utils/
\ No newline at end of file
diff --git a/egs_modelscope/speechio/paraformer/README.md b/egs_modelscope/speechio/paraformer/README.md
new file mode 100644
index 000000000..669185f6e
--- /dev/null
+++ b/egs_modelscope/speechio/paraformer/README.md
@@ -0,0 +1,24 @@
+# ModelScope: Paraformer-large Model
+
+## Highlight
+
+### ModelScope: Paraformer-Large Model
+- Fast: Non-autoregressive (NAR) model, the Paraformer can achieve comparable performance to the state-of-the-art AR transformer, with more than 10x speedup.
+- Accurate: SOTA in a lot of public ASR tasks, with a very significant relative improvement, capable of industrial implementation.
+- Convenient: Quickly and easily download Paraformer-large from Modelscope for finetuning and inference.
+ - Support finetuning and inference on AISHELL-1 and AISHELL-2.
+ - Support inference on AISHELL-1, AISHELL-2, Wenetspeech, SpeechIO and other audio.
+
+## How to infer using a pretrained ModelScope Paraformer-large Model
+
+### Inference
+- Setting parameters in `paraformer_large_infer.sh`
+ - ori_data: please set the speechio raw data path
+ - data_dir: data output dictionary
+ - exp_dir: the result path
+ - model_name: speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch # base model, download from modelscope
+ - test_sets: please set the testsets name
+- Then you can run the pipeline to infer with:
+```sh
+ sh ./paraformer_large_infer.sh
+```
diff --git a/egs_modelscope/speechio/paraformer/RESULTS.md b/egs_modelscope/speechio/paraformer/RESULTS.md
new file mode 100644
index 000000000..9938e74fe
--- /dev/null
+++ b/egs_modelscope/speechio/paraformer/RESULTS.md
@@ -0,0 +1,42 @@
+# Paraformer-Large
+- Model link:
+- Model size: 220M
+- Train config: conf/train_asr_paraformer_sanm_50e_16d_2048_512_lfr6.yaml
+
+# Environments
+- date: `Tue Nov 22 18:48:39 CST 2022`
+- python version: `3.7.12`
+- FunASR version: `0.1.0`
+- pytorch version: `pytorch 1.7.0`
+- Git hash: ``
+- Commit date: ``
+
+# Beachmark Results
+
+
+## SpeechIO TIOBE
+- Decode config 1: conf/decode_asr_transformer_noctc_1best.yaml
+ - Decode without CTC
+ - Decode without LM
+- Decode config 2: conf/decode_asr_transformer_noctc_10best_lm_weight_0.15.yaml
+ - Decode without CTC
+ - Decode with Transformer-LM
+ - LM weight: 0.15
+
+| testset | w/o LM | w/ LM |
+|:------------------:|:----:|:----:|
+|SPEECHIO_ASR_ZH00001| 0.49 | 0.35 |
+|SPEECHIO_ASR_ZH00002| 3.23 | 2.86 |
+|SPEECHIO_ASR_ZH00003| 1.13 | 0.80 |
+|SPEECHIO_ASR_ZH00004| 1.33 | 1.10 |
+|SPEECHIO_ASR_ZH00005| 1.41 | 1.18 |
+|SPEECHIO_ASR_ZH00006| 5.25 | 4.85 |
+|SPEECHIO_ASR_ZH00007| 5.51 | 4.97 |
+|SPEECHIO_ASR_ZH00008| 3.69 | 3.18 |
+|SPEECHIO_ASR_ZH00009| 3.02 | 2.78 |
+|SPEECHIO_ASR_ZH000010| 3.35 | 2.99 |
+|SPEECHIO_ASR_ZH000011| 1.54 | 1.25 |
+|SPEECHIO_ASR_ZH000012| 2.06 | 1.68 |
+|SPEECHIO_ASR_ZH000013| 2.57 | 2.25 |
+|SPEECHIO_ASR_ZH000014| 3.86 | 3.08 |
+|SPEECHIO_ASR_ZH000015| 3.34 | 2.67 |
diff --git a/egs_modelscope/speechio/paraformer/modelscope_utils b/egs_modelscope/speechio/paraformer/modelscope_utils
new file mode 120000
index 000000000..fc97768c8
--- /dev/null
+++ b/egs_modelscope/speechio/paraformer/modelscope_utils
@@ -0,0 +1 @@
+../../common/modelscope_utils
\ No newline at end of file
diff --git a/egs_modelscope/speechio/paraformer/paraformer_large_infer.sh b/egs_modelscope/speechio/paraformer/paraformer_large_infer.sh
new file mode 100755
index 000000000..bcf8c331c
--- /dev/null
+++ b/egs_modelscope/speechio/paraformer/paraformer_large_infer.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+ori_data=
+data_dir=
+exp_dir=
+model_name=speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
+inference_nj=32
+gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1"
+ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
+njob=4 # the number of jobs for each gpu
+gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding
+
+if ${gpu_inference}; then
+ inference_nj=$[${ngpu}*${njob}]
+else
+ inference_nj=$njob
+fi
+
+# LM configs
+use_lm=false
+beam_size=1
+lm_weight=0.0
+
+test_sets="SPEECHIO_ASR_ZH00001 SPEECHIO_ASR_ZH00002 SPEECHIO_ASR_ZH00003 SPEECHIO_ASR_ZH00004 SPEECHIO_ASR_ZH00005 SPEECHIO_ASR_ZH00006 SPEECHIO_ASR_ZH00007 SPEECHIO_ASR_ZH00008 SPEECHIO_ASR_ZH00009 SPEECHIO_ASR_ZH00010 SPEECHIO_ASR_ZH00011 SPEECHIO_ASR_ZH00012 SPEECHIO_ASR_ZH00013 SPEECHIO_ASR_ZH00014 SPEECHIO_ASR_ZH00015"
+
+. utils/parse_options.sh
+
+for tset_name in ${test_sets}; do
+ test_dir=${data_dir}/speechio/${tset_name}
+ mkdir -p ${test_dir}
+ find ${ori_data}/${tset_name} -iname "*.wav" > ${test_dir}/wav.flist
+ sed -e 's/\.wav//' ${test_dir}/wav.flist | awk -F '/' '{print $NF}' > ${test_dir}/utt.list
+ paste -d' ' ${test_dir}/utt.list ${test_dir}/wav.flist > ${test_dir}/wav.scp
+ cp ${ori_data}/${tset_name}/trans.txt ${test_dir}/text
+ sed -i "s/\t/ /g" ${test_dir}/text
+done
+
+mkdir -p ${exp_dir}/speechio
+
+modelscope_utils/modelscope_infer.sh \
+ --data_dir ${data_dir}/speechio \
+ --exp_dir ${exp_dir}/speechio \
+ --test_sets "${test_sets}" \
+ --model_name ${model_name} \
+ --inference_nj ${inference_nj} \
+ --gpuid_list ${gpuid_list} \
+ --njob ${njob} \
+ --gpu_inference ${gpu_inference} \
+ --use_lm ${use_lm} \
+ --beam_size ${beam_size} \
+ --lm_weight ${lm_weight}
+
+# SpeechIO TIOBE textnorm
+for tset_name in ${test_sets}; do
+ echo "$0 --> Normalizing REF text ..."
+ ./utils/textnorm_zh.py \
+ --has_key --to_upper \
+ ${ori_data}/${tset_name}/trans.txt \
+ ${data_dir}/speechio/${tset_name}/ref.txt
+
+ cp ${exp_dir}/speechio/${model_name}/decode_asr/${tset_name}/text.proc ${exp_dir}/speechio/${model_name}/decode_asr/${tset_name}/raw_rec.txt
+ sed -i "s#