import argparse from typing import Callable from typing import Collection from typing import Dict from typing import List from typing import Optional from typing import Tuple import numpy as np import torch from funasr.datasets.collate_fn import CommonCollateFn from funasr.datasets.preprocessor import CommonPreprocessor from funasr.layers.abs_normalize import AbsNormalize from funasr.layers.global_mvn import GlobalMVN from funasr.layers.utterance_mvn import UtteranceMVN from funasr.models.data2vec import Data2VecPretrainModel from funasr.models.encoder.abs_encoder import AbsEncoder from funasr.models.encoder.data2vec_encoder import Data2VecEncoder from funasr.models.frontend.abs_frontend import AbsFrontend from funasr.models.frontend.default import DefaultFrontend from funasr.models.frontend.windowing import SlidingWindow from funasr.models.preencoder.abs_preencoder import AbsPreEncoder from funasr.models.preencoder.sinc import LightweightSincConvs from funasr.models.specaug.abs_specaug import AbsSpecAug from funasr.models.specaug.specaug import SpecAug from funasr.tasks.abs_task import AbsTask from funasr.tokenizer.phoneme_tokenizer import g2p_choices from funasr.torch_utils.initialize import initialize from funasr.train.class_choices import ClassChoices from funasr.train.trainer import Trainer from funasr.utils.types import float_or_none from funasr.utils.types import int_or_none from funasr.utils.types import str2bool from funasr.utils.types import str_or_none frontend_choices = ClassChoices( name="frontend", classes=dict(default=DefaultFrontend, sliding_window=SlidingWindow), type_check=AbsFrontend, default="default", ) specaug_choices = ClassChoices( name="specaug", classes=dict(specaug=SpecAug), type_check=AbsSpecAug, default=None, optional=True, ) normalize_choices = ClassChoices( "normalize", classes=dict( global_mvn=GlobalMVN, utterance_mvn=UtteranceMVN, ), type_check=AbsNormalize, default=None, optional=True, ) preencoder_choices = ClassChoices( name="preencoder", classes=dict( sinc=LightweightSincConvs, ), type_check=AbsPreEncoder, default=None, optional=True, ) encoder_choices = ClassChoices( "encoder", classes=dict( data2vec_encoder=Data2VecEncoder, ), type_check=AbsEncoder, default="data2vec_encoder", ) model_choices = ClassChoices( "model", classes=dict( data2vec=Data2VecPretrainModel, ), default="data2vec", ) class Data2VecTask(AbsTask): # If you need more than one optimizers, change this value num_optimizers: int = 1 # Add variable objects configurations class_choices_list = [ # --frontend and --frontend_conf frontend_choices, # --specaug and --specaug_conf specaug_choices, # --normalize and --normalize_conf normalize_choices, # --preencoder and --preencoder_conf preencoder_choices, # --encoder and --encoder_conf encoder_choices, # --model and --model_conf model_choices, ] # If you need to modify train() or eval() procedures, change Trainer class here trainer = Trainer @classmethod def add_task_arguments(cls, parser: argparse.ArgumentParser): group = parser.add_argument_group(description="Task related") # NOTE(kamo): add_arguments(..., required=True) can't be used # to provide --print_config mode. Instead of it, do as group.add_argument( "--token_list", type=str_or_none, default=None, help="A text mapping int-id to token", ) group.add_argument( "--init", type=lambda x: str_or_none(x.lower()), default=None, help="The initialization method", choices=[ "chainer", "xavier_uniform", "xavier_normal", "kaiming_uniform", "kaiming_normal", None, ], ) group.add_argument( "--input_size", type=int_or_none, default=None, help="The number of input dimension of the feature", ) group = parser.add_argument_group(description="Preprocess related") group.add_argument( "--use_preprocessor", type=str2bool, default=True, help="Apply preprocessing to data or not", ) group.add_argument( "--token_type", type=str, default=None, choices=["bpe", "char", "word", "phn"], help="The text will be tokenized " "in the specified level token", ) group.add_argument( "--feats_type", type=str, default='fbank', help="feats type, e.g. fbank, wav, ark_wav(needed to be scale normalization)", ) group.add_argument( "--bpemodel", type=str_or_none, default=None, help="The model file of sentencepiece", ) parser.add_argument( "--non_linguistic_symbols", type=str_or_none, help="non_linguistic_symbols file path", ) parser.add_argument( "--cleaner", type=str_or_none, choices=[None, "tacotron", "jaconv", "vietnamese"], default=None, help="Apply text cleaning", ) parser.add_argument( "--g2p", type=str_or_none, choices=g2p_choices, default=None, help="Specify g2p method if --token_type=phn", ) parser.add_argument( "--speech_volume_normalize", type=float_or_none, default=None, help="Scale the maximum amplitude to the given value.", ) parser.add_argument( "--rir_scp", type=str_or_none, default=None, help="The file path of rir scp file.", ) parser.add_argument( "--rir_apply_prob", type=float, default=1.0, help="THe probability for applying RIR convolution.", ) parser.add_argument( "--noise_scp", type=str_or_none, default=None, help="The file path of noise scp file.", ) parser.add_argument( "--noise_apply_prob", type=float, default=1.0, help="The probability applying Noise adding.", ) parser.add_argument( "--noise_db_range", type=str, default="13_15", help="The range of noise decibel level.", ) parser.add_argument( "--pred_masked_weight", type=float, default=1.0, help="weight for predictive loss for masked frames", ) parser.add_argument( "--pred_nomask_weight", type=float, default=0.0, help="weight for predictive loss for unmasked frames", ) parser.add_argument( "--loss_weights", type=float, default=0.0, help="weights for additional loss terms (not first one)", ) for class_choices in cls.class_choices_list: # Append -- and --_conf. # e.g. --encoder and --encoder_conf class_choices.add_arguments(group) @classmethod def build_collate_fn( cls, args: argparse.Namespace, train: bool ) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], ]: return CommonCollateFn(clipping=True) @classmethod def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: if args.use_preprocessor: retval = CommonPreprocessor( train=train, bpemodel=args.bpemodel, non_linguistic_symbols=args.non_linguistic_symbols, text_cleaner=args.cleaner, g2p_type=args.g2p, # NOTE(kamo): Check attribute existence for backward compatibility rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None, rir_apply_prob=args.rir_apply_prob if hasattr(args, "rir_apply_prob") else 1.0, noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None, noise_apply_prob=args.noise_apply_prob if hasattr(args, "noise_apply_prob") else 1.0, noise_db_range=args.noise_db_range if hasattr(args, "noise_db_range") else "13_15", speech_volume_normalize=args.speech_volume_normalize if hasattr(args, "rir_scp") else None, ) else: retval = None return retval @classmethod def required_data_names( cls, train: bool = True, inference: bool = False ) -> Tuple[str, ...]: # for pre-training retval = ("speech",) return retval @classmethod def optional_data_names( cls, train: bool = True, inference: bool = False ) -> Tuple[str, ...]: retval = () return retval @classmethod def build_model(cls, args: argparse.Namespace): # 1. frontend if args.input_size is None: # Extract features in the model frontend_class = frontend_choices.get_class(args.frontend) frontend = frontend_class(**args.frontend_conf) input_size = frontend.output_size() else: # Give features from data-loader args.frontend = None args.frontend_conf = {} frontend = None input_size = args.input_size # 2. Data augmentation for spectrogram if args.specaug is not None: specaug_class = specaug_choices.get_class(args.specaug) specaug = specaug_class(**args.specaug_conf) else: specaug = None # 3. Normalization layer if args.normalize is not None: normalize_class = normalize_choices.get_class(args.normalize) normalize = normalize_class(**args.normalize_conf) else: normalize = None # 4. Pre-encoder input block # NOTE(kan-bayashi): Use getattr to keep the compatibility if getattr(args, "preencoder", None) is not None: preencoder_class = preencoder_choices.get_class(args.preencoder) preencoder = preencoder_class(**args.preencoder_conf) input_size = preencoder.output_size() else: preencoder = None # 5. Encoder encoder_class = encoder_choices.get_class(args.encoder) encoder = encoder_class( input_size=input_size, **args.encoder_conf, ) # 6. Build model try: model_class = model_choices.get_class(args.model) except AttributeError: model_class = model_choices.get_class("data2vec") model = model_class( frontend=frontend, specaug=specaug, normalize=normalize, preencoder=preencoder, encoder=encoder, ) # 7. Initialize if args.init is not None: initialize(model, args.init) return model