diff --git a/funasr/datasets/dataset.py b/funasr/datasets/dataset.py index 2af93d0bc..d1777a385 100644 --- a/funasr/datasets/dataset.py +++ b/funasr/datasets/dataset.py @@ -107,7 +107,7 @@ class H5FileWrapper: return value[()] -def sound_loader(path, float_dtype=None): +def sound_loader(path, dest_sample_rate=16000, float_dtype=None): # The file is as follows: # utterance_id_A /some/where/a.wav # utterance_id_B /some/where/a.flac @@ -115,7 +115,7 @@ def sound_loader(path, float_dtype=None): # NOTE(kamo): SoundScpReader doesn't support pipe-fashion # like Kaldi e.g. "cat a.wav |". # NOTE(kamo): The audio signal is normalized to [-1,1] range. - loader = SoundScpReader(path, normalize=True, always_2d=False) + loader = SoundScpReader(path, dest_sample_rate=16000, normalize=True, always_2d=False) # SoundScpReader.__getitem__() returns Tuple[int, ndarray], # but ndarray is desired, so Adapter class is inserted here @@ -139,7 +139,7 @@ def rand_int_loader(filepath, loader_type): DATA_TYPES = { "sound": dict( func=sound_loader, - kwargs=["float_dtype"], + kwargs=["dest_sample_rate","float_dtype"], help="Audio format types which supported by sndfile wav, flac, etc." "\n\n" " utterance_id_a a.wav\n" @@ -282,6 +282,7 @@ class ESPnetDataset(AbsDataset): int_dtype: str = "long", max_cache_size: Union[float, int, str] = 0.0, max_cache_fd: int = 0, + dest_sample_rate: int = 16000, ): assert check_argument_types() if len(path_name_type_list) == 0: @@ -295,6 +296,7 @@ class ESPnetDataset(AbsDataset): self.float_dtype = float_dtype self.int_dtype = int_dtype self.max_cache_fd = max_cache_fd + self.dest_sample_rate = dest_sample_rate self.loader_dict = {} self.debug_info = {} @@ -335,6 +337,8 @@ class ESPnetDataset(AbsDataset): for key2 in dic["kwargs"]: if key2 == "loader_type": kwargs["loader_type"] = loader_type + elif key2 == "dest_sample_rate" and loader_type=="sound": + kwargs["dest_sample_rate"] = self.dest_sample_rate elif key2 == "float_dtype": kwargs["float_dtype"] = self.float_dtype elif key2 == "int_dtype": diff --git a/funasr/fileio/sound_scp.py b/funasr/fileio/sound_scp.py index 459369efb..dc872b047 100644 --- a/funasr/fileio/sound_scp.py +++ b/funasr/fileio/sound_scp.py @@ -4,6 +4,7 @@ from typing import Union import numpy as np import soundfile +import librosa from typeguard import check_argument_types from funasr.fileio.read_text import read_2column_text @@ -30,6 +31,7 @@ class SoundScpReader(collections.abc.Mapping): dtype=np.int16, always_2d: bool = False, normalize: bool = False, + dest_sample_rate: int = 16000, ): assert check_argument_types() self.fname = fname @@ -37,15 +39,18 @@ class SoundScpReader(collections.abc.Mapping): self.always_2d = always_2d self.normalize = normalize self.data = read_2column_text(fname) + self.dest_sample_rate = dest_sample_rate def __getitem__(self, key): wav = self.data[key] if self.normalize: # soundfile.read normalizes data to [-1,1] if dtype is not given - array, rate = soundfile.read(wav, always_2d=self.always_2d) + array, rate = librosa.load( + wav, sr=self.dest_sample_rate, mono=not self.always_2d + ) else: - array, rate = soundfile.read( - wav, dtype=self.dtype, always_2d=self.always_2d + array, rate = librosa.load( + wav, sr=self.dest_sample_rate, mono=not self.always_2d, dtype=self.dtype ) return rate, array diff --git a/funasr/tasks/abs_task.py b/funasr/tasks/abs_task.py index 723a67cfb..e0884cef6 100644 --- a/funasr/tasks/abs_task.py +++ b/funasr/tasks/abs_task.py @@ -1576,6 +1576,7 @@ class AbsTask(ABC): preprocess=iter_options.preprocess_fn, max_cache_size=iter_options.max_cache_size, max_cache_fd=iter_options.max_cache_fd, + dest_sample_rate=args.frontend_conf["fs"], ) cls.check_task_requirements( dataset, args.allow_variable_data_keys, train=iter_options.train