mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
Add finetune resampling function under small data type.
This commit is contained in:
parent
716ea81703
commit
4afdd97df4
@ -107,7 +107,7 @@ class H5FileWrapper:
|
|||||||
return value[()]
|
return value[()]
|
||||||
|
|
||||||
|
|
||||||
def sound_loader(path, float_dtype=None):
|
def sound_loader(path, dest_sample_rate=16000, float_dtype=None):
|
||||||
# The file is as follows:
|
# The file is as follows:
|
||||||
# utterance_id_A /some/where/a.wav
|
# utterance_id_A /some/where/a.wav
|
||||||
# utterance_id_B /some/where/a.flac
|
# utterance_id_B /some/where/a.flac
|
||||||
@ -115,7 +115,7 @@ def sound_loader(path, float_dtype=None):
|
|||||||
# NOTE(kamo): SoundScpReader doesn't support pipe-fashion
|
# NOTE(kamo): SoundScpReader doesn't support pipe-fashion
|
||||||
# like Kaldi e.g. "cat a.wav |".
|
# like Kaldi e.g. "cat a.wav |".
|
||||||
# NOTE(kamo): The audio signal is normalized to [-1,1] range.
|
# NOTE(kamo): The audio signal is normalized to [-1,1] range.
|
||||||
loader = SoundScpReader(path, normalize=True, always_2d=False)
|
loader = SoundScpReader(path, dest_sample_rate=16000, normalize=True, always_2d=False)
|
||||||
|
|
||||||
# SoundScpReader.__getitem__() returns Tuple[int, ndarray],
|
# SoundScpReader.__getitem__() returns Tuple[int, ndarray],
|
||||||
# but ndarray is desired, so Adapter class is inserted here
|
# but ndarray is desired, so Adapter class is inserted here
|
||||||
@ -139,7 +139,7 @@ def rand_int_loader(filepath, loader_type):
|
|||||||
DATA_TYPES = {
|
DATA_TYPES = {
|
||||||
"sound": dict(
|
"sound": dict(
|
||||||
func=sound_loader,
|
func=sound_loader,
|
||||||
kwargs=["float_dtype"],
|
kwargs=["dest_sample_rate","float_dtype"],
|
||||||
help="Audio format types which supported by sndfile wav, flac, etc."
|
help="Audio format types which supported by sndfile wav, flac, etc."
|
||||||
"\n\n"
|
"\n\n"
|
||||||
" utterance_id_a a.wav\n"
|
" utterance_id_a a.wav\n"
|
||||||
@ -282,6 +282,7 @@ class ESPnetDataset(AbsDataset):
|
|||||||
int_dtype: str = "long",
|
int_dtype: str = "long",
|
||||||
max_cache_size: Union[float, int, str] = 0.0,
|
max_cache_size: Union[float, int, str] = 0.0,
|
||||||
max_cache_fd: int = 0,
|
max_cache_fd: int = 0,
|
||||||
|
dest_sample_rate: int = 16000,
|
||||||
):
|
):
|
||||||
assert check_argument_types()
|
assert check_argument_types()
|
||||||
if len(path_name_type_list) == 0:
|
if len(path_name_type_list) == 0:
|
||||||
@ -295,6 +296,7 @@ class ESPnetDataset(AbsDataset):
|
|||||||
self.float_dtype = float_dtype
|
self.float_dtype = float_dtype
|
||||||
self.int_dtype = int_dtype
|
self.int_dtype = int_dtype
|
||||||
self.max_cache_fd = max_cache_fd
|
self.max_cache_fd = max_cache_fd
|
||||||
|
self.dest_sample_rate = dest_sample_rate
|
||||||
|
|
||||||
self.loader_dict = {}
|
self.loader_dict = {}
|
||||||
self.debug_info = {}
|
self.debug_info = {}
|
||||||
@ -335,6 +337,8 @@ class ESPnetDataset(AbsDataset):
|
|||||||
for key2 in dic["kwargs"]:
|
for key2 in dic["kwargs"]:
|
||||||
if key2 == "loader_type":
|
if key2 == "loader_type":
|
||||||
kwargs["loader_type"] = loader_type
|
kwargs["loader_type"] = loader_type
|
||||||
|
elif key2 == "dest_sample_rate" and loader_type=="sound":
|
||||||
|
kwargs["dest_sample_rate"] = self.dest_sample_rate
|
||||||
elif key2 == "float_dtype":
|
elif key2 == "float_dtype":
|
||||||
kwargs["float_dtype"] = self.float_dtype
|
kwargs["float_dtype"] = self.float_dtype
|
||||||
elif key2 == "int_dtype":
|
elif key2 == "int_dtype":
|
||||||
|
|||||||
@ -4,6 +4,7 @@ from typing import Union
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import soundfile
|
import soundfile
|
||||||
|
import librosa
|
||||||
from typeguard import check_argument_types
|
from typeguard import check_argument_types
|
||||||
|
|
||||||
from funasr.fileio.read_text import read_2column_text
|
from funasr.fileio.read_text import read_2column_text
|
||||||
@ -30,6 +31,7 @@ class SoundScpReader(collections.abc.Mapping):
|
|||||||
dtype=np.int16,
|
dtype=np.int16,
|
||||||
always_2d: bool = False,
|
always_2d: bool = False,
|
||||||
normalize: bool = False,
|
normalize: bool = False,
|
||||||
|
dest_sample_rate: int = 16000,
|
||||||
):
|
):
|
||||||
assert check_argument_types()
|
assert check_argument_types()
|
||||||
self.fname = fname
|
self.fname = fname
|
||||||
@ -37,15 +39,18 @@ class SoundScpReader(collections.abc.Mapping):
|
|||||||
self.always_2d = always_2d
|
self.always_2d = always_2d
|
||||||
self.normalize = normalize
|
self.normalize = normalize
|
||||||
self.data = read_2column_text(fname)
|
self.data = read_2column_text(fname)
|
||||||
|
self.dest_sample_rate = dest_sample_rate
|
||||||
|
|
||||||
def __getitem__(self, key):
|
def __getitem__(self, key):
|
||||||
wav = self.data[key]
|
wav = self.data[key]
|
||||||
if self.normalize:
|
if self.normalize:
|
||||||
# soundfile.read normalizes data to [-1,1] if dtype is not given
|
# soundfile.read normalizes data to [-1,1] if dtype is not given
|
||||||
array, rate = soundfile.read(wav, always_2d=self.always_2d)
|
array, rate = librosa.load(
|
||||||
|
wav, sr=self.dest_sample_rate, mono=not self.always_2d
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
array, rate = soundfile.read(
|
array, rate = librosa.load(
|
||||||
wav, dtype=self.dtype, always_2d=self.always_2d
|
wav, sr=self.dest_sample_rate, mono=not self.always_2d, dtype=self.dtype
|
||||||
)
|
)
|
||||||
|
|
||||||
return rate, array
|
return rate, array
|
||||||
|
|||||||
@ -1576,6 +1576,7 @@ class AbsTask(ABC):
|
|||||||
preprocess=iter_options.preprocess_fn,
|
preprocess=iter_options.preprocess_fn,
|
||||||
max_cache_size=iter_options.max_cache_size,
|
max_cache_size=iter_options.max_cache_size,
|
||||||
max_cache_fd=iter_options.max_cache_fd,
|
max_cache_fd=iter_options.max_cache_fd,
|
||||||
|
dest_sample_rate=args.frontend_conf["fs"],
|
||||||
)
|
)
|
||||||
cls.check_task_requirements(
|
cls.check_task_requirements(
|
||||||
dataset, args.allow_variable_data_keys, train=iter_options.train
|
dataset, args.allow_variable_data_keys, train=iter_options.train
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user