mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
batch
This commit is contained in:
parent
03040b04e2
commit
a883f2342a
@ -71,7 +71,7 @@ class EspnetStyleBatchSampler(DistributedSampler):
|
||||
self.max_token_length = kwargs.get("max_token_length", 2048)
|
||||
self.min_token_length = kwargs.get("min_token_length", 0)
|
||||
self.length_scale_source = kwargs.get("length_scale_source", 1.0)
|
||||
self.start_step = 0
|
||||
self.start_step = start_step
|
||||
if self.start_step > 0:
|
||||
logging.info(f"Warning, start_step > 0, dataloader start from step: {self.start_step}")
|
||||
# super().__init__(dataset, num_replicas=num_replicas, rank=rank,
|
||||
@ -146,7 +146,10 @@ class EspnetStyleBatchSampler(DistributedSampler):
|
||||
start_idx = self.rank * batches_per_rank
|
||||
end_idx = start_idx + batches_per_rank
|
||||
rank_batches = buffer_batches[start_idx + self.start_step : end_idx]
|
||||
|
||||
if self.start_step > 0:
|
||||
logging.info(
|
||||
f"Warning, rank: {self.rank}, dataloader start from step: {self.start_step}, batch_num_before: {end_idx-start_idx}, now: {len(rank_batches)}"
|
||||
)
|
||||
# Return an iterator over the batches for the current rank
|
||||
return iter(rank_batches)
|
||||
|
||||
|
||||
@ -35,7 +35,7 @@ class IndexDSJsonlRankFull(torch.utils.data.Dataset):
|
||||
with open(path, encoding="utf-8") as fin:
|
||||
file_list_all = fin.readlines()
|
||||
|
||||
num_per_slice = (len(file_list_all) - 1) // data_split_num + 1
|
||||
num_per_slice = (len(file_list_all) - 1) // data_split_num + 1 # 16
|
||||
file_list = file_list_all[
|
||||
data_split_i * num_per_slice : (data_split_i + 1) * num_per_slice
|
||||
]
|
||||
|
||||
@ -50,8 +50,8 @@ def update_data(lines, i):
|
||||
sample_num = len(waveform)
|
||||
source_len = int(sample_num / 16000 * 1000 / 10)
|
||||
source_len_old = data["source_len"]
|
||||
if source_len_old != source_len:
|
||||
print(f"wav: {wav_path}, old: {source_len_old}, new: {source_len}")
|
||||
if (source_len_old - source_len) > 100 or (source_len - source_len_old) > 100:
|
||||
print(f"old: {source_len_old}, new: {source_len}, wav: {wav_path}")
|
||||
data["source_len"] = source_len
|
||||
jsonl_line = json.dumps(data, ensure_ascii=False)
|
||||
lines[i] = jsonl_line
|
||||
|
||||
Loading…
Reference in New Issue
Block a user