funasr v2 setup

This commit is contained in:
游雁 2023-11-21 22:36:52 +08:00
parent 8b9c53cfc3
commit 58440d9373
4 changed files with 104 additions and 1 deletions

View File

View File

@ -0,0 +1,60 @@
import torch
class BatchSampler(torch.utils.data.BatchSampler):
def __init__(self, dataset=None, args=None, drop_last=True, ):
self.drop_last = drop_last
self.pre_idx = -1
self.dataset = dataset
self.batch_size_type = args.batch_size_type
self.batch_size = args.batch_size
self.sort_size = args.sort_size
self.max_length_token = args.max_length_token
self.total_samples = len(dataset)
def __len__(self):
return self.total_samples
def __iter__(self):
batch = []
max_token = 0
num_sample = 0
iter_num = (self.total_samples-1) // self.sort_size + 1
for iter in range(self.pre_idx + 1, iter_num):
datalen_with_index = []
for i in range(self.sort_size):
idx = iter * self.sort_size + i
if idx >= self.total_samples:
continue
if self.batch_size_type == "example":
sample_len_cur = 1
else:
idx_map = self.dataset.shuffle_idx[idx]
# prompt = self.dataset.indexed_dataset[idx_map]["prompt"]
sample_len_cur = self.dataset.indexed_dataset[idx_map]["source_len"] + \
self.dataset.indexed_dataset[idx_map]["target_len"]
datalen_with_index.append([idx, sample_len_cur])
datalen_with_index_sort = sorted(datalen_with_index, key=lambda x: x[1])
for item in datalen_with_index_sort:
idx, sample_len_cur = item
if sample_len_cur > self.max_length_token:
continue
max_token_cur = max(max_token, sample_len_cur)
max_token_padding = (1 + num_sample) * max_token_cur
if max_token_padding <= self.batch_size:
batch.append(idx)
max_token = max_token_cur
num_sample += 1
else:
yield batch
max_token = sample_len_cur
num_sample = 1
batch = [idx]

View File

@ -0,0 +1,43 @@
import torch
import json
import torch.distributed as dist
class AudioDatasetJsonl(torch.utils.data.Dataset):
def __init__(self, path, data_parallel_rank=0, data_parallel_size=1):
super().__init__()
data_parallel_size = dist.get_world_size()
contents = []
with open(path, encoding='utf-8') as fin:
for line in fin:
data = json.loads(line.strip())
if "text" in data: # for sft
self.contents.append(data['text'])
if "source" in data: # for speech lab pretrain
prompt = data["prompt"]
source = data["source"]
target = data["target"]
source_len = data["source_len"]
target_len = data["target_len"]
contents.append({"source": source,
"prompt": prompt,
"target": target,
"source_len": source_len,
"target_len": target_len,
}
)
self.contents = []
total_num = len(contents)
num_per_rank = total_num // data_parallel_size
rank = dist.get_rank()
# import ipdb; ipdb.set_trace()
self.contents = contents[rank * num_per_rank:(rank + 1) * num_per_rank]
def __len__(self):
return len(self.contents)
def __getitem__(self, index):
return self.contents[index]

View File

@ -10,7 +10,7 @@ import humanfriendly
import torch
from funasr.models.frontend.abs_frontend import AbsFrontend
from funasr.modules.frontends.frontend import Frontend
from funasr.models.frontend.frontends_utils.frontend import Frontend
from funasr.modules.nets_utils import pad_list
from funasr.utils.get_default_kwargs import get_default_kwargs