This commit is contained in:
游雁 2024-09-26 11:52:05 +08:00
parent 7ce917b596
commit aaa0325322

View File

@ -16,11 +16,11 @@ class OpenAIIndexDSJsonl(torch.utils.data.Dataset): # torch.utils.data.Dataset
def __init__(self, path: str, **kwargs): def __init__(self, path: str, **kwargs):
super().__init__() super().__init__()
self.max_source_length = kwargs.get("max_source_length", 3000) self.max_source_length = kwargs.get("max_source_length", 6000)
self.min_source_length = kwargs.get("min_source_length", 0) # self.min_source_length = kwargs.get("min_source_length", 0)
self.max_target_length = kwargs.get("max_target_length", 2048) self.max_target_length = kwargs.get("max_target_length", 2048)
self.min_target_length = kwargs.get("min_target_length", 0) # self.min_target_length = kwargs.get("min_target_length", 0)
self.max_token_length = kwargs.get("max_token_length", 2200) # self.max_token_length = kwargs.get("max_token_length", 2200)
is_training = kwargs.get("is_training", True) is_training = kwargs.get("is_training", True)
if not (path.endswith(".jsonl") or path.endswith(".json")): if not (path.endswith(".jsonl") or path.endswith(".json")):
@ -53,16 +53,17 @@ class OpenAIIndexDSJsonl(torch.utils.data.Dataset): # torch.utils.data.Dataset
data = data_dict["messages"] data = data_dict["messages"]
speech_length = data_dict.get("speech_length", -1) // 8 speech_length = data_dict.get("speech_length", -1) // 8
text_length = data_dict.get("text_length", 0) text_length = data_dict.get("text_length", 0)
if speech_length > self.max_source_length: if speech_length * 8 > self.max_source_length:
logging.info( logging.info(
f"speech_length: {speech_length} > {self.max_source_length}, drop it" f"speech_length: {speech_length*8} > {self.max_source_length}, drop it: {data_dict}"
) )
continue continue
if text_length > self.max_target_length: if text_length > self.max_target_length:
logging.info(
f"text_length: {text_length} > {self.max_target_length}, drop it: {data_dict}"
)
continue continue
self.max_target_length = kwargs.get("max_target_length", 2048)
system, user, assistant = [], [], [] system, user, assistant = [], [], []
for i, item in enumerate(data): for i, item in enumerate(data):
role = item["role"] role = item["role"]