This commit is contained in:
游雁 2024-02-20 15:22:13 +08:00
parent df9d3438da
commit 362a6de3c8
5 changed files with 33 additions and 16 deletions

View File

@ -94,13 +94,16 @@ scheduler_conf:
dataset: AudioDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: DynamicBatchLocalShuffleSampler
batch_sampler: RankFullLocalShuffleBatchSampler
batch_type: example # example or length
batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
batch_size: 32 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
buffer_size: 500
buffer_size: 1024
shuffle: True
num_workers: 4
preprocessor_speech: SpeechPreprocessSpeedPerturb
preprocessor_speech_conf:
speed_perturb: [0.9, 1.0, 1.1]
tokenizer: CharTokenizer
tokenizer_conf:

View File

@ -94,13 +94,16 @@ scheduler_conf:
dataset: AudioDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: DynamicBatchLocalShuffleSampler
batch_sampler: RankFullLocalShuffleBatchSampler
batch_type: example # example or length
batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
batch_size: 32 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
buffer_size: 500
buffer_size: 1024
shuffle: True
num_workers: 4
preprocessor_speech: SpeechPreprocessSpeedPerturb
preprocessor_speech_conf:
speed_perturb: [0.9, 1.0, 1.1]
tokenizer: CharTokenizer
tokenizer_conf:

View File

@ -109,9 +109,14 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "log_file: ${log_file}"
gpu_num=$(echo CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun \
--nnodes 1 \
--nproc_per_node ${gpu_num} \
# torchrun \
# --nnodes 1 \
# --nproc_per_node ${gpu_num}
cmd="python"
if [ ${gpu_num} -gt 1 ];then
cmd="torchrun --nnodes 1 --nproc_per_node ${gpu_num}"
fi
${cmd} \
../../../funasr/bin/train.py \
--config-path "${workspace}/conf" \
--config-name "${config}" \

View File

@ -94,13 +94,16 @@ scheduler_conf:
dataset: AudioDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: DynamicBatchLocalShuffleSampler
batch_sampler: RankFullLocalShuffleBatchSampler
batch_type: example # example or length
batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
batch_size: 32 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
buffer_size: 500
buffer_size: 1024
shuffle: True
num_workers: 4
preprocessor_speech: SpeechPreprocessSpeedPerturb
preprocessor_speech_conf:
speed_perturb: [0.9, 1.0, 1.1]
tokenizer: CharTokenizer
tokenizer_conf:

View File

@ -88,13 +88,16 @@ scheduler_conf:
dataset: AudioDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: DynamicBatchLocalShuffleSampler
batch_sampler: RankFullLocalShuffleBatchSampler
batch_type: example # example or length
batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
batch_size: 32 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
buffer_size: 500
buffer_size: 1024
shuffle: True
num_workers: 0
num_workers: 4
preprocessor_speech: SpeechPreprocessSpeedPerturb
preprocessor_speech_conf:
speed_perturb: [0.9, 1.0, 1.1]
tokenizer: CharTokenizer
tokenizer_conf: