mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-04 06:34:20 +00:00
minor updates
This commit is contained in:
parent
d29efb7345
commit
817413f899
@ -370,4 +370,73 @@ if [ $stage -le 15 ] && [ $stop_stage -ge 15 ]; then
|
|||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 16 ] && [ $stop_stage -ge 16 ]; then
|
||||||
|
log "Stage 16: Prepare LM data"
|
||||||
|
|
||||||
|
./prepare_lm_data.sh
|
||||||
|
|
||||||
|
for vocab_size in ${vocab_sizes[@]}; do
|
||||||
|
lang_dir=data/lang_bpe_${vocab_size}
|
||||||
|
out_dir=data/lm_training_bpe_${vocab_size}
|
||||||
|
|
||||||
|
mkdir $out_dir
|
||||||
|
|
||||||
|
./local/prepare_lm_training_data.py \
|
||||||
|
--bpe-model $lang_dir/bpe.model \
|
||||||
|
--lm-data ./data/lm_training_data/lm_training_text \
|
||||||
|
--lm-archive $out_dir/lm_training_data.pt
|
||||||
|
|
||||||
|
./local/prepare_lm_training_data.py \
|
||||||
|
--bpe-model $lang_dir/bpe.model \
|
||||||
|
--lm-data ./data/lm_dev_data/lm_dev_text \
|
||||||
|
--lm-archive $out_dir/lm_dev_data.pt
|
||||||
|
|
||||||
|
./local/prepare_lm_training_data.py \
|
||||||
|
--bpe-model $lang_dir/bpe.model \
|
||||||
|
--lm-data ./data/lm_test_data/lm_test_text \
|
||||||
|
--lm-archive $out_dir/lm_test_data.pt
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 17 ] && [ $stop_stage -ge 17 ]; then
|
||||||
|
log "Stage 17: Sort LM data"
|
||||||
|
for vocab_size in ${vocab_sizes[@]}; do
|
||||||
|
out_dir=data/lm_training_bpe_${vocab_size}
|
||||||
|
|
||||||
|
./local/sort_lm_training_data.py \
|
||||||
|
--in-lm-data $out_dir/lm_training_data.pt \
|
||||||
|
--out-lm-data $out_dir/sorted_lm_data.pt \
|
||||||
|
--out-statistics $out_dir/statistics.txt
|
||||||
|
|
||||||
|
./local/sort_lm_training_data.py \
|
||||||
|
--in-lm-data $out_dir/lm_dev_data.pt \
|
||||||
|
--out-lm-data $out_dir/sorted_lm_data-dev.pt \
|
||||||
|
--out-statistics $out_dir/statistics-dev.txt
|
||||||
|
|
||||||
|
./local/sort_lm_training_data.py \
|
||||||
|
--in-lm-data $out_dir/lm_test_data.pt \
|
||||||
|
--out-lm-data $out_dir/sorted_lm_data-test.pt \
|
||||||
|
--out-statistics $out_dir/statistics-test.txt
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 18 ] && [ $stop_stage -ge 18 ]; then
|
||||||
|
log "Stage 18: Train RNN LM model"
|
||||||
|
for vocab_size in ${vocab_sizes[@]}; do
|
||||||
|
out_dir=data/lm_training_bpe_${vocab_size}
|
||||||
|
python ../../../icefall/rnn_lm/train.py \
|
||||||
|
--start-epoch 0 \
|
||||||
|
--world-size 1 \
|
||||||
|
--use-fp16 0 \
|
||||||
|
--embedding-dim 2048 \
|
||||||
|
--hidden-dim 2048 \
|
||||||
|
--num-layers 3 \
|
||||||
|
--batch-size 400 \
|
||||||
|
--exp-dir rnnlm_bpe_${vocab_size}/exp \
|
||||||
|
--lm-data $out_dir/sorted_lm_data.pt \
|
||||||
|
--lm-data-valid $out_dir/sorted_lm_data-dev.pt \
|
||||||
|
--vocab-size $vocab_size \
|
||||||
|
--master-port 12345
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
@ -31,6 +31,7 @@ from pathlib import Path
|
|||||||
import k2
|
import k2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
def get_args():
|
||||||
@ -87,7 +88,7 @@ def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
cur = None
|
cur = None
|
||||||
for i in range(num_sentences):
|
for i in tqdm(range(num_sentences)):
|
||||||
word_ids = sorted_sentences[i]
|
word_ids = sorted_sentences[i]
|
||||||
token_ids = words2bpe[word_ids]
|
token_ids = words2bpe[word_ids]
|
||||||
if isinstance(token_ids, k2.RaggedTensor):
|
if isinstance(token_ids, k2.RaggedTensor):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user