From 2bca7032afb0d5b9eb60f7bcf3bc15ad1e8d8a83 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 1 Dec 2022 15:57:43 +0800 Subject: [PATCH] Update RNNLM training scripts (#720) * Update RNNLM training scripts * Fix a typo * Fix CI --- .github/workflows/run-ptb-rnn-lm.yml | 67 ++++++++++++++++++++ egs/librispeech/ASR/local/train_bpe_model.py | 4 ++ egs/ptb/LM/prepare.sh | 38 ++++++----- egs/ptb/LM/rnn_lm | 1 + egs/ptb/LM/train-rnn-lm.sh | 67 ++++++++++++++++++++ icefall/rnn_lm/compute_perplexity.py | 2 +- icefall/rnn_lm/dataset.py | 4 +- icefall/rnn_lm/train.py | 10 +-- 8 files changed, 170 insertions(+), 23 deletions(-) create mode 100644 .github/workflows/run-ptb-rnn-lm.yml create mode 120000 egs/ptb/LM/rnn_lm create mode 100755 egs/ptb/LM/train-rnn-lm.sh diff --git a/.github/workflows/run-ptb-rnn-lm.yml b/.github/workflows/run-ptb-rnn-lm.yml new file mode 100644 index 000000000..8ebc2e79b --- /dev/null +++ b/.github/workflows/run-ptb-rnn-lm.yml @@ -0,0 +1,67 @@ +name: run-ptb-rnn-lm-training + +on: + push: + branches: + - master + pull_request: + types: [labeled] + + schedule: + # minute (0-59) + # hour (0-23) + # day of the month (1-31) + # month (1-12) + # day of the week (0-6) + # nightly build at 15:50 UTC time every day + - cron: "50 15 * * *" + +jobs: + run_ptb_rnn_lm_training: + if: github.event.label.name == 'ready' || github.event.label.name == 'rnnlm' || github.event_name == 'push' || github.event_name == 'schedule' + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest] + python-version: ["3.8"] + + fail-fast: false + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + cache-dependency-path: '**/requirements-ci.txt' + + - name: Install Python dependencies + run: | + grep -v '^#' ./requirements-ci.txt | grep -v kaldifst | xargs -n 1 -L 1 pip install + pip uninstall -y protobuf + pip install --no-binary protobuf protobuf + + - name: Prepare data + shell: bash + run: | + export PYTHONPATH=$PWD:$PYTHONPATH + cd egs/ptb/LM + ./prepare.sh + + - name: Run training + shell: bash + run: | + export PYTHONPATH=$PWD:$PYTHONPATH + cd egs/ptb/LM + ./train-rnn-lm.sh --world-size 1 --num-epochs 5 --use-epoch 4 --use-avg 2 + + - name: Upload pretrained models + uses: actions/upload-artifact@v2 + if: github.event.label.name == 'ready' || github.event.label.name == 'rnnlm' || github.event_name == 'push' || github.event_name == 'schedule' + with: + name: python-${{ matrix.python-version }}-ubuntu-rnn-lm-ptb + path: egs/ptb/LM/my-rnnlm-exp/ diff --git a/egs/librispeech/ASR/local/train_bpe_model.py b/egs/librispeech/ASR/local/train_bpe_model.py index 42aba9572..7f6f47e16 100755 --- a/egs/librispeech/ASR/local/train_bpe_model.py +++ b/egs/librispeech/ASR/local/train_bpe_model.py @@ -89,6 +89,10 @@ def main(): bos_id=-1, eos_id=-1, ) + else: + print(f"{model_file} exists - skipping") + return + shutil.copyfile(model_file, f"{lang_dir}/bpe.model") diff --git a/egs/ptb/LM/prepare.sh b/egs/ptb/LM/prepare.sh index 91c3c667a..69fab999a 100755 --- a/egs/ptb/LM/prepare.sh +++ b/egs/ptb/LM/prepare.sh @@ -22,9 +22,9 @@ dl_dir=$PWD/download # if the array contains xxx, yyy vocab_sizes=( 500 - 1000 - 2000 - 5000 + # 1000 + # 2000 + # 5000 ) # All files generated by this script are saved in "data". @@ -42,11 +42,14 @@ log "dl_dir: $dl_dir" if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then log "Stage -1: Download data" + + # Caution: The downloaded data has already been normalized for LM training. + if [ ! -f $dl_dir/.complete ]; then - url=https://raw.githubusercontent.com/townie/PTB-dataset-from-Tomas-Mikolov-s-webpage/master/data/ - wget --no-verbose --directory-prefix $dl_dir $url/ptb.train.txt - wget --no-verbose --directory-prefix $dl_dir $url/ptb.valid.txt - wget --no-verbose --directory-prefix $dl_dir $url/ptb.test.txt + url=http://raw.githubusercontent.com/townie/PTB-dataset-from-Tomas-Mikolov-s-webpage/master/data + wget --directory-prefix $dl_dir $url/ptb.train.txt + wget --directory-prefix $dl_dir $url/ptb.valid.txt + wget --directory-prefix $dl_dir $url/ptb.test.txt touch $dl_dir/.complete fi fi @@ -54,11 +57,15 @@ fi if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then log "Stage 0: Train BPE model" + # Caution: You have to use the same bpe model for training your acoustic model + # Caution: You have to use the same bpe model for training your acoustic model + # Caution: You have to use the same bpe model for training your acoustic model + for vocab_size in ${vocab_sizes[@]}; do - out_dir=data/bpe_${vocab_size} - mkdir -p $out_dir + lang_dir=data/lang_bpe_${vocab_size} + mkdir -p $lang_dir ./local/train_bpe_model.py \ - --out-dir $out_dir \ + --lang-dir $lang_dir \ --vocab-size $vocab_size \ --transcript $dl_dir/ptb.train.txt done @@ -69,20 +76,21 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then # Note: ptb.train.txt has already been normalized for vocab_size in ${vocab_sizes[@]}; do - out_dir=data/bpe_${vocab_size} + lang_dir=data/lang_bpe_${vocab_size} + out_dir=data/lm_training_bpe_${vocab_size} mkdir -p $out_dir ./local/prepare_lm_training_data.py \ - --bpe-model $out_dir/bpe.model \ + --bpe-model $lang_dir/bpe.model \ --lm-data $dl_dir/ptb.train.txt \ --lm-archive $out_dir/lm_data.pt ./local/prepare_lm_training_data.py \ - --bpe-model $out_dir/bpe.model \ + --bpe-model $lang_dir/bpe.model \ --lm-data $dl_dir/ptb.valid.txt \ --lm-archive $out_dir/lm_data-valid.pt ./local/prepare_lm_training_data.py \ - --bpe-model $out_dir/bpe.model \ + --bpe-model $lang_dir/bpe.model \ --lm-data $dl_dir/ptb.test.txt \ --lm-archive $out_dir/lm_data-test.pt done @@ -98,7 +106,7 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then # in a sentence. for vocab_size in ${vocab_sizes[@]}; do - out_dir=data/bpe_${vocab_size} + out_dir=data/lm_training_bpe_${vocab_size} mkdir -p $out_dir ./local/sort_lm_training_data.py \ --in-lm-data $out_dir/lm_data.pt \ diff --git a/egs/ptb/LM/rnn_lm b/egs/ptb/LM/rnn_lm new file mode 120000 index 000000000..87f29771e --- /dev/null +++ b/egs/ptb/LM/rnn_lm @@ -0,0 +1 @@ +../../../icefall/rnn_lm \ No newline at end of file diff --git a/egs/ptb/LM/train-rnn-lm.sh b/egs/ptb/LM/train-rnn-lm.sh new file mode 100755 index 000000000..29c609ee1 --- /dev/null +++ b/egs/ptb/LM/train-rnn-lm.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash + +# Please run ./prepare.sh first + +stage=-1 +stop_stage=100 + +# Number of GPUs to use for training +world_size=1 + +# Number of epochs to train +num_epochs=20 + +# Use this epoch for computing ppl +use_epoch=19 + +# number of models to average for computing ppl +use_avg=2 + +exp_dir=./my-rnnlm-exp + +. shared/parse_options.sh || exit 1 + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then + log "Training RNN LM" + + ./rnn_lm/train.py \ + --exp-dir $exp_dir \ + --start-epoch 0 \ + --num-epochs $num_epochs \ + --world-size $world_size \ + --use-fp16 0 \ + --vocab-size 500 \ + \ + --lm-data ./data/lm_training_bpe_500/sorted_lm_data.pt \ + --lm-data-valid ./data/lm_training_bpe_500/sorted_lm_data-valid.pt \ + \ + --embedding-dim 800 \ + --hidden-dim 200 \ + --num-layers 2 \ + --tie-weights false \ + --batch-size 50 +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + log "Computing perplexity" + + ./rnn_lm/compute_perplexity.py \ + --exp-dir $exp_dir \ + --epoch $use_epoch \ + --avg $use_avg \ + --vocab-size 500 \ + \ + --lm-data ./data/lm_training_bpe_500/sorted_lm_data-test.pt \ + \ + --embedding-dim 800 \ + --hidden-dim 200 \ + --num-layers 2 \ + --tie-weights false \ + --batch-size 50 +fi diff --git a/icefall/rnn_lm/compute_perplexity.py b/icefall/rnn_lm/compute_perplexity.py index 550801a8f..f75a89590 100755 --- a/icefall/rnn_lm/compute_perplexity.py +++ b/icefall/rnn_lm/compute_perplexity.py @@ -20,7 +20,7 @@ Usage: ./rnn_lm/compute_perplexity.py \ --epoch 4 \ --avg 2 \ - --lm-data ./data/bpe_500/sorted_lm_data-test.pt + --lm-data ./data/lm_training_bpe_500/sorted_lm_data-test.pt """ diff --git a/icefall/rnn_lm/dataset.py b/icefall/rnn_lm/dataset.py index 4bf982503..53be53f64 100644 --- a/icefall/rnn_lm/dataset.py +++ b/icefall/rnn_lm/dataset.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang) +# Copyright (c) 2021 Xiaomi Corporation (authors: Daniel Povey, Fangjun Kuang) # # See ../../../../LICENSE for clarification regarding multiple authors # @@ -194,7 +194,7 @@ def get_dataloader( batch_size=params.batch_size, ) if is_distributed: - sampler = DistributedSampler(dataset, shuffle=True, drop_last=False) + sampler = DistributedSampler(dataset, shuffle=True, drop_last=True) else: sampler = None diff --git a/icefall/rnn_lm/train.py b/icefall/rnn_lm/train.py index 3ba5bfbee..803da99d6 100755 --- a/icefall/rnn_lm/train.py +++ b/icefall/rnn_lm/train.py @@ -24,7 +24,7 @@ Usage: --use-fp16 0 \ --embedding-dim 800 \ --hidden-dim 200 \ - --num-layers 2\ + --num-layers 2 \ --batch-size 400 """ @@ -83,7 +83,7 @@ def get_parser(): parser.add_argument( "--num-epochs", type=int, - default=10, + default=30, help="Number of epochs to train.", ) @@ -110,14 +110,14 @@ def get_parser(): parser.add_argument( "--use-fp16", type=str2bool, - default=False, + default=True, help="Whether to use half precision training.", ) parser.add_argument( "--batch-size", type=int, - default=50, + default=400, ) parser.add_argument( @@ -165,7 +165,7 @@ def get_parser(): parser.add_argument( "--tie-weights", type=str2bool, - default=False, + default=True, help="""True to share the weights between the input embedding layer and the last output linear layer """,