From 2bca7032afb0d5b9eb60f7bcf3bc15ad1e8d8a83 Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Thu, 1 Dec 2022 15:57:43 +0800
Subject: [PATCH] Update RNNLM training scripts (#720)

* Update RNNLM training scripts

* Fix a typo

* Fix CI
---
 .github/workflows/run-ptb-rnn-lm.yml         | 67 ++++++++++++++++++++
 egs/librispeech/ASR/local/train_bpe_model.py |  4 ++
 egs/ptb/LM/prepare.sh                        | 38 ++++++-----
 egs/ptb/LM/rnn_lm                            |  1 +
 egs/ptb/LM/train-rnn-lm.sh                   | 67 ++++++++++++++++++++
 icefall/rnn_lm/compute_perplexity.py         |  2 +-
 icefall/rnn_lm/dataset.py                    |  4 +-
 icefall/rnn_lm/train.py                      | 10 +--
 8 files changed, 170 insertions(+), 23 deletions(-)
 create mode 100644 .github/workflows/run-ptb-rnn-lm.yml
 create mode 120000 egs/ptb/LM/rnn_lm
 create mode 100755 egs/ptb/LM/train-rnn-lm.sh

diff --git a/.github/workflows/run-ptb-rnn-lm.yml b/.github/workflows/run-ptb-rnn-lm.yml
new file mode 100644
index 000000000..8ebc2e79b
--- /dev/null
+++ b/.github/workflows/run-ptb-rnn-lm.yml
@@ -0,0 +1,67 @@
+name: run-ptb-rnn-lm-training
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    types: [labeled]
+
+  schedule:
+    # minute (0-59)
+    # hour (0-23)
+    # day of the month (1-31)
+    # month (1-12)
+    # day of the week (0-6)
+    # nightly build at 15:50 UTC time every day
+    - cron: "50 15 * * *"
+
+jobs:
+  run_ptb_rnn_lm_training:
+    if: github.event.label.name == 'ready' || github.event.label.name == 'rnnlm' || github.event_name == 'push' || github.event_name == 'schedule'
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        python-version: ["3.8"]
+
+      fail-fast: false
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+          cache-dependency-path: '**/requirements-ci.txt'
+
+      - name: Install Python dependencies
+        run: |
+          grep -v '^#' ./requirements-ci.txt  | grep -v kaldifst | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
+
+      - name: Prepare data
+        shell: bash
+        run: |
+          export PYTHONPATH=$PWD:$PYTHONPATH
+          cd egs/ptb/LM
+          ./prepare.sh
+
+      - name: Run training
+        shell: bash
+        run: |
+          export PYTHONPATH=$PWD:$PYTHONPATH
+          cd egs/ptb/LM
+          ./train-rnn-lm.sh --world-size 1 --num-epochs 5 --use-epoch 4 --use-avg 2
+
+      - name: Upload pretrained models
+        uses: actions/upload-artifact@v2
+        if: github.event.label.name == 'ready' || github.event.label.name == 'rnnlm' || github.event_name == 'push' || github.event_name == 'schedule'
+        with:
+          name: python-${{ matrix.python-version }}-ubuntu-rnn-lm-ptb
+          path: egs/ptb/LM/my-rnnlm-exp/
diff --git a/egs/librispeech/ASR/local/train_bpe_model.py b/egs/librispeech/ASR/local/train_bpe_model.py
index 42aba9572..7f6f47e16 100755
--- a/egs/librispeech/ASR/local/train_bpe_model.py
+++ b/egs/librispeech/ASR/local/train_bpe_model.py
@@ -89,6 +89,10 @@ def main():
             bos_id=-1,
             eos_id=-1,
         )
+    else:
+        print(f"{model_file} exists - skipping")
+        return
+
 
     shutil.copyfile(model_file, f"{lang_dir}/bpe.model")
 
diff --git a/egs/ptb/LM/prepare.sh b/egs/ptb/LM/prepare.sh
index 91c3c667a..69fab999a 100755
--- a/egs/ptb/LM/prepare.sh
+++ b/egs/ptb/LM/prepare.sh
@@ -22,9 +22,9 @@ dl_dir=$PWD/download
 # if the array contains xxx, yyy
 vocab_sizes=(
   500
-  1000
-  2000
-  5000
+  # 1000
+  # 2000
+  # 5000
 )
 
 # All files generated by this script are saved in "data".
@@ -42,11 +42,14 @@ log "dl_dir: $dl_dir"
 
 if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
   log "Stage -1: Download data"
+
+  # Caution: The downloaded data has already been normalized for LM training.
+
   if [ ! -f $dl_dir/.complete ]; then
-    url=https://raw.githubusercontent.com/townie/PTB-dataset-from-Tomas-Mikolov-s-webpage/master/data/
-    wget --no-verbose --directory-prefix $dl_dir $url/ptb.train.txt
-    wget --no-verbose --directory-prefix $dl_dir $url/ptb.valid.txt
-    wget --no-verbose --directory-prefix $dl_dir $url/ptb.test.txt
+    url=http://raw.githubusercontent.com/townie/PTB-dataset-from-Tomas-Mikolov-s-webpage/master/data
+    wget --directory-prefix $dl_dir $url/ptb.train.txt
+    wget --directory-prefix $dl_dir $url/ptb.valid.txt
+    wget --directory-prefix $dl_dir $url/ptb.test.txt
     touch $dl_dir/.complete
   fi
 fi
@@ -54,11 +57,15 @@ fi
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
   log "Stage 0: Train BPE model"
 
+  # Caution: You have to use the same bpe model for training your acoustic model
+  # Caution: You have to use the same bpe model for training your acoustic model
+  # Caution: You have to use the same bpe model for training your acoustic model
+
   for vocab_size in ${vocab_sizes[@]}; do
-    out_dir=data/bpe_${vocab_size}
-    mkdir -p $out_dir
+    lang_dir=data/lang_bpe_${vocab_size}
+    mkdir -p $lang_dir
     ./local/train_bpe_model.py \
-      --out-dir $out_dir \
+      --lang-dir $lang_dir \
       --vocab-size $vocab_size \
       --transcript $dl_dir/ptb.train.txt
   done
@@ -69,20 +76,21 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
   # Note: ptb.train.txt has already been normalized
 
   for vocab_size in ${vocab_sizes[@]}; do
-    out_dir=data/bpe_${vocab_size}
+    lang_dir=data/lang_bpe_${vocab_size}
+    out_dir=data/lm_training_bpe_${vocab_size}
     mkdir -p $out_dir
     ./local/prepare_lm_training_data.py \
-      --bpe-model $out_dir/bpe.model \
+      --bpe-model $lang_dir/bpe.model \
       --lm-data $dl_dir/ptb.train.txt \
       --lm-archive $out_dir/lm_data.pt
 
     ./local/prepare_lm_training_data.py \
-      --bpe-model $out_dir/bpe.model \
+      --bpe-model $lang_dir/bpe.model \
       --lm-data $dl_dir/ptb.valid.txt \
       --lm-archive $out_dir/lm_data-valid.pt
 
     ./local/prepare_lm_training_data.py \
-      --bpe-model $out_dir/bpe.model \
+      --bpe-model $lang_dir/bpe.model \
       --lm-data $dl_dir/ptb.test.txt \
       --lm-archive $out_dir/lm_data-test.pt
   done
@@ -98,7 +106,7 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
   # in a sentence.
 
   for vocab_size in ${vocab_sizes[@]}; do
-    out_dir=data/bpe_${vocab_size}
+    out_dir=data/lm_training_bpe_${vocab_size}
     mkdir -p $out_dir
     ./local/sort_lm_training_data.py \
       --in-lm-data $out_dir/lm_data.pt \
diff --git a/egs/ptb/LM/rnn_lm b/egs/ptb/LM/rnn_lm
new file mode 120000
index 000000000..87f29771e
--- /dev/null
+++ b/egs/ptb/LM/rnn_lm
@@ -0,0 +1 @@
+../../../icefall/rnn_lm
\ No newline at end of file
diff --git a/egs/ptb/LM/train-rnn-lm.sh b/egs/ptb/LM/train-rnn-lm.sh
new file mode 100755
index 000000000..29c609ee1
--- /dev/null
+++ b/egs/ptb/LM/train-rnn-lm.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+
+# Please run ./prepare.sh first
+
+stage=-1
+stop_stage=100
+
+# Number of GPUs to use for training
+world_size=1
+
+# Number of epochs to train
+num_epochs=20
+
+# Use this epoch for computing ppl
+use_epoch=19
+
+# number of models to average for computing ppl
+use_avg=2
+
+exp_dir=./my-rnnlm-exp
+
+. shared/parse_options.sh || exit 1
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "Training RNN LM"
+
+  ./rnn_lm/train.py \
+    --exp-dir $exp_dir \
+    --start-epoch 0 \
+    --num-epochs $num_epochs \
+    --world-size $world_size \
+    --use-fp16 0 \
+    --vocab-size 500 \
+    \
+    --lm-data ./data/lm_training_bpe_500/sorted_lm_data.pt \
+    --lm-data-valid ./data/lm_training_bpe_500/sorted_lm_data-valid.pt \
+    \
+    --embedding-dim 800 \
+    --hidden-dim 200 \
+    --num-layers 2 \
+    --tie-weights false \
+    --batch-size 50
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Computing perplexity"
+
+  ./rnn_lm/compute_perplexity.py \
+    --exp-dir $exp_dir \
+    --epoch $use_epoch \
+    --avg $use_avg \
+    --vocab-size 500 \
+    \
+    --lm-data ./data/lm_training_bpe_500/sorted_lm_data-test.pt \
+    \
+    --embedding-dim 800 \
+    --hidden-dim 200 \
+    --num-layers 2 \
+    --tie-weights false \
+    --batch-size 50
+fi
diff --git a/icefall/rnn_lm/compute_perplexity.py b/icefall/rnn_lm/compute_perplexity.py
index 550801a8f..f75a89590 100755
--- a/icefall/rnn_lm/compute_perplexity.py
+++ b/icefall/rnn_lm/compute_perplexity.py
@@ -20,7 +20,7 @@ Usage:
   ./rnn_lm/compute_perplexity.py \
     --epoch 4 \
     --avg 2 \
-    --lm-data ./data/bpe_500/sorted_lm_data-test.pt
+    --lm-data ./data/lm_training_bpe_500/sorted_lm_data-test.pt
 
 """
 
diff --git a/icefall/rnn_lm/dataset.py b/icefall/rnn_lm/dataset.py
index 4bf982503..53be53f64 100644
--- a/icefall/rnn_lm/dataset.py
+++ b/icefall/rnn_lm/dataset.py
@@ -1,4 +1,4 @@
-# Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
+# Copyright (c)  2021  Xiaomi Corporation (authors: Daniel Povey, Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@@ -194,7 +194,7 @@ def get_dataloader(
         batch_size=params.batch_size,
     )
     if is_distributed:
-        sampler = DistributedSampler(dataset, shuffle=True, drop_last=False)
+        sampler = DistributedSampler(dataset, shuffle=True, drop_last=True)
     else:
         sampler = None
 
diff --git a/icefall/rnn_lm/train.py b/icefall/rnn_lm/train.py
index 3ba5bfbee..803da99d6 100755
--- a/icefall/rnn_lm/train.py
+++ b/icefall/rnn_lm/train.py
@@ -24,7 +24,7 @@ Usage:
         --use-fp16 0 \
         --embedding-dim 800 \
         --hidden-dim 200 \
-        --num-layers 2\
+        --num-layers 2 \
         --batch-size 400
 
 """
@@ -83,7 +83,7 @@ def get_parser():
     parser.add_argument(
         "--num-epochs",
         type=int,
-        default=10,
+        default=30,
         help="Number of epochs to train.",
     )
 
@@ -110,14 +110,14 @@ def get_parser():
     parser.add_argument(
         "--use-fp16",
         type=str2bool,
-        default=False,
+        default=True,
         help="Whether to use half precision training.",
     )
 
     parser.add_argument(
         "--batch-size",
         type=int,
-        default=50,
+        default=400,
     )
 
     parser.add_argument(
@@ -165,7 +165,7 @@ def get_parser():
     parser.add_argument(
         "--tie-weights",
         type=str2bool,
-        default=False,
+        default=True,
         help="""True to share the weights between the input embedding layer and the
         last output linear layer
         """,