Minor fixes.

2025-08-09 18:12:19 +00:00 · 2021-08-16 17:39:31 +08:00 · 2021-08-16 17:39:31 +08:00 · 9c2e378476
commit 9c2e378476
parent 12a2fd023e
4 changed files with 120 additions and 63 deletions
--- a/egs/librispeech/ASR/local/compile_hlg.py
+++ b/egs/librispeech/ASR/local/compile_hlg.py
@ -1,18 +1,18 @@
 #!/usr/bin/env python3

 """
-This script compiles HLG from
+This script takes as input lang_dir and generates HLG from

-    - H, the ctc topology, built from tokens contained in lexicon.txt
-    - L, the lexicon, built from L_disambig.pt
+    - H, the ctc topology, built from tokens contained in lang_dir/lexicon.txt
+    - L, the lexicon, built from lang_dir/L_disambig.pt

        Caution: We use a lexicon that contains disambiguation symbols

    - G, the LM, built from data/lm/G_3_gram.fst.txt

-The generated HLG is saved in data/lm/HLG.pt (phone based)
-or data/lm/HLG_bpe.pt (BPE based)
+The generated HLG is saved in $lang_dir/HLG.pt
 """
+import argparse
 import logging
 from pathlib import Path

@ -22,11 +22,23 @@ import torch
 from icefall.lexicon import Lexicon


+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        help="""Input and output directory.
+        """,
+    )
+
+    return parser.parse_args()
+
+
 def compile_HLG(lang_dir: str) -> k2.Fsa:
    """
    Args:
      lang_dir:
-        The language directory, e.g., data/lang_phone or data/lang_bpe.
+        The language directory, e.g., data/lang_phone or data/lang_bpe_5000.

    Return:
      An FSA representing HLG.
@ -104,17 +116,18 @@ def compile_HLG(lang_dir: str) -> k2.Fsa:


 def main():
-    for d in ["data/lang_phone", "data/lang_bpe"]:
-        d = Path(d)
-        logging.info(f"Processing {d}")
+    args = get_args()
+    lang_dir = Path(args.lang_dir)

-        if (d / "HLG.pt").is_file():
-            logging.info(f"{d}/HLG.pt already exists - skipping")
-            continue
+    if (lang_dir / "HLG.pt").is_file():
+        logging.info(f"{lang_dir}/HLG.pt already exists - skipping")
+        return

-        HLG = compile_HLG(d)
-        logging.info(f"Saving HLG.pt to {d}")
-        torch.save(HLG.as_dict(), f"{d}/HLG.pt")
+    logging.info(f"Processing {lang_dir}")
+
+    HLG = compile_HLG(lang_dir)
+    logging.info(f"Saving HLG.pt to {lang_dir}")
+    torch.save(HLG.as_dict(), f"{lang_dir}/HLG.pt")


 if __name__ == "__main__":
--- a/egs/librispeech/ASR/local/prepare_lang_bpe.py
+++ b/egs/librispeech/ASR/local/prepare_lang_bpe.py
@ -3,12 +3,13 @@
 # Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)

 """
-This script takes as inputs the following two files:

-    - data/lang_bpe/bpe.model,
-    - data/lang_bpe/words.txt
+This script takes as input `lang_dir`, which should contain::

-and generates the following files in the directory data/lang_bpe:
+    - lang_dir/bpe.model,
+    - lang_dir/words.txt
+
+and generates the following files in the directory `lang_dir`:

    - lexicon.txt
    - lexicon_disambig.txt
@ -17,6 +18,7 @@ and generates the following files in the directory data/lang_bpe:
    - tokens.txt
 """

+import argparse
 from pathlib import Path
 from typing import Dict, List, Tuple

@ -141,8 +143,22 @@ def generate_lexicon(
    return lexicon, token2id


+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        help="""Input and output directory.
+        It should contain the bpe.model and words.txt
+        """,
+    )
+
+    return parser.parse_args()
+
+
 def main():
-    lang_dir = Path("data/lang_bpe")
+    args = get_args()
+    lang_dir = Path(args.lang_dir)
    model_file = lang_dir / "bpe.model"

    word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
@ -189,15 +205,6 @@ def main():
    torch.save(L.as_dict(), lang_dir / "L.pt")
    torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt")

-    if False:
-        # Just for debugging, will remove it
-        L.labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt")
-        L.aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt")
-        L_disambig.labels_sym = L.labels_sym
-        L_disambig.aux_labels_sym = L.aux_labels_sym
-        L.draw(lang_dir / "L.svg", title="L")
-        L_disambig.draw(lang_dir / "L_disambig.svg", title="L_disambig")
-

 if __name__ == "__main__":
    main()
--- a/egs/librispeech/ASR/local/train_bpe_model.py
+++ b/egs/librispeech/ASR/local/train_bpe_model.py
@ -1,10 +1,5 @@
 #!/usr/bin/env python3

-"""
-This script takes as input "data/lang/bpe/train.txt"
-and generates "data/lang/bpe/bep.model".
-"""
-
 # You can install sentencepiece via:
 #
 #  pip install sentencepiece
@ -14,17 +9,41 @@ and generates "data/lang/bpe/bep.model".
 #
 # Please install a version >=0.1.96

+import argparse
 import shutil
 from pathlib import Path

 import sentencepiece as spm


+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        help="""Input and output directory.
+        It should contain the training corpus: train.txt.
+        The generated bpe.model is saved to this directory.
+        """,
+    )
+    parser.add_argument(
+        "--vocab-size",
+        type=int,
+        help="Vocabulary size for BPE training",
+    )
+
+    return parser.parse_args()
+
+
 def main():
+    args = get_args()
+    vocab_size = args.vocab_size
+    lang_dir = Path(args.lang_dir)
+
    model_type = "unigram"
-    vocab_size = 5000
-    model_prefix = f"data/lang_bpe/{model_type}_{vocab_size}"
-    train_text = "data/lang_bpe/train.txt"
+
+    model_prefix = f"{lang_dir}/{model_type}_{vocab_size}"
+    train_text = f"{lang_dir}/train.txt"
    character_coverage = 1.0
    input_sentence_size = 100000000

@ -49,10 +68,7 @@ def main():
            eos_id=-1,
        )

-    sp = spm.SentencePieceProcessor(model_file=str(model_file))
-    vocab_size = sp.vocab_size()
-
-    shutil.copyfile(model_file, "data/lang_bpe/bpe.model")
+    shutil.copyfile(model_file, f"{lang_dir}/bpe.model")


 if __name__ == "__main__":
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@ -25,7 +25,7 @@ stop_stage=100
 #        - librispeech-vocab.txt
 #        - librispeech-lexicon.txt
 #
-#  - $do_dir/musan
+#  - $dl_dir/musan
 #      This directory contains the following directories downloaded from
 #       http://www.openslr.org/17/
 #
@ -36,8 +36,15 @@ dl_dir=$PWD/download

 . shared/parse_options.sh || exit 1

+# vocab size for sentence piece models.
+# It will generate data/lang_bpe_xxx,
+# data/lang_bpe_yyy if the array contains xxx, yyy
+vocab_sizes=(
+  5000
+)

-# All generated files by this script are saved in "data"
+# All files generated by this script are saved in "data".
+# You can safely remove "data" and rerun this script to regenerate it.
 mkdir -p data

 log() {
@ -50,6 +57,7 @@ log "dl_dir: $dl_dir"

 if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
  log "stage -1: Download LM"
+  [ ! -e $dl_dir/lm ] && mkdir -p $dl_dir/lm
  ./local/download_lm.py --out-dir=$dl_dir/lm
 fi

@ -118,28 +126,34 @@ fi

 if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
  log "State 6: Prepare BPE based lang"
-  mkdir -p data/lang_bpe
-  # We reuse words.txt from phone based lexicon
-  # so that the two can share G.pt later.
-  cp data/lang_phone/words.txt data/lang_bpe/

-  if [ ! -f data/lang_bpe/train.txt ]; then
-    log "Generate data for BPE training"
-    files=$(
-      find "data/LibriSpeech/train-clean-100" -name "*.trans.txt"
-      find "data/LibriSpeech/train-clean-360" -name "*.trans.txt"
-      find "data/LibriSpeech/train-other-500" -name "*.trans.txt"
-    )
-    for f in ${files[@]}; do
-      cat $f | cut -d " " -f 2-
-    done > data/lang_bpe/train.txt
-  fi
+  for vocab_size in ${vocab_sizes[@]}; do
+    lang_dir=data/lang_bpe_${vocab_size}
+    mkdir -p $lang_dir
+    # We reuse words.txt from phone based lexicon
+    # so that the two can share G.pt later.
+    cp data/lang_phone/words.txt $lang_dir

-  python3 ./local/train_bpe_model.py
+    if [ ! -f $lang_dir/train.txt ]; then
+      log "Generate data for BPE training"
+      files=$(
+        find "$dl_dir/LibriSpeech/train-clean-100" -name "*.trans.txt"
+        find "$dl_dir/LibriSpeech/train-clean-360" -name "*.trans.txt"
+        find "$dl_dir/LibriSpeech/train-other-500" -name "*.trans.txt"
+      )
+      for f in ${files[@]}; do
+        cat $f | cut -d " " -f 2-
+      done > $lang_dir/train.txt
+    fi

-  if [ ! -f data/lang_bpe/L_disambig.pt ]; then
-    ./local/prepare_lang_bpe.py
-  fi
+    ./local/train_bpe_model.py \
+      --lang-dir $lang_dir \
+      --vocab-size $vocab_size
+
+    if [ ! -f $lang_dir/L_disambig.pt ]; then
+      ./local/prepare_lang_bpe.py --lang-dir $lang_dir
+    fi
+  done
 fi

 if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
@ -169,5 +183,12 @@ fi

 if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
  log "Stage 8: Compile HLG"
-  python3 ./local/compile_hlg.py
+  ./local/compile_hlg.py --lang-dir data/lang_phone
+
+  for vocab_size in ${vocab_sizes[@]}; do
+    lang_dir=data/lang_bpe_${vocab_size}
+    ./local/compile_hlg.py --lang-dir $lang_dir
+  done
 fi
+
+cd data && ln -sfv lang_bpe_5000 lang_bpe