diff --git a/egs/librispeech/ASR/local/compile_hlg.py b/egs/librispeech/ASR/local/compile_hlg.py index b30402161..9f28bb74d 100755 --- a/egs/librispeech/ASR/local/compile_hlg.py +++ b/egs/librispeech/ASR/local/compile_hlg.py @@ -1,18 +1,18 @@ #!/usr/bin/env python3 """ -This script compiles HLG from +This script takes as input lang_dir and generates HLG from - - H, the ctc topology, built from tokens contained in lexicon.txt - - L, the lexicon, built from L_disambig.pt + - H, the ctc topology, built from tokens contained in lang_dir/lexicon.txt + - L, the lexicon, built from lang_dir/L_disambig.pt Caution: We use a lexicon that contains disambiguation symbols - G, the LM, built from data/lm/G_3_gram.fst.txt -The generated HLG is saved in data/lm/HLG.pt (phone based) -or data/lm/HLG_bpe.pt (BPE based) +The generated HLG is saved in $lang_dir/HLG.pt """ +import argparse import logging from pathlib import Path @@ -22,11 +22,23 @@ import torch from icefall.lexicon import Lexicon +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + """, + ) + + return parser.parse_args() + + def compile_HLG(lang_dir: str) -> k2.Fsa: """ Args: lang_dir: - The language directory, e.g., data/lang_phone or data/lang_bpe. + The language directory, e.g., data/lang_phone or data/lang_bpe_5000. Return: An FSA representing HLG. @@ -104,17 +116,18 @@ def compile_HLG(lang_dir: str) -> k2.Fsa: def main(): - for d in ["data/lang_phone", "data/lang_bpe"]: - d = Path(d) - logging.info(f"Processing {d}") + args = get_args() + lang_dir = Path(args.lang_dir) - if (d / "HLG.pt").is_file(): - logging.info(f"{d}/HLG.pt already exists - skipping") - continue + if (lang_dir / "HLG.pt").is_file(): + logging.info(f"{lang_dir}/HLG.pt already exists - skipping") + return - HLG = compile_HLG(d) - logging.info(f"Saving HLG.pt to {d}") - torch.save(HLG.as_dict(), f"{d}/HLG.pt") + logging.info(f"Processing {lang_dir}") + + HLG = compile_HLG(lang_dir) + logging.info(f"Saving HLG.pt to {lang_dir}") + torch.save(HLG.as_dict(), f"{lang_dir}/HLG.pt") if __name__ == "__main__": diff --git a/egs/librispeech/ASR/local/prepare_lang_bpe.py b/egs/librispeech/ASR/local/prepare_lang_bpe.py index e31220d9b..68b8db966 100755 --- a/egs/librispeech/ASR/local/prepare_lang_bpe.py +++ b/egs/librispeech/ASR/local/prepare_lang_bpe.py @@ -3,12 +3,13 @@ # Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang) """ -This script takes as inputs the following two files: - - data/lang_bpe/bpe.model, - - data/lang_bpe/words.txt +This script takes as input `lang_dir`, which should contain:: -and generates the following files in the directory data/lang_bpe: + - lang_dir/bpe.model, + - lang_dir/words.txt + +and generates the following files in the directory `lang_dir`: - lexicon.txt - lexicon_disambig.txt @@ -17,6 +18,7 @@ and generates the following files in the directory data/lang_bpe: - tokens.txt """ +import argparse from pathlib import Path from typing import Dict, List, Tuple @@ -141,8 +143,22 @@ def generate_lexicon( return lexicon, token2id +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + It should contain the bpe.model and words.txt + """, + ) + + return parser.parse_args() + + def main(): - lang_dir = Path("data/lang_bpe") + args = get_args() + lang_dir = Path(args.lang_dir) model_file = lang_dir / "bpe.model" word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt") @@ -189,15 +205,6 @@ def main(): torch.save(L.as_dict(), lang_dir / "L.pt") torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt") - if False: - # Just for debugging, will remove it - L.labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt") - L.aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt") - L_disambig.labels_sym = L.labels_sym - L_disambig.aux_labels_sym = L.aux_labels_sym - L.draw(lang_dir / "L.svg", title="L") - L_disambig.draw(lang_dir / "L_disambig.svg", title="L_disambig") - if __name__ == "__main__": main() diff --git a/egs/librispeech/ASR/local/train_bpe_model.py b/egs/librispeech/ASR/local/train_bpe_model.py index 59746ad9a..9872a7c6a 100755 --- a/egs/librispeech/ASR/local/train_bpe_model.py +++ b/egs/librispeech/ASR/local/train_bpe_model.py @@ -1,10 +1,5 @@ #!/usr/bin/env python3 -""" -This script takes as input "data/lang/bpe/train.txt" -and generates "data/lang/bpe/bep.model". -""" - # You can install sentencepiece via: # # pip install sentencepiece @@ -14,17 +9,41 @@ and generates "data/lang/bpe/bep.model". # # Please install a version >=0.1.96 +import argparse import shutil from pathlib import Path import sentencepiece as spm +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + It should contain the training corpus: train.txt. + The generated bpe.model is saved to this directory. + """, + ) + parser.add_argument( + "--vocab-size", + type=int, + help="Vocabulary size for BPE training", + ) + + return parser.parse_args() + + def main(): + args = get_args() + vocab_size = args.vocab_size + lang_dir = Path(args.lang_dir) + model_type = "unigram" - vocab_size = 5000 - model_prefix = f"data/lang_bpe/{model_type}_{vocab_size}" - train_text = "data/lang_bpe/train.txt" + + model_prefix = f"{lang_dir}/{model_type}_{vocab_size}" + train_text = f"{lang_dir}/train.txt" character_coverage = 1.0 input_sentence_size = 100000000 @@ -49,10 +68,7 @@ def main(): eos_id=-1, ) - sp = spm.SentencePieceProcessor(model_file=str(model_file)) - vocab_size = sp.vocab_size() - - shutil.copyfile(model_file, "data/lang_bpe/bpe.model") + shutil.copyfile(model_file, f"{lang_dir}/bpe.model") if __name__ == "__main__": diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh index ae676b199..f06e013f6 100755 --- a/egs/librispeech/ASR/prepare.sh +++ b/egs/librispeech/ASR/prepare.sh @@ -25,7 +25,7 @@ stop_stage=100 # - librispeech-vocab.txt # - librispeech-lexicon.txt # -# - $do_dir/musan +# - $dl_dir/musan # This directory contains the following directories downloaded from # http://www.openslr.org/17/ # @@ -36,8 +36,15 @@ dl_dir=$PWD/download . shared/parse_options.sh || exit 1 +# vocab size for sentence piece models. +# It will generate data/lang_bpe_xxx, +# data/lang_bpe_yyy if the array contains xxx, yyy +vocab_sizes=( + 5000 +) -# All generated files by this script are saved in "data" +# All files generated by this script are saved in "data". +# You can safely remove "data" and rerun this script to regenerate it. mkdir -p data log() { @@ -50,6 +57,7 @@ log "dl_dir: $dl_dir" if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then log "stage -1: Download LM" + [ ! -e $dl_dir/lm ] && mkdir -p $dl_dir/lm ./local/download_lm.py --out-dir=$dl_dir/lm fi @@ -118,28 +126,34 @@ fi if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then log "State 6: Prepare BPE based lang" - mkdir -p data/lang_bpe - # We reuse words.txt from phone based lexicon - # so that the two can share G.pt later. - cp data/lang_phone/words.txt data/lang_bpe/ - if [ ! -f data/lang_bpe/train.txt ]; then - log "Generate data for BPE training" - files=$( - find "data/LibriSpeech/train-clean-100" -name "*.trans.txt" - find "data/LibriSpeech/train-clean-360" -name "*.trans.txt" - find "data/LibriSpeech/train-other-500" -name "*.trans.txt" - ) - for f in ${files[@]}; do - cat $f | cut -d " " -f 2- - done > data/lang_bpe/train.txt - fi + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + mkdir -p $lang_dir + # We reuse words.txt from phone based lexicon + # so that the two can share G.pt later. + cp data/lang_phone/words.txt $lang_dir - python3 ./local/train_bpe_model.py + if [ ! -f $lang_dir/train.txt ]; then + log "Generate data for BPE training" + files=$( + find "$dl_dir/LibriSpeech/train-clean-100" -name "*.trans.txt" + find "$dl_dir/LibriSpeech/train-clean-360" -name "*.trans.txt" + find "$dl_dir/LibriSpeech/train-other-500" -name "*.trans.txt" + ) + for f in ${files[@]}; do + cat $f | cut -d " " -f 2- + done > $lang_dir/train.txt + fi - if [ ! -f data/lang_bpe/L_disambig.pt ]; then - ./local/prepare_lang_bpe.py - fi + ./local/train_bpe_model.py \ + --lang-dir $lang_dir \ + --vocab-size $vocab_size + + if [ ! -f $lang_dir/L_disambig.pt ]; then + ./local/prepare_lang_bpe.py --lang-dir $lang_dir + fi + done fi if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then @@ -169,5 +183,12 @@ fi if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then log "Stage 8: Compile HLG" - python3 ./local/compile_hlg.py + ./local/compile_hlg.py --lang-dir data/lang_phone + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + ./local/compile_hlg.py --lang-dir $lang_dir + done fi + +cd data && ln -sfv lang_bpe_5000 lang_bpe