From 12a2fd023ed1c0da59873f70ac5c644c28d56e9b Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 12 Aug 2021 12:44:04 +0800 Subject: [PATCH 1/3] Add doc about installation and usage (#7) * Add readme. * Add TOC. * fix typos * Minor fixes after review. --- README.md | 61 +++++++- egs/librispeech/ASR/README.md | 139 ++++++------------ egs/librispeech/ASR/conformer_ctc/train.py | 26 +++- .../ASR/conformer_ctc/transformer.py | 19 +-- egs/librispeech/ASR/tdnn_lstm_ctc/README.md | 22 +-- requirements.txt | 1 + 6 files changed, 134 insertions(+), 134 deletions(-) diff --git a/README.md b/README.md index 9ffd34b6d..91c1f67a9 100644 --- a/README.md +++ b/README.md @@ -1 +1,60 @@ -Working in progress. + +# Table of Contents + +- [Installation](#installation) + * [Install k2](#install-k2) + * [Install lhotse](#install-lhotse) + * [Install icefall](#install-icefall) +- [Run recipes](#run-recipes) + +## Installation + +`icefall` depends on [k2][k2] for FSA operations and [lhotse][lhotse] for +data preparations. To use `icefall`, you have to install its dependencies first. +The following subsections describe how to setup the environment. + +CAUTION: There are various ways to setup the environment. What we describe +here is just one alternative. + +### Install k2 + +Please refer to [k2's installation documentation][k2-install] to install k2. +If you have any issues about installing k2, please open an issue at +. + +### Install lhotse + +Please refer to [lhotse's installation documentation][lhotse-install] to install +lhotse. + +### Install icefall + +`icefall` is a set of Python scripts. What you need to do is just to set +the environment variable `PYTHONPATH`: + +```bash +cd $HOME/open-source +git clone https://github.com/k2-fsa/icefall +cd icefall +pip install -r requirements.txt +export PYTHONPATH=$HOME/open-source/icefall:$PYTHONPATHON +``` + +To verify `icefall` was installed successfully, you can run: + +```bash +python3 -c "import icefall; print(icefall.__file__)" +``` + +It should print the path to `icefall`. + +## Run recipes + +At present, only LibriSpeech recipe is provided. Please +follow [egs/librispeech/ASR/README.md][LibriSpeech] to run it. + +[LibriSpeech]: egs/librispeech/ASR/README.md +[k2-install]: https://k2.readthedocs.io/en/latest/installation/index.html# +[k2]: https://github.com/k2-fsa/k2 +[lhotse]: https://github.com/lhotse-speech/lhotse +[lhotse-install]: https://lhotse.readthedocs.io/en/latest/getting-started.html#installation diff --git a/egs/librispeech/ASR/README.md b/egs/librispeech/ASR/README.md index 45c9ef4de..30778ed05 100644 --- a/egs/librispeech/ASR/README.md +++ b/egs/librispeech/ASR/README.md @@ -1,121 +1,64 @@ -Run `./prepare.sh` to prepare the data. +## Data preparation -Run `./xxx_train.py` (to be added) to train a model. - -## Conformer-CTC -Results of the pre-trained model from -`` -are given below - -### HLG - no LM rescoring - -(output beam size is 8) - -#### 1-best decoding +If you want to use `./prepare.sh` to download everything for you, +you can just run ``` -[test-clean-no_rescore] %WER 3.15% [1656 / 52576, 127 ins, 377 del, 1152 sub ] -[test-other-no_rescore] %WER 7.03% [3682 / 52343, 220 ins, 1024 del, 2438 sub ] +./prepare.sh ``` -#### n-best decoding - -For n=100, +If you have pre-downloaded the LibriSpeech dataset, please +read `./prepare.sh` and modify it to point to the location +of your dataset so that it won't re-download it. After modification, +please run ``` -[test-clean-no_rescore-100] %WER 3.15% [1656 / 52576, 127 ins, 377 del, 1152 sub ] -[test-other-no_rescore-100] %WER 7.14% [3737 / 52343, 275 ins, 1020 del, 2442 sub ] +./prepare.sh ``` -For n=200, +The script `./prepare.sh` prepares features, lexicon, LMs, etc. +All generated files are saved in the folder `./data`. + +**HINT:** `./prepare.sh` supports options `--stage` and `--stop-stage`. + +## TDNN-LSTM CTC training + +The folder `tdnn_lstm_ctc` contains scripts for CTC training +with TDNN-LSTM models. + +Pre-configured parameters for training and decoding are set in the function +`get_params()` within `tdnn_lstm_ctc/train.py` +and `tdnn_lstm_ctc/decode.py`. + +Parameters that can be passed from the command-line can be found by ``` -[test-clean-no_rescore-200] %WER 3.16% [1660 / 52576, 125 ins, 378 del, 1157 sub ] -[test-other-no_rescore-200] %WER 7.04% [3684 / 52343, 228 ins, 1012 del, 2444 sub ] +./tdnn_lstm_ctc/train.py --help +./tdnn_lstm_ctc/decode.py --help ``` -### HLG - with LM rescoring - -#### Whole lattice rescoring +If you have 4 GPUs on a machine and want to use GPU 0, 2, 3 for +mutli-GPU training, you can run ``` -[test-clean-lm_scale_0.8] %WER 2.77% [1456 / 52576, 150 ins, 210 del, 1096 sub ] -[test-other-lm_scale_0.8] %WER 6.23% [3262 / 52343, 246 ins, 635 del, 2381 sub ] +export CUDA_VISIBLE_DEVICES="0,2,3" +./tdnn_lstm_ctc/train.py \ + --master-port 12345 \ + --world-size 3 ``` -WERs of different LM scales are: +If you want to decode by averaging checkpoints `epoch-8.pt`, +`epoch-9.pt` and `epoch-10.pt`, you can run ``` -For test-clean, WER of different settings are: -lm_scale_0.8 2.77 best for test-clean -lm_scale_0.9 2.87 -lm_scale_1.0 3.06 -lm_scale_1.1 3.34 -lm_scale_1.2 3.71 -lm_scale_1.3 4.18 -lm_scale_1.4 4.8 -lm_scale_1.5 5.48 -lm_scale_1.6 6.08 -lm_scale_1.7 6.79 -lm_scale_1.8 7.49 -lm_scale_1.9 8.14 -lm_scale_2.0 8.82 - -For test-other, WER of different settings are: -lm_scale_0.8 6.23 best for test-other -lm_scale_0.9 6.37 -lm_scale_1.0 6.62 -lm_scale_1.1 6.99 -lm_scale_1.2 7.46 -lm_scale_1.3 8.13 -lm_scale_1.4 8.84 -lm_scale_1.5 9.61 -lm_scale_1.6 10.32 -lm_scale_1.7 11.17 -lm_scale_1.8 12.12 -lm_scale_1.9 12.93 -lm_scale_2.0 13.77 +./tdnn_lstm_ctc/decode.py \ + --epoch 10 \ + --avg 3 ``` -#### n-best LM rescoring +## Conformer CTC training -n = 100 - -``` -[test-clean-lm_scale_0.8] %WER 2.79% [1469 / 52576, 149 ins, 212 del, 1108 sub ] -[test-other-lm_scale_0.8] %WER 6.36% [3329 / 52343, 259 ins, 666 del, 2404 sub ] -``` - -WERs of different LM scales are: - -``` -For test-clean, WER of different settings are: -lm_scale_0.8 2.79 best for test-clean -lm_scale_0.9 2.89 -lm_scale_1.0 3.03 -lm_scale_1.1 3.28 -lm_scale_1.2 3.52 -lm_scale_1.3 3.78 -lm_scale_1.4 4.04 -lm_scale_1.5 4.24 -lm_scale_1.6 4.45 -lm_scale_1.7 4.58 -lm_scale_1.8 4.7 -lm_scale_1.9 4.8 -lm_scale_2.0 4.92 -For test-other, WER of different settings are: -lm_scale_0.8 6.36 best for test-other -lm_scale_0.9 6.45 -lm_scale_1.0 6.64 -lm_scale_1.1 6.92 -lm_scale_1.2 7.25 -lm_scale_1.3 7.59 -lm_scale_1.4 7.88 -lm_scale_1.5 8.13 -lm_scale_1.6 8.36 -lm_scale_1.7 8.54 -lm_scale_1.8 8.71 -lm_scale_1.9 8.88 -lm_scale_2.0 9.02 -``` +The folder `conformer-ctc` contains scripts for CTC training +with conformer models. The steps of running the training and +decoding are similar to `tdnn_lstm_ctc`. diff --git a/egs/librispeech/ASR/conformer_ctc/train.py b/egs/librispeech/ASR/conformer_ctc/train.py index 552db81ec..645757ebc 100755 --- a/egs/librispeech/ASR/conformer_ctc/train.py +++ b/egs/librispeech/ASR/conformer_ctc/train.py @@ -16,6 +16,7 @@ import torch.nn as nn from conformer import Conformer from lhotse.utils import fix_random_seed from torch.nn.parallel import DistributedDataParallel as DDP +from torch.nn.utils import clip_grad_norm_ from torch.utils.tensorboard import SummaryWriter from transformer import Noam @@ -114,7 +115,9 @@ def get_params() -> AttributeDict: - log_interval: Print training loss if batch_idx % log_interval` is 0 - - valid_interval: Run validation if batch_idx % valid_interval` is 0 + - valid_interval: Run validation if batch_idx % valid_interval is 0 + + - reset_interval: Reset statistics if batch_idx % reset_interval is 0 - beam_size: It is used in k2.ctc_loss @@ -124,19 +127,20 @@ def get_params() -> AttributeDict: """ params = AttributeDict( { - "exp_dir": Path("conformer_ctc/exp"), + "exp_dir": Path("conformer_ctc/exp_new"), "lang_dir": Path("data/lang_bpe"), "feature_dim": 80, - "weight_decay": 0.0, + "weight_decay": 1e-6, "subsampling_factor": 4, "start_epoch": 0, - "num_epochs": 50, + "num_epochs": 20, "best_train_loss": float("inf"), "best_valid_loss": float("inf"), "best_train_epoch": -1, "best_valid_epoch": -1, "batch_idx_train": 0, "log_interval": 10, + "reset_interval": 200, "valid_interval": 3000, "beam_size": 10, "reduction": "sum", @@ -440,6 +444,8 @@ def train_one_epoch( tot_att_loss = 0.0 tot_frames = 0.0 # sum of frames over all batches + params.tot_loss = 0.0 + params.tot_frames = 0.0 for batch_idx, batch in enumerate(train_dl): params.batch_idx_train += 1 batch_size = len(batch["supervisions"]["text"]) @@ -457,6 +463,7 @@ def train_one_epoch( optimizer.zero_grad() loss.backward() + clip_grad_norm_(model.parameters(), 5.0, 2.0) optimizer.step() loss_cpu = loss.detach().cpu().item() @@ -468,6 +475,9 @@ def train_one_epoch( tot_ctc_loss += ctc_loss_cpu tot_att_loss += att_loss_cpu + params.tot_frames += params.train_frames + params.tot_loss += loss_cpu + tot_avg_loss = tot_loss / tot_frames tot_avg_ctc_loss = tot_ctc_loss / tot_frames tot_avg_att_loss = tot_att_loss / tot_frames @@ -516,6 +526,12 @@ def train_one_epoch( tot_avg_loss, params.batch_idx_train, ) + if batch_idx > 0 and batch_idx % params.reset_interval == 0: + tot_loss = 0.0 # sum of losses over all batches + tot_ctc_loss = 0.0 + tot_att_loss = 0.0 + + tot_frames = 0.0 # sum of frames over all batches if batch_idx > 0 and batch_idx % params.valid_interval == 0: compute_validation_loss( @@ -551,7 +567,7 @@ def train_one_epoch( params.batch_idx_train, ) - params.train_loss = tot_loss / tot_frames + params.train_loss = params.tot_loss / params.tot_frames if params.train_loss < params.best_train_loss: params.best_train_epoch = params.cur_epoch diff --git a/egs/librispeech/ASR/conformer_ctc/transformer.py b/egs/librispeech/ASR/conformer_ctc/transformer.py index a974be4e0..51c77b220 100644 --- a/egs/librispeech/ASR/conformer_ctc/transformer.py +++ b/egs/librispeech/ASR/conformer_ctc/transformer.py @@ -4,12 +4,9 @@ import math from typing import Dict, List, Optional, Tuple -import k2 import torch import torch.nn as nn from subsampling import Conv2dSubsampling, VggSubsampling - -from icefall.utils import get_texts from torch.nn.utils.rnn import pad_sequence # Note: TorchScript requires Dict/List/etc. to be fully typed. @@ -274,9 +271,11 @@ class Transformer(nn.Module): device ) - # TODO: Use eos_id as ignore_id. - # tgt_key_padding_mask = decoder_padding_mask(ys_in_pad, ignore_id=eos_id) - tgt_key_padding_mask = decoder_padding_mask(ys_in_pad) + tgt_key_padding_mask = decoder_padding_mask(ys_in_pad, ignore_id=eos_id) + # TODO: Use length information to create the decoder padding mask + # We set the first column to False since the first column in ys_in_pad + # contains sos_id, which is the same as eos_id in our current setting. + tgt_key_padding_mask[:, 0] = False tgt = self.decoder_embed(ys_in_pad) # (N, T) -> (N, T, C) tgt = self.decoder_pos(tgt) @@ -339,9 +338,11 @@ class Transformer(nn.Module): device ) - # TODO: Use eos_id as ignore_id. - # tgt_key_padding_mask = decoder_padding_mask(ys_in_pad, ignore_id=eos_id) - tgt_key_padding_mask = decoder_padding_mask(ys_in_pad) + tgt_key_padding_mask = decoder_padding_mask(ys_in_pad, ignore_id=eos_id) + # TODO: Use length information to create the decoder padding mask + # We set the first column to False since the first column in ys_in_pad + # contains sos_id, which is the same as eos_id in our current setting. + tgt_key_padding_mask[:, 0] = False tgt = self.decoder_embed(ys_in_pad) # (B, T) -> (B, T, F) tgt = self.decoder_pos(tgt) diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/README.md b/egs/librispeech/ASR/tdnn_lstm_ctc/README.md index 401f3e319..df98a0e11 100644 --- a/egs/librispeech/ASR/tdnn_lstm_ctc/README.md +++ b/egs/librispeech/ASR/tdnn_lstm_ctc/README.md @@ -1,22 +1,2 @@ -## (To be filled in) -It will contain: - -- How to run -- WERs - -```bash -cd $PWD/.. - -./prepare.sh - -./tdnn_lstm_ctc/train.py -``` - -If you have 4 GPUs and want to use GPU 1 and GPU 3 for DDP training, -you can do the following: - -``` -export CUDA_VISIBLE_DEVICES="1,3" -./tdnn_lstm_ctc/train.py --world-size=2 -``` +Will add results later. diff --git a/requirements.txt b/requirements.txt index a54edf118..710048fed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ kaldilm kaldialign sentencepiece>=0.1.96 +tensorboard From 1c3b13c7eb7502be417c32ddb6bde64f48c241c5 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 16 Aug 2021 19:01:25 +0800 Subject: [PATCH 2/3] Minor fixes. (#9) --- egs/librispeech/ASR/local/compile_hlg.py | 43 +++++++----- egs/librispeech/ASR/local/prepare_lang_bpe.py | 35 ++++++---- egs/librispeech/ASR/local/train_bpe_model.py | 40 ++++++++---- egs/librispeech/ASR/prepare.sh | 65 ++++++++++++------- 4 files changed, 120 insertions(+), 63 deletions(-) diff --git a/egs/librispeech/ASR/local/compile_hlg.py b/egs/librispeech/ASR/local/compile_hlg.py index b30402161..9f28bb74d 100755 --- a/egs/librispeech/ASR/local/compile_hlg.py +++ b/egs/librispeech/ASR/local/compile_hlg.py @@ -1,18 +1,18 @@ #!/usr/bin/env python3 """ -This script compiles HLG from +This script takes as input lang_dir and generates HLG from - - H, the ctc topology, built from tokens contained in lexicon.txt - - L, the lexicon, built from L_disambig.pt + - H, the ctc topology, built from tokens contained in lang_dir/lexicon.txt + - L, the lexicon, built from lang_dir/L_disambig.pt Caution: We use a lexicon that contains disambiguation symbols - G, the LM, built from data/lm/G_3_gram.fst.txt -The generated HLG is saved in data/lm/HLG.pt (phone based) -or data/lm/HLG_bpe.pt (BPE based) +The generated HLG is saved in $lang_dir/HLG.pt """ +import argparse import logging from pathlib import Path @@ -22,11 +22,23 @@ import torch from icefall.lexicon import Lexicon +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + """, + ) + + return parser.parse_args() + + def compile_HLG(lang_dir: str) -> k2.Fsa: """ Args: lang_dir: - The language directory, e.g., data/lang_phone or data/lang_bpe. + The language directory, e.g., data/lang_phone or data/lang_bpe_5000. Return: An FSA representing HLG. @@ -104,17 +116,18 @@ def compile_HLG(lang_dir: str) -> k2.Fsa: def main(): - for d in ["data/lang_phone", "data/lang_bpe"]: - d = Path(d) - logging.info(f"Processing {d}") + args = get_args() + lang_dir = Path(args.lang_dir) - if (d / "HLG.pt").is_file(): - logging.info(f"{d}/HLG.pt already exists - skipping") - continue + if (lang_dir / "HLG.pt").is_file(): + logging.info(f"{lang_dir}/HLG.pt already exists - skipping") + return - HLG = compile_HLG(d) - logging.info(f"Saving HLG.pt to {d}") - torch.save(HLG.as_dict(), f"{d}/HLG.pt") + logging.info(f"Processing {lang_dir}") + + HLG = compile_HLG(lang_dir) + logging.info(f"Saving HLG.pt to {lang_dir}") + torch.save(HLG.as_dict(), f"{lang_dir}/HLG.pt") if __name__ == "__main__": diff --git a/egs/librispeech/ASR/local/prepare_lang_bpe.py b/egs/librispeech/ASR/local/prepare_lang_bpe.py index e31220d9b..68b8db966 100755 --- a/egs/librispeech/ASR/local/prepare_lang_bpe.py +++ b/egs/librispeech/ASR/local/prepare_lang_bpe.py @@ -3,12 +3,13 @@ # Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang) """ -This script takes as inputs the following two files: - - data/lang_bpe/bpe.model, - - data/lang_bpe/words.txt +This script takes as input `lang_dir`, which should contain:: -and generates the following files in the directory data/lang_bpe: + - lang_dir/bpe.model, + - lang_dir/words.txt + +and generates the following files in the directory `lang_dir`: - lexicon.txt - lexicon_disambig.txt @@ -17,6 +18,7 @@ and generates the following files in the directory data/lang_bpe: - tokens.txt """ +import argparse from pathlib import Path from typing import Dict, List, Tuple @@ -141,8 +143,22 @@ def generate_lexicon( return lexicon, token2id +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + It should contain the bpe.model and words.txt + """, + ) + + return parser.parse_args() + + def main(): - lang_dir = Path("data/lang_bpe") + args = get_args() + lang_dir = Path(args.lang_dir) model_file = lang_dir / "bpe.model" word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt") @@ -189,15 +205,6 @@ def main(): torch.save(L.as_dict(), lang_dir / "L.pt") torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt") - if False: - # Just for debugging, will remove it - L.labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt") - L.aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt") - L_disambig.labels_sym = L.labels_sym - L_disambig.aux_labels_sym = L.aux_labels_sym - L.draw(lang_dir / "L.svg", title="L") - L_disambig.draw(lang_dir / "L_disambig.svg", title="L_disambig") - if __name__ == "__main__": main() diff --git a/egs/librispeech/ASR/local/train_bpe_model.py b/egs/librispeech/ASR/local/train_bpe_model.py index 59746ad9a..9872a7c6a 100755 --- a/egs/librispeech/ASR/local/train_bpe_model.py +++ b/egs/librispeech/ASR/local/train_bpe_model.py @@ -1,10 +1,5 @@ #!/usr/bin/env python3 -""" -This script takes as input "data/lang/bpe/train.txt" -and generates "data/lang/bpe/bep.model". -""" - # You can install sentencepiece via: # # pip install sentencepiece @@ -14,17 +9,41 @@ and generates "data/lang/bpe/bep.model". # # Please install a version >=0.1.96 +import argparse import shutil from pathlib import Path import sentencepiece as spm +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + It should contain the training corpus: train.txt. + The generated bpe.model is saved to this directory. + """, + ) + parser.add_argument( + "--vocab-size", + type=int, + help="Vocabulary size for BPE training", + ) + + return parser.parse_args() + + def main(): + args = get_args() + vocab_size = args.vocab_size + lang_dir = Path(args.lang_dir) + model_type = "unigram" - vocab_size = 5000 - model_prefix = f"data/lang_bpe/{model_type}_{vocab_size}" - train_text = "data/lang_bpe/train.txt" + + model_prefix = f"{lang_dir}/{model_type}_{vocab_size}" + train_text = f"{lang_dir}/train.txt" character_coverage = 1.0 input_sentence_size = 100000000 @@ -49,10 +68,7 @@ def main(): eos_id=-1, ) - sp = spm.SentencePieceProcessor(model_file=str(model_file)) - vocab_size = sp.vocab_size() - - shutil.copyfile(model_file, "data/lang_bpe/bpe.model") + shutil.copyfile(model_file, f"{lang_dir}/bpe.model") if __name__ == "__main__": diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh index ae676b199..f06e013f6 100755 --- a/egs/librispeech/ASR/prepare.sh +++ b/egs/librispeech/ASR/prepare.sh @@ -25,7 +25,7 @@ stop_stage=100 # - librispeech-vocab.txt # - librispeech-lexicon.txt # -# - $do_dir/musan +# - $dl_dir/musan # This directory contains the following directories downloaded from # http://www.openslr.org/17/ # @@ -36,8 +36,15 @@ dl_dir=$PWD/download . shared/parse_options.sh || exit 1 +# vocab size for sentence piece models. +# It will generate data/lang_bpe_xxx, +# data/lang_bpe_yyy if the array contains xxx, yyy +vocab_sizes=( + 5000 +) -# All generated files by this script are saved in "data" +# All files generated by this script are saved in "data". +# You can safely remove "data" and rerun this script to regenerate it. mkdir -p data log() { @@ -50,6 +57,7 @@ log "dl_dir: $dl_dir" if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then log "stage -1: Download LM" + [ ! -e $dl_dir/lm ] && mkdir -p $dl_dir/lm ./local/download_lm.py --out-dir=$dl_dir/lm fi @@ -118,28 +126,34 @@ fi if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then log "State 6: Prepare BPE based lang" - mkdir -p data/lang_bpe - # We reuse words.txt from phone based lexicon - # so that the two can share G.pt later. - cp data/lang_phone/words.txt data/lang_bpe/ - if [ ! -f data/lang_bpe/train.txt ]; then - log "Generate data for BPE training" - files=$( - find "data/LibriSpeech/train-clean-100" -name "*.trans.txt" - find "data/LibriSpeech/train-clean-360" -name "*.trans.txt" - find "data/LibriSpeech/train-other-500" -name "*.trans.txt" - ) - for f in ${files[@]}; do - cat $f | cut -d " " -f 2- - done > data/lang_bpe/train.txt - fi + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + mkdir -p $lang_dir + # We reuse words.txt from phone based lexicon + # so that the two can share G.pt later. + cp data/lang_phone/words.txt $lang_dir - python3 ./local/train_bpe_model.py + if [ ! -f $lang_dir/train.txt ]; then + log "Generate data for BPE training" + files=$( + find "$dl_dir/LibriSpeech/train-clean-100" -name "*.trans.txt" + find "$dl_dir/LibriSpeech/train-clean-360" -name "*.trans.txt" + find "$dl_dir/LibriSpeech/train-other-500" -name "*.trans.txt" + ) + for f in ${files[@]}; do + cat $f | cut -d " " -f 2- + done > $lang_dir/train.txt + fi - if [ ! -f data/lang_bpe/L_disambig.pt ]; then - ./local/prepare_lang_bpe.py - fi + ./local/train_bpe_model.py \ + --lang-dir $lang_dir \ + --vocab-size $vocab_size + + if [ ! -f $lang_dir/L_disambig.pt ]; then + ./local/prepare_lang_bpe.py --lang-dir $lang_dir + fi + done fi if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then @@ -169,5 +183,12 @@ fi if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then log "Stage 8: Compile HLG" - python3 ./local/compile_hlg.py + ./local/compile_hlg.py --lang-dir data/lang_phone + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + ./local/compile_hlg.py --lang-dir $lang_dir + done fi + +cd data && ln -sfv lang_bpe_5000 lang_bpe From caa0b9e9425af27e0c6211048acb55a76ed5d315 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 19 Aug 2021 14:54:01 +0800 Subject: [PATCH 3/3] Fix an error in displaying decoding process. (#12) --- egs/librispeech/ASR/conformer_ctc/decode.py | 11 +++++------ egs/librispeech/ASR/tdnn_lstm_ctc/decode.py | 11 +++++------ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/egs/librispeech/ASR/conformer_ctc/decode.py b/egs/librispeech/ASR/conformer_ctc/decode.py index 889a0a474..0722cd582 100755 --- a/egs/librispeech/ASR/conformer_ctc/decode.py +++ b/egs/librispeech/ASR/conformer_ctc/decode.py @@ -284,7 +284,6 @@ def decode_dataset( results = [] num_cuts = 0 - tot_num_cuts = len(dl.dataset.cuts) results = defaultdict(list) for batch_idx, batch in enumerate(dl): @@ -314,9 +313,7 @@ def decode_dataset( if batch_idx % 100 == 0: logging.info( - f"batch {batch_idx}, cuts processed until now is " - f"{num_cuts}/{tot_num_cuts} " - f"({float(num_cuts)/tot_num_cuts*100:.6f}%)" + f"batch {batch_idx}, cuts processed until now is {num_cuts}" ) return results @@ -399,7 +396,9 @@ def main(): sos_id = graph_compiler.sos_id eos_id = graph_compiler.eos_id - HLG = k2.Fsa.from_dict(torch.load(f"{params.lang_dir}/HLG.pt")) + HLG = k2.Fsa.from_dict( + torch.load(f"{params.lang_dir}/HLG.pt", map_location="cpu") + ) HLG = HLG.to(device) assert HLG.requires_grad is False @@ -430,7 +429,7 @@ def main(): torch.save(G.as_dict(), params.lm_dir / "G_4_gram.pt") else: logging.info("Loading pre-compiled G_4_gram.pt") - d = torch.load(params.lm_dir / "G_4_gram.pt") + d = torch.load(params.lm_dir / "G_4_gram.pt", map_location="cpu") G = k2.Fsa.from_dict(d).to(device) if params.method in ["whole-lattice-rescoring", "attention-decoder"]: diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/decode.py b/egs/librispeech/ASR/tdnn_lstm_ctc/decode.py index 137fa795c..9a1aad579 100755 --- a/egs/librispeech/ASR/tdnn_lstm_ctc/decode.py +++ b/egs/librispeech/ASR/tdnn_lstm_ctc/decode.py @@ -236,7 +236,6 @@ def decode_dataset( results = [] num_cuts = 0 - tot_num_cuts = len(dl.dataset.cuts) results = defaultdict(list) for batch_idx, batch in enumerate(dl): @@ -264,9 +263,7 @@ def decode_dataset( if batch_idx % 100 == 0: logging.info( - f"batch {batch_idx}, cuts processed until now is " - f"{num_cuts}/{tot_num_cuts} " - f"({float(num_cuts)/tot_num_cuts*100:.6f}%)" + f"batch {batch_idx}, cuts processed until now is {num_cuts}" ) return results @@ -328,7 +325,9 @@ def main(): logging.info(f"device: {device}") - HLG = k2.Fsa.from_dict(torch.load("data/lang_phone/HLG.pt")) + HLG = k2.Fsa.from_dict( + torch.load("data/lang_phone/HLG.pt", map_location="cpu") + ) HLG = HLG.to(device) assert HLG.requires_grad is False @@ -355,7 +354,7 @@ def main(): torch.save(G.as_dict(), params.lm_dir / "G_4_gram.pt") else: logging.info("Loading pre-compiled G_4_gram.pt") - d = torch.load(params.lm_dir / "G_4_gram.pt") + d = torch.load(params.lm_dir / "G_4_gram.pt", map_location="cpu") G = k2.Fsa.from_dict(d).to(device) if params.method == "whole-lattice-rescoring":