WIP: Add BPE training code.

2025-12-11 06:55:27 +00:00 · 2021-07-29 20:23:52 +08:00 · 2021-07-29 20:23:52 +08:00 · acc63a9172
commit acc63a9172
parent bd69e4be32
15 changed files with 1144 additions and 267 deletions
--- a/egs/librispeech/ASR/conformer_ctc/train.py
+++ b/egs/librispeech/ASR/conformer_ctc/train.py
@ -0,0 +1,602 @@
 #!/usr/bin/env python3
 # This is just at the very beginning ...
 import argparse
 import logging
 from pathlib import Path
 from shutil import copyfile
 from typing import Optional
 import k2
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
 import torch.nn as nn
 import torch.optim as optim
 from conformer import Conformer
 from transformer import Noam
 from lhotse.utils import fix_random_seed
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.nn.utils import clip_grad_value_
 from torch.optim.lr_scheduler import StepLR
 from torch.utils.tensorboard import SummaryWriter
 from icefall.checkpoint import load_checkpoint
 from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
 from icefall.dataset.librispeech import LibriSpeechAsrDataModule
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.bpe_graph_compiler import BpeCtcTrainingGraphCompiler
 from icefall.lexicon import Lexicon
 from icefall.utils import (
    AttributeDict,
    encode_supervisions,
    setup_logger,
    str2bool,
 )
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--world-size",
        type=int,
        default=1,
        help="Number of GPUs for DDP training.",
    )
    parser.add_argument(
        "--master-port",
        type=int,
        default=12354,
        help="Master port to use for DDP training.",
    )
    parser.add_argument(
        "--tensorboard",
        type=str2bool,
        default=True,
        help="Should various information be logged in tensorboard.",
    )
    # TODO: add extra arguments and support DDP training.
    # Currently, only single GPU training is implemented. Will add
    # DDP training once single GPU training is finished.
    return parser
 def get_params() -> AttributeDict:
    """Return a dict containing training parameters.
    All training related parameters that are not passed from the commandline
    is saved in the variable `params`.
    Commandline options are merged into `params` after they are parsed, so
    you can also access them via `params`.
    Explanation of options saved in `params`:
        - exp_dir: It specifies the directory where all training related
                   files, e.g., checkpoints, log, etc, are saved
        - lang_dir: It contains language related input files such as
                    "lexicon.txt"
        - lr: It specifies the initial learning rate
        - feature_dim: The model input dim. It has to match the one used
                       in computing features.
        - weight_decay:  The weight_decay for the optimizer.
        - subsampling_factor:  The subsampling factor for the model.
        - start_epoch:  If it is not zero, load checkpoint `start_epoch-1`
                        and continue training from that checkpoint.
        - num_epochs:  Number of epochs to train.
        - best_train_loss: Best training loss so far. It is used to select
                           the model that has the lowest training loss. It is
                           updated during the training.
        - best_valid_loss: Best validation loss so far. It is used to select
                           the model that has the lowest validation loss. It is
                           updated during the training.
        - best_train_epoch: It is the epoch that has the best training loss.
        - best_valid_epoch: It is the epoch that has the best validation loss.
        - batch_idx_train: Used to writing statistics to tensorboard. It
                           contains number of batches trained so far across
                           epochs.
        - log_interval:  Print training loss if batch_idx % log_interval` is 0
        - valid_interval:  Run validation if batch_idx % valid_interval` is 0
        - beam_size: It is used in k2.ctc_loss
        - reduction: It is used in k2.ctc_loss
        - use_double_scores: It is used in k2.ctc_loss
    """
    params = AttributeDict(
        {
            "exp_dir": Path("conformer_ctc/exp"),
            "lang_dir": Path("data/lang/bpe"),
            "feature_dim": 80,
            "weight_decay": 0.0,
            "subsampling_factor": 4,
            "start_epoch": 0,
            "num_epochs": 10,
            "best_train_loss": float("inf"),
            "best_valid_loss": float("inf"),
            "best_train_epoch": -1,
            "best_valid_epoch": -1,
            "batch_idx_train": 0,
            "log_interval": 10,
            "valid_interval": 1000,
            "beam_size": 10,
            "reduction": "sum",
            "use_double_scores": True,
            #
            "accum_grad": 1,
            "att_rate": 0.7,
            "attention_dim": 512,
            "nhead": 8,
            "num_decoder_layers": 6,
            "is_espnet_structure": True,
            "mmi_loss": False,
            "use_feat_batchnorm": True,
            "lr_factor": 5.0,
            "warm_step": 80000,
        }
    )
    return params
 def load_checkpoint_if_available(
    params: AttributeDict,
    model: nn.Module,
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
 ) -> None:
    """Load checkpoint from file.
    If params.start_epoch is positive, it will load the checkpoint from
    `params.start_epoch - 1`. Otherwise, this function does nothing.
    Apart from loading state dict for `model`, `optimizer` and `scheduler`,
    it also updates `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
    and `best_valid_loss` in `params`.
    Args:
      params:
        The return value of :func:`get_params`.
      model:
        The training model.
      optimizer:
        The optimizer that we are using.
      scheduler:
        The learning rate scheduler we are using.
    Returns:
      Return None.
    """
    if params.start_epoch <= 0:
        return
    filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
    saved_params = load_checkpoint(
        filename, model=model, optimizer=optimizer, scheduler=scheduler,
    )
    keys = [
        "best_train_epoch",
        "best_valid_epoch",
        "batch_idx_train",
        "best_train_loss",
        "best_valid_loss",
    ]
    for k in keys:
        params[k] = saved_params[k]
    return saved_params
 def save_checkpoint(
    params: AttributeDict,
    model: nn.Module,
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
    Args:
      params:
        It is returned by :func:`get_params`.
      model:
        The training model.
    """
    if rank != 0:
        return
    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
    save_checkpoint_impl(
        filename=filename,
        model=model,
        params=params,
        optimizer=optimizer,
        scheduler=scheduler,
        rank=rank,
    )
    if params.best_train_epoch == params.cur_epoch:
        best_train_filename = params.exp_dir / "best-train-loss.pt"
        copyfile(src=filename, dst=best_train_filename)
    if params.best_valid_epoch == params.cur_epoch:
        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
        copyfile(src=filename, dst=best_valid_filename)
 def compute_loss(
    params: AttributeDict,
    model: nn.Module,
    batch: dict,
    graph_compiler: BpeCtcTrainingGraphCompiler,
    is_training: bool,
 ):
    """
    Compute CTC loss given the model and its inputs.
    Args:
      params:
        Parameters for training. See :func:`get_params`.
      model:
        The model for training. It is an instance of Conformer in our case.
      batch:
        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
        for the content in it.
      graph_compiler:
        It is used to build a decoding graph from a ctc topo and training
        transcript. The training transcript is contained in the given `batch`,
        while the ctc topo is built when this compiler is instantiated.
      is_training:
        True for training. False for validation. When it is True, this
        function enables autograd during computation; when it is False, it
        disables autograd.
    """
    device = graph_compiler.device
    feature = batch["inputs"]
    # at entry, feature is [N, T, C]
    feature = feature.permute(0, 2, 1)  # now feature is [N, C, T]
    assert feature.ndim == 3
    feature = feature.to(device)
    supervisions = batch["supervisions"]
    with torch.set_grad_enabled(is_training):
        nnet_output, encoder_memory, memory_mask = model(feature, supervisions)
        # nnet_output is [N, C, T]
        nnet_output = nnet_output.permute(0, 2, 1)  # [N, C, T] -> [N, T, C]
    # NOTE: We need `encode_supervisions` to sort sequences with
    # different duration in decreasing order, required by
    # `k2.intersect_dense` called in `k2.ctc_loss`
    supervision_segments, texts = encode_supervisions(
        supervisions, subsampling_factor=params.subsampling_factor
    )
    token_ids = graph_compiler.texts_to_ids(texts)
    decoding_graph = graph_compiler.compile(token_ids)
    dense_fsa_vec = k2.DenseFsaVec(
        nnet_output,
        supervision_segments,
        allow_truncate=params.subsampling_factor - 1,
    )
    ctc_loss = k2.ctc_loss(
        decoding_graph=decoding_graph,
        dense_fsa_vec=dense_fsa_vec,
        output_beam=params.beam_size,
        reduction=params.reduction,
        use_double_scores=params.use_double_scores,
    )
    if params.att_rate != 0.0:
        att_loss = model.decoder_forward(
            encoder_memory,
            memory_mask,
            token_ids=token_ids,
            sos_id=graph_compiler.sos_id,
            eos_id=graph_compiler.eos_id,
        )
        loss = (1.0 - params.att_rate) * ctc_loss + params.att_rate * att_loss
    else:
        loss = ctc_loss
    # train_frames and valid_frames are used for printing.
    if is_training:
        params.train_frames = supervision_segments[:, 2].sum().item()
    else:
        params.valid_frames = supervision_segments[:, 2].sum().item()
    assert loss.requires_grad == is_training
    return loss
 def compute_validation_loss(
    params: AttributeDict,
    model: nn.Module,
    graph_compiler: BpeCtcTrainingGraphCompiler,
    valid_dl: torch.utils.data.DataLoader,
    world_size: int = 1,
 ) -> None:
    """Run the validation process. The validation loss
    is saved in `params.valid_loss`.
    """
    model.eval()
    tot_loss = 0.0
    tot_frames = 0.0
    for batch_idx, batch in enumerate(valid_dl):
        loss = compute_loss(
            params=params,
            model=model,
            batch=batch,
            graph_compiler=graph_compiler,
            is_training=False,
        )
        assert loss.requires_grad is False
        loss_cpu = loss.detach().cpu().item()
        tot_loss += loss_cpu
        tot_frames += params.valid_frames
    if world_size > 1:
        s = torch.tensor([tot_loss, tot_frames], device=loss.device)
        dist.all_reduce(s, op=dist.ReduceOp.SUM)
        s = s.cpu().tolist()
        tot_loss = s[0]
        tot_frames = s[1]
    params.valid_loss = tot_loss / tot_frames
    if params.valid_loss < params.best_valid_loss:
        params.best_valid_epoch = params.cur_epoch
        params.best_valid_loss = params.valid_loss
 def train_one_epoch(
    params: AttributeDict,
    model: nn.Module,
    optimizer: torch.optim.Optimizer,
    graph_compiler: BpeCtcTrainingGraphCompiler,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
 ) -> None:
    """Train the model for one epoch.
    The training loss from the mean of all frames is saved in
    `params.train_loss`. It runs the validation process every
    `params.valid_interval` batches.
    Args:
      params:
        It is returned by :func:`get_params`.
      model:
        The model for training.
      optimizer:
        The optimizer we are using.
      graph_compiler:
        It is used to convert transcripts to FSAs.
      train_dl:
        Dataloader for the training dataset.
      valid_dl:
        Dataloader for the validation dataset.
      tb_writer:
        Writer to write log messages to tensorboard.
      world_size:
        Number of nodes in DDP training. If it is 1, DDP is disabled.
    """
    model.train()
    tot_loss = 0.0  # sum of losses over all batches
    tot_frames = 0.0  # sum of frames over all batches
    for batch_idx, batch in enumerate(train_dl):
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])
        loss = compute_loss(
            params=params,
            model=model,
            batch=batch,
            graph_compiler=graph_compiler,
            is_training=True,
        )
        # NOTE: We use reduction==sum and loss is computed over utterances
        # in the batch and there is no normalization to it so far.
        optimizer.zero_grad()
        loss.backward()
        clip_grad_value_(model.parameters(), 5.0)
        optimizer.step()
        loss_cpu = loss.detach().cpu().item()
        tot_frames += params.train_frames
        tot_loss += loss_cpu
        tot_avg_loss = tot_loss / tot_frames
        if batch_idx % params.log_interval == 0:
            logging.info(
                f"Epoch {params.cur_epoch}, batch {batch_idx}, "
                f"batch avg loss {loss_cpu/params.train_frames:.4f}, "
                f"total avg loss: {tot_avg_loss:.4f}, "
                f"batch size: {batch_size}"
            )
        if batch_idx > 0 and batch_idx % params.valid_interval == 0:
            compute_validation_loss(
                params=params,
                model=model,
                graph_compiler=graph_compiler,
                valid_dl=valid_dl,
                world_size=world_size,
            )
            model.train()
            logging.info(
                f"Epoch {params.cur_epoch}, valid loss {params.valid_loss:.4f},"
                f" best valid loss: {params.best_valid_loss:.4f} "
                f"best valid epoch: {params.best_valid_epoch}"
            )
    params.train_loss = tot_loss / tot_frames
    if params.train_loss < params.best_train_loss:
        params.best_train_epoch = params.cur_epoch
        params.best_train_loss = params.train_loss
 def run(rank, world_size, args):
    """
    Args:
      rank:
        It is a value between 0 and `world_size-1`, which is
        passed automatically by `mp.spawn()` in :func:`main`.
        The node with rank 0 is responsible for saving checkpoint.
      world_size:
        Number of GPUs for DDP training.
      args:
        The return value of get_parser().parse_args()
    """
    params = get_params()
    params.update(vars(args))
    fix_random_seed(42)
    if world_size > 1:
        setup_dist(rank, world_size, params.master_port)
    setup_logger(f"{params.exp_dir}/log/log-train")
    logging.info("Training started")
    logging.info(params)
    if args.tensorboard and rank == 0:
        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
    else:
        tb_writer = None
    lexicon = Lexicon(params.lang_dir)
    max_token_id = max(lexicon.tokens)
    num_classes = max_token_id + 1  # +1 for the blank
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", rank)
    graph_compiler = BpeCtcTrainingGraphCompiler(
        params.lang_dir,
        device=device,
        sos_token="<sos/eos>",
        eos_token="<sos/eos>",
    )
    logging.info("About to create model")
    model = Conformer(
        num_features=params.feature_dim,
        nhead=params.nhead,
        d_model=params.attention_dim,
        num_classes=num_classes,
        subsampling_factor=params.subsampling_factor,
        num_decoder_layers=params.num_decoder_layers,
        vgg_frontend=False,
        is_espnet_structure=params.is_espnet_structure,
        mmi_loss=params.mmi_loss,
        use_feat_batchnorm=params.use_feat_batchnorm,
    )
    checkpoints = load_checkpoint_if_available(params=params, model=model)
    model.to(device)
    if world_size > 1:
        model = DDP(model, device_ids=[rank])
    optimizer = Noam(
        model.parameters(),
        model_size=params.attention_dim,
        factor=params.lr_factor,
        warm_step=params.warm_step,
        weight_decay=params.weight_decay,
    )
    if checkpoints:
        optimizer.load_state_dict(checkpoints["optimizer"])
    librispeech = LibriSpeechAsrDataModule(args)
    train_dl = librispeech.train_dataloaders()
    valid_dl = librispeech.valid_dataloaders()
    for epoch in range(params.start_epoch, params.num_epochs):
        train_dl.sampler.set_epoch(epoch)
        cur_lr = optimizer._rate
        if tb_writer is not None:
            tb_writer.add_scalar(
                "train/learning_rate", cur_lr, params.batch_idx_train
            )
            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
        if rank == 0:
            logging.info("epoch {}, learning rate {}".format(epoch, cur_lr))
        params.cur_epoch = epoch
        train_one_epoch(
            params=params,
            model=model,
            optimizer=optimizer,
            graph_compiler=graph_compiler,
            train_dl=train_dl,
            valid_dl=valid_dl,
            tb_writer=tb_writer,
            world_size=world_size,
        )
        save_checkpoint(
            params=params, model=model, optimizer=optimizer, rank=rank,
        )
    logging.info("Done!")
    if world_size > 1:
        torch.distributed.barrier()
        cleanup_dist()
 def main():
    parser = get_parser()
    LibriSpeechAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    world_size = args.world_size
    assert world_size >= 1
    if world_size > 1:
        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
    else:
        run(rank=0, world_size=1, args=args)
 if __name__ == "__main__":
    main()
--- a/egs/librispeech/ASR/conformer_ctc/transformer.py
+++ b/egs/librispeech/ASR/conformer_ctc/transformer.py
@ -189,6 +189,8 @@ class Transformer(nn.Module):
        supervision: Supervisions = None,
        graph_compiler: object = None,
        token_ids: List[int] = None,
        sos_id: Optional[int] = None,
        eos_id: Optional[int] = None,
    ) -> Tensor:
        """
        Args:
@ -197,6 +199,8 @@ class Transformer(nn.Module):
            supervision: Supervison in lhotse format, get from batch['supervisions']
            graph_compiler: use graph_compiler.L_inv (Its labels are words, while its aux_labels are phones)
                            , graph_compiler.words and graph_compiler.oov
            sos_id: sos token id
            eos_id: eos token id
        Returns:
            Tensor: Decoder loss.
@ -206,18 +210,9 @@ class Transformer(nn.Module):
                supervision, graph_compiler.lexicon.words, graph_compiler.oov
            )
            ys_in_pad, ys_out_pad = add_sos_eos(
-                batch_text,
+                batch_text, graph_compiler.L_inv, sos_id, eos_id,
                graph_compiler.L_inv,
                self.decoder_num_class - 1,
                self.decoder_num_class - 1,
            )
        elif token_ids is not None:
            # speical token ids:
            # <blank> 0
            # <UNK> 1
            # <sos/eos> self.decoder_num_class - 1
            sos_id = self.decoder_num_class - 1
            eos_id = self.decoder_num_class - 1
            _sos = torch.tensor([sos_id])
            _eos = torch.tensor([eos_id])
            ys_in = [
@ -259,7 +254,12 @@ class Transformer(nn.Module):
        return decoder_loss
    def decoder_nll(
-        self, x: Tensor, encoder_mask: Tensor, token_ids: List[List[int]] = None
+        self,
        x: Tensor,
        encoder_mask: Tensor,
        token_ids: List[List[int]],
        sos_id: int,
        eos_id: int,
    ) -> Tensor:
        """
        Args:
@ -273,12 +273,6 @@ class Transformer(nn.Module):
        # The common part between this fuction and decoder_forward could be
        # extracted as a seperated function.
        if token_ids is not None:
            # speical token ids:
            # <blank> 0
            # <UNK> 1
            # <sos/eos> self.decoder_num_class - 1
            sos_id = self.decoder_num_class - 1
            eos_id = self.decoder_num_class - 1
            _sos = torch.tensor([sos_id])
            _eos = torch.tensor([eos_id])
            ys_in = [
@ -866,7 +860,8 @@ class LabelSmoothingLoss(nn.Module):
            target = target.masked_fill(ignore, 0)  # avoid -1 index
            true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
        kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
-        denom = total if self.normalize_length else batch_size
+        #  denom = total if self.normalize_length else batch_size
        denom = total if self.normalize_length else 1
        return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom
@ -983,8 +978,8 @@ def generate_square_subsequent_mask(sz: int) -> Tensor:
 def add_sos_eos(
    ys: List[List[int]],
    lexicon: k2.Fsa,
-    sos: int,
+    sos_id: int,
-    eos: int,
+    eos_id: int,
    ignore_id: int = -1,
 ) -> Tuple[Tensor, Tensor]:
    """Add <sos> and <eos> labels.
@ -992,8 +987,8 @@ def add_sos_eos(
    Args:
        ys: batch of unpadded target sequences
        lexicon: Its labels are words, while its aux_labels are phones.
-        sos: index of <sos>
+        sos_id: index of <sos>
-        eos: index of <eos>
+        eos_id: index of <eos>
        ignore_id: index of padding
    Returns:
@ -1001,8 +996,8 @@ def add_sos_eos(
        Tensor: Output of transformer decoder. padded tensor of dimention (batch_size, max_length).
    """
-    _sos = torch.tensor([sos])
+    _sos = torch.tensor([sos_id])
-    _eos = torch.tensor([eos])
+    _eos = torch.tensor([eos_id])
    ys = get_hierarchical_targets(ys, lexicon)
    ys_in = [torch.cat([_sos, y], dim=0) for y in ys]
    ys_out = [torch.cat([y, _eos], dim=0) for y in ys]
--- a/egs/librispeech/ASR/local/compile_hlg.py
+++ b/egs/librispeech/ASR/local/compile_hlg.py
@ -3,7 +3,7 @@
 """
 This script compiles HLG from
-    - H, the ctc topology, built from phones contained in lexicon.txt
+    - H, the ctc topology, built from tokens contained in lexicon.txt
    - L, the lexicon, built from L_disambig.pt
        Caution: We use a lexicon that contains disambiguation symbols
@ -13,6 +13,7 @@ This script compiles HLG from
 The generated HLG is saved in data/lm/HLG.pt (phone based)
 or data/lm/HLG_bpe.pt (BPE based)
 """
 import logging
 from pathlib import Path
 import k2
@ -32,44 +33,44 @@ def compile_HLG(lang_dir: str) -> k2.Fsa:
    """
    lexicon = Lexicon(lang_dir)
    max_token_id = max(lexicon.tokens)
-    print(f"Building ctc_topo. max_token_id: {max_token_id}")
+    logging.info(f"Building ctc_topo. max_token_id: {max_token_id}")
    H = k2.ctc_topo(max_token_id)
    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt"))
    if Path("data/lm/G_3_gram.pt").is_file():
-        print("Loading pre-compiled G_3_gram")
+        logging.info("Loading pre-compiled G_3_gram")
        d = torch.load("data/lm/G_3_gram.pt")
        G = k2.Fsa.from_dict(d)
    else:
-        print("Loading G_3_gram.fst.txt")
+        logging.info("Loading G_3_gram.fst.txt")
        with open("data/lm/G_3_gram.fst.txt") as f:
            G = k2.Fsa.from_openfst(f.read(), acceptor=False)
            torch.save(G.as_dict(), "G_3_gram.pt")
-    first_token_disambig_id = lexicon.phones["#0"]
+    first_token_disambig_id = lexicon.token_table["#0"]
-    first_word_disambig_id = lexicon.words["#0"]
+    first_word_disambig_id = lexicon.word_table["#0"]
    L = k2.arc_sort(L)
    G = k2.arc_sort(G)
-    print("Intersecting L and G")
+    logging.info("Intersecting L and G")
    LG = k2.compose(L, G)
-    print(f"LG shape: {LG.shape}")
+    logging.info(f"LG shape: {LG.shape}")
-    print("Connecting LG")
+    logging.info("Connecting LG")
    LG = k2.connect(LG)
-    print(f"LG shape after k2.connect: {LG.shape}")
+    logging.info(f"LG shape after k2.connect: {LG.shape}")
-    print(type(LG.aux_labels))
+    logging.info(type(LG.aux_labels))
-    print("Determinizing LG")
+    logging.info("Determinizing LG")
    LG = k2.determinize(LG)
-    print(type(LG.aux_labels))
+    logging.info(type(LG.aux_labels))
-    print("Connecting LG after k2.determinize")
+    logging.info("Connecting LG after k2.determinize")
    LG = k2.connect(LG)
-    print("Removing disambiguation symbols on LG")
+    logging.info("Removing disambiguation symbols on LG")
    LG.labels[LG.labels >= first_token_disambig_id] = 0
@ -77,27 +78,27 @@ def compile_HLG(lang_dir: str) -> k2.Fsa:
    LG.aux_labels.values()[LG.aux_labels.values() >= first_word_disambig_id] = 0
    LG = k2.remove_epsilon(LG)
-    print(f"LG shape after k2.remove_epsilon: {LG.shape}")
+    logging.info(f"LG shape after k2.remove_epsilon: {LG.shape}")
    LG = k2.connect(LG)
    LG.aux_labels = k2.ragged.remove_values_eq(LG.aux_labels, 0)
-    print("Arc sorting LG")
+    logging.info("Arc sorting LG")
    LG = k2.arc_sort(LG)
-    print("Composing H and LG")
+    logging.info("Composing H and LG")
    # CAUTION: The name of the inner_labels is fixed
    # to `tokens`. If you want to change it, please
    # also change other places in icefall that are using
    # it.
    HLG = k2.compose(H, LG, inner_labels="tokens")
-    print("Connecting LG")
+    logging.info("Connecting LG")
    HLG = k2.connect(HLG)
-    print("Arc sorting LG")
+    logging.info("Arc sorting LG")
    HLG = k2.arc_sort(HLG)
-    print(f"HLG.shape: {HLG.shape}")
+    logging.info(f"HLG.shape: {HLG.shape}")
    return HLG
@ -106,10 +107,10 @@ def phone_based_HLG():
    if Path("data/lm/HLG.pt").is_file():
        return
-    print("Compiling phone based HLG")
+    logging.info("Compiling phone based HLG")
    HLG = compile_HLG("data/lang")
-    print("Saving HLG.pt to data/lm")
+    logging.info("Saving HLG.pt to data/lm")
    torch.save(HLG.as_dict(), "data/lm/HLG.pt")
@ -117,9 +118,9 @@ def bpe_based_HLG():
    if Path("data/lm/HLG_bpe.pt").is_file():
        return
-    print("Compiling BPE based HLG")
+    logging.info("Compiling BPE based HLG")
    HLG = compile_HLG("data/lang/bpe")
-    print("Saving HLG_bpe.pt to data/lm")
+    logging.info("Saving HLG_bpe.pt to data/lm")
    torch.save(HLG.as_dict(), "data/lm/HLG_bpe.pt")
@ -129,4 +130,10 @@ def main():
 if __name__ == "__main__":
    formatter = (
        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    )
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/librispeech/ASR/local/prepare_lang.py
+++ b/egs/librispeech/ASR/local/prepare_lang.py
@ -4,13 +4,13 @@
 """
 This script takes as input a lexicon file "data/lang/lexicon.txt"
-consisting of words and phones and does the following:
+consisting of words and tokens (i.e., phones) and does the following:
 1. Add disambiguation symbols to the lexicon and generate lexicon_disambig.txt
-2. Generate phones.txt, the phones table mapping a phone to a unique integer.
+2. Generate tokens.txt, the token table mapping a token to a unique integer.
-3. Generate words.txt, the words table mapping a word to a unique integer.
+3. Generate words.txt, the word table mapping a word to a unique integer.
 4. Generate L.pt, in k2 format. It can be loaded by
@ -29,62 +29,11 @@ from typing import Any, Dict, List, Tuple
 import k2
 import torch
 from icefall.lexicon import read_lexicon, write_lexicon
 Lexicon = List[Tuple[str, List[str]]]
 def read_lexicon(filename: str) -> Lexicon:
    """Read a lexicon.txt in `filename`.
    Each line in the lexicon contains "word p1 p2 p3 ...".
    That is, the first field is a word and the remaining
    fields are phones. Fields are separated by space(s).
    Args:
      filename:
        Path to the lexicon.txt
    Returns:
      A list of tuples., e.g., [('w', ['p1', 'p2']), ('w1', ['p3, 'p4'])]
    """
    ans = []
    with open(filename, "r", encoding="utf-8") as f:
        whitespace = re.compile("[ \t]+")
        for line in f:
            a = whitespace.split(line.strip(" \t\r\n"))
            if len(a) == 0:
                continue
            if len(a) < 2:
                print(f"Found bad line {line} in lexicon file {filename}")
                print("Every line is expected to contain at least 2 fields")
                sys.exit(1)
            word = a[0]
            if word == "<eps>":
                print(f"Found bad line {line} in lexicon file {filename}")
                print("<eps> should not be a valid word")
                sys.exit(1)
            prons = a[1:]
            ans.append((word, prons))
    return ans
 def write_lexicon(filename: str, lexicon: Lexicon) -> None:
    """Write a lexicon to a file.
    Args:
      filename:
        Path to the lexicon file to be generated.
      lexicon:
        It can be the return value of :func:`read_lexicon`.
    """
    with open(filename, "w", encoding="utf-8") as f:
        for word, prons in lexicon:
            f.write(f"{word} {' '.join(prons)}\n")
 def write_mapping(filename: str, sym2id: Dict[str, int]) -> None:
    """Write a symbol to ID mapping to a file.
@ -105,18 +54,18 @@ def write_mapping(filename: str, sym2id: Dict[str, int]) -> None:
            f.write(f"{sym} {i}\n")
-def get_phones(lexicon: Lexicon) -> List[str]:
+def get_tokens(lexicon: Lexicon) -> List[str]:
-    """Get phones from a lexicon.
+    """Get tokens from a lexicon.
    Args:
      lexicon:
        It is the return value of :func:`read_lexicon`.
    Returns:
-      Return a list of unique phones.
+      Return a list of unique tokens.
    """
    ans = set()
-    for _, prons in lexicon:
+    for _, tokens in lexicon:
-        ans.update(prons)
+        ans.update(tokens)
    sorted_ans = sorted(list(ans))
    return sorted_ans
@ -138,8 +87,8 @@ def get_words(lexicon: Lexicon) -> List[str]:
 def add_disambig_symbols(lexicon: Lexicon) -> Tuple[Lexicon, int]:
-    """It adds pseudo-phone disambiguation symbols #1, #2 and so on
+    """It adds pseudo-token disambiguation symbols #1, #2 and so on
-    at the ends of phones to ensure that all pronunciations are different,
+    at the ends of tokens to ensure that all pronunciations are different,
    and that none is a prefix of another.
    See also add_lex_disambig.pl from kaldi.
@ -151,30 +100,30 @@ def add_disambig_symbols(lexicon: Lexicon) -> Tuple[Lexicon, int]:
      Return a tuple with two elements:
        - The output lexicon with disambiguation symbols
-        - The ID of the max disambiguation symbols that appears
+        - The ID of the max disambiguation symbol that appears
          in the lexicon
    """
-    # (1) Work out the count of each phone-sequence in the
+    # (1) Work out the count of each token-sequence in the
    # lexicon.
    count = defaultdict(int)
-    for _, prons in lexicon:
+    for _, tokens in lexicon:
-        count[" ".join(prons)] += 1
+        count[" ".join(tokens)] += 1
-    # (2) For each left sub-sequence of each phone-sequence, note down
+    # (2) For each left sub-sequence of each token-sequence, note down
    # that it exists (for identifying prefixes of longer strings).
    issubseq = defaultdict(int)
-    for _, prons in lexicon:
+    for _, tokens in lexicon:
-        prons = prons.copy()
+        tokens = tokens.copy()
-        prons.pop()
+        tokens.pop()
-        while prons:
+        while tokens:
-            issubseq[" ".join(prons)] = 1
+            issubseq[" ".join(tokens)] = 1
-            prons.pop()
+            tokens.pop()
    # (3) For each entry in the lexicon:
-    # if the phone sequence is unique and is not a
+    # if the token sequence is unique and is not a
    # prefix of another word, no disambig symbol.
-    # Else output #1, or #2, #3, ... if the same phone-seq
+    # Else output #1, or #2, #3, ... if the same token-seq
    # has already been assigned a disambig symbol.
    ans = []
@ -183,14 +132,14 @@ def add_disambig_symbols(lexicon: Lexicon) -> Tuple[Lexicon, int]:
    max_disambig = first_allowed_disambig - 1
    last_used_disambig_symbol_of = defaultdict(int)
-    for word, prons in lexicon:
+    for word, tokens in lexicon:
-        phnseq = " ".join(prons)
+        tokenseq = " ".join(tokens)
-        assert phnseq != ""
+        assert tokenseq != ""
-        if issubseq[phnseq] == 0 and count[phnseq] == 1:
+        if issubseq[tokenseq] == 0 and count[tokenseq] == 1:
-            ans.append((word, prons))
+            ans.append((word, tokens))
            continue
-        cur_disambig = last_used_disambig_symbol_of[phnseq]
+        cur_disambig = last_used_disambig_symbol_of[tokenseq]
        if cur_disambig == 0:
            cur_disambig = first_allowed_disambig
        else:
@ -198,9 +147,9 @@ def add_disambig_symbols(lexicon: Lexicon) -> Tuple[Lexicon, int]:
        if cur_disambig > max_disambig:
            max_disambig = cur_disambig
-        last_used_disambig_symbol_of[phnseq] = cur_disambig
+        last_used_disambig_symbol_of[tokenseq] = cur_disambig
-        phnseq += f" #{cur_disambig}"
+        tokenseq += f" #{cur_disambig}"
-        ans.append((word, phnseq.split()))
+        ans.append((word, tokenseq.split()))
    return ans, max_disambig
@ -217,7 +166,7 @@ def generate_id_map(symbols: List[str]) -> Dict[str, int]:
 def add_self_loops(
-    arcs: List[List[Any]], disambig_phone: int, disambig_word: int
+    arcs: List[List[Any]], disambig_token: int, disambig_word: int
 ) -> List[List[Any]]:
    """Adds self-loops to states of an FST to propagate disambiguation symbols
    through it. They are added on each state with non-epsilon output symbols
@ -228,12 +177,15 @@ def add_self_loops(
    This function uses k2 style FSTs and it does not need to add self-loops
    to the final state.
    The input label of a self-loop is `disambig_token`, while the output
    label is `disambig_word`.
    Args:
      arcs:
        A list-of-list. The sublist contains
        `[src_state, dest_state, label, aux_label, score]`
-      disambig_phone:
+      disambig_token:
-        It is the phone ID of the symbol `#0`.
+        It is the token ID of the symbol `#0`.
      disambig_word:
        It is the word ID of the symbol `#0`.
@ -248,37 +200,38 @@ def add_self_loops(
    ans = []
    for s in states_needs_self_loops:
-        ans.append([s, s, disambig_phone, disambig_word, 0])
+        ans.append([s, s, disambig_token, disambig_word, 0])
    return arcs + ans
 def lexicon_to_fst(
    lexicon: Lexicon,
-    phone2id: Dict[str, int],
+    token2id: Dict[str, int],
    word2id: Dict[str, int],
-    sil_phone: str = "SIL",
+    sil_token: str = "SIL",
    sil_prob: float = 0.5,
    need_self_loops: bool = False,
 ) -> k2.Fsa:
    """Convert a lexicon to an FST (in k2 format) with optional silence at
-    the beginning and end of the word.
+    the beginning and end of each word.
    Args:
      lexicon:
        The input lexicon. See also :func:`read_lexicon`
-      phone2id:
+      token2id:
-        A dict mapping phones to IDs.
+        A dict mapping tokens to IDs.
      word2id:
        A dict mapping words to IDs.
-      sil_phone:
+      sil_token:
-        The silence phone.
+        The silence token.
      sil_prob:
        The probability for adding a silence at the beginning and end
        of the word.
      need_self_loops:
        If True, add self-loop to states with non-epsilon output symbols
-        on at least one arc out of the state.
+        on at least one arc out of the state. The input label for this
        self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
    Returns:
      Return an instance of `k2.Fsa` representing the given lexicon.
    """
@ -294,48 +247,44 @@ def lexicon_to_fst(
    next_state = 3  # the next un-allocated state, will be incremented as we go.
    arcs = []
-    assert phone2id["<eps>"] == 0
+    assert token2id["<eps>"] == 0
    assert word2id["<eps>"] == 0
    eps = 0
-    sil_phone = phone2id[sil_phone]
+    sil_token = token2id[sil_token]
    arcs.append([start_state, loop_state, eps, eps, no_sil_score])
    arcs.append([start_state, sil_state, eps, eps, sil_score])
-    arcs.append([sil_state, loop_state, sil_phone, eps, 0])
+    arcs.append([sil_state, loop_state, sil_token, eps, 0])
-    for word, prons in lexicon:
+    for word, tokens in lexicon:
-        assert len(prons) > 0, f"{word} has no pronunciations"
+        assert len(tokens) > 0, f"{word} has no pronunciations"
        cur_state = loop_state
        word = word2id[word]
-        prons = [phone2id[i] for i in prons]
+        tokens = [token2id[i] for i in tokens]
-        for i in range(len(prons) - 1):
+        for i in range(len(tokens) - 1):
-            if i == 0:
+            w = word if i == 0 else eps
-                arcs.append([cur_state, next_state, prons[i], word, 0])
+            arcs.append([cur_state, next_state, tokens[i], w, 0])
            else:
                arcs.append([cur_state, next_state, prons[i], eps, 0])
            cur_state = next_state
            next_state += 1
-        # now for the last phone of this word
+        # now for the last token of this word
        # It has two out-going arcs, one to the loop state,
        # the other one to the sil_state.
-        i = len(prons) - 1
+        i = len(tokens) - 1
        w = word if i == 0 else eps
-        arcs.append([cur_state, loop_state, prons[i], w, no_sil_score])
+        arcs.append([cur_state, loop_state, tokens[i], w, no_sil_score])
-        arcs.append([cur_state, sil_state, prons[i], w, sil_score])
+        arcs.append([cur_state, sil_state, tokens[i], w, sil_score])
    if need_self_loops:
-        disambig_phone = phone2id["#0"]
+        disambig_token = token2id["#0"]
        disambig_word = word2id["#0"]
        arcs = add_self_loops(
-            arcs,
+            arcs, disambig_token=disambig_token, disambig_word=disambig_word,
            disambig_phone=disambig_phone,
            disambig_word=disambig_word,
        )
    final_state = next_state
@ -354,22 +303,22 @@ def lexicon_to_fst(
 def main():
    out_dir = Path("data/lang")
    lexicon_filename = out_dir / "lexicon.txt"
-    sil_phone = "SIL"
+    sil_token = "SIL"
    sil_prob = 0.5
    lexicon = read_lexicon(lexicon_filename)
-    phones = get_phones(lexicon)
+    tokens = get_tokens(lexicon)
    words = get_words(lexicon)
    lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
    for i in range(max_disambig + 1):
        disambig = f"#{i}"
-        assert disambig not in phones
+        assert disambig not in tokens
-        phones.append(f"#{i}")
+        tokens.append(f"#{i}")
-    assert "<eps>" not in phones
+    assert "<eps>" not in tokens
-    phones = ["<eps>"] + phones
+    tokens = ["<eps>"] + tokens
    assert "<eps>" not in words
    assert "#0" not in words
@ -378,26 +327,26 @@ def main():
    words = ["<eps>"] + words + ["#0", "<s>", "</s>"]
-    phone2id = generate_id_map(phones)
+    token2id = generate_id_map(tokens)
    word2id = generate_id_map(words)
-    write_mapping(out_dir / "phones.txt", phone2id)
+    write_mapping(out_dir / "tokens.txt", token2id)
    write_mapping(out_dir / "words.txt", word2id)
    write_lexicon(out_dir / "lexicon_disambig.txt", lexicon_disambig)
    L = lexicon_to_fst(
        lexicon,
-        phone2id=phone2id,
+        token2id=token2id,
        word2id=word2id,
-        sil_phone=sil_phone,
+        sil_token=sil_token,
        sil_prob=sil_prob,
    )
    L_disambig = lexicon_to_fst(
        lexicon_disambig,
-        phone2id=phone2id,
+        token2id=token2id,
        word2id=word2id,
-        sil_phone=sil_phone,
+        sil_token=sil_token,
        sil_prob=sil_prob,
        need_self_loops=True,
    )
@ -406,7 +355,7 @@ def main():
    if False:
        # Just for debugging, will remove it
-        L.labels_sym = k2.SymbolTable.from_file(out_dir / "phones.txt")
+        L.labels_sym = k2.SymbolTable.from_file(out_dir / "tokens.txt")
        L.aux_labels_sym = k2.SymbolTable.from_file(out_dir / "words.txt")
        L_disambig.labels_sym = L.labels_sym
        L_disambig.aux_labels_sym = L.aux_labels_sym
--- a/egs/librispeech/ASR/local/prepare_lang_bpe.py
+++ b/egs/librispeech/ASR/local/prepare_lang_bpe.py
@ -3,9 +3,9 @@
 # Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
 """
-This script takes as inputs the following files:
+This script takes as inputs the following two files:
    - data/lang/bpe/bpe.model,
    - data/lang/bpe/tokens.txt (will remove it),
    - data/lang/bpe/words.txt
 and generates the following files in the directory data/lang/bpe:
@ -14,11 +14,11 @@ and generates the following files in the directory data/lang/bpe:
    - lexicon_disambig.txt
    - L.pt
    - L_disambig.pt
-    - phones.txt
+    - tokens.txt
 """
 from pathlib import Path
-from typing import Dict, List
+from typing import Dict, List, Tuple
 import k2
 import sentencepiece as spm
@ -28,6 +28,7 @@ from prepare_lang import (
    add_disambig_symbols,
    add_self_loops,
    write_lexicon,
    write_mapping,
 )
@ -48,48 +49,46 @@ def lexicon_to_fst_no_sil(
        A dict mapping words to IDs.
      need_self_loops:
        If True, add self-loop to states with non-epsilon output symbols
-        on at least one arc out of the state.
+        on at least one arc out of the state. The input label for this
        self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
    Returns:
      Return an instance of `k2.Fsa` representing the given lexicon.
    """
    loop_state = 0  # words enter and leave from here
-    next_state = 1  # the next un-allocated state, will be incremented as we go.
+    next_state = 1  # the next un-allocated state, will be incremented as we go
    arcs = []
-    assert token2id["<blank>"] == 0
+    # The blank symbol <blk> is defined in local/train_bpe_model.py
    assert token2id["<blk>"] == 0
    assert word2id["<eps>"] == 0
    eps = 0
-    for word, prons in lexicon:
+    for word, pieces in lexicon:
-        assert len(prons) > 0, f"{word} has no pronunciations"
+        assert len(pieces) > 0, f"{word} has no pronunciations"
        cur_state = loop_state
        word = word2id[word]
-        prons = [token2id[i] for i in prons]
+        pieces = [token2id[i] for i in pieces]
-        for i in range(len(prons) - 1):
+        for i in range(len(pieces) - 1):
-            if i == 0:
+            w = word if i == 0 else eps
-                arcs.append([cur_state, next_state, prons[i], word, 0])
+            arcs.append([cur_state, next_state, pieces[i], w, 0])
            else:
                arcs.append([cur_state, next_state, prons[i], eps, 0])
            cur_state = next_state
            next_state += 1
-        # now for the last phone of this word
+        # now for the last piece of this word
-        i = len(prons) - 1
+        i = len(pieces) - 1
        w = word if i == 0 else eps
-        arcs.append([cur_state, loop_state, prons[i], w, 0])
+        arcs.append([cur_state, loop_state, pieces[i], w, 0])
    if need_self_loops:
-        disambig_phone = token2id["#0"]
+        disambig_token = token2id["#0"]
        disambig_word = word2id["#0"]
        arcs = add_self_loops(
-            arcs,
+            arcs, disambig_token=disambig_token, disambig_word=disambig_word,
            disambig_phone=disambig_phone,
            disambig_word=disambig_word,
        )
    final_state = next_state
@ -105,7 +104,9 @@ def lexicon_to_fst_no_sil(
    return fsa
-def generate_lexicon(model_file: str, words: List[str]) -> Lexicon:
+def generate_lexicon(
    model_file: str, words: List[str]
 ) -> Tuple[Lexicon, Dict[str, int]]:
    """Generate a lexicon from a BPE model.
    Args:
@ -114,8 +115,10 @@ def generate_lexicon(model_file: str, words: List[str]) -> Lexicon:
      words:
        A list of strings representing words.
    Returns:
-      Return a dict whose keys are words and values are the corresponding
+      Return a tuple with two elements:
-      word pieces.
+        - A dict whose keys are words and values are the corresponding
          word pieces.
        - A dict representing the token symbol, mapping from tokens to IDs.
    """
    sp = spm.SentencePieceProcessor()
    sp.load(str(model_file))
@ -126,8 +129,14 @@ def generate_lexicon(model_file: str, words: List[str]) -> Lexicon:
    for word, pieces in zip(words, words_pieces):
        lexicon.append((word, pieces))
-    lexicon.append(("<UNK>", ["<UNK>"]))
+    # The OOV word is <UNK>
-    return lexicon
+    lexicon.append(("<UNK>", [sp.id_to_piece(sp.unk_id())]))
    token2id: Dict[str, int] = dict()
    for i in range(sp.vocab_size()):
        token2id[sp.id_to_piece(i)] = i
    return lexicon, token2id
 def main():
@ -143,34 +152,28 @@ def main():
        if w in words:
            words.remove(w)
-    lexicon = generate_lexicon(model_file, words)
+    lexicon, token_sym_table = generate_lexicon(model_file, words)
    # TODO(fangjun): Remove tokens.txt and generate it from the model directly.
    #
    # We are using it since the IDs we are using in tokens.txt is
    # different from the one contained in the model
    token_sym_table = k2.SymbolTable.from_file(lang_dir / "tokens.txt")
    lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
    next_token_id = max(token_sym_table.values()) + 1
    for i in range(max_disambig + 1):
        disambig = f"#{i}"
        assert disambig not in token_sym_table
-        token_sym_table.add(f"#{i}")
+        token_sym_table[disambig] = next_token_id
        next_token_id += 1
    word_sym_table.add("#0")
    word_sym_table.add("<s>")
    word_sym_table.add("</s>")
-    token_sym_table.to_file(lang_dir / "phones.txt")
+    write_mapping(lang_dir / "tokens.txt", token_sym_table)
    write_lexicon(lang_dir / "lexicon.txt", lexicon)
    write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig)
    L = lexicon_to_fst_no_sil(
-        lexicon,
+        lexicon, token2id=token_sym_table, word2id=word_sym_table,
        token2id=token_sym_table,
        word2id=word_sym_table,
    )
    L_disambig = lexicon_to_fst_no_sil(
@ -184,7 +187,7 @@ def main():
    if False:
        # Just for debugging, will remove it
-        L.labels_sym = k2.SymbolTable.from_file(lang_dir / "phones.txt")
+        L.labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt")
        L.aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt")
        L_disambig.labels_sym = L.labels_sym
        L_disambig.aux_labels_sym = L.aux_labels_sym
--- a/egs/librispeech/ASR/local/train_bpe_model.py
+++ b/egs/librispeech/ASR/local/train_bpe_model.py
@ -0,0 +1,60 @@
 #!/usr/bin/env python3
 """
 This script takes as input "data/lang/bpe/train.txt"
 and generates "data/lang/bpe/bep.model".
 """
 # You can install sentencepiece via:
 #
 #  pip install sentencepiece
 #
 # Due to an issue reported in
 # https://github.com/google/sentencepiece/pull/642#issuecomment-857972030
 #
 # Please install a version >=0.1.96
 from pathlib import Path
 import sentencepiece as spm
 import shutil
 def main():
    model_type = "unigram"
    vocab_size = 5000
    model_prefix = f"data/lang/bpe/{model_type}_{vocab_size}"
    train_text = "data/lang/bpe/train.txt"
    character_coverage = 1.0
    input_sentence_size = 100000000
    user_defined_symbols = ["<blk>", "<sos/eos>"]
    unk_id = len(user_defined_symbols)
    # Note: unk_id is fixed to 2.
    # If you change it, you should also change other
    # places that are using it.
    model_file = Path(model_prefix + ".model")
    if not model_file.is_file():
        spm.SentencePieceTrainer.train(
            input=train_text,
            vocab_size=vocab_size,
            model_type=model_type,
            model_prefix=model_prefix,
            input_sentence_size=input_sentence_size,
            character_coverage=character_coverage,
            user_defined_symbols=user_defined_symbols,
            unk_id=unk_id,
            bos_id=-1,
            eos_id=-1,
        )
    sp = spm.SentencePieceProcessor(model_file=str(model_file))
    vocab_size = sp.vocab_size()
    shutil.copyfile(model_file, "data/lang/bpe/bpe.model")
 if __name__ == "__main__":
    main()
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@ -10,14 +10,20 @@ stop_stage=100
 mkdir -p data
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
-  echo "stage -1: Download LM"
+  log "stage -1: Download LM"
  mkdir -p data/lm
  ./local/download_lm.py
 fi
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
-  echo "stage 0: Download data"
+  log "stage 0: Download data"
  # If you have pre-downloaded it to /path/to/LibriSpeech,
  # you can create a symlink
@ -49,7 +55,7 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
 fi
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
-  echo "Stage 1: Prepare librispeech manifest"
+  log "Stage 1: Prepare librispeech manifest"
  # We assume that you have downloaded the librispeech corpus
  # to data/LibriSpeech
  mkdir -p data/manifests
@ -57,7 +63,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
 fi
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
-  echo "Stage 2: Prepare musan manifest"
+  log "Stage 2: Prepare musan manifest"
  # We assume that you have downloaded the musan corpus
  # to data/musan
  mkdir -p data/manifests
@ -65,19 +71,19 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
 fi
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
-  echo "Stage 3: Compute fbank for librispeech"
+  log "Stage 3: Compute fbank for librispeech"
  mkdir -p data/fbank
  ./local/compute_fbank_librispeech.py
 fi
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
-  echo "Stage 4: Compute fbank for musan"
+  log "Stage 4: Compute fbank for musan"
  mkdir -p data/fbank
  ./local/compute_fbank_musan.py
 fi
 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
-  echo "Stage 5: Prepare phone based lang"
+  log "Stage 5: Prepare phone based lang"
  # TODO: add BPE based lang
  mkdir -p data/lang
@ -85,21 +91,37 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
    cat - data/lm/librispeech-lexicon.txt |
    sort | uniq > data/lang/lexicon.txt
-  ./local/prepare_lang.py
+  if [ ! -f data/lang/L_disambig.pt ]; then
    ./local/prepare_lang.py
  fi
 fi
 if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
-  echo "State 6: Prepare BPE based lang"
+  log "State 6: Prepare BPE based lang"
  mkdir -p data/lang/bpe
  cp data/lang/words.txt data/lang/bpe/
  if [ ! -f data/lang/bpe/train.txt ]; then
    log "Generate data for BPE training"
    files=$(
      find "data/LibriSpeech/train-clean-100" -name "*.trans.txt"
      find "data/LibriSpeech/train-clean-360" -name "*.trans.txt"
      find "data/LibriSpeech/train-other-500" -name "*.trans.txt"
    )
    for f in ${files[@]}; do
      cat $f | cut -d " " -f 2-
    done > data/lang/bpe/train.txt
  fi
  python3 ./local/train_bpe_model.py
  if [ ! -f data/lang/bpe/L_disambig.pt ]; then
    ./local/prepare_lang_bpe.py
  fi
 fi
 if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
-  echo "Stage 7: Prepare G"
+  log "Stage 7: Prepare G"
  # We assume you have install kaldilm, if not, please install
  # it using: pip install kaldilm
@ -123,6 +145,6 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
 fi
 if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
-  echo "Stage 8: Compile HLG"
+  log "Stage 8: Compile HLG"
  python3 ./local/compile_hlg.py
 fi
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/decode.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/decode.py
@ -72,7 +72,7 @@ def get_params() -> AttributeDict:
            #  - nbest
            #  - nbest-rescoring
            #  - whole-lattice-rescoring
-            "method": "whole-lattice-rescoring",
+            "method": "1best",
            # num_paths is used when method is "nbest" and "nbest-rescoring"
            "num_paths": 30,
        }
@ -173,7 +173,7 @@ def decode_one_batch(
            )
            key = f"no_rescore-{params.num_paths}"
        hyps = get_texts(best_path)
-        hyps = [[lexicon.words[i] for i in ids] for ids in hyps]
+        hyps = [[lexicon.word_table[i] for i in ids] for ids in hyps]
        return {key: hyps}
    assert params.method in ["nbest-rescoring", "whole-lattice-rescoring"]
@ -196,7 +196,7 @@ def decode_one_batch(
    ans = dict()
    for lm_scale_str, best_path in best_path_dict.items():
        hyps = get_texts(best_path)
-        hyps = [[lexicon.words[i] for i in ids] for ids in hyps]
+        hyps = [[lexicon.word_table[i] for i in ids] for ids in hyps]
        ans[lm_scale_str] = hyps
    return ans
--- a/icefall/bpe_graph_compiler.py
+++ b/icefall/bpe_graph_compiler.py
@ -0,0 +1,74 @@
 from pathlib import Path
 from typing import List, Union
 import k2
 import sentencepiece as spm
 import torch
 class BpeCtcTrainingGraphCompiler(object):
    def __init__(
        self,
        lang_dir: Path,
        device: Union[str, torch.device] = "cpu",
        sos_token: str = "<sos/eos>",
        eos_token: str = "<sos/eos>",
    ) -> None:
        """
        Args:
          lang_dir:
            This directory is expected to contain the following files:
                - bpe.model
                - words.txt
          device:
            It indicates CPU or CUDA.
          sos_token:
            The word piece that represents sos.
          eos_token:
            The word piece that represents eos.
        """
        lang_dir = Path(lang_dir)
        model_file = lang_dir / "bpe.model"
        sp = spm.SentencePieceProcessor()
        sp.load(str(model_file))
        self.sp = sp
        self.word_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
        self.device = device
        self.sos_id = self.sp.piece_to_id(sos_token)
        self.eos_id = self.sp.piece_to_id(eos_token)
        assert self.sos_id != self.sp.unk_id()
        assert self.eos_id != self.sp.unk_id()
    def texts_to_ids(self, texts: List[str]) -> List[List[int]]:
        """Convert a list of texts to a list-of-list of piece IDs.
        Args:
          texts:
            It is a list of strings. Each string consists of space(s)
            separated words. An example containing two strings is given below:
                ['HELLO ICEFALL', 'HELLO k2']
        Returns:
          Return a list-of-list of piece IDs.
        """
        return self.sp.encode(texts, out_type=int)
    def compile(
        self, piece_ids: List[List[int]], modified: bool = False,
    ) -> k2.Fsa:
        """Build a ctc graph from a list-of-list piece IDs.
        Args:
          piece_ids:
            It is a list-of-list integer IDs.
         modified:
           See :func:`k2.ctc_graph` for its meaning.
        Return:
          Return an FsaVec, which is the result of composing a
          CTC topology with linear FSAs constructed from the given
          piece IDs.
        """
        return k2.ctc_graph(piece_ids, modified=modified, device=self.device)
--- a/icefall/graph_compiler.py
+++ b/icefall/graph_compiler.py
@ -8,10 +8,7 @@ from icefall.lexicon import Lexicon
 class CtcTrainingGraphCompiler(object):
    def __init__(
-        self,
+        self, lexicon: Lexicon, device: torch.device, oov: str = "<UNK>",
        lexicon: Lexicon,
        device: torch.device,
        oov: str = "<UNK>",
    ):
        """
        Args:
@ -26,11 +23,11 @@ class CtcTrainingGraphCompiler(object):
        L_inv = lexicon.L_inv.to(device)
        assert L_inv.requires_grad is False
-        assert oov in lexicon.words
+        assert oov in lexicon.word_table
        self.L_inv = k2.arc_sort(L_inv)
-        self.oov_id = lexicon.words[oov]
+        self.oov_id = lexicon.word_table[oov]
-        self.words = lexicon.words
+        self.word_table = lexicon.word_table
        max_token_id = max(lexicon.tokens)
        ctc_topo = k2.ctc_topo(max_token_id, modified=False)
@ -90,8 +87,8 @@ class CtcTrainingGraphCompiler(object):
        for text in texts:
            word_ids = []
            for word in text.split(" "):
-                if word in self.words:
+                if word in self.word_table:
-                    word_ids.append(self.words[word])
+                    word_ids.append(self.word_table[word])
                else:
                    word_ids.append(self.oov_id)
            word_ids_list.append(word_ids)
--- a/icefall/lexicon.py
+++ b/icefall/lexicon.py
@ -1,12 +1,65 @@
 import logging
 import re
 from pathlib import Path
-from typing import List
+from typing import List, Tuple, Union
 import k2
 import torch
 def read_lexicon(filename: str) -> List[Tuple[str, List[str]]]:
    """Read a lexicon from `filename`.
    Each line in the lexicon contains "word p1 p2 p3 ...".
    That is, the first field is a word and the remaining
    fields are tokens. Fields are separated by space(s).
    Args:
      filename:
        Path to the lexicon.txt
    Returns:
      A list of tuples., e.g., [('w', ['p1', 'p2']), ('w1', ['p3, 'p4'])]
    """
    ans = []
    with open(filename, "r", encoding="utf-8") as f:
        whitespace = re.compile("[ \t]+")
        for line in f:
            a = whitespace.split(line.strip(" \t\r\n"))
            if len(a) == 0:
                continue
            if len(a) < 2:
                print(f"Found bad line {line} in lexicon file {filename}")
                print("Every line is expected to contain at least 2 fields")
                sys.exit(1)
            word = a[0]
            if word == "<eps>":
                print(f"Found bad line {line} in lexicon file {filename}")
                print("<eps> should not be a valid word")
                sys.exit(1)
            tokens = a[1:]
            ans.append((word, tokens))
    return ans
 def write_lexicon(filename: str, lexicon: List[Tuple[str, List[str]]]) -> None:
    """Write a lexicon to a file.
    Args:
      filename:
        Path to the lexicon file to be generated.
      lexicon:
        It can be the return value of :func:`read_lexicon`.
    """
    with open(filename, "w", encoding="utf-8") as f:
        for word, tokens in lexicon:
            f.write(f"{word} {' '.join(tokens)}\n")
 class Lexicon(object):
    """Phone based lexicon.
@ -14,14 +67,14 @@ class Lexicon(object):
    """
    def __init__(
-        self, lang_dir: Path, disambig_pattern: str = re.compile(r"^#\d+$")
+        self, lang_dir: Path, disambig_pattern: str = re.compile(r"^#\d+$"),
    ):
        """
        Args:
          lang_dir:
            Path to the lang director. It is expected to contain the following
            files:
-                - phones.txt
+                - tokens.txt
                - words.txt
                - L.pt
            The above files are produced by the script `prepare.sh`. You
@ -30,11 +83,11 @@ class Lexicon(object):
            It contains the pattern for disambiguation symbols.
        """
        lang_dir = Path(lang_dir)
-        self.phones = k2.SymbolTable.from_file(lang_dir / "phones.txt")
+        self.token_table = k2.SymbolTable.from_file(lang_dir / "tokens.txt")
-        self.words = k2.SymbolTable.from_file(lang_dir / "words.txt")
+        self.word_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
        if (lang_dir / "Linv.pt").exists():
-            logging.info("Loading pre-compiled Linv.pt")
+            logging.info(f"Loading pre-compiled {lang_dir}/Linv.pt")
            L_inv = k2.Fsa.from_dict(torch.load(lang_dir / "Linv.pt"))
        else:
            logging.info("Converting L.pt to Linv.pt")
@ -49,18 +102,92 @@ class Lexicon(object):
    @property
    def tokens(self) -> List[int]:
-        """Return a list of phone IDs excluding those from
+        """Return a list of token IDs excluding those from
        disambiguation symbols.
        Caution:
-          0 is not a phone ID so it is excluded from the return value.
+          0 is not a token ID so it is excluded from the return value.
        """
-        symbols = self.phones.symbols
+        symbols = self.token_table.symbols
        ans = []
        for s in symbols:
            if not self.disambig_pattern.match(s):
-                ans.append(self.phones[s])
+                ans.append(self.token_table[s])
        if 0 in ans:
            ans.remove(0)
        ans.sort()
        return ans
 class BpeLexicon(Lexicon):
    def __init__(
        self, lang_dir: Path, disambig_pattern: str = re.compile(r"^#\d+$"),
    ):
        """
        Refer to the help information in Lexicon.__init__.
        """
        super().__init__(lang_dir=lang_dir, disambig_pattern=disambig_pattern)
        self.ragged_lexicon = self.convert_lexicon_to_ragged(
            lang_dir / "lexicon.txt"
        )
    def convert_lexicon_to_ragged(self, filename: str) -> k2.RaggedInt:
        """Read a BPE lexicon from file and convert it to a
        k2 ragged tensor.
        Args:
          filename:
            Filename of the BPE lexicon, e.g., data/lang/bpe/lexicon.txt
        Returns:
          A k2 ragged tensor with two axes [word_id]
        """
        disambig_id = self.word_table["#0"]
        # We reuse the same words.txt from the phone based lexicon
        # so that we can share the same G.fst. Here, we have to
        # exclude some words present only in the phone based lexicon.
        excluded_words = ["<eps>", "!SIL", "<SPOKEN_NOISE>"]
        # epsilon is not a word, but it occupies on position
        #
        row_splits = [0]
        token_ids = []
        lexicon = read_lexicon(filename)
        lexicon = dict(lexicon)
        for i in range(disambig_id):
            w = self.word_table[i]
            if w in excluded_words:
                row_splits.append(row_splits[-1])
                continue
            pieces = lexicon[w]
            piece_ids = [self.token_table[k] for k in pieces]
            row_splits.append(row_splits[-1] + len(piece_ids))
            token_ids.extend(piece_ids)
        cached_tot_size = row_splits[-1]
        row_splits = torch.tensor(row_splits, dtype=torch.int32)
        shape = k2.ragged.create_ragged_shape2(
            row_splits=row_splits, cached_tot_size=cached_tot_size
        )
        values = torch.tensor(token_ids, dtype=torch.int32)
        return k2.RaggedInt(shape, values)
    def words_to_piece_ids(self, words: List[str]) -> k2.RaggedInt:
        """Convert a list of words to a ragged tensor contained
        word piece IDs.
        """
        word_ids = [self.word_table[w] for w in words]
        word_ids = torch.tensor(word_ids, dtype=torch.int32)
        ragged, _ = k2.ragged.index(
            self.ragged_lexicon,
            indexes=word_ids,
            need_value_indexes=False,
            axis=0,
        )
        return ragged
--- a/test/test_bpe_graph_compiler.py
+++ b/test/test_bpe_graph_compiler.py
@ -0,0 +1,25 @@
 #!/usr/bin/env python3
 # Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
 from icefall.bpe_graph_compiler import BpeCtcTrainingGraphCompiler
 from icefall.lexicon import BpeLexicon
 from pathlib import Path
 def test():
    lang_dir = Path("data/lang/bpe")
    if not lang_dir.is_dir():
        return
    # TODO: generate data for testing
    compiler = BpeCtcTrainingGraphCompiler(lang_dir)
    ids = compiler.texts_to_ids(["HELLO", "WORLD ZZZ"])
    fsa = compiler.compile(ids)
    lexicon = BpeLexicon(lang_dir)
    ids0 = lexicon.words_to_piece_ids(["HELLO"])
    assert ids[0] == ids0.values().tolist()
    ids1 = lexicon.words_to_piece_ids(["WORLD", "ZZZ"])
    assert ids[1] == ids1.values().tolist()
--- a/test/test_checkpoint.py
+++ b/test/test_checkpoint.py
@ -41,7 +41,8 @@ def test_load_checkpoints(checkpoints1):
    m.p2 = nn.Parameter(torch.Tensor([0, 0]))
    params = load_checkpoint(checkpoints1, m)
    assert torch.allclose(m.p1, torch.Tensor([10.0, 20]))
-    assert params == {"a": 10, "b": 20}
+    assert params["a"] == 10
    assert params["b"] == 20
 def test_average_checkpoints(checkpoints1, checkpoints2):
--- a/test/test_graph_compiler.py
+++ b/test/test_graph_compiler.py
@ -81,8 +81,8 @@ def lexicon():
    """
    )
    ans = Lexicon.__new__(Lexicon)
-    ans.phones = L.labels_sym
+    ans.token_table = L.labels_sym
-    ans.words = L.aux_labels_sym
+    ans.word_table = L.aux_labels_sym
    ans.L_inv = k2.arc_sort(L.invert_())
    ans.disambig_pattern = re.compile(r"^#\d+$")
@ -107,11 +107,11 @@ class TestCtcTrainingGraphCompiler(object):
        aux_labels1 = fsa[1].aux_labels[:-1]
        aux_labels1 = aux_labels1[aux_labels1 != 0].tolist()
-        labels0 = [lexicon.phones[i] for i in labels0]
+        labels0 = [lexicon.token_table[i] for i in labels0]
-        labels1 = [lexicon.phones[i] for i in labels1]
+        labels1 = [lexicon.token_table[i] for i in labels1]
-        aux_labels0 = [lexicon.words[i] for i in aux_labels0]
+        aux_labels0 = [lexicon.word_table[i] for i in aux_labels0]
-        aux_labels1 = [lexicon.words[i] for i in aux_labels1]
+        aux_labels1 = [lexicon.word_table[i] for i in aux_labels1]
        assert labels0 == ["b", "a", "r", "f", "o", "o"]
        assert aux_labels0 == ["bar", "foo"]
@ -129,11 +129,11 @@ class TestCtcTrainingGraphCompiler(object):
        input2 = ["b", "b", "a", "a", "a", "<blk>", "<blk>", "z", "z"]
        input2 += ["<blk>", "<blk>", "SPN", "SPN", "<blk>", "<blk>"]
-        lexicon.phones._id2sym[0] == "<blk>"
+        lexicon.token_table._id2sym[0] == "<blk>"
-        lexicon.phones._sym2id["<blk>"] = 0
+        lexicon.token_table._sym2id["<blk>"] = 0
-        input1 = [lexicon.phones[i] for i in input1]
+        input1 = [lexicon.token_table[i] for i in input1]
-        input2 = [lexicon.phones[i] for i in input2]
+        input2 = [lexicon.token_table[i] for i in input2]
        fsa1 = k2.linear_fsa(input1)
        fsa2 = k2.linear_fsa(input2)
@ -147,14 +147,14 @@ class TestCtcTrainingGraphCompiler(object):
        aux_labels0 = lattice[0].aux_labels[:-1]
        aux_labels0 = aux_labels0[aux_labels0 != 0].tolist()
-        aux_labels0 = [lexicon.words[i] for i in aux_labels0]
+        aux_labels0 = [lexicon.word_table[i] for i in aux_labels0]
        assert aux_labels0 == ["bar", "foo"]
        aux_labels1 = lattice[1].aux_labels[:-1]
        aux_labels1 = aux_labels1[aux_labels1 != 0].tolist()
-        aux_labels1 = [lexicon.words[i] for i in aux_labels1]
+        aux_labels1 = [lexicon.word_table[i] for i in aux_labels1]
        assert aux_labels1 == ["baz", "<UNK>"]
        texts = get_texts(lattice)
-        texts = [[lexicon.words[i] for i in words] for words in texts]
+        texts = [[lexicon.word_table[i] for i in words] for words in texts]
        assert texts == [["bar", "foo"], ["baz", "<UNK>"]]
--- a/test/test_lexicon.py
+++ b/test/test_lexicon.py
@ -1,10 +1,12 @@
 #!/usr/bin/env python3
 from pathlib import Path
 import k2
 import pytest
 import torch
-from icefall.lexicon import Lexicon
+from icefall.lexicon import BpeLexicon, Lexicon
@pytest.fixture
@ -47,7 +49,7 @@ def lang_dir(tmp_path):
        num_aux_labels=1,
    )
-    with open(tmp_path / "phones.txt", "w") as f:
+    with open(tmp_path / "tokens.txt", "w") as f:
        f.write(phone2id)
    with open(tmp_path / "words.txt", "w") as f:
        f.write(word2id)
@ -60,3 +62,16 @@ def lang_dir(tmp_path):
 def test_lexicon(lang_dir):
    lexicon = Lexicon(lang_dir)
    assert lexicon.tokens == list(range(1, 8))
 def test_bpe_lexicon():
    lang_dir = Path("data/lang/bpe")
    if not lang_dir.is_dir():
        return
    # TODO: Generate test data for BpeLexicon
    lexicon = BpeLexicon(lang_dir)
    words = ["<UNK>", "HELLO", "ZZZZ", "WORLD"]
    ids = lexicon.words_to_piece_ids(words)
    print(ids)
    print([lexicon.token_table[i] for i in ids.values().tolist()])