remove unused scripts and soft link common scripts

2022-05-13 10:11:37 -04:00 · 2022-05-13 10:11:37 -04:00 · d4a8648a0c
commit d4a8648a0c
parent 2381ba544d
10 changed files with 33 additions and 1837 deletions
--- a/egs/spgispeech/ASR/local/compile_hlg.py
+++ b/egs/spgispeech/ASR/local/compile_hlg.py
@ -1,152 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-This script takes as input lang_dir and generates HLG from
-    - H, the ctc topology, built from tokens contained in lang_dir/lexicon.txt
-    - L, the lexicon, built from lang_dir/L_disambig.pt
-        Caution: We use a lexicon that contains disambiguation symbols
-    - G, the LM, built from data/lm/G_3_gram.fst.txt
-The generated HLG is saved in $lang_dir/HLG.pt
-"""
-import argparse
-import logging
-from pathlib import Path
-
-import k2
-import torch
-
-from icefall.lexicon import Lexicon
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--lang-dir",
-        type=str,
-        help="""Input and output directory.
-        """,
-    )
-
-    return parser.parse_args()
-
-
-def compile_HLG(lang_dir: str) -> k2.Fsa:
-    """
-    Args:
-      lang_dir:
-        The language directory, e.g., data/lang_phone or data/lang_bpe_5000.
-    Return:
-      An FSA representing HLG.
-    """
-    lexicon = Lexicon(lang_dir)
-    max_token_id = max(lexicon.tokens)
-    logging.info(f"Building ctc_topo. max_token_id: {max_token_id}")
-    H = k2.ctc_topo(max_token_id)
-    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt"))
-
-    if Path("data/lm/G_3_gram.pt").is_file():
-        logging.info("Loading pre-compiled G_3_gram")
-        d = torch.load("data/lm/G_3_gram.pt")
-        G = k2.Fsa.from_dict(d)
-    else:
-        logging.info("Loading G_3_gram.fst.txt")
-        with open("data/lm/G_3_gram.fst.txt") as f:
-            G = k2.Fsa.from_openfst(f.read(), acceptor=False)
-            torch.save(G.as_dict(), "data/lm/G_3_gram.pt")
-
-    first_token_disambig_id = lexicon.token_table["#0"]
-    first_word_disambig_id = lexicon.word_table["#0"]
-
-    L = k2.arc_sort(L)
-    G = k2.arc_sort(G)
-
-    logging.info("Intersecting L and G")
-    LG = k2.compose(L, G)
-    logging.info(f"LG shape: {LG.shape}")
-
-    logging.info("Connecting LG")
-    LG = k2.connect(LG)
-    logging.info(f"LG shape after k2.connect: {LG.shape}")
-
-    logging.info(type(LG.aux_labels))
-    logging.info("Determinizing LG")
-
-    LG = k2.determinize(LG)
-    logging.info(type(LG.aux_labels))
-
-    logging.info("Connecting LG after k2.determinize")
-    LG = k2.connect(LG)
-
-    logging.info("Removing disambiguation symbols on LG")
-
-    LG.labels[LG.labels >= first_token_disambig_id] = 0
-    # See https://github.com/k2-fsa/k2/issues/874
-    # for why we need to set LG.properties to None
-    LG.__dict__["_properties"] = None
-
-    assert isinstance(LG.aux_labels, k2.RaggedTensor)
-    LG.aux_labels.values[LG.aux_labels.values >= first_word_disambig_id] = 0
-
-    LG = k2.remove_epsilon(LG)
-    logging.info(f"LG shape after k2.remove_epsilon: {LG.shape}")
-
-    LG = k2.connect(LG)
-    LG.aux_labels = LG.aux_labels.remove_values_eq(0)
-
-    logging.info("Arc sorting LG")
-    LG = k2.arc_sort(LG)
-
-    logging.info("Composing H and LG")
-    # CAUTION: The name of the inner_labels is fixed
-    # to `tokens`. If you want to change it, please
-    # also change other places in icefall that are using
-    # it.
-    HLG = k2.compose(H, LG, inner_labels="tokens")
-
-    logging.info("Connecting LG")
-    HLG = k2.connect(HLG)
-
-    logging.info("Arc sorting LG")
-    HLG = k2.arc_sort(HLG)
-    logging.info(f"HLG.shape: {HLG.shape}")
-
-    return HLG
-
-
-def main():
-    args = get_args()
-    lang_dir = Path(args.lang_dir)
-
-    if (lang_dir / "HLG.pt").is_file():
-        logging.info(f"{lang_dir}/HLG.pt already exists - skipping")
-        return
-
-    logging.info(f"Processing {lang_dir}")
-
-    HLG = compile_HLG(lang_dir)
-    logging.info(f"Saving HLG.pt to {lang_dir}")
-    torch.save(HLG.as_dict(), f"{lang_dir}/HLG.pt")
-
-
-if __name__ == "__main__":
-    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-
-    logging.basicConfig(format=formatter, level=logging.INFO)
-
-    main()
--- a/egs/spgispeech/ASR/local/compile_hlg.py
+++ b/egs/spgispeech/ASR/local/compile_hlg.py
@ -0,0 +1 @@
+../../../librispeech/ASR/local/compile_hlg.py
--- a/egs/spgispeech/ASR/local/convert_transcript_words_to_tokens.py
+++ b/egs/spgispeech/ASR/local/convert_transcript_words_to_tokens.py
@ -1,107 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
-"""
-Convert a transcript file containing words to a corpus file containing tokens
-for LM training with the help of a lexicon.
-
-If the lexicon contains phones, the resulting LM will be a phone LM; If the
-lexicon contains word pieces, the resulting LM will be a word piece LM.
-
-If a word has multiple pronunciations, the one that appears first in the lexicon
-is kept; others are removed.
-
-If the input transcript is:
-
-    hello zoo world hello
-    world zoo
-    foo zoo world hellO
-
-and if the lexicon is
-
-    <UNK> SPN
-    hello h e l l o 2
-    hello h e l l o
-    world w o r l d
-    zoo z o o
-
-Then the output is
-
-    h e l l o 2 z o o w o r l d h e l l o 2
-    w o r l d z o o
-    SPN z o o w o r l d SPN
-"""
-
-import argparse
-from pathlib import Path
-from typing import Dict, List
-
-from generate_unique_lexicon import filter_multiple_pronunications
-
-from icefall.lexicon import read_lexicon
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--transcript",
-        type=str,
-        help="The input transcript file."
-        "We assume that the transcript file consists of "
-        "lines. Each line consists of space separated words.",
-    )
-    parser.add_argument("--lexicon", type=str, help="The input lexicon file.")
-    parser.add_argument(
-        "--oov", type=str, default="<UNK>", help="The OOV word."
-    )
-
-    return parser.parse_args()
-
-
-def process_line(
-    lexicon: Dict[str, List[str]], line: str, oov_token: str
-) -> None:
-    """
-    Args:
-      lexicon:
-        A dict containing pronunciations. Its keys are words and values
-        are pronunciations (i.e., tokens).
-      line:
-        A line of transcript consisting of space(s) separated words.
-      oov_token:
-        The pronunciation of the oov word if a word in `line` is not present
-        in the lexicon.
-    Returns:
-      Return None.
-    """
-    s = ""
-    words = line.strip().split()
-    for i, w in enumerate(words):
-        tokens = lexicon.get(w, oov_token)
-        s += " ".join(tokens)
-        s += " "
-    print(s.strip())
-
-
-def main():
-    args = get_args()
-    assert Path(args.lexicon).is_file()
-    assert Path(args.transcript).is_file()
-    assert len(args.oov) > 0
-
-    # Only the first pronunciation of a word is kept
-    lexicon = filter_multiple_pronunications(read_lexicon(args.lexicon))
-
-    lexicon = dict(lexicon)
-
-    assert args.oov in lexicon
-
-    oov_token = lexicon[args.oov]
-
-    with open(args.transcript) as f:
-        for line in f:
-            process_line(lexicon=lexicon, line=line, oov_token=oov_token)
-
-
-if __name__ == "__main__":
-    main()
--- a/egs/spgispeech/ASR/local/download_lm.py
+++ b/egs/spgispeech/ASR/local/download_lm.py
@ -1,97 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-This file downloads the following LibriSpeech LM files:
-
-    - 3-gram.pruned.1e-7.arpa.gz
-    - 4-gram.arpa.gz
-    - librispeech-vocab.txt
-    - librispeech-lexicon.txt
-
-from http://www.openslr.org/resources/11
-and save them in the user provided directory.
-
-Files are not re-downloaded if they already exist.
-
-Usage:
-    ./local/download_lm.py --out-dir ./download/lm
-"""
-
-import argparse
-import gzip
-import logging
-import os
-import shutil
-from pathlib import Path
-
-from lhotse.utils import urlretrieve_progress
-from tqdm.auto import tqdm
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--out-dir", type=str, help="Output directory.")
-
-    args = parser.parse_args()
-    return args
-
-
-def main(out_dir: str):
-    url = "http://www.openslr.org/resources/11"
-    out_dir = Path(out_dir)
-
-    files_to_download = (
-        "3-gram.pruned.1e-7.arpa.gz",
-        "4-gram.arpa.gz",
-        "librispeech-vocab.txt",
-        "librispeech-lexicon.txt",
-    )
-
-    for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"):
-        filename = out_dir / f
-        if filename.is_file() is False:
-            urlretrieve_progress(
-                f"{url}/{f}",
-                filename=filename,
-                desc=f"Downloading {filename}",
-            )
-        else:
-            logging.info(f"{filename} already exists - skipping")
-
-        if ".gz" in str(filename):
-            unzipped = Path(os.path.splitext(filename)[0])
-            if unzipped.is_file() is False:
-                with gzip.open(filename, "rb") as f_in:
-                    with open(unzipped, "wb") as f_out:
-                        shutil.copyfileobj(f_in, f_out)
-            else:
-                logging.info(f"{unzipped} already exist - skipping")
-
-
-if __name__ == "__main__":
-    formatter = (
-        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-    )
-
-    logging.basicConfig(format=formatter, level=logging.INFO)
-
-    args = get_args()
-    logging.info(f"out_dir: {args.out_dir}")
-
-    main(out_dir=args.out_dir)
--- a/egs/spgispeech/ASR/local/generate_unique_lexicon.py
+++ b/egs/spgispeech/ASR/local/generate_unique_lexicon.py
@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This file takes as input a lexicon.txt and output a new lexicon,
-in which each word has a unique pronunciation.
-
-The way to do this is to keep only the first pronunciation of a word
-in lexicon.txt.
-"""
-
-
-import argparse
-import logging
-from pathlib import Path
-from typing import List, Tuple
-
-from icefall.lexicon import read_lexicon, write_lexicon
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--lang-dir",
-        type=str,
-        help="""Input and output directory.
-        It should contain a file lexicon.txt.
-        This file will generate a new file uniq_lexicon.txt
-        in it.
-        """,
-    )
-
-    return parser.parse_args()
-
-
-def filter_multiple_pronunications(
-    lexicon: List[Tuple[str, List[str]]]
-) -> List[Tuple[str, List[str]]]:
-    """Remove multiple pronunciations of words from a lexicon.
-
-    If a word has more than one pronunciation in the lexicon, only
-    the first one is kept, while other pronunciations are removed
-    from the lexicon.
-
-    Args:
-      lexicon:
-        The input lexicon, containing a list of (word, [p1, p2, ..., pn]),
-        where "p1, p2, ..., pn" are the pronunciations of the "word".
-    Returns:
-      Return a new lexicon where each word has a unique pronunciation.
-    """
-    seen = set()
-    ans = []
-
-    for word, tokens in lexicon:
-        if word in seen:
-            continue
-        seen.add(word)
-        ans.append((word, tokens))
-    return ans
-
-
-def main():
-    args = get_args()
-    lang_dir = Path(args.lang_dir)
-
-    lexicon_filename = lang_dir / "lexicon.txt"
-
-    in_lexicon = read_lexicon(lexicon_filename)
-
-    out_lexicon = filter_multiple_pronunications(in_lexicon)
-
-    write_lexicon(lang_dir / "uniq_lexicon.txt", out_lexicon)
-
-    logging.info(f"Number of entries in lexicon.txt: {len(in_lexicon)}")
-    logging.info(f"Number of entries in uniq_lexicon.txt: {len(out_lexicon)}")
-
-
-if __name__ == "__main__":
-    formatter = (
-        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-    )
-
-    logging.basicConfig(format=formatter, level=logging.INFO)
-
-    main()
--- a/egs/spgispeech/ASR/local/prepare_lang.py
+++ b/egs/spgispeech/ASR/local/prepare_lang.py
@ -1,413 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-This script takes as input a lexicon file "data/lang_phone/lexicon.txt"
-consisting of words and tokens (i.e., phones) and does the following:
-
-1. Add disambiguation symbols to the lexicon and generate lexicon_disambig.txt
-
-2. Generate tokens.txt, the token table mapping a token to a unique integer.
-
-3. Generate words.txt, the word table mapping a word to a unique integer.
-
-4. Generate L.pt, in k2 format. It can be loaded by
-
-        d = torch.load("L.pt")
-        lexicon = k2.Fsa.from_dict(d)
-
-5. Generate L_disambig.pt, in k2 format.
-"""
-import argparse
-import math
-from collections import defaultdict
-from pathlib import Path
-from typing import Any, Dict, List, Tuple
-
-import k2
-import torch
-
-from icefall.lexicon import read_lexicon, write_lexicon
-from icefall.utils import str2bool
-
-Lexicon = List[Tuple[str, List[str]]]
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--lang-dir",
-        type=str,
-        help="""Input and output directory.
-        It should contain a file lexicon.txt.
-        Generated files by this script are saved into this directory.
-        """,
-    )
-
-    parser.add_argument(
-        "--debug",
-        type=str2bool,
-        default=False,
-        help="""True for debugging, which will generate
-        a visualization of the lexicon FST.
-
-        Caution: If your lexicon contains hundreds of thousands
-        of lines, please set it to False!
-        """,
-    )
-
-    return parser.parse_args()
-
-
-def write_mapping(filename: str, sym2id: Dict[str, int]) -> None:
-    """Write a symbol to ID mapping to a file.
-
-    Note:
-      No need to implement `read_mapping` as it can be done
-      through :func:`k2.SymbolTable.from_file`.
-
-    Args:
-      filename:
-        Filename to save the mapping.
-      sym2id:
-        A dict mapping symbols to IDs.
-    Returns:
-      Return None.
-    """
-    with open(filename, "w", encoding="utf-8") as f:
-        for sym, i in sym2id.items():
-            f.write(f"{sym} {i}\n")
-
-
-def get_tokens(lexicon: Lexicon) -> List[str]:
-    """Get tokens from a lexicon.
-
-    Args:
-      lexicon:
-        It is the return value of :func:`read_lexicon`.
-    Returns:
-      Return a list of unique tokens.
-    """
-    ans = set()
-    for _, tokens in lexicon:
-        ans.update(tokens)
-    sorted_ans = sorted(list(ans))
-    return sorted_ans
-
-
-def get_words(lexicon: Lexicon) -> List[str]:
-    """Get words from a lexicon.
-
-    Args:
-      lexicon:
-        It is the return value of :func:`read_lexicon`.
-    Returns:
-      Return a list of unique words.
-    """
-    ans = set()
-    for word, _ in lexicon:
-        ans.add(word)
-    sorted_ans = sorted(list(ans))
-    return sorted_ans
-
-
-def add_disambig_symbols(lexicon: Lexicon) -> Tuple[Lexicon, int]:
-    """It adds pseudo-token disambiguation symbols #1, #2 and so on
-    at the ends of tokens to ensure that all pronunciations are different,
-    and that none is a prefix of another.
-
-    See also add_lex_disambig.pl from kaldi.
-
-    Args:
-      lexicon:
-        It is returned by :func:`read_lexicon`.
-    Returns:
-      Return a tuple with two elements:
-
-        - The output lexicon with disambiguation symbols
-        - The ID of the max disambiguation symbol that appears
-          in the lexicon
-    """
-
-    # (1) Work out the count of each token-sequence in the
-    # lexicon.
-    count = defaultdict(int)
-    for _, tokens in lexicon:
-        count[" ".join(tokens)] += 1
-
-    # (2) For each left sub-sequence of each token-sequence, note down
-    # that it exists (for identifying prefixes of longer strings).
-    issubseq = defaultdict(int)
-    for _, tokens in lexicon:
-        tokens = tokens.copy()
-        tokens.pop()
-        while tokens:
-            issubseq[" ".join(tokens)] = 1
-            tokens.pop()
-
-    # (3) For each entry in the lexicon:
-    # if the token sequence is unique and is not a
-    # prefix of another word, no disambig symbol.
-    # Else output #1, or #2, #3, ... if the same token-seq
-    # has already been assigned a disambig symbol.
-    ans = []
-
-    # We start with #1 since #0 has its own purpose
-    first_allowed_disambig = 1
-    max_disambig = first_allowed_disambig - 1
-    last_used_disambig_symbol_of = defaultdict(int)
-
-    for word, tokens in lexicon:
-        tokenseq = " ".join(tokens)
-        assert tokenseq != ""
-        if issubseq[tokenseq] == 0 and count[tokenseq] == 1:
-            ans.append((word, tokens))
-            continue
-
-        cur_disambig = last_used_disambig_symbol_of[tokenseq]
-        if cur_disambig == 0:
-            cur_disambig = first_allowed_disambig
-        else:
-            cur_disambig += 1
-
-        if cur_disambig > max_disambig:
-            max_disambig = cur_disambig
-        last_used_disambig_symbol_of[tokenseq] = cur_disambig
-        tokenseq += f" #{cur_disambig}"
-        ans.append((word, tokenseq.split()))
-    return ans, max_disambig
-
-
-def generate_id_map(symbols: List[str]) -> Dict[str, int]:
-    """Generate ID maps, i.e., map a symbol to a unique ID.
-
-    Args:
-      symbols:
-        A list of unique symbols.
-    Returns:
-      A dict containing the mapping between symbols and IDs.
-    """
-    return {sym: i for i, sym in enumerate(symbols)}
-
-
-def add_self_loops(
-    arcs: List[List[Any]], disambig_token: int, disambig_word: int
-) -> List[List[Any]]:
-    """Adds self-loops to states of an FST to propagate disambiguation symbols
-    through it. They are added on each state with non-epsilon output symbols
-    on at least one arc out of the state.
-
-    See also fstaddselfloops.pl from Kaldi. One difference is that
-    Kaldi uses OpenFst style FSTs and it has multiple final states.
-    This function uses k2 style FSTs and it does not need to add self-loops
-    to the final state.
-
-    The input label of a self-loop is `disambig_token`, while the output
-    label is `disambig_word`.
-
-    Args:
-      arcs:
-        A list-of-list. The sublist contains
-        `[src_state, dest_state, label, aux_label, score]`
-      disambig_token:
-        It is the token ID of the symbol `#0`.
-      disambig_word:
-        It is the word ID of the symbol `#0`.
-
-    Return:
-      Return new `arcs` containing self-loops.
-    """
-    states_needs_self_loops = set()
-    for arc in arcs:
-        src, dst, ilabel, olabel, score = arc
-        if olabel != 0:
-            states_needs_self_loops.add(src)
-
-    ans = []
-    for s in states_needs_self_loops:
-        ans.append([s, s, disambig_token, disambig_word, 0])
-
-    return arcs + ans
-
-
-def lexicon_to_fst(
-    lexicon: Lexicon,
-    token2id: Dict[str, int],
-    word2id: Dict[str, int],
-    sil_token: str = "SIL",
-    sil_prob: float = 0.5,
-    need_self_loops: bool = False,
-) -> k2.Fsa:
-    """Convert a lexicon to an FST (in k2 format) with optional silence at
-    the beginning and end of each word.
-
-    Args:
-      lexicon:
-        The input lexicon. See also :func:`read_lexicon`
-      token2id:
-        A dict mapping tokens to IDs.
-      word2id:
-        A dict mapping words to IDs.
-      sil_token:
-        The silence token.
-      sil_prob:
-        The probability for adding a silence at the beginning and end
-        of the word.
-      need_self_loops:
-        If True, add self-loop to states with non-epsilon output symbols
-        on at least one arc out of the state. The input label for this
-        self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
-    Returns:
-      Return an instance of `k2.Fsa` representing the given lexicon.
-    """
-    assert sil_prob > 0.0 and sil_prob < 1.0
-    # CAUTION: we use score, i.e, negative cost.
-    sil_score = math.log(sil_prob)
-    no_sil_score = math.log(1.0 - sil_prob)
-
-    start_state = 0
-    loop_state = 1  # words enter and leave from here
-    sil_state = 2  # words terminate here when followed by silence; this state
-    # has a silence transition to loop_state.
-    next_state = 3  # the next un-allocated state, will be incremented as we go.
-    arcs = []
-
-    assert token2id["<eps>"] == 0
-    assert word2id["<eps>"] == 0
-
-    eps = 0
-
-    sil_token = token2id[sil_token]
-
-    arcs.append([start_state, loop_state, eps, eps, no_sil_score])
-    arcs.append([start_state, sil_state, eps, eps, sil_score])
-    arcs.append([sil_state, loop_state, sil_token, eps, 0])
-
-    for word, tokens in lexicon:
-        assert len(tokens) > 0, f"{word} has no pronunciations"
-        cur_state = loop_state
-
-        word = word2id[word]
-        tokens = [token2id[i] for i in tokens]
-
-        for i in range(len(tokens) - 1):
-            w = word if i == 0 else eps
-            arcs.append([cur_state, next_state, tokens[i], w, 0])
-
-            cur_state = next_state
-            next_state += 1
-
-        # now for the last token of this word
-        # It has two out-going arcs, one to the loop state,
-        # the other one to the sil_state.
-        i = len(tokens) - 1
-        w = word if i == 0 else eps
-        arcs.append([cur_state, loop_state, tokens[i], w, no_sil_score])
-        arcs.append([cur_state, sil_state, tokens[i], w, sil_score])
-
-    if need_self_loops:
-        disambig_token = token2id["#0"]
-        disambig_word = word2id["#0"]
-        arcs = add_self_loops(
-            arcs,
-            disambig_token=disambig_token,
-            disambig_word=disambig_word,
-        )
-
-    final_state = next_state
-    arcs.append([loop_state, final_state, -1, -1, 0])
-    arcs.append([final_state])
-
-    arcs = sorted(arcs, key=lambda arc: arc[0])
-    arcs = [[str(i) for i in arc] for arc in arcs]
-    arcs = [" ".join(arc) for arc in arcs]
-    arcs = "\n".join(arcs)
-
-    fsa = k2.Fsa.from_str(arcs, acceptor=False)
-    return fsa
-
-
-def main():
-    args = get_args()
-    lang_dir = Path(args.lang_dir)
-    lexicon_filename = lang_dir / "lexicon.txt"
-    sil_token = "SIL"
-    sil_prob = 0.5
-
-    lexicon = read_lexicon(lexicon_filename)
-    tokens = get_tokens(lexicon)
-    words = get_words(lexicon)
-
-    lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
-
-    for i in range(max_disambig + 1):
-        disambig = f"#{i}"
-        assert disambig not in tokens
-        tokens.append(f"#{i}")
-
-    assert "<eps>" not in tokens
-    tokens = ["<eps>"] + tokens
-
-    assert "<eps>" not in words
-    assert "#0" not in words
-    assert "<s>" not in words
-    assert "</s>" not in words
-
-    words = ["<eps>"] + words + ["#0", "<s>", "</s>"]
-
-    token2id = generate_id_map(tokens)
-    word2id = generate_id_map(words)
-
-    write_mapping(lang_dir / "tokens.txt", token2id)
-    write_mapping(lang_dir / "words.txt", word2id)
-    write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig)
-
-    L = lexicon_to_fst(
-        lexicon,
-        token2id=token2id,
-        word2id=word2id,
-        sil_token=sil_token,
-        sil_prob=sil_prob,
-    )
-
-    L_disambig = lexicon_to_fst(
-        lexicon_disambig,
-        token2id=token2id,
-        word2id=word2id,
-        sil_token=sil_token,
-        sil_prob=sil_prob,
-        need_self_loops=True,
-    )
-    torch.save(L.as_dict(), lang_dir / "L.pt")
-    torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt")
-
-    if args.debug:
-        labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt")
-        aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt")
-
-        L.labels_sym = labels_sym
-        L.aux_labels_sym = aux_labels_sym
-        L.draw(f"{lang_dir / 'L.svg'}", title="L.pt")
-
-        L_disambig.labels_sym = labels_sym
-        L_disambig.aux_labels_sym = aux_labels_sym
-        L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", title="L_disambig.pt")
-
-
-if __name__ == "__main__":
-    main()
--- a/egs/spgispeech/ASR/local/prepare_lang.py
+++ b/egs/spgispeech/ASR/local/prepare_lang.py
@ -0,0 +1 @@
+../../../librispeech/ASR/local/prepare_lang.py
--- a/egs/spgispeech/ASR/local/prepare_lang_bpe.py
+++ b/egs/spgispeech/ASR/local/prepare_lang_bpe.py
@ -1,246 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
-
-"""
-This script takes as input `lang_dir`, which should contain::
-    - lang_dir/bpe.model,
-    - lang_dir/words.txt
-and generates the following files in the directory `lang_dir`:
-    - lexicon.txt
-    - lexicon_disambig.txt
-    - L.pt
-    - L_disambig.pt
-    - tokens.txt
-"""
-
-import argparse
-from pathlib import Path
-from typing import Dict, List, Tuple
-
-import k2
-import sentencepiece as spm
-import torch
-from prepare_lang_g2pen import (
-    Lexicon,
-    add_disambig_symbols,
-    add_self_loops,
-    write_lexicon,
-    write_mapping,
-)
-
-from icefall.utils import str2bool
-
-
-def lexicon_to_fst_no_sil(
-    lexicon: Lexicon,
-    token2id: Dict[str, int],
-    word2id: Dict[str, int],
-    need_self_loops: bool = False,
-) -> k2.Fsa:
-    """Convert a lexicon to an FST (in k2 format).
-    Args:
-      lexicon:
-        The input lexicon. See also :func:`read_lexicon`
-      token2id:
-        A dict mapping tokens to IDs.
-      word2id:
-        A dict mapping words to IDs.
-      need_self_loops:
-        If True, add self-loop to states with non-epsilon output symbols
-        on at least one arc out of the state. The input label for this
-        self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
-    Returns:
-      Return an instance of `k2.Fsa` representing the given lexicon.
-    """
-    loop_state = 0  # words enter and leave from here
-    next_state = 1  # the next un-allocated state, will be incremented as we go
-
-    arcs = []
-
-    # The blank symbol <blk> is defined in local/train_bpe_model.py
-    assert token2id["<blk>"] == 0
-    assert word2id["<eps>"] == 0
-
-    eps = 0
-
-    for word, pieces in lexicon:
-        assert len(pieces) > 0, f"{word} has no pronunciations"
-        cur_state = loop_state
-
-        word = word2id[word]
-        pieces = [token2id[i] for i in pieces]
-
-        for i in range(len(pieces) - 1):
-            w = word if i == 0 else eps
-            arcs.append([cur_state, next_state, pieces[i], w, 0])
-
-            cur_state = next_state
-            next_state += 1
-
-        # now for the last piece of this word
-        i = len(pieces) - 1
-        w = word if i == 0 else eps
-        arcs.append([cur_state, loop_state, pieces[i], w, 0])
-
-    if need_self_loops:
-        disambig_token = token2id["#0"]
-        disambig_word = word2id["#0"]
-        arcs = add_self_loops(
-            arcs,
-            disambig_token=disambig_token,
-            disambig_word=disambig_word,
-        )
-
-    final_state = next_state
-    arcs.append([loop_state, final_state, -1, -1, 0])
-    arcs.append([final_state])
-
-    arcs = sorted(arcs, key=lambda arc: arc[0])
-    arcs = [[str(i) for i in arc] for arc in arcs]
-    arcs = [" ".join(arc) for arc in arcs]
-    arcs = "\n".join(arcs)
-
-    fsa = k2.Fsa.from_str(arcs, acceptor=False)
-    return fsa
-
-
-def generate_lexicon(
-    model_file: str, words: List[str]
-) -> Tuple[Lexicon, Dict[str, int]]:
-    """Generate a lexicon from a BPE model.
-    Args:
-      model_file:
-        Path to a sentencepiece model.
-      words:
-        A list of strings representing words.
-    Returns:
-      Return a tuple with two elements:
-        - A dict whose keys are words and values are the corresponding
-          word pieces.
-        - A dict representing the token symbol, mapping from tokens to IDs.
-    """
-    sp = spm.SentencePieceProcessor()
-    sp.load(str(model_file))
-
-    words_pieces: List[List[str]] = sp.encode(words, out_type=str)
-
-    lexicon = []
-    for word, pieces in zip(words, words_pieces):
-        lexicon.append((word, pieces))
-
-    # The OOV word is <UNK>
-    lexicon.append(("[UNK]", [sp.id_to_piece(sp.unk_id())]))
-
-    token2id: Dict[str, int] = dict()
-    for i in range(sp.vocab_size()):
-        token2id[sp.id_to_piece(i)] = i
-
-    return lexicon, token2id
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--lang-dir",
-        type=str,
-        help="""Input and output directory.
-        It should contain the bpe.model and words.txt
-        """,
-    )
-
-    parser.add_argument(
-        "--debug",
-        type=str2bool,
-        default=False,
-        help="""True for debugging, which will generate
-        a visualization of the lexicon FST.
-        Caution: If your lexicon contains hundreds of thousands
-        of lines, please set it to False!
-        See "test/test_bpe_lexicon.py" for usage.
-        """,
-    )
-
-    return parser.parse_args()
-
-
-def main():
-    args = get_args()
-    lang_dir = Path(args.lang_dir)
-    model_file = lang_dir / "bpe.model"
-
-    word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
-
-    words = word_sym_table.symbols
-
-    excluded = ["<eps>", "!SIL", "<SPOKEN_NOISE>", "[UNK]", "#0", "<s>", "</s>"]
-    for w in excluded:
-        if w in words:
-            words.remove(w)
-
-    lexicon, token_sym_table = generate_lexicon(model_file, words)
-
-    lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
-
-    next_token_id = max(token_sym_table.values()) + 1
-    for i in range(max_disambig + 1):
-        disambig = f"#{i}"
-        assert disambig not in token_sym_table
-        token_sym_table[disambig] = next_token_id
-        next_token_id += 1
-
-    word_sym_table.add("#0")
-    word_sym_table.add("<s>")
-    word_sym_table.add("</s>")
-
-    write_mapping(lang_dir / "tokens.txt", token_sym_table)
-
-    write_lexicon(lang_dir / "lexicon.txt", lexicon)
-    write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig)
-
-    L = lexicon_to_fst_no_sil(
-        lexicon,
-        token2id=token_sym_table,
-        word2id=word_sym_table,
-    )
-
-    L_disambig = lexicon_to_fst_no_sil(
-        lexicon_disambig,
-        token2id=token_sym_table,
-        word2id=word_sym_table,
-        need_self_loops=True,
-    )
-    torch.save(L.as_dict(), lang_dir / "L.pt")
-    torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt")
-
-    if args.debug:
-        labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt")
-        aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt")
-
-        L.labels_sym = labels_sym
-        L.aux_labels_sym = aux_labels_sym
-        L.draw(f"{lang_dir / 'L.svg'}", title="L.pt")
-
-        L_disambig.labels_sym = labels_sym
-        L_disambig.aux_labels_sym = aux_labels_sym
-        L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", title="L_disambig.pt")
-
-
-if __name__ == "__main__":
-    main()
--- a/egs/spgispeech/ASR/local/prepare_lang_bpe.py
+++ b/egs/spgispeech/ASR/local/prepare_lang_bpe.py
@ -0,0 +1 @@
+../../../librispeech/ASR/local/prepare_lang_bpe.py
--- a/egs/spgispeech/ASR/local/prepare_lang_g2pen.py
+++ b/egs/spgispeech/ASR/local/prepare_lang_g2pen.py
@ -1,473 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-This script takes as input a wors.txt file "data/lang_phone/words.txt"
-consisting of words and their IDs and creates a lexicon with g2p_en python package
-(it's CMUdict based). It also creates rest of the files typically expected in a lang 
-dir, including L.pt and Linv.pt.
-"""
-import argparse
-import math
-from collections import defaultdict
-from pathlib import Path
-from typing import Any, Dict, List, Tuple
-
-import k2
-import torch
-from g2p_en import G2p
-from tqdm import tqdm
-
-from icefall.lexicon import read_lexicon, write_lexicon
-from icefall.utils import str2bool
-
-Lexicon = List[Tuple[str, List[str]]]
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--lang-dir",
-        type=str,
-        help="""Input and output directory.
-        It should contain a file words.txt.
-        Generated files by this script are saved into this directory.
-        """,
-    )
-
-    parser.add_argument(
-        "--debug",
-        type=str2bool,
-        default=False,
-        help="""True for debugging, which will generate
-        a visualization of the lexicon FST.
-        Caution: If your lexicon contains hundreds of thousands
-        of lines, please set it to False!
-        """,
-    )
-
-    return parser.parse_args()
-
-
-def get_g2p_sym2int():
-
-    # These symbols are removed from from g2p_en's vocabulary
-    excluded_symbols = [
-        "<pad>",
-        "<s>",
-        "</s>",
-        "<unk>",
-    ]
-
-    symbols = [p for p in sorted(G2p().phonemes) if p not in excluded_symbols]
-    # reserve 0 and 1 for blank and sos/eos/pad tokens
-    # symbols start at index 2
-    sym2int = {
-        "<eps>": 0,
-        "SIL": 1,
-        "UNK": 2,
-        "LAUGHTER": 3,
-        "SIGH": 4,
-        "COUGH": 5,
-        "VOCALIZED-NOISE": 6,
-        "BREATH": 7,
-        "LIPSMACK": 8,
-        "SNEEZE": 9,
-        "NOISE": 10,
-        **{sym: idx for idx, sym in enumerate(symbols, start=11)},
-    }
-    return sym2int
-
-
-def write_mapping(filename: str, sym2id: Dict[str, int]) -> None:
-    """Write a symbol to ID mapping to a file.
-    Note:
-      No need to implement `read_mapping` as it can be done
-      through :func:`k2.SymbolTable.from_file`.
-    Args:
-      filename:
-        Filename to save the mapping.
-      sym2id:
-        A dict mapping symbols to IDs.
-    Returns:
-      Return None.
-    """
-    with open(filename, "w", encoding="utf-8") as f:
-        for sym, i in sym2id.items():
-            f.write(f"{sym} {i}\n")
-
-
-def get_tokens(lexicon: Lexicon) -> List[str]:
-    """Get tokens from a lexicon.
-    Args:
-      lexicon:
-        It is the return value of :func:`read_lexicon`.
-    Returns:
-      Return a list of unique tokens.
-    """
-    ans = set()
-    for _, tokens in lexicon:
-        ans.update(tokens)
-    sorted_ans = sorted(list(ans))
-    return sorted_ans
-
-
-def get_words(lexicon: Lexicon) -> List[str]:
-    """Get words from a lexicon.
-    Args:
-      lexicon:
-        It is the return value of :func:`read_lexicon`.
-    Returns:
-      Return a list of unique words.
-    """
-    ans = set()
-    for word, _ in lexicon:
-        ans.add(word)
-    sorted_ans = sorted(list(ans))
-    return sorted_ans
-
-
-def add_disambig_symbols(lexicon: Lexicon) -> Tuple[Lexicon, int]:
-    """It adds pseudo-token disambiguation symbols #1, #2 and so on
-    at the ends of tokens to ensure that all pronunciations are different,
-    and that none is a prefix of another.
-    See also add_lex_disambig.pl from kaldi.
-    Args:
-      lexicon:
-        It is returned by :func:`read_lexicon`.
-    Returns:
-      Return a tuple with two elements:
-        - The output lexicon with disambiguation symbols
-        - The ID of the max disambiguation symbol that appears
-          in the lexicon
-    """
-
-    # (1) Work out the count of each token-sequence in the
-    # lexicon.
-    count = defaultdict(int)
-    for _, tokens in lexicon:
-        count[" ".join(tokens)] += 1
-
-    # (2) For each left sub-sequence of each token-sequence, note down
-    # that it exists (for identifying prefixes of longer strings).
-    issubseq = defaultdict(int)
-    for _, tokens in lexicon:
-        tokens = tokens.copy()
-        tokens.pop()
-        while tokens:
-            issubseq[" ".join(tokens)] = 1
-            tokens.pop()
-
-    # (3) For each entry in the lexicon:
-    # if the token sequence is unique and is not a
-    # prefix of another word, no disambig symbol.
-    # Else output #1, or #2, #3, ... if the same token-seq
-    # has already been assigned a disambig symbol.
-    ans = []
-
-    # We start with #1 since #0 has its own purpose
-    first_allowed_disambig = 1
-    max_disambig = first_allowed_disambig - 1
-    last_used_disambig_symbol_of = defaultdict(int)
-
-    for word, tokens in lexicon:
-        tokenseq = " ".join(tokens)
-        assert tokenseq != ""
-        if issubseq[tokenseq] == 0 and count[tokenseq] == 1:
-            ans.append((word, tokens))
-            continue
-
-        cur_disambig = last_used_disambig_symbol_of[tokenseq]
-        if cur_disambig == 0:
-            cur_disambig = first_allowed_disambig
-        else:
-            cur_disambig += 1
-
-        if cur_disambig > max_disambig:
-            max_disambig = cur_disambig
-        last_used_disambig_symbol_of[tokenseq] = cur_disambig
-        tokenseq += f" #{cur_disambig}"
-        ans.append((word, tokenseq.split()))
-    return ans, max_disambig
-
-
-def generate_id_map(symbols: List[str]) -> Dict[str, int]:
-    """Generate ID maps, i.e., map a symbol to a unique ID.
-    Args:
-      symbols:
-        A list of unique symbols.
-    Returns:
-      A dict containing the mapping between symbols and IDs.
-    """
-    return {sym: i for i, sym in enumerate(symbols)}
-
-
-def add_self_loops(
-    arcs: List[List[Any]], disambig_token: int, disambig_word: int
-) -> List[List[Any]]:
-    """Adds self-loops to states of an FST to propagate disambiguation symbols
-    through it. They are added on each state with non-epsilon output symbols
-    on at least one arc out of the state.
-    See also fstaddselfloops.pl from Kaldi. One difference is that
-    Kaldi uses OpenFst style FSTs and it has multiple final states.
-    This function uses k2 style FSTs and it does not need to add self-loops
-    to the final state.
-    The input label of a self-loop is `disambig_token`, while the output
-    label is `disambig_word`.
-    Args:
-      arcs:
-        A list-of-list. The sublist contains
-        `[src_state, dest_state, label, aux_label, score]`
-      disambig_token:
-        It is the token ID of the symbol `#0`.
-      disambig_word:
-        It is the word ID of the symbol `#0`.
-    Return:
-      Return new `arcs` containing self-loops.
-    """
-    states_needs_self_loops = set()
-    for arc in arcs:
-        src, dst, ilabel, olabel, score = arc
-        if olabel != 0:
-            states_needs_self_loops.add(src)
-
-    ans = []
-    for s in states_needs_self_loops:
-        ans.append([s, s, disambig_token, disambig_word, 0])
-
-    return arcs + ans
-
-
-def lexicon_to_fst(
-    lexicon: Lexicon,
-    token2id: Dict[str, int],
-    word2id: Dict[str, int],
-    sil_token: str = "SIL",
-    sil_prob: float = 0.5,
-    need_self_loops: bool = False,
-) -> k2.Fsa:
-    """Convert a lexicon to an FST (in k2 format) with optional silence at
-    the beginning and end of each word.
-    Args:
-      lexicon:
-        The input lexicon. See also :func:`read_lexicon`
-      token2id:
-        A dict mapping tokens to IDs.
-      word2id:
-        A dict mapping words to IDs.
-      sil_token:
-        The silence token.
-      sil_prob:
-        The probability for adding a silence at the beginning and end
-        of the word.
-      need_self_loops:
-        If True, add self-loop to states with non-epsilon output symbols
-        on at least one arc out of the state. The input label for this
-        self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
-    Returns:
-      Return an instance of `k2.Fsa` representing the given lexicon.
-    """
-    assert sil_prob > 0.0 and sil_prob < 1.0
-    # CAUTION: we use score, i.e, negative cost.
-    sil_score = math.log(sil_prob)
-    no_sil_score = math.log(1.0 - sil_prob)
-
-    start_state = 0
-    loop_state = 1  # words enter and leave from here
-    sil_state = 2  # words terminate here when followed by silence; this state
-    # has a silence transition to loop_state.
-    next_state = 3  # the next un-allocated state, will be incremented as we go.
-    arcs = []
-
-    assert token2id["<eps>"] == 0
-    assert word2id["<eps>"] == 0
-
-    eps = 0
-
-    sil_token = token2id[sil_token]
-
-    arcs.append([start_state, loop_state, eps, eps, no_sil_score])
-    arcs.append([start_state, sil_state, eps, eps, sil_score])
-    arcs.append([sil_state, loop_state, sil_token, eps, 0])
-
-    for word, tokens in lexicon:
-        assert len(tokens) > 0, f"{word} has no pronunciations"
-        cur_state = loop_state
-
-        word = word2id[word]
-        tokens = [token2id[i] for i in tokens]
-
-        for i in range(len(tokens) - 1):
-            w = word if i == 0 else eps
-            arcs.append([cur_state, next_state, tokens[i], w, 0])
-
-            cur_state = next_state
-            next_state += 1
-
-        # now for the last token of this word
-        # It has two out-going arcs, one to the loop state,
-        # the other one to the sil_state.
-        i = len(tokens) - 1
-        w = word if i == 0 else eps
-        arcs.append([cur_state, loop_state, tokens[i], w, no_sil_score])
-        arcs.append([cur_state, sil_state, tokens[i], w, sil_score])
-
-    if need_self_loops:
-        disambig_token = token2id["#0"]
-        disambig_word = word2id["#0"]
-        arcs = add_self_loops(
-            arcs,
-            disambig_token=disambig_token,
-            disambig_word=disambig_word,
-        )
-
-    final_state = next_state
-    arcs.append([loop_state, final_state, -1, -1, 0])
-    arcs.append([final_state])
-
-    arcs = sorted(arcs, key=lambda arc: arc[0])
-    arcs = [[str(i) for i in arc] for arc in arcs]
-    arcs = [" ".join(arc) for arc in arcs]
-    arcs = "\n".join(arcs)
-
-    fsa = k2.Fsa.from_str(arcs, acceptor=False)
-    return fsa
-
-
-def main():
-    args = get_args()
-    lang_dir = Path(args.lang_dir)
-    vocab_filename = lang_dir / "words.txt"
-    lexicon_filename = lang_dir / "lexicon.txt"
-    sil_token = "SIL"
-    sil_prob = 0.5
-    special_symbols = [
-        "[UNK]",
-        "[BREATH]",
-        "[COUGH]",
-        "[LAUGHTER]",
-        "[LIPSMACK]",
-        "[NOISE]",
-        "[SIGH]",
-        "[SNEEZE]",
-        "[VOCALIZED-NOISE]",
-    ]
-
-    g2p = G2p()
-    token2id = get_g2p_sym2int()
-
-    vocab = sorted(
-        [
-            l.split()[0]
-            for l in vocab_filename.read_text().splitlines()
-            if l.strip() and not l.startswith(("!", "[", "<", "#"))
-        ]
-    )
-    print("First ten words from the vocabulary:")
-    print(vocab[:10])
-
-    if not lexicon_filename.is_file():
-        lexicon = [
-            ("!SIL", [sil_token]),
-        ]
-        for symbol in special_symbols:
-            lexicon.append((symbol, [symbol[1:-1]]))
-        lexicon += [
-            (
-                word,
-                [
-                    phn
-                    for phn in g2p(word)
-                    if phn
-                    not in (
-                        "'",
-                        " ",
-                        "-",
-                        ",",
-                    )  # g2p_en has these symbols as phones
-                ],
-            )
-            for word in tqdm(vocab, desc="Processing vocab with G2P")
-        ]
-        lexicon = [entry for entry in lexicon if entry[1]]  # filter empty prons
-        print(lexicon[:10])
-
-        write_lexicon(lexicon_filename, lexicon)
-    else:
-        lexicon = read_lexicon(lexicon_filename)
-
-    tokens = get_tokens(lexicon)
-
-    lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
-
-    for i in range(max_disambig + 1):
-        disambig = f"#{i}"
-        assert disambig not in tokens
-        tokens.append(disambig)
-        token2id[disambig] = max(token2id.values()) + 1
-
-    print("Tokens in the lexicon:")
-    print(tokens)
-
-    # sort by ID
-    token2id = dict(sorted(token2id.items(), key=lambda tpl: tpl[1]))
-    print(token2id)
-    word2id = {"<eps>": 0}
-    word2id.update(
-        {word: int(id_) for id_, (word, pron) in enumerate(lexicon, start=1)}
-    )
-    for symbol in ["<s>", "</s>", "#0"]:
-        word2id[symbol] = len(word2id)
-
-    write_mapping(lang_dir / "tokens.txt", token2id)
-    write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig)
-
-    L = lexicon_to_fst(
-        lexicon,
-        token2id=token2id,
-        word2id=word2id,
-        sil_token=sil_token,
-        sil_prob=sil_prob,
-    )
-
-    L_disambig = lexicon_to_fst(
-        lexicon_disambig,
-        token2id=token2id,
-        word2id=word2id,
-        sil_token=sil_token,
-        sil_prob=sil_prob,
-        need_self_loops=True,
-    )
-    torch.save(L.as_dict(), lang_dir / "L.pt")
-    torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt")
-
-    if args.debug:
-        labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt")
-        aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt")
-
-        L.labels_sym = labels_sym
-        L.aux_labels_sym = aux_labels_sym
-        L.draw(f"{lang_dir / 'L.svg'}", title="L.pt")
-
-        L_disambig.labels_sym = labels_sym
-        L_disambig.aux_labels_sym = aux_labels_sym
-        L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", title="L_disambig.pt")
-
-
-if __name__ == "__main__":
-    main()
--- a/egs/spgispeech/ASR/local/test_prepare_lang.py
+++ b/egs/spgispeech/ASR/local/test_prepare_lang.py
@ -1,106 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
-
-import os
-import tempfile
-
-import k2
-from prepare_lang import (
-    add_disambig_symbols,
-    generate_id_map,
-    get_phones,
-    get_words,
-    lexicon_to_fst,
-    read_lexicon,
-    write_lexicon,
-    write_mapping,
-)
-
-
-def generate_lexicon_file() -> str:
-    fd, filename = tempfile.mkstemp()
-    os.close(fd)
-    s = """
-    !SIL SIL
-    <SPOKEN_NOISE> SPN
-    <UNK> SPN
-    f f
-    a a
-    foo f o o
-    bar b a r
-    bark b a r k
-    food f o o d
-    food2 f o o d
-    fo  f o
-    """.strip()
-    with open(filename, "w") as f:
-        f.write(s)
-    return filename
-
-
-def test_read_lexicon(filename: str):
-    lexicon = read_lexicon(filename)
-    phones = get_phones(lexicon)
-    words = get_words(lexicon)
-    print(lexicon)
-    print(phones)
-    print(words)
-    lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
-    print(lexicon_disambig)
-    print("max disambig:", f"#{max_disambig}")
-
-    phones = ["<eps>", "SIL", "SPN"] + phones
-    for i in range(max_disambig + 1):
-        phones.append(f"#{i}")
-    words = ["<eps>"] + words
-
-    phone2id = generate_id_map(phones)
-    word2id = generate_id_map(words)
-
-    print(phone2id)
-    print(word2id)
-
-    write_mapping("phones.txt", phone2id)
-    write_mapping("words.txt", word2id)
-
-    write_lexicon("a.txt", lexicon)
-    write_lexicon("a_disambig.txt", lexicon_disambig)
-
-    fsa = lexicon_to_fst(lexicon, phone2id=phone2id, word2id=word2id)
-    fsa.labels_sym = k2.SymbolTable.from_file("phones.txt")
-    fsa.aux_labels_sym = k2.SymbolTable.from_file("words.txt")
-    fsa.draw("L.pdf", title="L")
-
-    fsa_disambig = lexicon_to_fst(
-        lexicon_disambig, phone2id=phone2id, word2id=word2id
-    )
-    fsa_disambig.labels_sym = k2.SymbolTable.from_file("phones.txt")
-    fsa_disambig.aux_labels_sym = k2.SymbolTable.from_file("words.txt")
-    fsa_disambig.draw("L_disambig.pdf", title="L_disambig")
-
-
-def main():
-    filename = generate_lexicon_file()
-    test_read_lexicon(filename)
-    os.remove(filename)
-
-
-if __name__ == "__main__":
-    main()
--- a/egs/spgispeech/ASR/local/train_bpe_model.py
+++ b/egs/spgispeech/ASR/local/train_bpe_model.py
@ -1,98 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# You can install sentencepiece via:
-#
-#  pip install sentencepiece
-#
-# Due to an issue reported in
-# https://github.com/google/sentencepiece/pull/642#issuecomment-857972030
-#
-# Please install a version >=0.1.96
-
-import argparse
-import shutil
-from pathlib import Path
-
-import sentencepiece as spm
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--lang-dir",
-        type=str,
-        help="""Input and output directory.
-        It should contain the training corpus: transcript_words.txt.
-        The generated bpe.model is saved to this directory.
-        """,
-    )
-
-    parser.add_argument(
-        "--transcript",
-        type=str,
-        help="Training transcript.",
-    )
-
-    parser.add_argument(
-        "--vocab-size",
-        type=int,
-        help="Vocabulary size for BPE training",
-    )
-
-    return parser.parse_args()
-
-
-def main():
-    args = get_args()
-    vocab_size = args.vocab_size
-    lang_dir = Path(args.lang_dir)
-
-    model_type = "unigram"
-
-    model_prefix = f"{lang_dir}/{model_type}_{vocab_size}"
-    train_text = args.transcript
-    character_coverage = 1.0
-    input_sentence_size = 100000000
-
-    user_defined_symbols = ["<blk>", "<sos/eos>"]
-    unk_id = len(user_defined_symbols)
-    # Note: unk_id is fixed to 2.
-    # If you change it, you should also change other
-    # places that are using it.
-
-    model_file = Path(model_prefix + ".model")
-    if not model_file.is_file():
-        spm.SentencePieceTrainer.train(
-            input=train_text,
-            vocab_size=vocab_size,
-            model_type=model_type,
-            model_prefix=model_prefix,
-            input_sentence_size=input_sentence_size,
-            character_coverage=character_coverage,
-            user_defined_symbols=user_defined_symbols,
-            unk_id=unk_id,
-            bos_id=-1,
-            eos_id=-1,
-        )
-
-    shutil.copyfile(model_file, f"{lang_dir}/bpe.model")
-
-
-if __name__ == "__main__":
-    main()
--- a/egs/spgispeech/ASR/local/train_bpe_model.py
+++ b/egs/spgispeech/ASR/local/train_bpe_model.py
@ -0,0 +1 @@
+../../../librispeech/ASR/local/train_bpe_model.py
--- a/egs/spgispeech/ASR/prepare.sh
+++ b/egs/spgispeech/ASR/prepare.sh
@ -124,51 +124,35 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
 fi

 if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
-  log "Stage 8: Prepare lexicon using g2p_en"
-  lang_dir=data/lang_phone
-  mkdir -p $lang_dir
-
-  # Add special words to words.txt
-  echo "<eps> 0" > $lang_dir/words.txt
-  echo "!SIL 1" >> $lang_dir/words.txt
-  echo "[UNK] 2" >> $lang_dir/words.txt
-
-  # Add regular words to words.txt
-  gunzip -c data/manifests/cuts_train_raw.jsonl.gz \
-    | jq '.supervisions[0].text' \
-    | sed 's:"::g' \
-    | sed 's: :\n:g' \
-    | sort \
-    | uniq \
-    | sed '/^$/d' \
-    | awk '{print $0,NR+2}' \
-    >> $lang_dir/words.txt
-
-  # Add remaining special word symbols expected by LM scripts.
-  num_words=$(cat $lang_dir/words.txt | wc -l)
-  echo "<s> ${num_words}" >> $lang_dir/words.txt
-  num_words=$(cat $lang_dir/words.txt | wc -l)
-  echo "</s> ${num_words}" >> $lang_dir/words.txt
-  num_words=$(cat $lang_dir/words.txt | wc -l)
-  echo "#0 ${num_words}" >> $lang_dir/words.txt
-
-  if [ ! -f $lang_dir/L_disambig.pt ]; then
-    # We use g2pen, which was trained on CMUdict and looks it up before
-    # resorting to an LSTM G2P model.
-    pip install g2p_en
-    ./local/prepare_lang_g2pen.py --lang-dir $lang_dir
-  fi
-fi
-
-if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
-  log "Stage 9: Prepare BPE based lang"
+  log "Stage 8: Prepare BPE based lang"

  for vocab_size in ${vocab_sizes[@]}; do
    lang_dir=data/lang_bpe_${vocab_size}
    mkdir -p $lang_dir
-    # We reuse words.txt from phone based lexicon
-    # so that the two can share G.pt later.
-    cp data/lang_phone/words.txt $lang_dir
+
+    # Add special words to words.txt
+    echo "<eps> 0" > $lang_dir/words.txt
+    echo "!SIL 1" >> $lang_dir/words.txt
+    echo "[UNK] 2" >> $lang_dir/words.txt
+
+    # Add regular words to words.txt
+    gunzip -c data/manifests/cuts_train_raw.jsonl.gz \
+      | jq '.supervisions[0].text' \
+      | sed 's:"::g' \
+      | sed 's: :\n:g' \
+      | sort \
+      | uniq \
+      | sed '/^$/d' \
+      | awk '{print $0,NR+2}' \
+      >> $lang_dir/words.txt
+
+    # Add remaining special word symbols expected by LM scripts.
+    num_words=$(cat $lang_dir/words.txt | wc -l)
+    echo "<s> ${num_words}" >> $lang_dir/words.txt
+    num_words=$(cat $lang_dir/words.txt | wc -l)
+    echo "</s> ${num_words}" >> $lang_dir/words.txt
+    num_words=$(cat $lang_dir/words.txt | wc -l)
+    echo "#0 ${num_words}" >> $lang_dir/words.txt

    ./local/train_bpe_model.py \
      --lang-dir $lang_dir \
@ -181,8 +165,8 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
  done
 fi

-if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
-  log "Stage 10: Train LM"
+if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
+  log "Stage 9: Train LM"
  lm_dir=data/lm

  if [ ! -f $lm_dir/G.arpa ]; then
@ -201,8 +185,8 @@ if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
  fi
 fi

-if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
-  log "Stage 11: Compile HLG"
+if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
+  log "Stage 10: Compile HLG"
  ./local/compile_hlg.py --lang-dir data/lang_phone

  for vocab_size in ${vocab_sizes[@]}; do
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/local/compile_hlg.py`
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/local/prepare_lang.py`
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/local/train_bpe_model.py`