from local

2023-01-19 11:27:19 +09:00 · 2023-01-19 11:27:19 +09:00 · 37c77aef37
commit 37c77aef37
parent b8efe4ad77
851 changed files with 39677 additions and 0 deletions
--- a/egs/LJSpeech/ASR/add_alignments.sh
+++ b/egs/LJSpeech/ASR/add_alignments.sh
@ -0,0 +1,12 @@
 #!/usr/bin/env bash
 set -eou pipefail
 alignments_dir=data/alignment
 cuts_in_dir=data/fbank
 cuts_out_dir=data/fbank_ali
 python3 ./local/add_alignment_librispeech.py \
  --alignments-dir $alignments_dir \
  --cuts-in-dir $cuts_in_dir \
  --cuts-out-dir $cuts_out_dir
--- a/egs/LJSpeech/ASR/distillation_with_hubert.sh
+++ b/egs/LJSpeech/ASR/distillation_with_hubert.sh
@ -0,0 +1,207 @@
 #!/usr/bin/env bash
 #
 # A short introduction about distillation framework.
 #
 # A typical traditional distillation method is
 # Loss(teacher embedding, student embedding).
 #
 # Comparing to these, the proposed distillation framework contains two mainly steps:
 # codebook indexes = quantizer.encode(teacher embedding)
 # Loss(codebook indexes, student embedding)
 #
 # Things worth to meantion:
 # 1. The float type teacher embedding is quantized into a sequence of
 #    8-bit integer codebook indexes.
 # 2. a middle layer 36(1-based) out of total 48 layers is used to extract
 #    teacher embeddings.
 # 3. a middle layer 6(1-based) out of total 6 layers is used to extract
 #    student embeddings.
 #
 # To directly download the extracted codebook indexes for model distillation, you can
 # set stage=2, stop_stage=4, use_extracted_codebook=True
 #
 # To start from scratch, you can
 # set stage=0, stop_stage=4, use_extracted_codebook=False
 stage=0
 stop_stage=4
 # Set the GPUs available.
 # This script requires at least one GPU.
 # You MUST set environment variable "CUDA_VISIBLE_DEVICES",
 # even you only have ONE GPU. It needed by CodebookIndexExtractor to determine numbert of jobs to extract codebook indexes parallelly.
 # Suppose only one GPU exists:
 # export CUDA_VISIBLE_DEVICES="0"
 #
 # Suppose GPU 2,3,4,5 are available.
 export CUDA_VISIBLE_DEVICES="0,1,2,3"
 exp_dir=./pruned_transducer_stateless6/exp
 mkdir -p $exp_dir
 # full_libri can be "True" or "False"
 #   "True" -> use full librispeech dataset for distillation
 #   "False" -> use train-clean-100 subset for distillation
 full_libri=False
 # use_extracted_codebook can be "True" or "False"
 #   "True" -> stage 0 and stage 1 would be skipped,
 #     and directly download the extracted codebook indexes for distillation
 #   "False" -> start from scratch
 use_extracted_codebook=False
 # teacher_model_id can be one of
 #   "hubert_xtralarge_ll60k_finetune_ls960" -> fine-tuned model, it is the one we currently use.
 #   "hubert_xtralarge_ll60k" -> pretrained model without fintuing
 teacher_model_id=hubert_xtralarge_ll60k_finetune_ls960
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ] && [ ! "$use_extracted_codebook" == "True" ]; then
  log "Stage 0: Download HuBERT model"
  # Preparation stage.
  # Install fairseq according to:
  # https://github.com/pytorch/fairseq
  # when testing this code:
  # commit 806855bf660ea748ed7ffb42fe8dcc881ca3aca0 is used.
  has_fairseq=$(python3 -c "import importlib; print(importlib.util.find_spec('fairseq') is not None)")
  if [ $has_fairseq == 'False' ]; then
    log "Please install fairseq before running following stages"
    exit 1
  fi
  # Install quantization toolkit:
  # pip install git+https://github.com/k2-fsa/multi_quantization.git
  # or
  # pip install multi_quantization
  has_quantization=$(python3 -c "import importlib; print(importlib.util.find_spec('multi_quantization') is not None)")
  if [ $has_quantization == 'False' ]; then
    log "Please install multi_quantization before running following stages"
    exit 1
  fi
  log "Download HuBERT model."
  # Parameters about model.
  hubert_model_dir=${exp_dir}/hubert_models
  hubert_model=${hubert_model_dir}/${teacher_model_id}.pt
  mkdir -p ${hubert_model_dir}
  # For more models refer to: https://github.com/pytorch/fairseq/tree/main/examples/hubert
  if [ -f ${hubert_model} ]; then
    log "HuBERT model alread exists."
  else
    wget -c https://dl.fbaipublicfiles.com/hubert/${teacher_model_id}.pt -P ${hubert_model_dir}
    wget -c wget https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt -P ${hubert_model_dir}
  fi
 fi
 if [ ! -d ./data/fbank ]; then
  log "This script assumes ./data/fbank is already generated by prepare.sh"
  exit 1
 fi
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ] && [ ! "$use_extracted_codebook" == "True" ]; then
  log "Stage 1: Verify that the downloaded HuBERT model is correct."
  # This stage is not directly used by codebook indexes extraction.
  # It is a method to "prove" that the downloaed hubert model
  # is inferenced in an correct way if WERs look like normal.
  # Expect WERs:
  # [test-clean-ctc_greedy_search] %WER 2.04% [1075 / 52576, 92 ins, 104 del, 879 sub ]
  # [test-other-ctc_greedy_search] %WER 3.71% [1942 / 52343, 152 ins, 126 del, 1664 sub ]
  ./pruned_transducer_stateless6/hubert_decode.py --exp-dir $exp_dir
 fi
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  # Analysis of disk usage:
  # With num_codebooks==8, each teacher embedding is quantized into
  # a sequence of eight 8-bit integers, i.e. only eight bytes are needed.
  # Training dataset including clean-100h with speed perturb 0.9 and 1.1 has 300 hours.
  # The output frame rates of Hubert is 50 per second.
  # Theoretically, 412M = 300 * 3600 * 50 * 8 / 1024 / 1024 is needed.
  # The actual size of all "*.h5" files storaging codebook index is 450M.
  # I think the extra "48M" usage is some meta information.
  # Time consumption analysis:
  # For quantizer training data(teacher embedding) extraction, only 1000 utts from clean-100 are used.
  # Together with quantizer training, no more than 20 minutes will be used.
  #
  # For codebook indexes extraction,
  # with two pieces of NVIDIA A100 gpus, around three hours needed to process 300 hours training data,
  # i.e. clean-100 with speed purteb 0.9 and 1.1.
  # GPU usage:
  # During quantizer's training data(teacher embedding) and it's training,
  # only the first ONE GPU is used.
  # During codebook indexes extraction, ALL GPUs set by CUDA_VISIBLE_DEVICES are used.
  if [ "$use_extracted_codebook" == "True" ]; then
    if [ ! "$teacher_model_id" == "hubert_xtralarge_ll60k_finetune_ls960" ]; then
      log "Currently we only uploaded codebook indexes from teacher model hubert_xtralarge_ll60k_finetune_ls960"
      exit 1
    fi
    mkdir -p $exp_dir/vq
    codebook_dir=$exp_dir/vq/$teacher_model_id
    mkdir -p codebook_dir
    codebook_download_dir=$exp_dir/download_codebook
    if [ -d $codebook_download_dir ]; then
      log "$codebook_download_dir exists, you should remove it first."
      exit 1
    fi
    log "Downloading extracted codebook indexes to $codebook_download_dir"
    # Make sure you have git-lfs installed (https://git-lfs.github.com)
    git lfs install
    git clone https://huggingface.co/Zengwei/pruned_transducer_stateless6_hubert_xtralarge_ll60k_finetune_ls960 $codebook_download_dir
    mkdir -p data/vq_fbank
    mv $codebook_download_dir/*.jsonl.gz data/vq_fbank/
    mkdir -p $codebook_dir/splits4
    mv $codebook_download_dir/*.h5 $codebook_dir/splits4/
    log "Remove $codebook_download_dir"
    rm -rf $codebook_download_dir
  fi
  ./pruned_transducer_stateless6/extract_codebook_index.py \
    --full-libri $full_libri \
    --exp-dir $exp_dir \
    --embedding-layer 36 \
    --num-utts 1000 \
    --num-codebooks 8 \
    --max-duration 100 \
    --teacher-model-id $teacher_model_id \
    --use-extracted-codebook $use_extracted_codebook
 fi
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  # Example training script.
  # Note: it's better to set spec-aug-time-warpi-factor=-1
  WORLD_SIZE=$(echo ${CUDA_VISIBLE_DEVICES} | awk '{n=split($1, _, ","); print n}')
  ./pruned_transducer_stateless6/train.py \
    --manifest-dir ./data/vq_fbank \
    --master-port 12359 \
    --full-libri $full_libri \
    --spec-aug-time-warp-factor -1 \
    --max-duration 300 \
    --world-size ${WORLD_SIZE} \
    --num-epochs 20 \
    --exp-dir $exp_dir \
    --enable-distillation True
 fi
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  # Results should be similar to:
  # errs-test-clean-beam_size_4-epoch-20-avg-10-beam-4.txt:%WER = 5.67
  # errs-test-other-beam_size_4-epoch-20-avg-10-beam-4.txt:%WER = 15.60
  ./pruned_transducer_stateless6/decode.py \
    --decoding-method "modified_beam_search" \
    --epoch 20 \
    --avg 10 \
    --max-duration 200 \
    --exp-dir $exp_dir \
    --enable-distillation True
 fi
--- a/egs/LJSpeech/ASR/generate-lm.sh
+++ b/egs/LJSpeech/ASR/generate-lm.sh
@ -0,0 +1,20 @@
 #!/usr/bin/env bash
 lang_dir=data/lang_bpe_500
 for ngram in 2 3 5; do
  if [ ! -f $lang_dir/${ngram}gram.arpa ]; then
    ./shared/make_kn_lm.py \
      -ngram-order ${ngram} \
      -text $lang_dir/transcript_tokens.txt \
      -lm $lang_dir/${ngram}gram.arpa
  fi
  if [ ! -f $lang_dir/${ngram}gram.fst.txt ]; then
    python3 -m kaldilm \
      --read-symbol-table="$lang_dir/tokens.txt" \
      --disambig-symbol='#0' \
      --max-order=${ngram} \
      $lang_dir/${ngram}gram.arpa > $lang_dir/${ngram}gram.fst.txt
  fi
 done
--- a/egs/LJSpeech/ASR/local/init.py
+++ b/egs/LJSpeech/ASR/local/init.py
--- a/egs/LJSpeech/ASR/local/add_alignment_librispeech.py
+++ b/egs/LJSpeech/ASR/local/add_alignment_librispeech.py
@ -0,0 +1,190 @@
 #!/usr/bin/env python3
 # Copyright    2022  Xiaomi Corp.        (authors: Zengwei Yao)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file adds alignments from https://github.com/CorentinJ/librispeech-alignments  # noqa
 to the existing fbank features dir (e.g., data/fbank)
 and save cuts to a new dir (e.g., data/fbank_ali).
 """
 import argparse
 import logging
 import zipfile
 from pathlib import Path
 from typing import List
 from lhotse import CutSet, load_manifest_lazy
 from lhotse.recipes.librispeech import parse_alignments
 from lhotse.utils import is_module_available
 LIBRISPEECH_ALIGNMENTS_URL = (
    "https://drive.google.com/uc?id=1WYfgr31T-PPwMcxuAq09XZfHQO5Mw8fE"
 )
 DATASET_PARTS = [
    "dev-clean",
    "dev-other",
    "test-clean",
    "test-other",
    "train-clean-100",
    "train-clean-360",
    "train-other-500",
 ]
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--alignments-dir",
        type=str,
        default="data/alignment",
        help="The dir to save alignments.",
    )
    parser.add_argument(
        "--cuts-in-dir",
        type=str,
        default="data/fbank",
        help="The dir of the existing cuts without alignments.",
    )
    parser.add_argument(
        "--cuts-out-dir",
        type=str,
        default="data/fbank_ali",
        help="The dir to save the new cuts with alignments",
    )
    return parser
 def download_alignments(
    target_dir: str, alignments_url: str = LIBRISPEECH_ALIGNMENTS_URL
 ):
    """
    Download and extract the alignments.
    Note: If you can not access drive.google.com, you could download the file
    `LibriSpeech-Alignments.zip` from huggingface:
    https://huggingface.co/Zengwei/librispeech-alignments
    and extract the zip file manually.
    Args:
      target_dir:
        The dir to save alignments.
      alignments_url:
        The URL of alignments.
    """
    """Modified from https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/librispeech.py"""  # noqa
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    completed_detector = target_dir / ".ali_completed"
    if completed_detector.is_file():
        logging.info("The alignment files already exist.")
        return
    ali_zip_path = target_dir / "LibriSpeech-Alignments.zip"
    if not ali_zip_path.is_file():
        assert is_module_available(
            "gdown"
        ), 'To download LibriSpeech alignments, please install "pip install gdown"'  # noqa
        import gdown
        gdown.download(alignments_url, output=str(ali_zip_path))
    with zipfile.ZipFile(str(ali_zip_path)) as f:
        f.extractall(path=target_dir)
        completed_detector.touch()
 def add_alignment(
    alignments_dir: str,
    cuts_in_dir: str = "data/fbank",
    cuts_out_dir: str = "data/fbank_ali",
    dataset_parts: List[str] = DATASET_PARTS,
 ):
    """
    Add alignment info to existing cuts.
    Args:
      alignments_dir:
        The dir of the alignments.
      cuts_in_dir:
        The dir of the existing cuts.
      cuts_out_dir:
        The dir to save the new cuts with alignments.
      dataset_parts:
        Librispeech parts to add alignments.
    """
    alignments_dir = Path(alignments_dir)
    cuts_in_dir = Path(cuts_in_dir)
    cuts_out_dir = Path(cuts_out_dir)
    cuts_out_dir.mkdir(parents=True, exist_ok=True)
    for part in dataset_parts:
        logging.info(f"Processing {part}")
        cuts_in_path = cuts_in_dir / f"librispeech_cuts_{part}.jsonl.gz"
        if not cuts_in_path.is_file():
            logging.info(f"{cuts_in_path} does not exist - skipping.")
            continue
        cuts_out_path = cuts_out_dir / f"librispeech_cuts_{part}.jsonl.gz"
        if cuts_out_path.is_file():
            logging.info(f"{part} already exists - skipping.")
            continue
        # parse alignments
        alignments = {}
        part_ali_dir = alignments_dir / "LibriSpeech" / part
        for ali_path in part_ali_dir.rglob("*.alignment.txt"):
            ali = parse_alignments(ali_path)
            alignments.update(ali)
        logging.info(f"{part} has {len(alignments.keys())} cuts with alignments.")
        # add alignment attribute and write out
        cuts_in = load_manifest_lazy(cuts_in_path)
        with CutSet.open_writer(cuts_out_path) as writer:
            for cut in cuts_in:
                for idx, subcut in enumerate(cut.supervisions):
                    origin_id = subcut.id.split("_")[0]
                    if origin_id in alignments:
                        ali = alignments[origin_id]
                    else:
                        logging.info(f"Warning: {origin_id} does not have alignment.")
                        ali = []
                    subcut.alignment = {"word": ali}
                writer.write(cut, flush=True)
 def main():
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    parser = get_parser()
    args = parser.parse_args()
    logging.info(vars(args))
    download_alignments(args.alignments_dir)
    add_alignment(args.alignments_dir, args.cuts_in_dir, args.cuts_out_dir)
 if __name__ == "__main__":
    main()
--- a/egs/LJSpeech/ASR/local/compile_hlg.py
+++ b/egs/LJSpeech/ASR/local/compile_hlg.py
@ -0,0 +1,166 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script takes as input lang_dir and generates HLG from
    - H, the ctc topology, built from tokens contained in lang_dir/lexicon.txt
    - L, the lexicon, built from lang_dir/L_disambig.pt
        Caution: We use a lexicon that contains disambiguation symbols
    - G, the LM, built from data/lm/G_3_gram.fst.txt
 The generated HLG is saved in $lang_dir/HLG.pt
 """
 import argparse
 import logging
 from pathlib import Path
 import k2
 import torch
 from icefall.lexicon import Lexicon
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lm",
        type=str,
        default="G_3_gram",
        help="""Stem name for LM used in HLG compiling.
        """,
    )
    parser.add_argument(
        "--lang-dir",
        type=str,
        help="""Input and output directory.
        """,
    )
    return parser.parse_args()
 def compile_HLG(lang_dir: str, lm: str = "G_3_gram") -> k2.Fsa:
    """
    Args:
      lang_dir:
        The language directory, e.g., data/lang_phone or data/lang_bpe_5000.
      lm:
        The language stem base name.
    Return:
      An FSA representing HLG.
    """
    lexicon = Lexicon(lang_dir)
    max_token_id = max(lexicon.tokens)
    logging.info(f"Building ctc_topo. max_token_id: {max_token_id}")
    H = k2.ctc_topo(max_token_id)
    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt"))
    if Path(f"data/lm/{lm}.pt").is_file():
        logging.info(f"Loading pre-compiled {lm}")
        d = torch.load(f"data/lm/{lm}.pt")
        G = k2.Fsa.from_dict(d)
    else:
        logging.info(f"Loading {lm}.fst.txt")
        with open(f"data/lm/{lm}.fst.txt") as f:
            G = k2.Fsa.from_openfst(f.read(), acceptor=False)
            torch.save(G.as_dict(), f"data/lm/{lm}.pt")
    first_token_disambig_id = lexicon.token_table["#0"]
    first_word_disambig_id = lexicon.word_table["#0"]
    L = k2.arc_sort(L)
    G = k2.arc_sort(G)
    logging.info("Intersecting L and G")
    LG = k2.compose(L, G)
    logging.info(f"LG shape: {LG.shape}")
    logging.info("Connecting LG")
    LG = k2.connect(LG)
    logging.info(f"LG shape after k2.connect: {LG.shape}")
    logging.info(type(LG.aux_labels))
    logging.info("Determinizing LG")
    LG = k2.determinize(LG)
    logging.info(type(LG.aux_labels))
    logging.info("Connecting LG after k2.determinize")
    LG = k2.connect(LG)
    logging.info("Removing disambiguation symbols on LG")
    LG.labels[LG.labels >= first_token_disambig_id] = 0
    # See https://github.com/k2-fsa/k2/issues/874
    # for why we need to set LG.properties to None
    LG.__dict__["_properties"] = None
    assert isinstance(LG.aux_labels, k2.RaggedTensor)
    LG.aux_labels.values[LG.aux_labels.values >= first_word_disambig_id] = 0
    LG = k2.remove_epsilon(LG)
    logging.info(f"LG shape after k2.remove_epsilon: {LG.shape}")
    LG = k2.connect(LG)
    LG.aux_labels = LG.aux_labels.remove_values_eq(0)
    logging.info("Arc sorting LG")
    LG = k2.arc_sort(LG)
    logging.info("Composing H and LG")
    # CAUTION: The name of the inner_labels is fixed
    # to `tokens`. If you want to change it, please
    # also change other places in icefall that are using
    # it.
    HLG = k2.compose(H, LG, inner_labels="tokens")
    logging.info("Connecting LG")
    HLG = k2.connect(HLG)
    logging.info("Arc sorting LG")
    HLG = k2.arc_sort(HLG)
    logging.info(f"HLG.shape: {HLG.shape}")
    return HLG
 def main():
    args = get_args()
    lang_dir = Path(args.lang_dir)
    if (lang_dir / "HLG.pt").is_file():
        logging.info(f"{lang_dir}/HLG.pt already exists - skipping")
        return
    logging.info(f"Processing {lang_dir}")
    HLG = compile_HLG(lang_dir, args.lm)
    logging.info(f"Saving HLG.pt to {lang_dir}")
    torch.save(HLG.as_dict(), f"{lang_dir}/HLG.pt")
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/LJSpeech/ASR/local/compile_lg.py
+++ b/egs/LJSpeech/ASR/local/compile_lg.py
@ -0,0 +1,139 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang, Wei Kang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script takes as input lang_dir and generates LG from
    - L, the lexicon, built from lang_dir/L_disambig.pt
        Caution: We use a lexicon that contains disambiguation symbols
    - G, the LM, built from data/lm/G_3_gram.fst.txt
 The generated LG is saved in $lang_dir/LG.pt
 """
 import argparse
 import logging
 from pathlib import Path
 import k2
 import torch
 from icefall.lexicon import Lexicon
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lang-dir",
        type=str,
        help="""Input and output directory.
        """,
    )
    return parser.parse_args()
 def compile_LG(lang_dir: str) -> k2.Fsa:
    """
    Args:
      lang_dir:
        The language directory, e.g., data/lang_phone or data/lang_bpe_5000.
    Return:
      An FSA representing LG.
    """
    lexicon = Lexicon(lang_dir)
    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt"))
    if Path("data/lm/G_3_gram.pt").is_file():
        logging.info("Loading pre-compiled G_3_gram")
        d = torch.load("data/lm/G_3_gram.pt")
        G = k2.Fsa.from_dict(d)
    else:
        logging.info("Loading G_3_gram.fst.txt")
        with open("data/lm/G_3_gram.fst.txt") as f:
            G = k2.Fsa.from_openfst(f.read(), acceptor=False)
            torch.save(G.as_dict(), "data/lm/G_3_gram.pt")
    first_token_disambig_id = lexicon.token_table["#0"]
    first_word_disambig_id = lexicon.word_table["#0"]
    L = k2.arc_sort(L)
    G = k2.arc_sort(G)
    logging.info("Intersecting L and G")
    LG = k2.compose(L, G)
    logging.info(f"LG shape: {LG.shape}")
    logging.info("Connecting LG")
    LG = k2.connect(LG)
    logging.info(f"LG shape after k2.connect: {LG.shape}")
    logging.info(type(LG.aux_labels))
    logging.info("Determinizing LG")
    LG = k2.determinize(LG, k2.DeterminizeWeightPushingType.kLogWeightPushing)
    logging.info(type(LG.aux_labels))
    logging.info("Connecting LG after k2.determinize")
    LG = k2.connect(LG)
    logging.info("Removing disambiguation symbols on LG")
    LG.labels[LG.labels >= first_token_disambig_id] = 0
    # See https://github.com/k2-fsa/k2/issues/874
    # for why we need to set LG.properties to None
    LG.__dict__["_properties"] = None
    assert isinstance(LG.aux_labels, k2.RaggedTensor)
    LG.aux_labels.values[LG.aux_labels.values >= first_word_disambig_id] = 0
    LG = k2.remove_epsilon(LG)
    logging.info(f"LG shape after k2.remove_epsilon: {LG.shape}")
    LG = k2.connect(LG)
    LG.aux_labels = LG.aux_labels.remove_values_eq(0)
    logging.info("Arc sorting LG")
    LG = k2.arc_sort(LG)
    return LG
 def main():
    args = get_args()
    lang_dir = Path(args.lang_dir)
    if (lang_dir / "LG.pt").is_file():
        logging.info(f"{lang_dir}/LG.pt already exists - skipping")
        return
    logging.info(f"Processing {lang_dir}")
    LG = compile_LG(lang_dir)
    logging.info(f"Saving LG.pt to {lang_dir}")
    torch.save(LG.as_dict(), f"{lang_dir}/LG.pt")
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/LJSpeech/ASR/local/compute_fbank_LJSpeech.py
+++ b/egs/LJSpeech/ASR/local/compute_fbank_LJSpeech.py
@ -0,0 +1,140 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file computes fbank features of the LJSpeech dataset.
 It looks for manifests in the directory data/manifests.
 The generated fbank features are saved in data/fbank.
 """
 import argparse
 import logging
 import os
 import sys
 from pathlib import Path
 from typing import Optional
 import sentencepiece as spm
 import torch
 from filter_cuts import filter_cuts
 from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--bpe-model",
        type=str,
        help="""Path to the bpe.model. If not None, we will remove short and
        long utterances before extracting features""",
    )
    parser.add_argument(
        "--data-dir",
        type=str,
        default=None,
        help="""Path to data""",
    )
    return parser.parse_args()
 def compute_fbank_LJSpeech(bpe_model: Optional[str] = None):
    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
    num_jobs = min(15, os.cpu_count())
    num_mel_bins = 80
    if bpe_model:
        logging.info(f"Loading {bpe_model}")
        sp = spm.SentencePieceProcessor()
        sp.load(bpe_model)
    data_dir = args.data_dir
    if data_dir is None:
        raise NotImplementedError("need data directory")
    directory = data_dir + '/wavs'
    parts = ['train', 'dev', 'test']
    prefix = "LJSpeech"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=parts,
        output_dir=src_dir,
        prefix=prefix,
        suffix=suffix,
    )
    assert manifests is not None
    assert len(manifests) == len(parts), (
        len(manifests),
        len(parts),
        list(manifests.keys()),
        parts,
    )
    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
            cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
            if (output_dir / cuts_filename).is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
            cut_set = CutSet.from_manifests(
                recordings=m["recordings"],
                supervisions=m["supervisions"],
            )
            if bpe_model:
                cut_set = filter_cuts(cut_set, sp)
            if "train" in partition:
                cut_set = (
                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomChunkyWriter,
            )
            cut_set.to_file(output_dir / cuts_filename)
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    args = get_args()
    logging.info(vars(args))
    compute_fbank_LJSpeech(bpe_model=args.bpe_model)
--- a/egs/LJSpeech/ASR/local/compute_fbank_LJSpeech_pseudo.py
+++ b/egs/LJSpeech/ASR/local/compute_fbank_LJSpeech_pseudo.py
@ -0,0 +1,140 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file computes fbank features of the LJSpeech dataset.
 It looks for manifests in the directory data/manifests.
 The generated fbank features are saved in data/fbank.
 """
 import argparse
 import logging
 import os
 import sys
 from pathlib import Path
 from typing import Optional
 import sentencepiece as spm
 import torch
 from filter_cuts import filter_cuts
 from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--bpe-model",
        type=str,
        help="""Path to the bpe.model. If not None, we will remove short and
        long utterances before extracting features""",
    )
    parser.add_argument(
        "--data-dir",
        type=str,
        default=None,
        help="""Path to data""",
    )
    return parser.parse_args()
 def compute_fbank_LJSpeech(bpe_model: Optional[str] = None):
    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
    num_jobs = min(15, os.cpu_count())
    num_mel_bins = 80
    if bpe_model:
        logging.info(f"Loading {bpe_model}")
        sp = spm.SentencePieceProcessor()
        sp.load(bpe_model)
    data_dir = args.data_dir
    if data_dir is None:
        raise NotImplementedError("need data directory")
    directory = data_dir + '/wavs'
    parts = ['train', 'dev', 'test']
    prefix = "LJSpeech_pseudo"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=parts,
        output_dir=src_dir,
        prefix=prefix,
        suffix=suffix,
    )
    assert manifests is not None
    assert len(manifests) == len(parts), (
        len(manifests),
        len(parts),
        list(manifests.keys()),
        parts,
    )
    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
            cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
            if (output_dir / cuts_filename).is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
            cut_set = CutSet.from_manifests(
                recordings=m["recordings"],
                supervisions=m["supervisions"],
            )
            if bpe_model:
                cut_set = filter_cuts(cut_set, sp)
            if "train" in partition:
                cut_set = (
                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomChunkyWriter,
            )
            cut_set.to_file(output_dir / cuts_filename)
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    args = get_args()
    logging.info(vars(args))
    compute_fbank_LJSpeech(bpe_model=args.bpe_model)
--- a/egs/LJSpeech/ASR/local/compute_fbank_gigaspeech_dev_test.py
+++ b/egs/LJSpeech/ASR/local/compute_fbank_gigaspeech_dev_test.py
@ -0,0 +1,90 @@
 #!/usr/bin/env python3
 # Copyright    2021  Johns Hopkins University (Piotr Żelasko)
 # Copyright    2021  Xiaomi Corp.             (Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 from pathlib import Path
 import torch
 from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def compute_fbank_gigaspeech_dev_test():
    in_out_dir = Path("data/fbank")
    # number of workers in dataloader
    num_workers = 20
    # number of seconds in a batch
    batch_duration = 600
    subsets = ("DEV", "TEST")
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
    logging.info(f"device: {device}")
    prefix = "gigaspeech"
    suffix = "jsonl.gz"
    for partition in subsets:
        cuts_path = in_out_dir / f"{prefix}_cuts_{partition}.{suffix}"
        if cuts_path.is_file():
            logging.info(f"{cuts_path} exists - skipping")
            continue
        raw_cuts_path = in_out_dir / f"{prefix}_cuts_{partition}_raw.{suffix}"
        logging.info(f"Loading {raw_cuts_path}")
        cut_set = CutSet.from_file(raw_cuts_path)
        logging.info("Computing features")
        cut_set = cut_set.compute_and_store_features_batch(
            extractor=extractor,
            storage_path=f"{in_out_dir}/{prefix}_feats_{partition}",
            num_workers=num_workers,
            batch_duration=batch_duration,
            overwrite=True,
        )
        cut_set = cut_set.trim_to_supervisions(
            keep_overlapping=False, min_duration=None
        )
        logging.info(f"Saving to {cuts_path}")
        cut_set.to_file(cuts_path)
        logging.info(f"Saved to {cuts_path}")
 def main():
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    compute_fbank_gigaspeech_dev_test()
 if __name__ == "__main__":
    main()
--- a/egs/LJSpeech/ASR/local/compute_fbank_gigaspeech_splits.py
+++ b/egs/LJSpeech/ASR/local/compute_fbank_gigaspeech_splits.py
@ -0,0 +1,170 @@
 #!/usr/bin/env python3
 # Copyright    2021  Johns Hopkins University (Piotr Żelasko)
 # Copyright    2021  Xiaomi Corp.             (Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 import os
 from datetime import datetime
 from pathlib import Path
 import torch
 from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--num-workers",
        type=int,
        default=20,
        help="Number of dataloading workers used for reading the audio.",
    )
    parser.add_argument(
        "--batch-duration",
        type=float,
        default=600.0,
        help="The maximum number of audio seconds in a batch."
        "Determines batch size dynamically.",
    )
    parser.add_argument(
        "--num-splits",
        type=int,
        required=True,
        help="The number of splits of the XL subset",
    )
    parser.add_argument(
        "--start",
        type=int,
        default=0,
        help="Process pieces starting from this number (inclusive).",
    )
    parser.add_argument(
        "--stop",
        type=int,
        default=-1,
        help="Stop processing pieces until this number (exclusive).",
    )
    return parser
 def compute_fbank_gigaspeech_splits(args):
    num_splits = args.num_splits
    output_dir = f"data/fbank/gigaspeech_XL_split_{num_splits}"
    output_dir = Path(output_dir)
    assert output_dir.exists(), f"{output_dir} does not exist!"
    num_digits = len(str(num_splits))
    start = args.start
    stop = args.stop
    if stop < start:
        stop = num_splits
    stop = min(stop, num_splits)
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
    logging.info(f"device: {device}")
    prefix = "gigaspeech"
    num_digits = 8  # num_digits is fixed by lhotse split-lazy
    for i in range(start, stop):
        idx = f"{i + 1}".zfill(num_digits)
        logging.info(f"Processing {idx}/{num_splits}")
        cuts_path = output_dir / f"{prefix}_cuts_XL.{idx}.jsonl.gz"
        if cuts_path.is_file():
            logging.info(f"{cuts_path} exists - skipping")
            continue
        raw_cuts_path = output_dir / f"{prefix}_cuts_XL_raw.{idx}.jsonl.gz"
        if not raw_cuts_path.is_file():
            logging.info(f"{raw_cuts_path} does not exist - skipping it")
            continue
        logging.info(f"Loading {raw_cuts_path}")
        cut_set = CutSet.from_file(raw_cuts_path)
        logging.info("Computing features")
        if (output_dir / f"{prefix}_feats_XL_{idx}.lca").exists():
            logging.info(f"Removing {output_dir}/{prefix}_feats_XL_{idx}.lca")
            os.remove(output_dir / f"{prefix}_feats_XL_{idx}.lca")
        cut_set = cut_set.compute_and_store_features_batch(
            extractor=extractor,
            storage_path=f"{output_dir}/{prefix}_feats_XL_{idx}",
            num_workers=args.num_workers,
            batch_duration=args.batch_duration,
            overwrite=True,
        )
        logging.info("About to split cuts into smaller chunks.")
        cut_set = cut_set.trim_to_supervisions(
            keep_overlapping=False, min_duration=None
        )
        logging.info(f"Saving to {cuts_path}")
        cut_set.to_file(cuts_path)
        logging.info(f"Saved to {cuts_path}")
 def main():
    now = datetime.now()
    date_time = now.strftime("%Y-%m-%d-%H-%M-%S")
    log_filename = "log-compute_fbank_gigaspeech_splits"
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    log_filename = f"{log_filename}-{date_time}"
    logging.basicConfig(
        filename=log_filename,
        format=formatter,
        level=logging.INFO,
        filemode="w",
    )
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    console.setFormatter(logging.Formatter(formatter))
    logging.getLogger("").addHandler(console)
    parser = get_parser()
    args = parser.parse_args()
    logging.info(vars(args))
    compute_fbank_gigaspeech_splits(args)
 if __name__ == "__main__":
    main()
--- a/egs/LJSpeech/ASR/local/compute_fbank_librispeech.py
+++ b/egs/LJSpeech/ASR/local/compute_fbank_librispeech.py
@ -0,0 +1,134 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file computes fbank features of the LibriSpeech dataset.
 It looks for manifests in the directory data/manifests.
 The generated fbank features are saved in data/fbank.
 """
 import argparse
 import logging
 import os
 from pathlib import Path
 from typing import Optional
 import sentencepiece as spm
 import torch
 from filter_cuts import filter_cuts
 from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--bpe-model",
        type=str,
        help="""Path to the bpe.model. If not None, we will remove short and
        long utterances before extracting features""",
    )
    return parser.parse_args()
 def compute_fbank_librispeech(bpe_model: Optional[str] = None):
    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
    num_jobs = min(15, os.cpu_count())
    num_mel_bins = 80
    if bpe_model:
        logging.info(f"Loading {bpe_model}")
        sp = spm.SentencePieceProcessor()
        sp.load(bpe_model)
    dataset_parts = (
        "dev-clean",
        "dev-other",
        "test-clean",
        "test-other",
        "train-clean-100",
        "train-clean-360",
        "train-other-500",
    )
    prefix = "librispeech"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=src_dir,
        prefix=prefix,
        suffix=suffix,
    )
    assert manifests is not None
    assert len(manifests) == len(dataset_parts), (
        len(manifests),
        len(dataset_parts),
        list(manifests.keys()),
        dataset_parts,
    )
    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
            cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
            if (output_dir / cuts_filename).is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
            cut_set = CutSet.from_manifests(
                recordings=m["recordings"],
                supervisions=m["supervisions"],
            )
            if bpe_model:
                cut_set = filter_cuts(cut_set, sp)
            if "train" in partition:
                cut_set = (
                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomChunkyWriter,
            )
            cut_set.to_file(output_dir / cuts_filename)
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    args = get_args()
    logging.info(vars(args))
    compute_fbank_librispeech(bpe_model=args.bpe_model)
--- a/egs/LJSpeech/ASR/local/compute_fbank_musan.py
+++ b/egs/LJSpeech/ASR/local/compute_fbank_musan.py
@ -0,0 +1,105 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file computes fbank features of the musan dataset.
 It looks for manifests in the directory data/manifests.
 The generated fbank features are saved in data/fbank.
 """
 import logging
 import os
 from pathlib import Path
 import torch
 from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter, combine
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def compute_fbank_musan():
    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
    num_jobs = min(15, os.cpu_count())
    num_mel_bins = 80
    dataset_parts = (
        "music",
        "speech",
        "noise",
    )
    prefix = "musan"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=src_dir,
        prefix=prefix,
        suffix=suffix,
    )
    assert manifests is not None
    assert len(manifests) == len(dataset_parts), (
        len(manifests),
        len(dataset_parts),
        list(manifests.keys()),
        dataset_parts,
    )
    musan_cuts_path = output_dir / "musan_cuts.jsonl.gz"
    if musan_cuts_path.is_file():
        logging.info(f"{musan_cuts_path} already exists - skipping")
        return
    logging.info("Extracting features for Musan")
    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
    with get_executor() as ex:  # Initialize the executor only once.
        # create chunks of Musan with duration 5 - 10 seconds
        musan_cuts = (
            CutSet.from_manifests(
                recordings=combine(part["recordings"] for part in manifests.values())
            )
            .cut_into_windows(10.0)
            .filter(lambda c: c.duration > 5)
            .compute_and_store_features(
                extractor=extractor,
                storage_path=f"{output_dir}/musan_feats",
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomChunkyWriter,
            )
        )
        musan_cuts.to_file(musan_cuts_path)
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    compute_fbank_musan()
--- a/egs/LJSpeech/ASR/local/compute_fbank_userlibri.py
+++ b/egs/LJSpeech/ASR/local/compute_fbank_userlibri.py
@ -0,0 +1,131 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file computes fbank features of the UserLibri dataset.
 It looks for manifests in the directory data/manifests.
 The generated fbank features are saved in data/fbank.
 """
 import argparse
 import logging
 import os
 from pathlib import Path
 from typing import Optional
 import sentencepiece as spm
 import torch
 from filter_cuts import filter_cuts
 from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--bpe-model",
        type=str,
        help="""Path to the bpe.model. If not None, we will remove short and
        long utterances before extracting features""",
    )
    return parser.parse_args()
 def compute_fbank_userlibri(bpe_model: Optional[str] = None):
    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
    num_jobs = min(15, os.cpu_count())
    num_mel_bins = 80
    if bpe_model:
        logging.info(f"Loading {bpe_model}")
        sp = spm.SentencePieceProcessor()
        sp.load(bpe_model)
    directory = "/DB/UserLibri/audio_data/speaker-wise-test"
    spks_parts = os.listdir(directory)
    directory = "/DB/UserLibri/audio_data/book-wise-test"
    books_parts = os.listdir(directory)
    dataset_parts = spks_parts + books_parts
    prefix = "userlibri"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=src_dir,
        prefix=prefix,
        suffix=suffix,
    )
    assert manifests is not None
    assert len(manifests) == len(dataset_parts), (
        len(manifests),
        len(dataset_parts),
        list(manifests.keys()),
        dataset_parts,
    )
    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
            cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
            if (output_dir / cuts_filename).is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
            cut_set = CutSet.from_manifests(
                recordings=m["recordings"],
                supervisions=m["supervisions"],
            )
            if bpe_model:
                cut_set = filter_cuts(cut_set, sp)
            if "train" in partition:
                cut_set = (
                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomChunkyWriter,
            )
            cut_set.to_file(output_dir / cuts_filename)
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    args = get_args()
    logging.info(vars(args))
    compute_fbank_userlibri(bpe_model=args.bpe_model)
--- a/egs/LJSpeech/ASR/local/convert_transcript_words_to_tokens.py
+++ b/egs/LJSpeech/ASR/local/convert_transcript_words_to_tokens.py
@ -0,0 +1,103 @@
 #!/usr/bin/env python3
 # Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
 """
 Convert a transcript file containing words to a corpus file containing tokens
 for LM training with the help of a lexicon.
 If the lexicon contains phones, the resulting LM will be a phone LM; If the
 lexicon contains word pieces, the resulting LM will be a word piece LM.
 If a word has multiple pronunciations, the one that appears first in the lexicon
 is kept; others are removed.
 If the input transcript is:
    hello zoo world hello
    world zoo
    foo zoo world hellO
 and if the lexicon is
    <UNK> SPN
    hello h e l l o 2
    hello h e l l o
    world w o r l d
    zoo z o o
 Then the output is
    h e l l o 2 z o o w o r l d h e l l o 2
    w o r l d z o o
    SPN z o o w o r l d SPN
 """
 import argparse
 from pathlib import Path
 from typing import Dict, List
 from generate_unique_lexicon import filter_multiple_pronunications
 from icefall.lexicon import read_lexicon
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--transcript",
        type=str,
        help="The input transcript file."
        "We assume that the transcript file consists of "
        "lines. Each line consists of space separated words.",
    )
    parser.add_argument("--lexicon", type=str, help="The input lexicon file.")
    parser.add_argument("--oov", type=str, default="<UNK>", help="The OOV word.")
    return parser.parse_args()
 def process_line(lexicon: Dict[str, List[str]], line: str, oov_token: str) -> None:
    """
    Args:
      lexicon:
        A dict containing pronunciations. Its keys are words and values
        are pronunciations (i.e., tokens).
      line:
        A line of transcript consisting of space(s) separated words.
      oov_token:
        The pronunciation of the oov word if a word in `line` is not present
        in the lexicon.
    Returns:
      Return None.
    """
    s = ""
    words = line.strip().split()
    for i, w in enumerate(words):
        tokens = lexicon.get(w, oov_token)
        s += " ".join(tokens)
        s += " "
    print(s.strip())
 def main():
    args = get_args()
    assert Path(args.lexicon).is_file()
    assert Path(args.transcript).is_file()
    assert len(args.oov) > 0
    # Only the first pronunciation of a word is kept
    lexicon = filter_multiple_pronunications(read_lexicon(args.lexicon))
    lexicon = dict(lexicon)
    assert args.oov in lexicon
    oov_token = lexicon[args.oov]
    with open(args.transcript) as f:
        for line in f:
            process_line(lexicon=lexicon, line=line, oov_token=oov_token)
 if __name__ == "__main__":
    main()
--- a/egs/LJSpeech/ASR/local/display_manifest_statistics.py
+++ b/egs/LJSpeech/ASR/local/display_manifest_statistics.py
@ -0,0 +1,215 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file displays duration statistics of utterances in a manifest.
 You can use the displayed value to choose minimum/maximum duration
 to remove short and long utterances during the training.
 See the function `remove_short_and_long_utt()` in transducer/train.py
 for usage.
 """
 from lhotse import load_manifest_lazy
 def main():
    #  path = "./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz"
    #  path = "./data/fbank/librispeech_cuts_train-clean-360.jsonl.gz"
    #  path = "./data/fbank/librispeech_cuts_train-other-500.jsonl.gz"
    #  path = "./data/fbank/librispeech_cuts_dev-clean.jsonl.gz"
    #  path = "./data/fbank/librispeech_cuts_dev-other.jsonl.gz"
    #  path = "./data/fbank/librispeech_cuts_test-clean.jsonl.gz"
    path = "./data/fbank/librispeech_cuts_test-other.jsonl.gz"
    cuts = load_manifest_lazy(path)
    cuts.describe()
 if __name__ == "__main__":
    main()
 """
 ## train-clean-100
 Cuts count: 85617
 Total duration (hours): 303.8
 Speech duration (hours): 303.8 (100.0%)
 ***
 Duration statistics (seconds):
 mean    12.8
 std     3.8
 min     1.3
 0.1%    1.9
 0.5%    2.2
 1%      2.5
 5%      4.2
 10%     6.4
 25%     11.4
 50%     13.8
 75%     15.3
 90%     16.7
 95%     17.3
 99%     18.1
 99.5%   18.4
 99.9%   18.8
 max     27.2
 ## train-clean-360
 Cuts count: 312042
 Total duration (hours): 1098.2
 Speech duration (hours): 1098.2 (100.0%)
 ***
 Duration statistics (seconds):
 mean    12.7
 std     3.8
 min     1.0
 0.1%    1.8
 0.5%    2.2
 1%      2.5
 5%      4.2
 10%     6.2
 25%     11.2
 50%     13.7
 75%     15.3
 90%     16.6
 95%     17.3
 99%     18.1
 99.5%   18.4
 99.9%   18.8
 max     33.0
 ## train-other 500
 Cuts count: 446064
 Total duration (hours): 1500.6
 Speech duration (hours): 1500.6 (100.0%)
 ***
 Duration statistics (seconds):
 mean    12.1
 std     4.2
 min     0.8
 0.1%    1.7
 0.5%    2.1
 1%      2.3
 5%      3.5
 10%     5.0
 25%     9.8
 50%     13.4
 75%     15.1
 90%     16.5
 95%     17.2
 99%     18.1
 99.5%   18.4
 99.9%   18.9
 max     31.0
 ## dev-clean
 Cuts count: 2703
 Total duration (hours): 5.4
 Speech duration (hours): 5.4 (100.0%)
 ***
 Duration statistics (seconds):
 mean    7.2
 std     4.7
 min     1.4
 0.1%    1.6
 0.5%    1.8
 1%      1.9
 5%      2.4
 10%     2.7
 25%     3.8
 50%     5.9
 75%     9.3
 90%     13.3
 95%     16.4
 99%     23.8
 99.5%   28.5
 99.9%   32.3
 max     32.6
 ## dev-other
 Cuts count: 2864
 Total duration (hours): 5.1
 Speech duration (hours): 5.1 (100.0%)
 ***
 Duration statistics (seconds):
 mean    6.4
 std     4.3
 min     1.1
 0.1%    1.3
 0.5%    1.7
 1%      1.8
 5%      2.2
 10%     2.6
 25%     3.5
 50%     5.3
 75%     7.9
 90%     12.0
 95%     15.0
 99%     22.2
 99.5%   27.1
 99.9%   32.4
 max     35.2
 ## test-clean
 Cuts count: 2620
 Total duration (hours): 5.4
 Speech duration (hours): 5.4 (100.0%)
 ***
 Duration statistics (seconds):
 mean    7.4
 std     5.2
 min     1.3
 0.1%    1.6
 0.5%    1.8
 1%      2.0
 5%      2.3
 10%     2.7
 25%     3.7
 50%     5.8
 75%     9.6
 90%     14.6
 95%     17.8
 99%     25.5
 99.5%   28.4
 99.9%   32.8
 max     35.0
 ## test-other
 Cuts count: 2939
 Total duration (hours): 5.3
 Speech duration (hours): 5.3 (100.0%)
 ***
 Duration statistics (seconds):
 mean    6.5
 std     4.4
 min     1.2
 0.1%    1.5
 0.5%    1.8
 1%      1.9
 5%      2.3
 10%     2.6
 25%     3.4
 50%     5.2
 75%     8.2
 90%     12.6
 95%     15.8
 99%     21.4
 99.5%   23.8
 99.9%   33.5
 max     34.5
 """
--- a/egs/LJSpeech/ASR/local/download_lm.py
+++ b/egs/LJSpeech/ASR/local/download_lm.py
@ -0,0 +1,97 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file downloads the following LibriSpeech LM files:
    - 3-gram.pruned.1e-7.arpa.gz
    - 4-gram.arpa.gz
    - librispeech-vocab.txt
    - librispeech-lexicon.txt
    - librispeech-lm-norm.txt.gz
 from http://www.openslr.org/resources/11
 and save them in the user provided directory.
 Files are not re-downloaded if they already exist.
 Usage:
    ./local/download_lm.py --out-dir ./download/lm
 """
 import argparse
 import gzip
 import logging
 import os
 import shutil
 from pathlib import Path
 from lhotse.utils import urlretrieve_progress
 from tqdm.auto import tqdm
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--out-dir", type=str, help="Output directory.")
    args = parser.parse_args()
    return args
 def main(out_dir: str):
    url = "http://www.openslr.org/resources/11"
    out_dir = Path(out_dir)
    files_to_download = (
        "3-gram.pruned.1e-7.arpa.gz",
        "4-gram.arpa.gz",
        "librispeech-vocab.txt",
        "librispeech-lexicon.txt",
        "librispeech-lm-norm.txt.gz",
    )
    for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"):
        filename = out_dir / f
        if filename.is_file() is False:
            urlretrieve_progress(
                f"{url}/{f}",
                filename=filename,
                desc=f"Downloading {filename}",
            )
        else:
            logging.info(f"{filename} already exists - skipping")
        if ".gz" in str(filename):
            unzipped = Path(os.path.splitext(filename)[0])
            if unzipped.is_file() is False:
                with gzip.open(filename, "rb") as f_in:
                    with open(unzipped, "wb") as f_out:
                        shutil.copyfileobj(f_in, f_out)
            else:
                logging.info(f"{unzipped} already exist - skipping")
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    args = get_args()
    logging.info(f"out_dir: {args.out_dir}")
    main(out_dir=args.out_dir)
--- a/egs/LJSpeech/ASR/local/filter_cuts.py
+++ b/egs/LJSpeech/ASR/local/filter_cuts.py
@ -0,0 +1,160 @@
 #!/usr/bin/env python3
 # Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script removes short and long utterances from a cutset.
 Caution:
  You may need to tune the thresholds for your own dataset.
 Usage example:
  python3 ./local/filter_cuts.py \
    --bpe-model data/lang_bpe_500/bpe.model \
    --in-cuts data/fbank/librispeech_cuts_test-clean.jsonl.gz \
    --out-cuts data/fbank-filtered/librispeech_cuts_test-clean.jsonl.gz
 """
 import argparse
 import logging
 from pathlib import Path
 import sentencepiece as spm
 from lhotse import CutSet, load_manifest_lazy
 from lhotse.cut import Cut
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--bpe-model",
        type=Path,
        help="Path to the bpe.model",
    )
    parser.add_argument(
        "--in-cuts",
        type=Path,
        help="Path to the input cutset",
    )
    parser.add_argument(
        "--out-cuts",
        type=Path,
        help="Path to the output cutset",
    )
    return parser.parse_args()
 def filter_cuts(cut_set: CutSet, sp: spm.SentencePieceProcessor):
    total = 0  # number of total utterances before removal
    removed = 0  # number of removed utterances
    def remove_short_and_long_utterances(c: Cut):
        """Return False to exclude the input cut"""
        nonlocal removed, total
        # Keep only utterances with duration between 1 second and 20 seconds
        #
        # Caution: There is a reason to select 20.0 here. Please see
        # ./display_manifest_statistics.py
        #
        # You should use ./display_manifest_statistics.py to get
        # an utterance duration distribution for your dataset to select
        # the threshold
        total += 1
        if c.duration < 1.0 or c.duration > 20.0:
            logging.warning(
                f"Exclude cut with ID {c.id} from training. Duration: {c.duration}"
            )
            removed += 1
            return False
        # In pruned RNN-T, we require that T >= S
        # where T is the number of feature frames after subsampling
        # and S is the number of tokens in the utterance
        # In ./pruned_transducer_stateless2/conformer.py, the
        # conv module uses the following expression
        # for subsampling
        if c.num_frames is None:
            num_frames = c.duration * 100  # approximate
        else:
            num_frames = c.num_frames
        T = ((num_frames - 1) // 2 - 1) // 2
        # Note: for ./lstm_transducer_stateless/lstm.py, the formula is
        #  T = ((num_frames - 3) // 2 - 1) // 2
        # Note: for ./pruned_transducer_stateless7/zipformer.py, the formula is
        # T = ((num_frames - 7) // 2 + 1) // 2
        tokens = sp.encode(c.supervisions[0].text, out_type=str)
        if T < len(tokens):
            logging.warning(
                f"Exclude cut with ID {c.id} from training. "
                f"Number of frames (before subsampling): {c.num_frames}. "
                f"Number of frames (after subsampling): {T}. "
                f"Text: {c.supervisions[0].text}. "
                f"Tokens: {tokens}. "
                f"Number of tokens: {len(tokens)}"
            )
            removed += 1
            return False
        return True
    # We use to_eager() here so that we can print out the value of total
    # and removed below.
    ans = cut_set.filter(remove_short_and_long_utterances).to_eager()
    ratio = removed / total * 100
    logging.info(
        f"Removed {removed} cuts from {total} cuts. {ratio:.3f}% data is removed."
    )
    return ans
 def main():
    args = get_args()
    logging.info(vars(args))
    if args.out_cuts.is_file():
        logging.info(f"{args.out_cuts} already exists - skipping")
        return
    assert args.in_cuts.is_file(), f"{args.in_cuts} does not exist"
    assert args.bpe_model.is_file(), f"{args.bpe_model} does not exist"
    sp = spm.SentencePieceProcessor()
    sp.load(str(args.bpe_model))
    cut_set = load_manifest_lazy(args.in_cuts)
    assert isinstance(cut_set, CutSet)
    cut_set = filter_cuts(cut_set, sp)
    logging.info(f"Saving to {args.out_cuts}")
    args.out_cuts.parent.mkdir(parents=True, exist_ok=True)
    cut_set.to_file(args.out_cuts)
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/LJSpeech/ASR/local/generate_unique_lexicon.py
+++ b/egs/LJSpeech/ASR/local/generate_unique_lexicon.py
@ -0,0 +1,98 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file takes as input a lexicon.txt and output a new lexicon,
 in which each word has a unique pronunciation.
 The way to do this is to keep only the first pronunciation of a word
 in lexicon.txt.
 """
 import argparse
 import logging
 from pathlib import Path
 from typing import List, Tuple
 from icefall.lexicon import read_lexicon, write_lexicon
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lang-dir",
        type=str,
        help="""Input and output directory.
        It should contain a file lexicon.txt.
        This file will generate a new file uniq_lexicon.txt
        in it.
        """,
    )
    return parser.parse_args()
 def filter_multiple_pronunications(
    lexicon: List[Tuple[str, List[str]]]
 ) -> List[Tuple[str, List[str]]]:
    """Remove multiple pronunciations of words from a lexicon.
    If a word has more than one pronunciation in the lexicon, only
    the first one is kept, while other pronunciations are removed
    from the lexicon.
    Args:
      lexicon:
        The input lexicon, containing a list of (word, [p1, p2, ..., pn]),
        where "p1, p2, ..., pn" are the pronunciations of the "word".
    Returns:
      Return a new lexicon where each word has a unique pronunciation.
    """
    seen = set()
    ans = []
    for word, tokens in lexicon:
        if word in seen:
            continue
        seen.add(word)
        ans.append((word, tokens))
    return ans
 def main():
    args = get_args()
    lang_dir = Path(args.lang_dir)
    lexicon_filename = lang_dir / "lexicon.txt"
    in_lexicon = read_lexicon(lexicon_filename)
    out_lexicon = filter_multiple_pronunications(in_lexicon)
    write_lexicon(lang_dir / "uniq_lexicon.txt", out_lexicon)
    logging.info(f"Number of entries in lexicon.txt: {len(in_lexicon)}")
    logging.info(f"Number of entries in uniq_lexicon.txt: {len(out_lexicon)}")
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/LJSpeech/ASR/local/prepare_LJSpeech.py
+++ b/egs/LJSpeech/ASR/local/prepare_LJSpeech.py
@ -0,0 +1,179 @@
 import logging
 import sys
 import os
 import re
 import shutil
 import tarfile
 import zipfile
 from concurrent.futures.thread import ThreadPoolExecutor
 from pathlib import Path
 from typing import Dict, List, Optional, Sequence, Tuple, Union
 from tqdm.auto import tqdm
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
 from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet
 from lhotse.utils import (
    Pathlike,
    is_module_available,
    safe_extract,
    urlretrieve_progress,
 )
 # LIBRISPEECH_ALIGNMENTS_URL = (
 #     "https://drive.google.com/uc?id=1WYfgr31T-PPwMcxuAq09XZfHQO5Mw8fE"
 # )
 def prepare_LJSpeech(
    corpus_dir: str,
    dataset_parts: str = "auto",
    output_dir: str = None,
    num_jobs: int = 1,
 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.
    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
        By default we will infer which parts are available in ``corpus_dir``.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    assert os.path.exists(corpus_dir), f"{corpus_dir} does not exist"
    # wav_dir = Path(corpus_dir + "/wavs")
    # wavs = os.listdir(wav_dir)
    # text_dir = Path(corpus_dir + "/wavs")
    # texts = os.listdir(text_dir)
    # wavs_parts = (
    #     set(wavs)
    # )
    # books_parts = (
    #     set(texts)
    # )
    manifests = {}
    dataset_parts = ["train", "dev", "test"]
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    import glob
    futures = []
    for part in tqdm(dataset_parts, desc="Dataset parts"):
        logging.info(f"Processing LJSpeech subset: {part}")
        if manifests_exist(part=part, output_dir=output_dir):
            logging.info(f"LJSpeech subset: {part} already prepared - skipping.")
            continue
        recordings = []
        supervisions = []
        part_path = Path(os.path.join(corpus_dir, "wavs", part))
        part_file_names = list(map(lambda x: x.strip('.wav'),os.listdir(part_path))) 
        txt_path = os.path.join(corpus_dir, "texts")
        futures = []
        for trans_path in tqdm(
            glob.iglob(str(txt_path) + "/*.txt"), desc="Distributing tasks", leave=False
        ):
            alignments = {}
            with open(trans_path) as f:
                cur_file_name = trans_path.split('/')[-1].replace('.txt', '')
                if cur_file_name not in part_file_names:
                    continue
                for line in f:
                    futures.append(
                        parse_utterance(part_path, trans_path + ' ' + line, alignments)
                    )
        for future in tqdm(futures, desc="Processing", leave=False):
            result = future
            if result is None:
                continue
            recording, segment = result
            recordings.append(recording)
            supervisions.append(segment)
        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recording_set, supervision_set)
        if output_dir is not None:
            supervision_set.to_file(
                output_dir / f"LJSpeech_supervisions_{part}.jsonl.gz"
            )
            recording_set.to_file(
                output_dir / f"LJSpeech_recordings_{part}.jsonl.gz"
            )
        manifests[part] = {
            "recordings": recording_set,
            "supervisions": supervision_set,
        }
    return manifests
 def parse_utterance(
    dataset_split_path: Path,
    line: str,
    alignments: Dict[str, List[AlignmentItem]],
 ) -> Optional[Tuple[Recording, SupervisionSegment]]:
    recording_id, text = line.strip().split(maxsplit=1)
    recording_id = recording_id.split('/')[-1].split('.txt')[0]
    # Create the Recording first
    audio_path = (
        dataset_split_path / f"{recording_id}.wav"
    )
    if not os.path.exists(audio_path):
        logging.warning(f"No such file: {audio_path}")
        return None
    recording = Recording.from_file(audio_path, recording_id=recording_id)
    # Then, create the corresponding supervisions
    segment = SupervisionSegment(
        id=recording_id,
        recording_id=recording_id,
        start=0.0,
        duration=recording.duration,
        channel=0,
        language="English",
        speaker=re.sub(r"-.*", r"", recording.id),
        text=text.strip(),
        alignment={"word": alignments[recording_id]}
        if recording_id in alignments
        else None,
    )
    return recording, segment
 def parse_alignments(ali_path: Pathlike) -> Dict[str, List[AlignmentItem]]:
    alignments = {}
    for line in Path(ali_path).read_text().splitlines():
        utt_id, words, timestamps = line.split()
        words = words.replace('"', "").split(",")
        timestamps = [0.0] + list(map(float, timestamps.replace('"', "").split(",")))
        alignments[utt_id] = [
            AlignmentItem(
                symbol=word, start=start, duration=round(end - start, ndigits=8)
            )
            for word, start, end in zip(words, timestamps, timestamps[1:])
        ]
    return alignments
 def main(corpus_dir):
    nj = 15
    output_dir = "data/manifests"
    prepare_LJSpeech(corpus_dir, "auto", output_dir, nj)
 corpus_dir = sys.argv[1]
 main(corpus_dir)
--- a/egs/LJSpeech/ASR/local/prepare_LJSpeech_pseudo.py
+++ b/egs/LJSpeech/ASR/local/prepare_LJSpeech_pseudo.py
@ -0,0 +1,179 @@
 import logging
 import sys
 import os
 import re
 import shutil
 import tarfile
 import zipfile
 from concurrent.futures.thread import ThreadPoolExecutor
 from pathlib import Path
 from typing import Dict, List, Optional, Sequence, Tuple, Union
 from tqdm.auto import tqdm
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
 from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet
 from lhotse.utils import (
    Pathlike,
    is_module_available,
    safe_extract,
    urlretrieve_progress,
 )
 # LIBRISPEECH_ALIGNMENTS_URL = (
 #     "https://drive.google.com/uc?id=1WYfgr31T-PPwMcxuAq09XZfHQO5Mw8fE"
 # )
 def prepare_LJSpeech(
    corpus_dir: str,
    dataset_parts: str = "auto",
    output_dir: str = None,
    num_jobs: int = 1,
 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.
    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
        By default we will infer which parts are available in ``corpus_dir``.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    assert os.path.exists(corpus_dir), f"{corpus_dir} does not exist"
    # wav_dir = Path(corpus_dir + "/wavs")
    # wavs = os.listdir(wav_dir)
    # text_dir = Path(corpus_dir + "/wavs")
    # texts = os.listdir(text_dir)
    # wavs_parts = (
    #     set(wavs)
    # )
    # books_parts = (
    #     set(texts)
    # )
    manifests = {}
    dataset_parts = ["train", "dev", "test"]
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    import glob
    futures = []
    for part in tqdm(dataset_parts, desc="Dataset parts"):
        logging.info(f"Processing LJSpeech subset: {part}")
        if manifests_exist(part=part, output_dir=output_dir):
            logging.info(f"LJSpeech subset: {part} already prepared - skipping.")
            continue
        recordings = []
        supervisions = []
        part_path = Path(os.path.join(corpus_dir, "wavs", part))
        part_file_names = list(map(lambda x: x.strip('.wav'),os.listdir(part_path))) 
        txt_path = os.path.join(corpus_dir, "texts")
        futures = []
        for trans_path in tqdm(
            glob.iglob(str(txt_path) + "/*.txt"), desc="Distributing tasks", leave=False
        ):
            alignments = {}
            with open(trans_path) as f:
                cur_file_name = trans_path.split('/')[-1].replace('.txt', '')
                if cur_file_name not in part_file_names:
                    continue
                for line in f:
                    futures.append(
                        parse_utterance(part_path, trans_path + ' ' + line, alignments)
                    )
        for future in tqdm(futures, desc="Processing", leave=False):
            result = future
            if result is None:
                continue
            recording, segment = result
            recordings.append(recording)
            supervisions.append(segment)
        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recording_set, supervision_set)
        if output_dir is not None:
            supervision_set.to_file(
                output_dir / f"LJSpeech_pseudo_supervisions_{part}.jsonl.gz"
            )
            recording_set.to_file(
                output_dir / f"LJSpeech_pseudo_recordings_{part}.jsonl.gz"
            )
        manifests[part] = {
            "recordings": recording_set,
            "supervisions": supervision_set,
        }
    return manifests
 def parse_utterance(
    dataset_split_path: Path,
    line: str,
    alignments: Dict[str, List[AlignmentItem]],
 ) -> Optional[Tuple[Recording, SupervisionSegment]]:
    recording_id, text = line.strip().split(maxsplit=1)
    recording_id = recording_id.split('/')[-1].split('.txt')[0]
    # Create the Recording first
    audio_path = (
        dataset_split_path / f"{recording_id}.wav"
    )
    if not os.path.exists(audio_path):
        logging.warning(f"No such file: {audio_path}")
        return None
    recording = Recording.from_file(audio_path, recording_id=recording_id)
    # Then, create the corresponding supervisions
    segment = SupervisionSegment(
        id=recording_id,
        recording_id=recording_id,
        start=0.0,
        duration=recording.duration,
        channel=0,
        language="English",
        speaker=re.sub(r"-.*", r"", recording.id),
        text=text.strip(),
        alignment={"word": alignments[recording_id]}
        if recording_id in alignments
        else None,
    )
    return recording, segment
 def parse_alignments(ali_path: Pathlike) -> Dict[str, List[AlignmentItem]]:
    alignments = {}
    for line in Path(ali_path).read_text().splitlines():
        utt_id, words, timestamps = line.split()
        words = words.replace('"', "").split(",")
        timestamps = [0.0] + list(map(float, timestamps.replace('"', "").split(",")))
        alignments[utt_id] = [
            AlignmentItem(
                symbol=word, start=start, duration=round(end - start, ndigits=8)
            )
            for word, start, end in zip(words, timestamps, timestamps[1:])
        ]
    return alignments
 def main(corpus_dir):
    nj = 15
    output_dir = "data/manifests"
    prepare_LJSpeech(corpus_dir, "auto", output_dir, nj)
 corpus_dir = sys.argv[1]
 main(corpus_dir)
--- a/egs/LJSpeech/ASR/local/prepare_LJSpeech_text.py
+++ b/egs/LJSpeech/ASR/local/prepare_LJSpeech_text.py
@ -0,0 +1,35 @@
 import os
 import sys
 import re
 metafile = sys.argv[1]
 outdir = "texts"
 save_dir = "/".join(metafile.split('/')[:-1])
 save_dir = os.path.join(save_dir, outdir)
 if not os.path.exists(save_dir):
    os.makedirs(save_dir)
 with open(metafile, 'r') as f:
    strings = f.readlines()
 for string in strings:
    # Split the string into parts
    parts = string.split("|")
    # Assign the parts to variables
    filename = parts[0]
    text1 = parts[1]
    try:
        text2 = parts[2]
    except:
        text2 = text1
    text2 = text2.upper()
    text2 = re.sub(r"[^A-Z ']", "", text2)
    # Create a new text file with the filename and write text2 to it
    filename = os.path.join(save_dir, filename)
    with open(f"{filename}.txt", "w") as file:
        file.write(text2)
--- a/egs/LJSpeech/ASR/local/prepare_lang.py
+++ b/egs/LJSpeech/ASR/local/prepare_lang.py
@ -0,0 +1,413 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script takes as input a lexicon file "data/lang_phone/lexicon.txt"
 consisting of words and tokens (i.e., phones) and does the following:
 1. Add disambiguation symbols to the lexicon and generate lexicon_disambig.txt
 2. Generate tokens.txt, the token table mapping a token to a unique integer.
 3. Generate words.txt, the word table mapping a word to a unique integer.
 4. Generate L.pt, in k2 format. It can be loaded by
        d = torch.load("L.pt")
        lexicon = k2.Fsa.from_dict(d)
 5. Generate L_disambig.pt, in k2 format.
 """
 import argparse
 import math
 from collections import defaultdict
 from pathlib import Path
 from typing import Any, Dict, List, Tuple
 import k2
 import torch
 from icefall.lexicon import read_lexicon, write_lexicon
 from icefall.utils import str2bool
 Lexicon = List[Tuple[str, List[str]]]
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lang-dir",
        type=str,
        help="""Input and output directory.
        It should contain a file lexicon.txt.
        Generated files by this script are saved into this directory.
        """,
    )
    parser.add_argument(
        "--debug",
        type=str2bool,
        default=False,
        help="""True for debugging, which will generate
        a visualization of the lexicon FST.
        Caution: If your lexicon contains hundreds of thousands
        of lines, please set it to False!
        """,
    )
    return parser.parse_args()
 def write_mapping(filename: str, sym2id: Dict[str, int]) -> None:
    """Write a symbol to ID mapping to a file.
    Note:
      No need to implement `read_mapping` as it can be done
      through :func:`k2.SymbolTable.from_file`.
    Args:
      filename:
        Filename to save the mapping.
      sym2id:
        A dict mapping symbols to IDs.
    Returns:
      Return None.
    """
    with open(filename, "w", encoding="utf-8") as f:
        for sym, i in sym2id.items():
            f.write(f"{sym} {i}\n")
 def get_tokens(lexicon: Lexicon) -> List[str]:
    """Get tokens from a lexicon.
    Args:
      lexicon:
        It is the return value of :func:`read_lexicon`.
    Returns:
      Return a list of unique tokens.
    """
    ans = set()
    for _, tokens in lexicon:
        ans.update(tokens)
    sorted_ans = sorted(list(ans))
    return sorted_ans
 def get_words(lexicon: Lexicon) -> List[str]:
    """Get words from a lexicon.
    Args:
      lexicon:
        It is the return value of :func:`read_lexicon`.
    Returns:
      Return a list of unique words.
    """
    ans = set()
    for word, _ in lexicon:
        ans.add(word)
    sorted_ans = sorted(list(ans))
    return sorted_ans
 def add_disambig_symbols(lexicon: Lexicon) -> Tuple[Lexicon, int]:
    """It adds pseudo-token disambiguation symbols #1, #2 and so on
    at the ends of tokens to ensure that all pronunciations are different,
    and that none is a prefix of another.
    See also add_lex_disambig.pl from kaldi.
    Args:
      lexicon:
        It is returned by :func:`read_lexicon`.
    Returns:
      Return a tuple with two elements:
        - The output lexicon with disambiguation symbols
        - The ID of the max disambiguation symbol that appears
          in the lexicon
    """
    # (1) Work out the count of each token-sequence in the
    # lexicon.
    count = defaultdict(int)
    for _, tokens in lexicon:
        count[" ".join(tokens)] += 1
    # (2) For each left sub-sequence of each token-sequence, note down
    # that it exists (for identifying prefixes of longer strings).
    issubseq = defaultdict(int)
    for _, tokens in lexicon:
        tokens = tokens.copy()
        tokens.pop()
        while tokens:
            issubseq[" ".join(tokens)] = 1
            tokens.pop()
    # (3) For each entry in the lexicon:
    # if the token sequence is unique and is not a
    # prefix of another word, no disambig symbol.
    # Else output #1, or #2, #3, ... if the same token-seq
    # has already been assigned a disambig symbol.
    ans = []
    # We start with #1 since #0 has its own purpose
    first_allowed_disambig = 1
    max_disambig = first_allowed_disambig - 1
    last_used_disambig_symbol_of = defaultdict(int)
    for word, tokens in lexicon:
        tokenseq = " ".join(tokens)
        assert tokenseq != ""
        if issubseq[tokenseq] == 0 and count[tokenseq] == 1:
            ans.append((word, tokens))
            continue
        cur_disambig = last_used_disambig_symbol_of[tokenseq]
        if cur_disambig == 0:
            cur_disambig = first_allowed_disambig
        else:
            cur_disambig += 1
        if cur_disambig > max_disambig:
            max_disambig = cur_disambig
        last_used_disambig_symbol_of[tokenseq] = cur_disambig
        tokenseq += f" #{cur_disambig}"
        ans.append((word, tokenseq.split()))
    return ans, max_disambig
 def generate_id_map(symbols: List[str]) -> Dict[str, int]:
    """Generate ID maps, i.e., map a symbol to a unique ID.
    Args:
      symbols:
        A list of unique symbols.
    Returns:
      A dict containing the mapping between symbols and IDs.
    """
    return {sym: i for i, sym in enumerate(symbols)}
 def add_self_loops(
    arcs: List[List[Any]], disambig_token: int, disambig_word: int
 ) -> List[List[Any]]:
    """Adds self-loops to states of an FST to propagate disambiguation symbols
    through it. They are added on each state with non-epsilon output symbols
    on at least one arc out of the state.
    See also fstaddselfloops.pl from Kaldi. One difference is that
    Kaldi uses OpenFst style FSTs and it has multiple final states.
    This function uses k2 style FSTs and it does not need to add self-loops
    to the final state.
    The input label of a self-loop is `disambig_token`, while the output
    label is `disambig_word`.
    Args:
      arcs:
        A list-of-list. The sublist contains
        `[src_state, dest_state, label, aux_label, score]`
      disambig_token:
        It is the token ID of the symbol `#0`.
      disambig_word:
        It is the word ID of the symbol `#0`.
    Return:
      Return new `arcs` containing self-loops.
    """
    states_needs_self_loops = set()
    for arc in arcs:
        src, dst, ilabel, olabel, score = arc
        if olabel != 0:
            states_needs_self_loops.add(src)
    ans = []
    for s in states_needs_self_loops:
        ans.append([s, s, disambig_token, disambig_word, 0])
    return arcs + ans
 def lexicon_to_fst(
    lexicon: Lexicon,
    token2id: Dict[str, int],
    word2id: Dict[str, int],
    sil_token: str = "SIL",
    sil_prob: float = 0.5,
    need_self_loops: bool = False,
 ) -> k2.Fsa:
    """Convert a lexicon to an FST (in k2 format) with optional silence at
    the beginning and end of each word.
    Args:
      lexicon:
        The input lexicon. See also :func:`read_lexicon`
      token2id:
        A dict mapping tokens to IDs.
      word2id:
        A dict mapping words to IDs.
      sil_token:
        The silence token.
      sil_prob:
        The probability for adding a silence at the beginning and end
        of the word.
      need_self_loops:
        If True, add self-loop to states with non-epsilon output symbols
        on at least one arc out of the state. The input label for this
        self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
    Returns:
      Return an instance of `k2.Fsa` representing the given lexicon.
    """
    assert sil_prob > 0.0 and sil_prob < 1.0
    # CAUTION: we use score, i.e, negative cost.
    sil_score = math.log(sil_prob)
    no_sil_score = math.log(1.0 - sil_prob)
    start_state = 0
    loop_state = 1  # words enter and leave from here
    sil_state = 2  # words terminate here when followed by silence; this state
    # has a silence transition to loop_state.
    next_state = 3  # the next un-allocated state, will be incremented as we go.
    arcs = []
    assert token2id["<eps>"] == 0
    assert word2id["<eps>"] == 0
    eps = 0
    sil_token = token2id[sil_token]
    arcs.append([start_state, loop_state, eps, eps, no_sil_score])
    arcs.append([start_state, sil_state, eps, eps, sil_score])
    arcs.append([sil_state, loop_state, sil_token, eps, 0])
    for word, tokens in lexicon:
        assert len(tokens) > 0, f"{word} has no pronunciations"
        cur_state = loop_state
        word = word2id[word]
        tokens = [token2id[i] for i in tokens]
        for i in range(len(tokens) - 1):
            w = word if i == 0 else eps
            arcs.append([cur_state, next_state, tokens[i], w, 0])
            cur_state = next_state
            next_state += 1
        # now for the last token of this word
        # It has two out-going arcs, one to the loop state,
        # the other one to the sil_state.
        i = len(tokens) - 1
        w = word if i == 0 else eps
        arcs.append([cur_state, loop_state, tokens[i], w, no_sil_score])
        arcs.append([cur_state, sil_state, tokens[i], w, sil_score])
    if need_self_loops:
        disambig_token = token2id["#0"]
        disambig_word = word2id["#0"]
        arcs = add_self_loops(
            arcs,
            disambig_token=disambig_token,
            disambig_word=disambig_word,
        )
    final_state = next_state
    arcs.append([loop_state, final_state, -1, -1, 0])
    arcs.append([final_state])
    arcs = sorted(arcs, key=lambda arc: arc[0])
    arcs = [[str(i) for i in arc] for arc in arcs]
    arcs = [" ".join(arc) for arc in arcs]
    arcs = "\n".join(arcs)
    fsa = k2.Fsa.from_str(arcs, acceptor=False)
    return fsa
 def main():
    args = get_args()
    lang_dir = Path(args.lang_dir)
    lexicon_filename = lang_dir / "lexicon.txt"
    sil_token = "SIL"
    sil_prob = 0.5
    lexicon = read_lexicon(lexicon_filename)
    tokens = get_tokens(lexicon)
    words = get_words(lexicon)
    lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
    for i in range(max_disambig + 1):
        disambig = f"#{i}"
        assert disambig not in tokens
        tokens.append(f"#{i}")
    assert "<eps>" not in tokens
    tokens = ["<eps>"] + tokens
    assert "<eps>" not in words
    assert "#0" not in words
    assert "<s>" not in words
    assert "</s>" not in words
    words = ["<eps>"] + words + ["#0", "<s>", "</s>"]
    token2id = generate_id_map(tokens)
    word2id = generate_id_map(words)
    write_mapping(lang_dir / "tokens.txt", token2id)
    write_mapping(lang_dir / "words.txt", word2id)
    write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig)
    L = lexicon_to_fst(
        lexicon,
        token2id=token2id,
        word2id=word2id,
        sil_token=sil_token,
        sil_prob=sil_prob,
    )
    L_disambig = lexicon_to_fst(
        lexicon_disambig,
        token2id=token2id,
        word2id=word2id,
        sil_token=sil_token,
        sil_prob=sil_prob,
        need_self_loops=True,
    )
    torch.save(L.as_dict(), lang_dir / "L.pt")
    torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt")
    if args.debug:
        labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt")
        aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt")
        L.labels_sym = labels_sym
        L.aux_labels_sym = aux_labels_sym
        L.draw(f"{lang_dir / 'L.svg'}", title="L.pt")
        L_disambig.labels_sym = labels_sym
        L_disambig.aux_labels_sym = aux_labels_sym
        L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", title="L_disambig.pt")
 if __name__ == "__main__":
    main()
--- a/egs/LJSpeech/ASR/local/prepare_lang_bpe.py
+++ b/egs/LJSpeech/ASR/local/prepare_lang_bpe.py
@ -0,0 +1,259 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
 """
 This script takes as input `lang_dir`, which should contain::
    - lang_dir/bpe.model,
    - lang_dir/words.txt
 and generates the following files in the directory `lang_dir`:
    - lexicon.txt
    - lexicon_disambig.txt
    - L.pt
    - L_disambig.pt
    - tokens.txt
 """
 import argparse
 from pathlib import Path
 from typing import Dict, List, Tuple
 import k2
 import sentencepiece as spm
 import torch
 from prepare_lang import (
    Lexicon,
    add_disambig_symbols,
    add_self_loops,
    write_lexicon,
    write_mapping,
 )
 from icefall.utils import str2bool
 def lexicon_to_fst_no_sil(
    lexicon: Lexicon,
    token2id: Dict[str, int],
    word2id: Dict[str, int],
    need_self_loops: bool = False,
 ) -> k2.Fsa:
    """Convert a lexicon to an FST (in k2 format).
    Args:
      lexicon:
        The input lexicon. See also :func:`read_lexicon`
      token2id:
        A dict mapping tokens to IDs.
      word2id:
        A dict mapping words to IDs.
      need_self_loops:
        If True, add self-loop to states with non-epsilon output symbols
        on at least one arc out of the state. The input label for this
        self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
    Returns:
      Return an instance of `k2.Fsa` representing the given lexicon.
    """
    loop_state = 0  # words enter and leave from here
    next_state = 1  # the next un-allocated state, will be incremented as we go
    arcs = []
    # The blank symbol <blk> is defined in local/train_bpe_model.py
    assert token2id["<blk>"] == 0
    assert word2id["<eps>"] == 0
    eps = 0
    for word, pieces in lexicon:
        assert len(pieces) > 0, f"{word} has no pronunciations"
        cur_state = loop_state
        word = word2id[word]
        pieces = [token2id[i] for i in pieces]
        for i in range(len(pieces) - 1):
            w = word if i == 0 else eps
            arcs.append([cur_state, next_state, pieces[i], w, 0])
            cur_state = next_state
            next_state += 1
        # now for the last piece of this word
        i = len(pieces) - 1
        w = word if i == 0 else eps
        arcs.append([cur_state, loop_state, pieces[i], w, 0])
    if need_self_loops:
        disambig_token = token2id["#0"]
        disambig_word = word2id["#0"]
        arcs = add_self_loops(
            arcs,
            disambig_token=disambig_token,
            disambig_word=disambig_word,
        )
    final_state = next_state
    arcs.append([loop_state, final_state, -1, -1, 0])
    arcs.append([final_state])
    arcs = sorted(arcs, key=lambda arc: arc[0])
    arcs = [[str(i) for i in arc] for arc in arcs]
    arcs = [" ".join(arc) for arc in arcs]
    arcs = "\n".join(arcs)
    fsa = k2.Fsa.from_str(arcs, acceptor=False)
    return fsa
 def generate_lexicon(
    model_file: str, words: List[str]
 ) -> Tuple[Lexicon, Dict[str, int]]:
    """Generate a lexicon from a BPE model.
    Args:
      model_file:
        Path to a sentencepiece model.
      words:
        A list of strings representing words.
    Returns:
      Return a tuple with two elements:
        - A dict whose keys are words and values are the corresponding
          word pieces.
        - A dict representing the token symbol, mapping from tokens to IDs.
    """
    sp = spm.SentencePieceProcessor()
    sp.load(str(model_file))
    # Convert word to word piece IDs instead of word piece strings
    # to avoid OOV tokens.
    words_pieces_ids: List[List[int]] = sp.encode(words, out_type=int)
    # Now convert word piece IDs back to word piece strings.
    words_pieces: List[List[str]] = [sp.id_to_piece(ids) for ids in words_pieces_ids]
    lexicon = []
    for word, pieces in zip(words, words_pieces):
        lexicon.append((word, pieces))
    # The OOV word is <UNK>
    lexicon.append(("<UNK>", [sp.id_to_piece(sp.unk_id())]))
    token2id: Dict[str, int] = dict()
    for i in range(sp.vocab_size()):
        token2id[sp.id_to_piece(i)] = i
    return lexicon, token2id
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lang-dir",
        type=str,
        help="""Input and output directory.
        It should contain the bpe.model and words.txt
        """,
    )
    parser.add_argument(
        "--debug",
        type=str2bool,
        default=False,
        help="""True for debugging, which will generate
        a visualization of the lexicon FST.
        Caution: If your lexicon contains hundreds of thousands
        of lines, please set it to False!
        See "test/test_bpe_lexicon.py" for usage.
        """,
    )
    return parser.parse_args()
 def main():
    args = get_args()
    lang_dir = Path(args.lang_dir)
    model_file = lang_dir / "bpe.model"
    word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
    words = word_sym_table.symbols
    excluded = ["<eps>", "!SIL", "<SPOKEN_NOISE>", "<UNK>", "#0", "<s>", "</s>"]
    for w in excluded:
        if w in words:
            words.remove(w)
    lexicon, token_sym_table = generate_lexicon(model_file, words)
    lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
    next_token_id = max(token_sym_table.values()) + 1
    for i in range(max_disambig + 1):
        disambig = f"#{i}"
        assert disambig not in token_sym_table
        token_sym_table[disambig] = next_token_id
        next_token_id += 1
    word_sym_table.add("#0")
    word_sym_table.add("<s>")
    word_sym_table.add("</s>")
    write_mapping(lang_dir / "tokens.txt", token_sym_table)
    write_lexicon(lang_dir / "lexicon.txt", lexicon)
    write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig)
    L = lexicon_to_fst_no_sil(
        lexicon,
        token2id=token_sym_table,
        word2id=word_sym_table,
    )
    L_disambig = lexicon_to_fst_no_sil(
        lexicon_disambig,
        token2id=token_sym_table,
        word2id=word_sym_table,
        need_self_loops=True,
    )
    torch.save(L.as_dict(), lang_dir / "L.pt")
    torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt")
    if args.debug:
        labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt")
        aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt")
        L.labels_sym = labels_sym
        L.aux_labels_sym = aux_labels_sym
        L.draw(f"{lang_dir / 'L.svg'}", title="L.pt")
        L_disambig.labels_sym = labels_sym
        L_disambig.aux_labels_sym = aux_labels_sym
        L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", title="L_disambig.pt")
 if __name__ == "__main__":
    main()
--- a/egs/LJSpeech/ASR/local/prepare_lm_training_data.py
+++ b/egs/LJSpeech/ASR/local/prepare_lm_training_data.py
@ -0,0 +1,167 @@
 #!/usr/bin/env python3
 # Copyright (c)  2021  Xiaomi Corporation (authors: Daniel Povey
 #                                                   Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script takes a `bpe.model` and a text file such as
 ./download/lm/librispeech-lm-norm.txt
 and outputs the LM training data to a supplied directory such
 as data/lm_training_bpe_500.  The format is as follows:
 It creates a PyTorch archive (.pt file), say data/lm_training.pt, which is a
 representation of a dict with the following format:
  'words' -> a k2.RaggedTensor of two axes [word][token] with dtype torch.int32
             containing the BPE representations of each word, indexed by
             integer word ID. (These integer word IDS are present in
             'lm_data').  The sentencepiece object can be used to turn the
             words and BPE units into string form.
  'sentences' -> a k2.RaggedTensor of two axes [sentence][word] with dtype
            torch.int32 containing all the sentences, as word-ids (we don't
            output the string form of this directly but it can be worked out
            together with 'words' and the bpe.model).
  'sentence_lengths' -> a 1-D torch.Tensor of dtype torch.int32, containing
            number of BPE tokens of each sentence.
 """
 import argparse
 import logging
 from pathlib import Path
 import k2
 import sentencepiece as spm
 import torch
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--bpe-model",
        type=str,
        help="Input BPE model, e.g. data/bpe_500/bpe.model",
    )
    parser.add_argument(
        "--lm-data",
        type=str,
        help="""Input LM training data as text, e.g.
        download/pb.train.txt""",
    )
    parser.add_argument(
        "--lm-archive",
        type=str,
        help="""Path to output archive, e.g. data/bpe_500/lm_data.pt;
        look at the source of this script to see the format.""",
    )
    return parser.parse_args()
 def main():
    args = get_args()
    if Path(args.lm_archive).exists():
        logging.warning(f"{args.lm_archive} exists - skipping")
        return
    sp = spm.SentencePieceProcessor()
    sp.load(args.bpe_model)
    # word2index is a dictionary from words to integer ids.  No need to reserve
    # space for epsilon, etc.; the words are just used as a convenient way to
    # compress the sequences of BPE pieces.
    word2index = dict()
    word2bpe = []  # Will be a list-of-list-of-int, representing BPE pieces.
    sentences = []  # Will be a list-of-list-of-int, representing word-ids.
    if "librispeech-lm-norm" in args.lm_data:
        num_lines_in_total = 40418261.0
        step = 5000000
    elif "valid" in args.lm_data:
        num_lines_in_total = 5567.0
        step = 3000
    elif "test" in args.lm_data:
        num_lines_in_total = 5559.0
        step = 3000
    else:
        num_lines_in_total = None
        step = None
    processed = 0
    with open(args.lm_data) as f:
        while True:
            line = f.readline()
            if line == "":
                break
            if step and processed % step == 0:
                logging.info(
                    f"Processed number of lines: {processed} "
                    f"({processed/num_lines_in_total*100: .3f}%)"
                )
            processed += 1
            line_words = line.split()
            for w in line_words:
                if w not in word2index:
                    w_bpe = sp.encode(w)
                    word2index[w] = len(word2bpe)
                    word2bpe.append(w_bpe)
            sentences.append([word2index[w] for w in line_words])
    logging.info("Constructing ragged tensors")
    words = k2.ragged.RaggedTensor(word2bpe)
    sentences = k2.ragged.RaggedTensor(sentences)
    output = dict(words=words, sentences=sentences)
    num_sentences = sentences.dim0
    logging.info(f"Computing sentence lengths, num_sentences: {num_sentences}")
    sentence_lengths = [0] * num_sentences
    for i in range(num_sentences):
        if step and i % step == 0:
            logging.info(
                f"Processed number of lines: {i} ({i/num_sentences*100: .3f}%)"
            )
        word_ids = sentences[i]
        # NOTE: If word_ids is a tensor with only 1 entry,
        # token_ids is a torch.Tensor
        token_ids = words[word_ids]
        if isinstance(token_ids, k2.RaggedTensor):
            token_ids = token_ids.values
        # token_ids is a 1-D tensor containing the BPE tokens
        # of the current sentence
        sentence_lengths[i] = token_ids.numel()
    output["sentence_lengths"] = torch.tensor(sentence_lengths, dtype=torch.int32)
    torch.save(output, args.lm_archive)
    logging.info(f"Saved to {args.lm_archive}")
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/LJSpeech/ASR/local/prepare_userlibri.py
+++ b/egs/LJSpeech/ASR/local/prepare_userlibri.py
@ -0,0 +1,255 @@
 import logging
 import os
 import re
 import shutil
 import tarfile
 import zipfile
 from concurrent.futures.thread import ThreadPoolExecutor
 from pathlib import Path
 from typing import Dict, List, Optional, Sequence, Tuple, Union
 from tqdm.auto import tqdm
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
 from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet
 from lhotse.utils import (
    Pathlike,
    is_module_available,
    safe_extract,
    urlretrieve_progress,
 )
 # LIBRISPEECH_ALIGNMENTS_URL = (
 #     "https://drive.google.com/uc?id=1WYfgr31T-PPwMcxuAq09XZfHQO5Mw8fE"
 # )
 # def download_librispeech(
 #     target_dir: Pathlike = ".",
 #     dataset_parts: Optional[Union[str, Sequence[str]]] = "mini_librispeech",
 #     force_download: bool = False,
 #     alignments: bool = False,
 #     base_url: str = "http://www.openslr.org/resources",
 #     alignments_url: str = LIBRISPEECH_ALIGNMENTS_URL,
 # ) -> Path:
 #     """
 #     Download and untar the dataset, supporting both LibriSpeech and MiniLibrispeech
 #     :param target_dir: Pathlike, the path of the dir to storage the dataset.
 #     :param dataset_parts: "librispeech", "mini_librispeech",
 #         or a list of splits (e.g. "dev-clean") to download.
 #     :param force_download: Bool, if True, download the tars no matter if the tars exist.
 #     :param alignments: should we download the alignments. The original source is:
 #         https://github.com/CorentinJ/librispeech-alignments
 #     :param base_url: str, the url of the OpenSLR resources.
 #     :param alignments_url: str, the url of LibriSpeech word alignments
 #     :return: the path to downloaded and extracted directory with data.
 #     """
 #     target_dir = Path(target_dir)
 #     corpus_dir = target_dir / "LibriSpeech"
 #     target_dir.mkdir(parents=True, exist_ok=True)
 #     if dataset_parts == "librispeech":
 #         dataset_parts = LIBRISPEECH
 #     elif dataset_parts == "mini_librispeech":
 #         dataset_parts = MINI_LIBRISPEECH
 #     elif isinstance(dataset_parts, str):
 #         dataset_parts = [dataset_parts]
 #     for part in tqdm(dataset_parts, desc="Downloading LibriSpeech parts"):
 #         logging.info(f"Processing split: {part}")
 #         # Determine the valid URL for a given split.
 #         if part in LIBRISPEECH:
 #             url = f"{base_url}/12"
 #         elif part in MINI_LIBRISPEECH:
 #             url = f"{base_url}/31"
 #         else:
 #             logging.warning(f"Invalid dataset part name: {part}")
 #             continue
 #         # Split directory exists and seem valid? Skip this split.
 #         part_dir = corpus_dir / part
 #         completed_detector = part_dir / ".completed"
 #         if completed_detector.is_file():
 #             logging.info(f"Skipping {part} because {completed_detector} exists.")
 #             continue
 #         # Maybe-download the archive.
 #         tar_name = f"{part}.tar.gz"
 #         tar_path = target_dir / tar_name
 #         if force_download or not tar_path.is_file():
 #             urlretrieve_progress(
 #                 f"{url}/{tar_name}", filename=tar_path, desc=f"Downloading {tar_name}"
 #             )
 #         # Remove partial unpacked files, if any, and unpack everything.
 #         shutil.rmtree(part_dir, ignore_errors=True)
 #         with tarfile.open(tar_path) as tar:
 #             safe_extract(tar, path=target_dir)
 #         completed_detector.touch()
 #     if alignments:
 #         completed_detector = target_dir / ".ali_completed"
 #         if completed_detector.is_file() and not force_download:
 #             return corpus_dir
 #         assert is_module_available(
 #             "gdown"
 #         ), 'To download LibriSpeech alignments, please install "pip install gdown"'
 #         import gdown
 #         ali_zip_path = str(target_dir / "LibriSpeech-Alignments.zip")
 #         gdown.download(alignments_url, output=ali_zip_path)
 #         with zipfile.ZipFile(ali_zip_path) as f:
 #             f.extractall(path=target_dir)
 #             completed_detector.touch()
 #     return corpus_dir
 def prepare_userlibri(
    corpus_dir: str,
    dataset_parts: str = "auto",
    output_dir: str = None,
    num_jobs: int = 1,
 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.
    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
        By default we will infer which parts are available in ``corpus_dir``.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    # corpus_audio_dir = Path(corpus_dir + "/audio_data")
    # corpus_lm_dir = Path(corpus_dir + "/lm_data")
    # corpus_dir = Path(corpus_dir)
    corpus_dir = Path(corpus_dir + "/audio_data")
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    spkwise_parent = corpus_dir / "speaker-wise-test"
    spks = os.listdir(spkwise_parent)
    bookwise_parent = corpus_dir / "book-wise-test"
    books = os.listdir(bookwise_parent)
    spks_parts = (
        set(spks)
    )
    books_parts = (
        set(books)
    )
    manifests = {}
    for s_or_b, dataset_parts in zip(["speaker-wise-test", "book-wise-test"], [spks_parts, books_parts]):
        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            # Maybe the manifests already exist: we can read them and save a bit of preparation time.
            manifests = read_manifests_if_cached(
                dataset_parts=dataset_parts, output_dir=output_dir
            )
        with ThreadPoolExecutor(num_jobs) as ex:
            for part in tqdm(dataset_parts, desc="Dataset parts"):
                logging.info(f"Processing UserLibri subset: {part}")
                if manifests_exist(part=part, output_dir=output_dir):
                    logging.info(f"UserLibri subset: {part} already prepared - skipping.")
                    continue
                recordings = []
                supervisions = []
                part_path = corpus_dir / s_or_b / part
                futures = []
                for trans_path in tqdm(
                    part_path.rglob("*.trans.txt"), desc="Distributing tasks", leave=False
                ):
                    alignments = {}
                    with open(trans_path) as f:
                        for line in f:
                            futures.append(
                                ex.submit(parse_utterance, trans_path.parent, line, alignments)
                            )
                for future in tqdm(futures, desc="Processing", leave=False):
                    result = future.result()
                    if result is None:
                        continue
                    recording, segment = result
                    recordings.append(recording)
                    supervisions.append(segment)
                recording_set = RecordingSet.from_recordings(recordings)
                supervision_set = SupervisionSet.from_segments(supervisions)
                validate_recordings_and_supervisions(recording_set, supervision_set)
                if output_dir is not None:
                    supervision_set.to_file(
                        output_dir / f"userlibri_supervisions_{part}.jsonl.gz"
                    )
                    recording_set.to_file(
                        output_dir / f"userlibri_recordings_{part}.jsonl.gz"
                    )
                manifests[part] = {
                    "recordings": recording_set,
                    "supervisions": supervision_set,
                }
    return manifests
 def parse_utterance(
    dataset_split_path: Path,
    line: str,
    alignments: Dict[str, List[AlignmentItem]],
 ) -> Optional[Tuple[Recording, SupervisionSegment]]:
    recording_id, text = line.strip().split(maxsplit=1)
    # Create the Recording first
    audio_path = (
        dataset_split_path
        / f"{recording_id}.flac"
    )
    if not audio_path.is_file():
        logging.warning(f"No such file: {audio_path}")
        return None
    recording = Recording.from_file(audio_path, recording_id=recording_id)
    # Then, create the corresponding supervisions
    segment = SupervisionSegment(
        id=recording_id,
        recording_id=recording_id,
        start=0.0,
        duration=recording.duration,
        channel=0,
        language="English",
        speaker=re.sub(r"-.*", r"", recording.id),
        text=text.strip(),
        alignment={"word": alignments[recording_id]}
        if recording_id in alignments
        else None,
    )
    return recording, segment
 def parse_alignments(ali_path: Pathlike) -> Dict[str, List[AlignmentItem]]:
    alignments = {}
    for line in Path(ali_path).read_text().splitlines():
        utt_id, words, timestamps = line.split()
        words = words.replace('"', "").split(",")
        timestamps = [0.0] + list(map(float, timestamps.replace('"', "").split(",")))
        alignments[utt_id] = [
            AlignmentItem(
                symbol=word, start=start, duration=round(end - start, ndigits=8)
            )
            for word, start, end in zip(words, timestamps, timestamps[1:])
        ]
    return alignments
 def main():
    nj = 15
    output_dir = "data/manifests"
    corpus_dir = "/DB/UserLibri"
    prepare_userlibri(corpus_dir, "auto", output_dir, nj)
 main()
--- a/egs/LJSpeech/ASR/local/preprocess_gigaspeech.py
+++ b/egs/LJSpeech/ASR/local/preprocess_gigaspeech.py
@ -0,0 +1,129 @@
 #!/usr/bin/env python3
 # Copyright    2021  Johns Hopkins University (Piotr Żelasko)
 # Copyright    2021  Xiaomi Corp.             (Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 import re
 from pathlib import Path
 from lhotse import CutSet, SupervisionSegment
 from lhotse.recipes.utils import read_manifests_if_cached
 # Similar text filtering and normalization procedure as in:
 # https://github.com/SpeechColab/GigaSpeech/blob/main/toolkits/kaldi/gigaspeech_data_prep.sh
 def normalize_text(
    utt: str,
    punct_pattern=re.compile(r"<(COMMA|PERIOD|QUESTIONMARK|EXCLAMATIONPOINT)>"),
    whitespace_pattern=re.compile(r"\s\s+"),
 ) -> str:
    return whitespace_pattern.sub(" ", punct_pattern.sub("", utt))
 def has_no_oov(
    sup: SupervisionSegment,
    oov_pattern=re.compile(r"<(SIL|MUSIC|NOISE|OTHER)>"),
 ) -> bool:
    return oov_pattern.search(sup.text) is None
 def preprocess_giga_speech():
    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
    output_dir.mkdir(exist_ok=True)
    dataset_parts = (
        "DEV",
        "TEST",
        "XS",
        "S",
        "M",
        "L",
        "XL",
    )
    logging.info("Loading manifest (may take 4 minutes)")
    prefix = "gigaspeech"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=src_dir,
        prefix=prefix,
        suffix=suffix,
    )
    assert manifests is not None
    assert len(manifests) == len(dataset_parts), (
        len(manifests),
        len(dataset_parts),
        list(manifests.keys()),
        dataset_parts,
    )
    for partition, m in manifests.items():
        logging.info(f"Processing {partition}")
        raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.{suffix}"
        if raw_cuts_path.is_file():
            logging.info(f"{partition} already exists - skipping")
            continue
        # Note this step makes the recipe different than LibriSpeech:
        # We must filter out some utterances and remove punctuation
        # to be consistent with Kaldi.
        logging.info("Filtering OOV utterances from supervisions")
        m["supervisions"] = m["supervisions"].filter(has_no_oov)
        logging.info(f"Normalizing text in {partition}")
        for sup in m["supervisions"]:
            sup.text = normalize_text(sup.text)
            sup.custom = {"origin": "giga"}
        # Create long-recording cut manifests.
        logging.info(f"Processing {partition}")
        cut_set = CutSet.from_manifests(
            recordings=m["recordings"],
            supervisions=m["supervisions"],
        )
        # Run data augmentation that needs to be done in the
        # time domain.
        #  if partition not in ["DEV", "TEST"]:
        #      logging.info(
        #          f"Speed perturb for {partition} with factors 0.9 and 1.1 "
        #          "(Perturbing may take 8 minutes and saving may"
        #          " take 20 minutes)"
        #      )
        #      cut_set = (
        #          cut_set
        #          + cut_set.perturb_speed(0.9)
        #          + cut_set.perturb_speed(1.1)
        #      )
        #
        # Note: No need to perturb the training subset as not all of the
        # data is going to be used in the training.
        logging.info(f"Saving to {raw_cuts_path}")
        cut_set.to_file(raw_cuts_path)
 def main():
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    preprocess_giga_speech()
 if __name__ == "__main__":
    main()
--- a/egs/LJSpeech/ASR/local/sort_lm_training_data.py
+++ b/egs/LJSpeech/ASR/local/sort_lm_training_data.py
@ -0,0 +1,141 @@
 #!/usr/bin/env python3
 # Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file takes as input the filename of LM training data
 generated by ./local/prepare_lm_training_data.py and sorts
 it by sentence length.
 Sentence length equals to the number of BPE tokens in a sentence.
 """
 import argparse
 import logging
 from pathlib import Path
 import k2
 import numpy as np
 import torch
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--in-lm-data",
        type=str,
        help="Input LM training data, e.g., data/bpe_500/lm_data.pt",
    )
    parser.add_argument(
        "--out-lm-data",
        type=str,
        help="Input LM training data, e.g., data/bpe_500/sorted_lm_data.pt",
    )
    parser.add_argument(
        "--out-statistics",
        type=str,
        help="Statistics about LM training data., data/bpe_500/statistics.txt",
    )
    return parser.parse_args()
 def main():
    args = get_args()
    in_lm_data = Path(args.in_lm_data)
    out_lm_data = Path(args.out_lm_data)
    assert in_lm_data.is_file(), f"{in_lm_data}"
    if out_lm_data.is_file():
        logging.warning(f"{out_lm_data} exists - skipping")
        return
    data = torch.load(in_lm_data)
    words2bpe = data["words"]
    sentences = data["sentences"]
    sentence_lengths = data["sentence_lengths"]
    num_sentences = sentences.dim0
    assert num_sentences == sentence_lengths.numel(), (
        num_sentences,
        sentence_lengths.numel(),
    )
    indices = torch.argsort(sentence_lengths, descending=True)
    sorted_sentences = sentences[indices.to(torch.int32)]
    sorted_sentence_lengths = sentence_lengths[indices]
    # Check that sentences are ordered by length
    assert num_sentences == sorted_sentences.dim0, (
        num_sentences,
        sorted_sentences.dim0,
    )
    cur = None
    for i in range(num_sentences):
        word_ids = sorted_sentences[i]
        token_ids = words2bpe[word_ids]
        if isinstance(token_ids, k2.RaggedTensor):
            token_ids = token_ids.values
        if cur is not None:
            assert cur >= token_ids.numel(), (cur, token_ids.numel())
        cur = token_ids.numel()
        assert cur == sorted_sentence_lengths[i]
    data["sentences"] = sorted_sentences
    data["sentence_lengths"] = sorted_sentence_lengths
    torch.save(data, args.out_lm_data)
    logging.info(f"Saved to {args.out_lm_data}")
    statistics = Path(args.out_statistics)
    # Write statistics
    num_words = sorted_sentences.numel()
    num_tokens = sentence_lengths.sum().item()
    max_sentence_length = sentence_lengths[indices[0]]
    min_sentence_length = sentence_lengths[indices[-1]]
    step = 10
    hist, bins = np.histogram(
        sentence_lengths.numpy(),
        bins=np.arange(1, max_sentence_length + step, step),
    )
    histogram = np.stack((bins[:-1], hist)).transpose()
    with open(statistics, "w") as f:
        f.write(f"num_sentences: {num_sentences}\n")
        f.write(f"num_words: {num_words}\n")
        f.write(f"num_tokens: {num_tokens}\n")
        f.write(f"max_sentence_length: {max_sentence_length}\n")
        f.write(f"min_sentence_length: {min_sentence_length}\n")
        f.write("histogram:\n")
        f.write("  bin  count  percent\n")
        for row in histogram:
            f.write(
                f"{int(row[0]):>5} {int(row[1]):>5}   "
                f"{100.*row[1]/num_sentences:.3f}%\n"
            )
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/LJSpeech/ASR/local/test_load_XL_split.py
+++ b/egs/LJSpeech/ASR/local/test_load_XL_split.py
@ -0,0 +1,51 @@
 #!/usr/bin/env python3
 # Copyright      2022  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file can be used to check if any split is corrupted.
 """
 import glob
 import re
 import lhotse
 def main():
    d = "data/fbank/XL_split_2000"
    filenames = list(glob.glob(f"{d}/cuts_XL.*.jsonl.gz"))
    pattern = re.compile(r"cuts_XL.([0-9]+).jsonl.gz")
    idx_filenames = [(int(pattern.search(c).group(1)), c) for c in filenames]
    idx_filenames = sorted(idx_filenames, key=lambda x: x[0])
    print(f"Loading {len(idx_filenames)} splits")
    s = 0
    for i, f in idx_filenames:
        cuts = lhotse.load_manifest_lazy(f)
        print(i, "filename", f)
        for i, c in enumerate(cuts):
            s += c.features.load().shape[0]
            if i > 5:
                break
 if __name__ == "__main__":
    main()
--- a/egs/LJSpeech/ASR/local/test_prepare_lang.py
+++ b/egs/LJSpeech/ASR/local/test_prepare_lang.py
@ -0,0 +1,104 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
 import os
 import tempfile
 import k2
 from prepare_lang import (
    add_disambig_symbols,
    generate_id_map,
    get_phones,
    get_words,
    lexicon_to_fst,
    read_lexicon,
    write_lexicon,
    write_mapping,
 )
 def generate_lexicon_file() -> str:
    fd, filename = tempfile.mkstemp()
    os.close(fd)
    s = """
    !SIL SIL
    <SPOKEN_NOISE> SPN
    <UNK> SPN
    f f
    a a
    foo f o o
    bar b a r
    bark b a r k
    food f o o d
    food2 f o o d
    fo  f o
    """.strip()
    with open(filename, "w") as f:
        f.write(s)
    return filename
 def test_read_lexicon(filename: str):
    lexicon = read_lexicon(filename)
    phones = get_phones(lexicon)
    words = get_words(lexicon)
    print(lexicon)
    print(phones)
    print(words)
    lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
    print(lexicon_disambig)
    print("max disambig:", f"#{max_disambig}")
    phones = ["<eps>", "SIL", "SPN"] + phones
    for i in range(max_disambig + 1):
        phones.append(f"#{i}")
    words = ["<eps>"] + words
    phone2id = generate_id_map(phones)
    word2id = generate_id_map(words)
    print(phone2id)
    print(word2id)
    write_mapping("phones.txt", phone2id)
    write_mapping("words.txt", word2id)
    write_lexicon("a.txt", lexicon)
    write_lexicon("a_disambig.txt", lexicon_disambig)
    fsa = lexicon_to_fst(lexicon, phone2id=phone2id, word2id=word2id)
    fsa.labels_sym = k2.SymbolTable.from_file("phones.txt")
    fsa.aux_labels_sym = k2.SymbolTable.from_file("words.txt")
    fsa.draw("L.pdf", title="L")
    fsa_disambig = lexicon_to_fst(lexicon_disambig, phone2id=phone2id, word2id=word2id)
    fsa_disambig.labels_sym = k2.SymbolTable.from_file("phones.txt")
    fsa_disambig.aux_labels_sym = k2.SymbolTable.from_file("words.txt")
    fsa_disambig.draw("L_disambig.pdf", title="L_disambig")
 def main():
    filename = generate_lexicon_file()
    test_read_lexicon(filename)
    os.remove(filename)
 if __name__ == "__main__":
    main()
--- a/egs/LJSpeech/ASR/local/train_bpe_model.py
+++ b/egs/LJSpeech/ASR/local/train_bpe_model.py
@ -0,0 +1,97 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # You can install sentencepiece via:
 #
 #  pip install sentencepiece
 #
 # Due to an issue reported in
 # https://github.com/google/sentencepiece/pull/642#issuecomment-857972030
 #
 # Please install a version >=0.1.96
 import argparse
 import shutil
 from pathlib import Path
 import sentencepiece as spm
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lang-dir",
        type=str,
        help="""Input and output directory.
        The generated bpe.model is saved to this directory.
        """,
    )
    parser.add_argument(
        "--transcript",
        type=str,
        help="Training transcript.",
    )
    parser.add_argument(
        "--vocab-size",
        type=int,
        help="Vocabulary size for BPE training",
    )
    return parser.parse_args()
 def main():
    args = get_args()
    vocab_size = args.vocab_size
    lang_dir = Path(args.lang_dir)
    model_type = "unigram"
    model_prefix = f"{lang_dir}/{model_type}_{vocab_size}"
    train_text = args.transcript
    character_coverage = 1.0
    input_sentence_size = 100000000
    user_defined_symbols = ["<blk>", "<sos/eos>"]
    unk_id = len(user_defined_symbols)
    # Note: unk_id is fixed to 2.
    # If you change it, you should also change other
    # places that are using it.
    model_file = Path(model_prefix + ".model")
    if not model_file.is_file():
        spm.SentencePieceTrainer.train(
            input=train_text,
            vocab_size=vocab_size,
            model_type=model_type,
            model_prefix=model_prefix,
            input_sentence_size=input_sentence_size,
            character_coverage=character_coverage,
            user_defined_symbols=user_defined_symbols,
            unk_id=unk_id,
            bos_id=-1,
            eos_id=-1,
        )
    shutil.copyfile(model_file, f"{lang_dir}/bpe.model")
 if __name__ == "__main__":
    main()
--- a/egs/LJSpeech/ASR/local/validate_bpe_lexicon.py
+++ b/egs/LJSpeech/ASR/local/validate_bpe_lexicon.py
@ -0,0 +1,77 @@
 #!/usr/bin/env python3
 # Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script checks that there are no OOV tokens in the BPE-based lexicon.
 Usage example:
    python3 ./local/validate_bpe_lexicon.py \
            --lexicon /path/to/lexicon.txt \
            --bpe-model /path/to/bpe.model
 """
 import argparse
 from pathlib import Path
 from typing import List, Tuple
 import sentencepiece as spm
 from icefall.lexicon import read_lexicon
 # Map word to word pieces
 Lexicon = List[Tuple[str, List[str]]]
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lexicon",
        required=True,
        type=Path,
        help="Path to lexicon.txt",
    )
    parser.add_argument(
        "--bpe-model",
        required=True,
        type=Path,
        help="Path to bpe.model",
    )
    return parser.parse_args()
 def main():
    args = get_args()
    assert args.lexicon.is_file(), args.lexicon
    assert args.bpe_model.is_file(), args.bpe_model
    lexicon = read_lexicon(args.lexicon)
    sp = spm.SentencePieceProcessor()
    sp.load(str(args.bpe_model))
    word_pieces = set(sp.id_to_piece(list(range(sp.vocab_size()))))
    for word, pieces in lexicon:
        for p in pieces:
            if p not in word_pieces:
                raise ValueError(f"The word {word} contains an OOV token {p}")
 if __name__ == "__main__":
    main()
--- a/egs/LJSpeech/ASR/local/validate_manifest.py
+++ b/egs/LJSpeech/ASR/local/validate_manifest.py
@ -0,0 +1,93 @@
 #!/usr/bin/env python3
 # Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script checks the following assumptions of the generated manifest:
 - Single supervision per cut
 - Supervision time bounds are within cut time bounds
 We will add more checks later if needed.
 Usage example:
    python3 ./local/validate_manifest.py \
            ./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz
 """
 import argparse
 import logging
 from pathlib import Path
 from lhotse import CutSet, load_manifest_lazy
 from lhotse.cut import Cut
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "manifest",
        type=Path,
        help="Path to the manifest file",
    )
    return parser.parse_args()
 def validate_one_supervision_per_cut(c: Cut):
    if len(c.supervisions) != 1:
        raise ValueError(f"{c.id} has {len(c.supervisions)} supervisions")
 def validate_supervision_and_cut_time_bounds(c: Cut):
    s = c.supervisions[0]
    if s.start < c.start:
        raise ValueError(
            f"{c.id}: Supervision start time {s.start} is less "
            f"than cut start time {c.start}"
        )
    if s.end > c.end:
        raise ValueError(
            f"{c.id}: Supervision end time {s.end} is larger "
            f"than cut end time {c.end}"
        )
 def main():
    args = get_args()
    manifest = args.manifest
    logging.info(f"Validating {manifest}")
    assert manifest.is_file(), f"{manifest} does not exist"
    cut_set = load_manifest_lazy(manifest)
    assert isinstance(cut_set, CutSet)
    for c in cut_set:
        validate_one_supervision_per_cut(c)
        validate_supervision_and_cut_time_bounds(c)
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_0.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_0.png
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_1.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_1.png
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_2.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_2.png
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_3.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_3.png
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_4.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_4.png
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_5.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_5.png
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_6.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_6.png
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_7.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_7.png
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_8.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_8.png
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_9.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_org/plot_9.png
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_0.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_0.png
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_1.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_1.png
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_2.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_2.png
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_3.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_3.png
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_4.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_4.png
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_5.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_5.png
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_6.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_6.png
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_7.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_7.png
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_8.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_8.png
--- a/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_9.png
+++ b/egs/LJSpeech/ASR/outputs/bpe_histogram_pseudo/plot_9.png
--- a/egs/LJSpeech/ASR/outputs/density_plot.png
+++ b/egs/LJSpeech/ASR/outputs/density_plot.png
--- a/egs/LJSpeech/ASR/outputs/phone_histogram_org/plot.png
+++ b/egs/LJSpeech/ASR/outputs/phone_histogram_org/plot.png
--- a/egs/LJSpeech/ASR/outputs/phone_histogram_pseudo/plot.png
+++ b/egs/LJSpeech/ASR/outputs/phone_histogram_pseudo/plot.png
--- a/egs/LJSpeech/ASR/outputs/word_histogram_org/plot.png
+++ b/egs/LJSpeech/ASR/outputs/word_histogram_org/plot.png
--- a/egs/LJSpeech/ASR/outputs/word_histogram_pseudo/plot.png
+++ b/egs/LJSpeech/ASR/outputs/word_histogram_pseudo/plot.png
--- a/egs/LJSpeech/ASR/outputs_sim/ctc_output.1.bias/abs_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/ctc_output.1.bias/abs_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/ctc_output.1.bias/cos_sim.png
+++ b/egs/LJSpeech/ASR/outputs_sim/ctc_output.1.bias/cos_sim.png
--- a/egs/LJSpeech/ASR/outputs_sim/ctc_output.1.bias/rel_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/ctc_output.1.bias/rel_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/ctc_output.1.weight/abs_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/ctc_output.1.weight/abs_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/ctc_output.1.weight/cos_sim.png
+++ b/egs/LJSpeech/ASR/outputs_sim/ctc_output.1.weight/cos_sim.png
--- a/egs/LJSpeech/ASR/outputs_sim/ctc_output.1.weight/rel_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/ctc_output.1.weight/rel_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/decoder.conv.weight/abs_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/decoder.conv.weight/abs_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/decoder.conv.weight/cos_sim.png
+++ b/egs/LJSpeech/ASR/outputs_sim/decoder.conv.weight/cos_sim.png
--- a/egs/LJSpeech/ASR/outputs_sim/decoder.conv.weight/rel_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/decoder.conv.weight/rel_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/decoder.embedding.weight/abs_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/decoder.embedding.weight/abs_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/decoder.embedding.weight/cos_sim.png
+++ b/egs/LJSpeech/ASR/outputs_sim/decoder.embedding.weight/cos_sim.png
--- a/egs/LJSpeech/ASR/outputs_sim/decoder.embedding.weight/rel_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/decoder.embedding.weight/rel_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layer_norm.bias/abs_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layer_norm.bias/abs_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layer_norm.bias/cos_sim.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layer_norm.bias/cos_sim.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layer_norm.bias/rel_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layer_norm.bias/rel_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layer_norm.weight/abs_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layer_norm.weight/abs_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layer_norm.weight/cos_sim.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layer_norm.weight/cos_sim.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layer_norm.weight/rel_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layer_norm.weight/rel_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc1.bias/abs_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc1.bias/abs_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc1.bias/cos_sim.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc1.bias/cos_sim.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc1.bias/rel_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc1.bias/rel_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc1.weight/abs_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc1.weight/abs_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc1.weight/cos_sim.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc1.weight/cos_sim.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc1.weight/rel_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc1.weight/rel_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc2.bias/abs_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc2.bias/abs_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc2.bias/cos_sim.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc2.bias/cos_sim.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc2.bias/rel_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc2.bias/rel_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc2.weight/abs_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc2.weight/abs_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc2.weight/cos_sim.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc2.weight/cos_sim.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc2.weight/rel_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.fc2.weight/rel_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.final_layer_norm.bias/abs_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.final_layer_norm.bias/abs_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.final_layer_norm.bias/cos_sim.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.final_layer_norm.bias/cos_sim.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.final_layer_norm.bias/rel_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.final_layer_norm.bias/rel_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.final_layer_norm.weight/abs_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.final_layer_norm.weight/abs_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.final_layer_norm.weight/cos_sim.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.final_layer_norm.weight/cos_sim.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.final_layer_norm.weight/rel_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.final_layer_norm.weight/rel_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.self_attn.k_proj.bias/abs_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.self_attn.k_proj.bias/abs_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.self_attn.k_proj.bias/cos_sim.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.self_attn.k_proj.bias/cos_sim.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.self_attn.k_proj.bias/rel_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.self_attn.k_proj.bias/rel_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.self_attn.k_proj.weight/abs_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.self_attn.k_proj.weight/abs_diff.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.self_attn.k_proj.weight/cos_sim.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.self_attn.k_proj.weight/cos_sim.png
--- a/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.self_attn.k_proj.weight/rel_diff.png
+++ b/egs/LJSpeech/ASR/outputs_sim/encoder.encoders.encoder.layers.0.self_attn.k_proj.weight/rel_diff.png
--- a/Show More
+++ b/Show More