diff --git a/egs/gigaspeech/ASR/local/compile_hlg.py b/egs/gigaspeech/ASR/local/compile_hlg.py deleted file mode 120000 index 471aa7fb4..000000000 --- a/egs/gigaspeech/ASR/local/compile_hlg.py +++ /dev/null @@ -1 +0,0 @@ -../../../librispeech/ASR/local/compile_hlg.py \ No newline at end of file diff --git a/egs/gigaspeech/ASR/local/compile_hlg.py b/egs/gigaspeech/ASR/local/compile_hlg.py new file mode 100755 index 000000000..712708c50 --- /dev/null +++ b/egs/gigaspeech/ASR/local/compile_hlg.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script takes as input lang_dir and generates HLG from + + - H, the ctc topology, built from tokens contained in lang_dir/lexicon.txt + - L, the lexicon, built from lang_dir/L_disambig.pt + + Caution: We use a lexicon that contains disambiguation symbols + + - G, the LM, built from data/lm/G_4_gram.fst.txt + +The generated HLG is saved in $lang_dir/HLG.pt +""" +import argparse +import logging +from pathlib import Path + +import k2 +import torch + +from icefall.lexicon import Lexicon + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + """, + ) + + return parser.parse_args() + + +def compile_HLG(lang_dir: str) -> k2.Fsa: + """ + Args: + lang_dir: + The language directory, e.g., data/lang_phone or data/lang_bpe_5000. + + Return: + An FSA representing HLG. + """ + lexicon = Lexicon(lang_dir) + max_token_id = max(lexicon.tokens) + logging.info(f"Building ctc_topo. max_token_id: {max_token_id}") + H = k2.ctc_topo(max_token_id) + L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt")) + + if Path("data/lm/G_4_gram.pt").is_file(): + logging.info("Loading pre-compiled G_4_gram") + d = torch.load("data/lm/G_4_gram.pt") + G = k2.Fsa.from_dict(d) + else: + logging.info("Loading G_4_gram.fst.txt") + with open("data/lm/G_4_gram.fst.txt") as f: + G = k2.Fsa.from_openfst(f.read(), acceptor=False) + torch.save(G.as_dict(), "data/lm/G_4_gram.pt") + + first_token_disambig_id = lexicon.token_table["#0"] + first_word_disambig_id = lexicon.word_table["#0"] + + L = k2.arc_sort(L) + G = k2.arc_sort(G) + + logging.info("Intersecting L and G") + LG = k2.compose(L, G) + logging.info(f"LG shape: {LG.shape}") + + logging.info("Connecting LG") + LG = k2.connect(LG) + logging.info(f"LG shape after k2.connect: {LG.shape}") + + logging.info(type(LG.aux_labels)) + logging.info("Determinizing LG") + + LG = k2.determinize(LG) + logging.info(type(LG.aux_labels)) + + logging.info("Connecting LG after k2.determinize") + LG = k2.connect(LG) + + logging.info("Removing disambiguation symbols on LG") + + LG.labels[LG.labels >= first_token_disambig_id] = 0 + # See https://github.com/k2-fsa/k2/issues/874 + # for why we need to set LG.properties to None + LG.__dict__["_properties"] = None + + assert isinstance(LG.aux_labels, k2.RaggedTensor) + LG.aux_labels.values[LG.aux_labels.values >= first_word_disambig_id] = 0 + + LG = k2.remove_epsilon(LG) + logging.info(f"LG shape after k2.remove_epsilon: {LG.shape}") + + LG = k2.connect(LG) + LG.aux_labels = LG.aux_labels.remove_values_eq(0) + + logging.info("Arc sorting LG") + LG = k2.arc_sort(LG) + + logging.info("Composing H and LG") + # CAUTION: The name of the inner_labels is fixed + # to `tokens`. If you want to change it, please + # also change other places in icefall that are using + # it. + HLG = k2.compose(H, LG, inner_labels="tokens") + + logging.info("Connecting LG") + HLG = k2.connect(HLG) + + logging.info("Arc sorting LG") + HLG = k2.arc_sort(HLG) + logging.info(f"HLG.shape: {HLG.shape}") + + return HLG + + +def main(): + args = get_args() + lang_dir = Path(args.lang_dir) + + if (lang_dir / "HLG.pt").is_file(): + logging.info(f"{lang_dir}/HLG.pt already exists - skipping") + return + + logging.info(f"Processing {lang_dir}") + + HLG = compile_HLG(lang_dir) + logging.info(f"Saving HLG.pt to {lang_dir}") + torch.save(HLG.as_dict(), f"{lang_dir}/HLG.pt") + + +if __name__ == "__main__": + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + ) + + logging.basicConfig(format=formatter, level=logging.INFO) + + main() diff --git a/egs/gigaspeech/ASR/prepare.sh b/egs/gigaspeech/ASR/prepare.sh index dbb34ab76..e18806994 100755 --- a/egs/gigaspeech/ASR/prepare.sh +++ b/egs/gigaspeech/ASR/prepare.sh @@ -19,6 +19,13 @@ num_splits=2000 # You can apply for the download credentials by following # https://github.com/SpeechColab/GigaSpeech#download # +# - $dl_dir/lm +# This directory contains the language model downloaded from +# https://huggingface.co/wgb14/gigaspeech_lm +# +# - 4gram.arpa.gz +# - lexicon.txt +# # - $dl_dir/musan # This directory contains the following directories downloaded from # http://www.openslr.org/17/ @@ -34,7 +41,7 @@ dl_dir=$PWD/download # It will generate data/lang_bpe_xxx, # data/lang_bpe_yyy if the array contains xxx, yyy vocab_sizes=( - # 5000 + 5000 500 ) @@ -50,6 +57,15 @@ log() { log "dl_dir: $dl_dir" +if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then + log "stage -1: Download LM" + # We assume that you have installed the git-lfs, if not, you could install it + # using: `sudo apt-get install git-lfs && git-lfs install` + [ ! -e $dl_dir/lm ] && mkdir -p $dl_dir/lm + git clone https://huggingface.co/wgb14/gigaspeech_lm $dl_dir/lm + gunzip -c $dl_dir/lm/4gram.arpa.gz > $dl_dir/lm/4gram.arpa +fi + if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then log "Stage 0: Download data" @@ -159,13 +175,14 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then lang_dir=data/lang_phone mkdir -p $lang_dir - # (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) | - # cat - $dl_dir/lm/librispeech-lexicon.txt | - # sort | uniq > $lang_dir/lexicon.txt + (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) | + cat - $dl_dir/lm/lexicon.txt | + sort | uniq > $lang_dir/lexicon.txt + + if [ ! -f $lang_dir/L_disambig.pt ]; then + ./local/prepare_lang.py --lang-dir $lang_dir + fi - # if [ ! -f $lang_dir/L_disambig.pt ]; then - # ./local/prepare_lang.py --lang-dir $lang_dir - # fi if [ ! -f $lang_dir/transcript_words.txt ]; then gunzip -c "data/manifests/gigaspeech_supervisions_XL.jsonl.gz" \ | jq '.text' \ @@ -225,14 +242,6 @@ if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then # so that the two can share G.pt later. cp data/lang_phone/{words.txt,transcript_words.txt} $lang_dir - if [ ! -f $lang_dir/transcript_words.txt ]; then - log "Generate data for BPE training" - gunzip -c "data/manifests/gigaspeech_supervisions_XL.jsonl.gz" \ - | jq '.text' \ - | sed 's/"//g' \ - > $lang_dir/transcript_words.txt - fi - if [ ! -f $lang_dir/bpe.model ]; then ./local/train_bpe_model.py \ --lang-dir $lang_dir \ @@ -283,42 +292,20 @@ if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then # it using: pip install kaldilm mkdir -p data/lm - if [ ! -f data/lm/3-gram.arpa ]; then - ./shared/make_kn_lm.py \ - -ngram-order 3 \ - -text "data/lang_phone/transcript_words.txt" \ - -lm data/lm/3-gram.arpa - fi - if [ ! -f data/lm/G_3_gram.fst.txt ]; then + if [ ! -f data/lm/G_4_gram.fst.txt ]; then # It is used in building HLG - python3 -m kaldilm \ - --read-symbol-table="data/lang_phone/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - data/lm/3-gram.arpa > data/lm/G_3_gram.fst.txt - fi - - if [ ! -f data/lm/4-gram.arpa ]; then - ./shared/make_kn_lm.py \ - -ngram-order 4 \ - -text "data/lang_phone/transcript_words.txt" \ - -lm data/lm/4-gram.arpa - fi - - if [ ! -f data/lm/G_4_gram.fst.txt ]; then - # It is used for LM rescoring python3 -m kaldilm \ --read-symbol-table="data/lang_phone/words.txt" \ --disambig-symbol='#0' \ --max-order=4 \ - data/lm/4-gram.arpa > data/lm/G_4_gram.fst.txt + $dl_dir/lm/4gram.arpa > data/lm/G_4_gram.fst.txt fi fi if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then log "Stage 13: Compile HLG" - # ./local/compile_hlg.py --lang-dir data/lang_phone + ./local/compile_hlg.py --lang-dir data/lang_phone for vocab_size in ${vocab_sizes[@]}; do lang_dir=data/lang_bpe_${vocab_size}