Add files from Dan.

See https://github.com/k2-fsa/icefall/pull/54
2025-12-11 06:55:27 +00:00 · 2021-11-17 12:21:05 +08:00 · 2021-11-17 12:21:05 +08:00 · 469b665a5a
commit 469b665a5a
parent 1ea780f203
3 changed files with 134 additions and 0 deletions
--- a/egs/librispeech/ASR/local/download_lm.py
+++ b/egs/librispeech/ASR/local/download_lm.py
@ -23,6 +23,7 @@ This file downloads the following LibriSpeech LM files:
    - 4-gram.arpa.gz
    - librispeech-vocab.txt
    - librispeech-lexicon.txt
    - librispeech-lm-norm.txt.gz
 from http://www.openslr.org/resources/11
 and save them in the user provided directory.
@ -61,6 +62,7 @@ def main(out_dir: str):
        "4-gram.arpa.gz",
        "librispeech-vocab.txt",
        "librispeech-lexicon.txt",
        "librispeech-lm-norm.txt.gz",
    )
    for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"):
--- a/egs/librispeech/ASR/local/prepare_lm_training_data.py
+++ b/egs/librispeech/ASR/local/prepare_lm_training_data.py
@ -0,0 +1,118 @@
 #!/usr/bin/env python3
 # Copyright (c)  2021  Xiaomi Corporation (authors: Daniel Povey)
 """
 This script takes a `bpe.model` and a text file such as
 `download/lm/librispeech-lm-norm.txt`,
 and outputs the LM training data to a supplied directory such
 as data/lm_training_data_bpe_5000.  The format is as follows:
 It creates a PyTorch archive (.pt file), say data/lm_training.pt, which is a
 representation of a dict with the following format:
  'words' -> a k2.RaggedTensor of two axes [word][token] with dtype torch.int32
             containing the BPE representations of each word, indexed by
             integer word ID. (These integer word IDS are present in
             'lm_data').  The sentencepiece object can be used to turn the
             words and BPE units into string form.
  'data' -> a k2.RaggedTensor of two axes [sentence][word] with dtype
            torch.int32 containing all the sentences, as word-ids (we don't
            output the string form of this directly but it can be worked out
            together with 'words' and the bpe.model).
 """
 import argparse
 from pathlib import Path
 from typing import Dict, List, Tuple
 import k2
 import sentencepiece as spm
 import torch
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "bpe_model",
        type=str,
        help="""Input BPE model, e.g. data/lang_bpe/bpe.model""",
    )
    parser.add_argument(
        "lm_data",
        type=str,
        help="""Input LM training data as text, e.g. data/downloads/lm/librispeech-lm-norm.txt""",
    )
    parser.add_argument(
        "lm_archive",
        type=str,
        help="""Path to output archive, e.g. lm_data.pt; look at the source of this script to see the format.""",
    )
    return parser.parse_args()
 def main():
    args = get_args()
    sp = spm.SentencePieceProcessor()
    sp.load(args.bpe_model)
    # word2index is a dictionary from words to integer ids.  No need to reserve
    # space for epsilon, etc.; the words are just used as a convenient way to
    # compress the sequences of BPE pieces.
    word2index = dict()
    words2bpe = []  # Will be a list-of-list-of-int, representing BPE pieces.
    sentences = []  # Wil be a list-of-list-of-int, representing word-ids.
    with open(args.lm_data) as f:
        while True:
            line = f.readline()
            if line == "":
                break
            line_words = line.split()
            for w in line_words:
                if not w in word2index:
                    w_bpe = sp.Encode(w)
                    word2index[w] = len(words2bpe)
                    words2bpe.append(w_bpe)
            sentences.append([word2index[w] for w in line_words])
    output = dict()
    output["words"] = k2.ragged.RaggedTensor(words2bpe)
    output["data"] = k2.ragged.RaggedTensor(sentences)
    torch.save(output, args.lm_archive)
    print(f"Saved to {args.lm_archive}")
 if __name__ == "__main__":
    main()
 #  This was tested as follows.
 # cat > foo <<EOF
 # THING TWO
 # ZOOLOGY
 # EOF
 #
 # local/prepare_lm_training_data.py data/lang_bpe/bpe.model foo bar.pt
 #
 # python3
 # Python 3.8.0 (default, Oct 28 2019, 16:14:01)
 # [GCC 8.3.0] on linux
 # Type "help", "copyright", "credits" or "license" for more information.
 # >>> import k2
 # >>> import sentencepiece as spm
 # >>> sp = spm.SentencePieceProcessor()
 # >>> sp.load('data/lang_bpe/bpe.model')
 # True
 # >>> import torch
 # >>> d = torch.load('bar.pt')
 # >>> sp.Decode(k2.ragged.to_list(k2.index(d['words'], d['data'])))
 # ['THING TWO', 'ZOOLOGY']
 # >>>
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@ -24,6 +24,7 @@ stop_stage=100
 #        - 4-gram.arpa
 #        - librispeech-vocab.txt
 #        - librispeech-lexicon.txt
 #        - librispeech-lm-norm.txt.gz
 #
 #  - $dl_dir/musan
 #      This directory contains the following directories downloaded from
@ -227,3 +228,16 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
    ./local/compile_hlg.py --lang-dir $lang_dir
  done
 fi
 if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
  for vocab_size in ${vocab_sizes[@]}; do
    lang_dir=data/lang_bpe_${vocab_size}
    lm_dir=data/lm_training_${vocab_size}
    mkdir -p $lm_dir
    log "Stage 10: Creating $lm_dir/lm_data.pt (It may take 8 minutes)"
    ./local/prepare_lm_training_data.py \
      $lang_dir/bpe.model \
      $dl_dir/lm/librispeech-lm-norm.txt \
      $lm_dir/lm_data.pt
  done
 fi