Add files from Dan.

See https://github.com/k2-fsa/icefall/pull/54
2025-08-27 10:44:19 +00:00 · 2021-11-17 12:21:05 +08:00 · 2021-11-17 12:21:05 +08:00 · 469b665a5a
commit 469b665a5a
parent 1ea780f203
3 changed files with 134 additions and 0 deletions
--- a/egs/librispeech/ASR/local/download_lm.py
+++ b/egs/librispeech/ASR/local/download_lm.py
@ -23,6 +23,7 @@ This file downloads the following LibriSpeech LM files:
    - 4-gram.arpa.gz
    - librispeech-vocab.txt
    - librispeech-lexicon.txt
+    - librispeech-lm-norm.txt.gz

 from http://www.openslr.org/resources/11
 and save them in the user provided directory.
@ -61,6 +62,7 @@ def main(out_dir: str):
        "4-gram.arpa.gz",
        "librispeech-vocab.txt",
        "librispeech-lexicon.txt",
+        "librispeech-lm-norm.txt.gz",
    )

    for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"):
--- a/egs/librispeech/ASR/local/prepare_lm_training_data.py
+++ b/egs/librispeech/ASR/local/prepare_lm_training_data.py
@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2021  Xiaomi Corporation (authors: Daniel Povey)
+
+"""
+
+This script takes a `bpe.model` and a text file such as
+`download/lm/librispeech-lm-norm.txt`,
+and outputs the LM training data to a supplied directory such
+as data/lm_training_data_bpe_5000.  The format is as follows:
+
+It creates a PyTorch archive (.pt file), say data/lm_training.pt, which is a
+representation of a dict with the following format:
+
+  'words' -> a k2.RaggedTensor of two axes [word][token] with dtype torch.int32
+             containing the BPE representations of each word, indexed by
+             integer word ID. (These integer word IDS are present in
+             'lm_data').  The sentencepiece object can be used to turn the
+             words and BPE units into string form.
+  'data' -> a k2.RaggedTensor of two axes [sentence][word] with dtype
+            torch.int32 containing all the sentences, as word-ids (we don't
+            output the string form of this directly but it can be worked out
+            together with 'words' and the bpe.model).
+
+"""
+
+import argparse
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import k2
+import sentencepiece as spm
+import torch
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "bpe_model",
+        type=str,
+        help="""Input BPE model, e.g. data/lang_bpe/bpe.model""",
+    )
+    parser.add_argument(
+        "lm_data",
+        type=str,
+        help="""Input LM training data as text, e.g. data/downloads/lm/librispeech-lm-norm.txt""",
+    )
+    parser.add_argument(
+        "lm_archive",
+        type=str,
+        help="""Path to output archive, e.g. lm_data.pt; look at the source of this script to see the format.""",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(args.bpe_model)
+
+    # word2index is a dictionary from words to integer ids.  No need to reserve
+    # space for epsilon, etc.; the words are just used as a convenient way to
+    # compress the sequences of BPE pieces.
+    word2index = dict()
+
+    words2bpe = []  # Will be a list-of-list-of-int, representing BPE pieces.
+
+    sentences = []  # Wil be a list-of-list-of-int, representing word-ids.
+
+    with open(args.lm_data) as f:
+        while True:
+            line = f.readline()
+            if line == "":
+                break
+            line_words = line.split()
+            for w in line_words:
+                if not w in word2index:
+                    w_bpe = sp.Encode(w)
+                    word2index[w] = len(words2bpe)
+                    words2bpe.append(w_bpe)
+            sentences.append([word2index[w] for w in line_words])
+
+    output = dict()
+    output["words"] = k2.ragged.RaggedTensor(words2bpe)
+    output["data"] = k2.ragged.RaggedTensor(sentences)
+
+    torch.save(output, args.lm_archive)
+    print(f"Saved to {args.lm_archive}")
+
+
+if __name__ == "__main__":
+    main()
+
+
+#  This was tested as follows.
+# cat > foo <<EOF
+# THING TWO
+# ZOOLOGY
+# EOF
+#
+# local/prepare_lm_training_data.py data/lang_bpe/bpe.model foo bar.pt
+#
+# python3
+# Python 3.8.0 (default, Oct 28 2019, 16:14:01)
+# [GCC 8.3.0] on linux
+# Type "help", "copyright", "credits" or "license" for more information.
+# >>> import k2
+# >>> import sentencepiece as spm
+# >>> sp = spm.SentencePieceProcessor()
+# >>> sp.load('data/lang_bpe/bpe.model')
+# True
+# >>> import torch
+# >>> d = torch.load('bar.pt')
+# >>> sp.Decode(k2.ragged.to_list(k2.index(d['words'], d['data'])))
+# ['THING TWO', 'ZOOLOGY']
+# >>>
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@ -24,6 +24,7 @@ stop_stage=100
 #        - 4-gram.arpa
 #        - librispeech-vocab.txt
 #        - librispeech-lexicon.txt
+#        - librispeech-lm-norm.txt.gz
 #
 #  - $dl_dir/musan
 #      This directory contains the following directories downloaded from
@ -227,3 +228,16 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
    ./local/compile_hlg.py --lang-dir $lang_dir
  done
 fi
+
+if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
+  for vocab_size in ${vocab_sizes[@]}; do
+    lang_dir=data/lang_bpe_${vocab_size}
+    lm_dir=data/lm_training_${vocab_size}
+    mkdir -p $lm_dir
+    log "Stage 10: Creating $lm_dir/lm_data.pt (It may take 8 minutes)"
+    ./local/prepare_lm_training_data.py \
+      $lang_dir/bpe.model \
+      $dl_dir/lm/librispeech-lm-norm.txt \
+      $lm_dir/lm_data.pt
+  done
+fi