Generate LM training data for the LibriSpeech recipe.

2025-08-26 10:16:14 +00:00 · 2021-11-22 21:04:11 +08:00 · 2021-11-22 21:04:11 +08:00 · 3c65ee11f4
commit 3c65ee11f4
parent 2213154c69
4 changed files with 273 additions and 0 deletions
--- a/egs/librispeech/ASR/local/download_lm.py
+++ b/egs/librispeech/ASR/local/download_lm.py
@ -23,6 +23,7 @@ This file downloads the following LibriSpeech LM files:
    - 4-gram.arpa.gz
    - librispeech-vocab.txt
    - librispeech-lexicon.txt
+    - librispeech-lm-norm.txt.gz

 from http://www.openslr.org/resources/11
 and save them in the user provided directory.
@ -61,6 +62,7 @@ def main(out_dir: str):
        "4-gram.arpa.gz",
        "librispeech-vocab.txt",
        "librispeech-lexicon.txt",
+        "librispeech-lm-norm.txt.gz",
    )

    for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"):
--- a/egs/librispeech/ASR/local/prepare_lm_training_data.py
+++ b/egs/librispeech/ASR/local/prepare_lm_training_data.py
@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2021  Xiaomi Corporation (authors: Daniel Povey
+#                                                   Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script takes a `bpe.model` and a text file such as
+./download/lm/librispeech-lm-norm.txt
+and outputs the LM training data to a supplied directory such
+as data/lm_training_bpe_500.  The format is as follows:
+
+It creates a PyTorch archive (.pt file), say data/lm_training.pt, which is a
+representation of a dict with the following format:
+
+  'words' -> a k2.RaggedTensor of two axes [word][token] with dtype torch.int32
+             containing the BPE representations of each word, indexed by
+             integer word ID. (These integer word IDS are present in
+             'lm_data').  The sentencepiece object can be used to turn the
+             words and BPE units into string form.
+  'sentences' -> a k2.RaggedTensor of two axes [sentence][word] with dtype
+            torch.int32 containing all the sentences, as word-ids (we don't
+            output the string form of this directly but it can be worked out
+            together with 'words' and the bpe.model).
+  'sentence_lengths' -> a 1-D torch.Tensor of dtype torch.int32, containing
+            number of BPE tokens of each sentence.
+"""
+
+import argparse
+import logging
+from pathlib import Path
+
+import k2
+import sentencepiece as spm
+import torch
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        help="Input BPE model, e.g. data/bpe_500/bpe.model",
+    )
+    parser.add_argument(
+        "--lm-data",
+        type=str,
+        help="""Input LM training data as text, e.g.
+        download/pb.train.txt""",
+    )
+    parser.add_argument(
+        "--lm-archive",
+        type=str,
+        help="""Path to output archive, e.g. data/bpe_500/lm_data.pt;
+        look at the source of this script to see the format.""",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+
+    if Path(args.lm_archive).exists():
+        logging.warning(f"{args.lm_archive} exists - skipping")
+        return
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(args.bpe_model)
+
+    # word2index is a dictionary from words to integer ids.  No need to reserve
+    # space for epsilon, etc.; the words are just used as a convenient way to
+    # compress the sequences of BPE pieces.
+    word2index = dict()
+
+    word2bpe = []  # Will be a list-of-list-of-int, representing BPE pieces.
+    sentences = []  # Will be a list-of-list-of-int, representing word-ids.
+
+    if "librispeech-lm-norm" in args.lm_data:
+        num_lines_in_total = 40418261.0
+        step = 5000000
+    elif "valid" in args.lm_data:
+        num_lines_in_total = 5567.0
+        step = 3000
+    elif "test" in args.lm_data:
+        num_lines_in_total = 5559.0
+        step = 3000
+    else:
+        num_lines_in_total = None
+        step = None
+
+    processed = 0
+
+    with open(args.lm_data) as f:
+        while True:
+            line = f.readline()
+            if line == "":
+                break
+
+            if step and processed % step == 0:
+                logging.info(
+                    f"Processed number of lines: {processed} "
+                    f"({processed/num_lines_in_total*100: .3f}%)"
+                )
+            processed += 1
+
+            line_words = line.split()
+            for w in line_words:
+                if w not in word2index:
+                    w_bpe = sp.encode(w)
+                    word2index[w] = len(word2bpe)
+                    word2bpe.append(w_bpe)
+            sentences.append([word2index[w] for w in line_words])
+
+    logging.info("Constructing ragged tensors")
+    words = k2.ragged.RaggedTensor(word2bpe)
+    sentences = k2.ragged.RaggedTensor(sentences)
+
+    output = dict(words=words, sentences=sentences)
+
+    num_sentences = sentences.dim0
+    logging.info(f"Computing sentence lengths, num_sentences: {num_sentences}")
+    sentence_lengths = [0] * num_sentences
+    for i in range(num_sentences):
+        if step and i % step == 0:
+            logging.info(
+                f"Processed number of lines: {i} "
+                f"({i/num_sentences*100: .3f}%)"
+            )
+
+        word_ids = sentences[i]
+
+        # NOTE: If word_ids is a tensor with only 1 entry,
+        # token_ids is a torch.Tensor
+        token_ids = words[word_ids]
+        if isinstance(token_ids, k2.RaggedTensor):
+            token_ids = token_ids.values
+
+        # token_ids is a 1-D tensor containing the BPE tokens
+        # of the current sentence
+
+        sentence_lengths[i] = token_ids.numel()
+
+    output["sentence_lengths"] = torch.tensor(
+        sentence_lengths, dtype=torch.int32
+    )
+
+    torch.save(output, args.lm_archive)
+    logging.info(f"Saved to {args.lm_archive}")
+
+
+if __name__ == "__main__":
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    main()
--- a/egs/librispeech/ASR/local/sort_lm_training_data.py
+++ b/egs/librispeech/ASR/local/sort_lm_training_data.py
@ -0,0 +1 @@
+../../../ptb/LM/local/sort_lm_training_data.py
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@ -24,6 +24,7 @@ stop_stage=100
 #        - 4-gram.arpa
 #        - librispeech-vocab.txt
 #        - librispeech-lexicon.txt
+#        - librispeech-lm-norm.txt.gz
 #
 #  - $dl_dir/musan
 #      This directory contains the following directories downloaded from
@ -227,3 +228,100 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
    ./local/compile_hlg.py --lang-dir $lang_dir
  done
 fi
+
+if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
+  log "Stage 10: Generate LM training data"
+
+  for vocab_size in ${vocab_sizes[@]}; do
+    log "Processing vocab_size == ${vocab_size}"
+    lang_dir=data/lang_bpe_${vocab_size}
+    out_dir=data/lm_training_bpe_${vocab_size}
+    mkdir -p $out_dir
+
+    ./local/prepare_lm_training_data.py \
+      --bpe-model $lang_dir/bpe.model \
+      --lm-data $dl_dir/lm/librispeech-lm-norm.txt \
+      --lm-archive $out_dir/lm_data.pt
+  done
+fi
+
+if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
+  log "Stage 11: Generate LM validation data"
+
+  for vocab_size in ${vocab_sizes[@]}; do
+    log "Processing vocab_size == ${vocab_size}"
+    out_dir=data/lm_training_bpe_${vocab_size}
+    mkdir -p $out_dir
+
+    if [ ! -f $out_dir/valid.txt ]; then
+      files=$(
+        find "$dl_dir/LibriSpeech/dev-clean" -name "*.trans.txt"
+        find "$dl_dir/LibriSpeech/dev-other" -name "*.trans.txt"
+      )
+      for f in ${files[@]}; do
+        cat $f | cut -d " " -f 2-
+      done > $out_dir/valid.txt
+    fi
+
+    lang_dir=data/lang_bpe_${vocab_size}
+    ./local/prepare_lm_training_data.py \
+      --bpe-model $lang_dir/bpe.model \
+      --lm-data $out_dir/valid.txt \
+      --lm-archive $out_dir/lm_data-valid.pt
+  done
+fi
+
+if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
+  log "Stage 12: Generate LM test data"
+
+  for vocab_size in ${vocab_sizes[@]}; do
+    log "Processing vocab_size == ${vocab_size}"
+    out_dir=data/lm_training_bpe_${vocab_size}
+    mkdir -p $out_dir
+
+    if [ ! -f $out_dir/test.txt ]; then
+      files=$(
+        find "$dl_dir/LibriSpeech/test-clean" -name "*.trans.txt"
+        find "$dl_dir/LibriSpeech/test-other" -name "*.trans.txt"
+      )
+      for f in ${files[@]}; do
+        cat $f | cut -d " " -f 2-
+      done > $out_dir/test.txt
+    fi
+    exit 0
+
+    lang_dir=data/lang_bpe_${vocab_size}
+    ./local/prepare_lm_training_data.py \
+      --bpe-model $lang_dir/bpe.model \
+      --lm-data $out_dir/test.txt \
+      --lm-archive $out_dir/lm_data-test.pt
+  done
+fi
+
+if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
+  log "Stage 13: Sort LM training data"
+  # Sort LM training data by sentence length in descending order
+  # for ease of training.
+  #
+  # Sentence length equals to the number of BPE tokens
+  # in a sentence.
+
+  for vocab_size in ${vocab_sizes[@]}; do
+    out_dir=data/lm_training_bpe_${vocab_size}
+    mkdir -p $out_dir
+    ./local/sort_lm_training_data.py \
+      --in-lm-data $out_dir/lm_data.pt \
+      --out-lm-data $out_dir/sorted_lm_data.pt \
+      --out-statistics $out_dir/statistics.txt
+
+    ./local/sort_lm_training_data.py \
+      --in-lm-data $out_dir/lm_data-valid.pt \
+      --out-lm-data $out_dir/sorted_lm_data-valid.pt \
+      --out-statistics $out_dir/statistics-valid.txt
+
+    ./local/sort_lm_training_data.py \
+      --in-lm-data $out_dir/lm_data-test.pt \
+      --out-lm-data $out_dir/sorted_lm_data-test.pt \
+      --out-statistics $out_dir/statistics-test.txt
+  done
+fi
				`@ -0,0 +1 @@`
				`../../../ptb/LM/local/sort_lm_training_data.py`