diff --git a/egs/librispeech/ASR/local/download_lm.py b/egs/librispeech/ASR/local/download_lm.py index 94d23afed..030122aa7 100755 --- a/egs/librispeech/ASR/local/download_lm.py +++ b/egs/librispeech/ASR/local/download_lm.py @@ -23,6 +23,7 @@ This file downloads the following LibriSpeech LM files: - 4-gram.arpa.gz - librispeech-vocab.txt - librispeech-lexicon.txt + - librispeech-lm-norm.txt.gz from http://www.openslr.org/resources/11 and save them in the user provided directory. @@ -61,6 +62,7 @@ def main(out_dir: str): "4-gram.arpa.gz", "librispeech-vocab.txt", "librispeech-lexicon.txt", + "librispeech-lm-norm.txt.gz", ) for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"): diff --git a/egs/librispeech/ASR/local/prepare_lm_training_data.py b/egs/librispeech/ASR/local/prepare_lm_training_data.py new file mode 100755 index 000000000..333ec0f0d --- /dev/null +++ b/egs/librispeech/ASR/local/prepare_lm_training_data.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2021 Xiaomi Corporation (authors: Daniel Povey) + +""" + +This script takes a `bpe.model` and a text file such as +`download/lm/librispeech-lm-norm.txt`, +and outputs the LM training data to a supplied directory such +as data/lm_training_data_bpe_5000. The format is as follows: + +It creates a PyTorch archive (.pt file), say data/lm_training.pt, which is a +representation of a dict with the following format: + + 'words' -> a k2.RaggedTensor of two axes [word][token] with dtype torch.int32 + containing the BPE representations of each word, indexed by + integer word ID. (These integer word IDS are present in + 'lm_data'). The sentencepiece object can be used to turn the + words and BPE units into string form. + 'data' -> a k2.RaggedTensor of two axes [sentence][word] with dtype + torch.int32 containing all the sentences, as word-ids (we don't + output the string form of this directly but it can be worked out + together with 'words' and the bpe.model). + +""" + +import argparse +from pathlib import Path +from typing import Dict, List, Tuple + +import k2 +import sentencepiece as spm +import torch + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "bpe_model", + type=str, + help="""Input BPE model, e.g. data/lang_bpe/bpe.model""", + ) + parser.add_argument( + "lm_data", + type=str, + help="""Input LM training data as text, e.g. data/downloads/lm/librispeech-lm-norm.txt""", + ) + parser.add_argument( + "lm_archive", + type=str, + help="""Path to output archive, e.g. lm_data.pt; look at the source of this script to see the format.""", + ) + + return parser.parse_args() + + +def main(): + args = get_args() + + sp = spm.SentencePieceProcessor() + sp.load(args.bpe_model) + + # word2index is a dictionary from words to integer ids. No need to reserve + # space for epsilon, etc.; the words are just used as a convenient way to + # compress the sequences of BPE pieces. + word2index = dict() + + words2bpe = [] # Will be a list-of-list-of-int, representing BPE pieces. + + sentences = [] # Wil be a list-of-list-of-int, representing word-ids. + + with open(args.lm_data) as f: + while True: + line = f.readline() + if line == "": + break + line_words = line.split() + for w in line_words: + if not w in word2index: + w_bpe = sp.Encode(w) + word2index[w] = len(words2bpe) + words2bpe.append(w_bpe) + sentences.append([word2index[w] for w in line_words]) + + output = dict() + output["words"] = k2.ragged.RaggedTensor(words2bpe) + output["data"] = k2.ragged.RaggedTensor(sentences) + + torch.save(output, args.lm_archive) + print(f"Saved to {args.lm_archive}") + + +if __name__ == "__main__": + main() + + +# This was tested as follows. +# cat > foo <>> import k2 +# >>> import sentencepiece as spm +# >>> sp = spm.SentencePieceProcessor() +# >>> sp.load('data/lang_bpe/bpe.model') +# True +# >>> import torch +# >>> d = torch.load('bar.pt') +# >>> sp.Decode(k2.ragged.to_list(k2.index(d['words'], d['data']))) +# ['THING TWO', 'ZOOLOGY'] +# >>> diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh index 3b2678ec4..6a4c83ffd 100755 --- a/egs/librispeech/ASR/prepare.sh +++ b/egs/librispeech/ASR/prepare.sh @@ -24,6 +24,7 @@ stop_stage=100 # - 4-gram.arpa # - librispeech-vocab.txt # - librispeech-lexicon.txt +# - librispeech-lm-norm.txt.gz # # - $dl_dir/musan # This directory contains the following directories downloaded from @@ -227,3 +228,16 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then ./local/compile_hlg.py --lang-dir $lang_dir done fi + +if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + lm_dir=data/lm_training_${vocab_size} + mkdir -p $lm_dir + log "Stage 10: Creating $lm_dir/lm_data.pt (It may take 8 minutes)" + ./local/prepare_lm_training_data.py \ + $lang_dir/bpe.model \ + $dl_dir/lm/librispeech-lm-norm.txt \ + $lm_dir/lm_data.pt + done +fi