From 894be068e78496c648e2b1314ad4dd5f65a665e3 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 23 Aug 2021 19:51:58 +0800 Subject: [PATCH] Update prepare.sh to create LM training data; add missed scripts local/prepare_lm_training_data.py --- .../ASR/local/prepare_lm_training_data.py | 118 ++++++++++++++++++ egs/librispeech/ASR/prepare.sh | 8 +- 2 files changed, 125 insertions(+), 1 deletion(-) create mode 100755 egs/librispeech/ASR/local/prepare_lm_training_data.py diff --git a/egs/librispeech/ASR/local/prepare_lm_training_data.py b/egs/librispeech/ASR/local/prepare_lm_training_data.py new file mode 100755 index 000000000..b6e0931f4 --- /dev/null +++ b/egs/librispeech/ASR/local/prepare_lm_training_data.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang, Daniel Povey) + +""" + +This script takes a `bpe.model` and a text file such as `download/lm/librispeech-lm-norm.txt`, +and outputs the LM training data to a supplied directory such +as data/lm_training_data_bpe_5000. The format is as follows: + +It creates a PyTorch archive (.pt file), say data/lm_training.pt, which is a representation of +a dict with the following format: + + 'words' -> a k2._RaggedInt containing the BPE representations of each word, inexed by + integer word ID. (These integer word IDS are present in 'lm_data'). The + sentencepiece object can be used to turn the words and BPE units into + string form. + 'data' -> a k2._RaggedInt containing all the sentences, as word-ids (we don't output + the string form of this directly but it can be worked out together with + 'words' and the bpe.model). + +""" + +import argparse +from pathlib import Path +from typing import Dict, List, Tuple + +import k2 +import sentencepiece as spm +import torch + + + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "bpe_model", + type=str, + help="""Input BPE model, e.g. data/lang_bpe/bpe.model""" + ) + parser.add_argument( + "lm_data", + type=str, + help="""Input LM training data as text, e.g. data/downloads/lm/librispeech-lm-norm.txt""" + ) + parser.add_argument( + "lm_archive", + type=str, + help="""Path to output archive, e.g. lm_data.pt; look at the source of this script to see the format.""" + ) + + return parser.parse_args() + + +def main(): + args = get_args() + + sp = spm.SentencePieceProcessor() + sp.load(args.bpe_model) + + # word2index is a dictionary from words to integer ids. No need to reserve + # space for epsilon, etc.; the words are just used as a convenient way to + # compress the sequences of BPE pieces. + word2index = dict() + + words2bpe = [] # Will be a list-of-list-of-int, representing BPE pieces. + + sentences = [] # Wil be a list-of-list-of-int, representing word-ids. + + with open(args.lm_data) as f: + while True: + line = f.readline() + if line == '': + break + line_words = line.split() + for w in line_words: + if not w in word2index: + w_bpe = sp.Encode(w) + word2index[w] = len(words2bpe) + words2bpe.append(w_bpe) + sentences.append([ word2index[w] for w in line_words]) + + output = dict() + output['words' ] = k2.ragged.create_ragged2(words2bpe) + output['data'] = k2.ragged.create_ragged2(sentences) + + torch.save(output, args.lm_archive) + print(f"Saved to {args.lm_archive}") + + +if __name__ == "__main__": + main() + + + +# This was tested as follows. +# cat > foo <>> import k2 +#>>> import sentencepiece as spm +#>>> sp = spm.SentencePieceProcessor() +#>>> sp.load('data/lang_bpe/bpe.model') +#True +#>>> import torch +#>>> d = torch.load('bar.pt') +#>>> sp.Decode(k2.ragged.to_list(k2.index(d['words'], d['data']))) +#['THING TWO', 'ZOOLOGY'] +#>>> diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh index 798a30631..94c408c6e 100755 --- a/egs/librispeech/ASR/prepare.sh +++ b/egs/librispeech/ASR/prepare.sh @@ -193,7 +193,13 @@ fi if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then - + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + lm_dir=lm_dir=data/lm_training_${vocab_size} + mkdir -p $lm_dir + log "Stage 9: creating $lm_dir/lm_data.pt" + ./local/prepare_lm_training_data.py data/lang_bpe_${vocab_size}/bpe.model download/lm/librispeech-lm-norm.txt $lm_dir/lm_data.pt + done fi cd data && ln -sfv lang_bpe_5000 lang_bpe