mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-27 18:54:18 +00:00
parent
1ea780f203
commit
469b665a5a
@ -23,6 +23,7 @@ This file downloads the following LibriSpeech LM files:
|
||||
- 4-gram.arpa.gz
|
||||
- librispeech-vocab.txt
|
||||
- librispeech-lexicon.txt
|
||||
- librispeech-lm-norm.txt.gz
|
||||
|
||||
from http://www.openslr.org/resources/11
|
||||
and save them in the user provided directory.
|
||||
@ -61,6 +62,7 @@ def main(out_dir: str):
|
||||
"4-gram.arpa.gz",
|
||||
"librispeech-vocab.txt",
|
||||
"librispeech-lexicon.txt",
|
||||
"librispeech-lm-norm.txt.gz",
|
||||
)
|
||||
|
||||
for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"):
|
||||
|
118
egs/librispeech/ASR/local/prepare_lm_training_data.py
Executable file
118
egs/librispeech/ASR/local/prepare_lm_training_data.py
Executable file
@ -0,0 +1,118 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright (c) 2021 Xiaomi Corporation (authors: Daniel Povey)
|
||||
|
||||
"""
|
||||
|
||||
This script takes a `bpe.model` and a text file such as
|
||||
`download/lm/librispeech-lm-norm.txt`,
|
||||
and outputs the LM training data to a supplied directory such
|
||||
as data/lm_training_data_bpe_5000. The format is as follows:
|
||||
|
||||
It creates a PyTorch archive (.pt file), say data/lm_training.pt, which is a
|
||||
representation of a dict with the following format:
|
||||
|
||||
'words' -> a k2.RaggedTensor of two axes [word][token] with dtype torch.int32
|
||||
containing the BPE representations of each word, indexed by
|
||||
integer word ID. (These integer word IDS are present in
|
||||
'lm_data'). The sentencepiece object can be used to turn the
|
||||
words and BPE units into string form.
|
||||
'data' -> a k2.RaggedTensor of two axes [sentence][word] with dtype
|
||||
torch.int32 containing all the sentences, as word-ids (we don't
|
||||
output the string form of this directly but it can be worked out
|
||||
together with 'words' and the bpe.model).
|
||||
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import k2
|
||||
import sentencepiece as spm
|
||||
import torch
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"bpe_model",
|
||||
type=str,
|
||||
help="""Input BPE model, e.g. data/lang_bpe/bpe.model""",
|
||||
)
|
||||
parser.add_argument(
|
||||
"lm_data",
|
||||
type=str,
|
||||
help="""Input LM training data as text, e.g. data/downloads/lm/librispeech-lm-norm.txt""",
|
||||
)
|
||||
parser.add_argument(
|
||||
"lm_archive",
|
||||
type=str,
|
||||
help="""Path to output archive, e.g. lm_data.pt; look at the source of this script to see the format.""",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
|
||||
sp = spm.SentencePieceProcessor()
|
||||
sp.load(args.bpe_model)
|
||||
|
||||
# word2index is a dictionary from words to integer ids. No need to reserve
|
||||
# space for epsilon, etc.; the words are just used as a convenient way to
|
||||
# compress the sequences of BPE pieces.
|
||||
word2index = dict()
|
||||
|
||||
words2bpe = [] # Will be a list-of-list-of-int, representing BPE pieces.
|
||||
|
||||
sentences = [] # Wil be a list-of-list-of-int, representing word-ids.
|
||||
|
||||
with open(args.lm_data) as f:
|
||||
while True:
|
||||
line = f.readline()
|
||||
if line == "":
|
||||
break
|
||||
line_words = line.split()
|
||||
for w in line_words:
|
||||
if not w in word2index:
|
||||
w_bpe = sp.Encode(w)
|
||||
word2index[w] = len(words2bpe)
|
||||
words2bpe.append(w_bpe)
|
||||
sentences.append([word2index[w] for w in line_words])
|
||||
|
||||
output = dict()
|
||||
output["words"] = k2.ragged.RaggedTensor(words2bpe)
|
||||
output["data"] = k2.ragged.RaggedTensor(sentences)
|
||||
|
||||
torch.save(output, args.lm_archive)
|
||||
print(f"Saved to {args.lm_archive}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
# This was tested as follows.
|
||||
# cat > foo <<EOF
|
||||
# THING TWO
|
||||
# ZOOLOGY
|
||||
# EOF
|
||||
#
|
||||
# local/prepare_lm_training_data.py data/lang_bpe/bpe.model foo bar.pt
|
||||
#
|
||||
# python3
|
||||
# Python 3.8.0 (default, Oct 28 2019, 16:14:01)
|
||||
# [GCC 8.3.0] on linux
|
||||
# Type "help", "copyright", "credits" or "license" for more information.
|
||||
# >>> import k2
|
||||
# >>> import sentencepiece as spm
|
||||
# >>> sp = spm.SentencePieceProcessor()
|
||||
# >>> sp.load('data/lang_bpe/bpe.model')
|
||||
# True
|
||||
# >>> import torch
|
||||
# >>> d = torch.load('bar.pt')
|
||||
# >>> sp.Decode(k2.ragged.to_list(k2.index(d['words'], d['data'])))
|
||||
# ['THING TWO', 'ZOOLOGY']
|
||||
# >>>
|
@ -24,6 +24,7 @@ stop_stage=100
|
||||
# - 4-gram.arpa
|
||||
# - librispeech-vocab.txt
|
||||
# - librispeech-lexicon.txt
|
||||
# - librispeech-lm-norm.txt.gz
|
||||
#
|
||||
# - $dl_dir/musan
|
||||
# This directory contains the following directories downloaded from
|
||||
@ -227,3 +228,16 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
|
||||
./local/compile_hlg.py --lang-dir $lang_dir
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
|
||||
for vocab_size in ${vocab_sizes[@]}; do
|
||||
lang_dir=data/lang_bpe_${vocab_size}
|
||||
lm_dir=data/lm_training_${vocab_size}
|
||||
mkdir -p $lm_dir
|
||||
log "Stage 10: Creating $lm_dir/lm_data.pt (It may take 8 minutes)"
|
||||
./local/prepare_lm_training_data.py \
|
||||
$lang_dir/bpe.model \
|
||||
$dl_dir/lm/librispeech-lm-norm.txt \
|
||||
$lm_dir/lm_data.pt
|
||||
done
|
||||
fi
|
||||
|
Loading…
x
Reference in New Issue
Block a user