mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
Generate LM training data for the LibriSpeech recipe.
This commit is contained in:
parent
2213154c69
commit
3c65ee11f4
@ -23,6 +23,7 @@ This file downloads the following LibriSpeech LM files:
|
|||||||
- 4-gram.arpa.gz
|
- 4-gram.arpa.gz
|
||||||
- librispeech-vocab.txt
|
- librispeech-vocab.txt
|
||||||
- librispeech-lexicon.txt
|
- librispeech-lexicon.txt
|
||||||
|
- librispeech-lm-norm.txt.gz
|
||||||
|
|
||||||
from http://www.openslr.org/resources/11
|
from http://www.openslr.org/resources/11
|
||||||
and save them in the user provided directory.
|
and save them in the user provided directory.
|
||||||
@ -61,6 +62,7 @@ def main(out_dir: str):
|
|||||||
"4-gram.arpa.gz",
|
"4-gram.arpa.gz",
|
||||||
"librispeech-vocab.txt",
|
"librispeech-vocab.txt",
|
||||||
"librispeech-lexicon.txt",
|
"librispeech-lexicon.txt",
|
||||||
|
"librispeech-lm-norm.txt.gz",
|
||||||
)
|
)
|
||||||
|
|
||||||
for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"):
|
for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"):
|
||||||
|
172
egs/librispeech/ASR/local/prepare_lm_training_data.py
Executable file
172
egs/librispeech/ASR/local/prepare_lm_training_data.py
Executable file
@ -0,0 +1,172 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# Copyright (c) 2021 Xiaomi Corporation (authors: Daniel Povey
|
||||||
|
# Fangjun Kuang)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script takes a `bpe.model` and a text file such as
|
||||||
|
./download/lm/librispeech-lm-norm.txt
|
||||||
|
and outputs the LM training data to a supplied directory such
|
||||||
|
as data/lm_training_bpe_500. The format is as follows:
|
||||||
|
|
||||||
|
It creates a PyTorch archive (.pt file), say data/lm_training.pt, which is a
|
||||||
|
representation of a dict with the following format:
|
||||||
|
|
||||||
|
'words' -> a k2.RaggedTensor of two axes [word][token] with dtype torch.int32
|
||||||
|
containing the BPE representations of each word, indexed by
|
||||||
|
integer word ID. (These integer word IDS are present in
|
||||||
|
'lm_data'). The sentencepiece object can be used to turn the
|
||||||
|
words and BPE units into string form.
|
||||||
|
'sentences' -> a k2.RaggedTensor of two axes [sentence][word] with dtype
|
||||||
|
torch.int32 containing all the sentences, as word-ids (we don't
|
||||||
|
output the string form of this directly but it can be worked out
|
||||||
|
together with 'words' and the bpe.model).
|
||||||
|
'sentence_lengths' -> a 1-D torch.Tensor of dtype torch.int32, containing
|
||||||
|
number of BPE tokens of each sentence.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import k2
|
||||||
|
import sentencepiece as spm
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--bpe-model",
|
||||||
|
type=str,
|
||||||
|
help="Input BPE model, e.g. data/bpe_500/bpe.model",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--lm-data",
|
||||||
|
type=str,
|
||||||
|
help="""Input LM training data as text, e.g.
|
||||||
|
download/pb.train.txt""",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--lm-archive",
|
||||||
|
type=str,
|
||||||
|
help="""Path to output archive, e.g. data/bpe_500/lm_data.pt;
|
||||||
|
look at the source of this script to see the format.""",
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = get_args()
|
||||||
|
|
||||||
|
if Path(args.lm_archive).exists():
|
||||||
|
logging.warning(f"{args.lm_archive} exists - skipping")
|
||||||
|
return
|
||||||
|
|
||||||
|
sp = spm.SentencePieceProcessor()
|
||||||
|
sp.load(args.bpe_model)
|
||||||
|
|
||||||
|
# word2index is a dictionary from words to integer ids. No need to reserve
|
||||||
|
# space for epsilon, etc.; the words are just used as a convenient way to
|
||||||
|
# compress the sequences of BPE pieces.
|
||||||
|
word2index = dict()
|
||||||
|
|
||||||
|
word2bpe = [] # Will be a list-of-list-of-int, representing BPE pieces.
|
||||||
|
sentences = [] # Will be a list-of-list-of-int, representing word-ids.
|
||||||
|
|
||||||
|
if "librispeech-lm-norm" in args.lm_data:
|
||||||
|
num_lines_in_total = 40418261.0
|
||||||
|
step = 5000000
|
||||||
|
elif "valid" in args.lm_data:
|
||||||
|
num_lines_in_total = 5567.0
|
||||||
|
step = 3000
|
||||||
|
elif "test" in args.lm_data:
|
||||||
|
num_lines_in_total = 5559.0
|
||||||
|
step = 3000
|
||||||
|
else:
|
||||||
|
num_lines_in_total = None
|
||||||
|
step = None
|
||||||
|
|
||||||
|
processed = 0
|
||||||
|
|
||||||
|
with open(args.lm_data) as f:
|
||||||
|
while True:
|
||||||
|
line = f.readline()
|
||||||
|
if line == "":
|
||||||
|
break
|
||||||
|
|
||||||
|
if step and processed % step == 0:
|
||||||
|
logging.info(
|
||||||
|
f"Processed number of lines: {processed} "
|
||||||
|
f"({processed/num_lines_in_total*100: .3f}%)"
|
||||||
|
)
|
||||||
|
processed += 1
|
||||||
|
|
||||||
|
line_words = line.split()
|
||||||
|
for w in line_words:
|
||||||
|
if w not in word2index:
|
||||||
|
w_bpe = sp.encode(w)
|
||||||
|
word2index[w] = len(word2bpe)
|
||||||
|
word2bpe.append(w_bpe)
|
||||||
|
sentences.append([word2index[w] for w in line_words])
|
||||||
|
|
||||||
|
logging.info("Constructing ragged tensors")
|
||||||
|
words = k2.ragged.RaggedTensor(word2bpe)
|
||||||
|
sentences = k2.ragged.RaggedTensor(sentences)
|
||||||
|
|
||||||
|
output = dict(words=words, sentences=sentences)
|
||||||
|
|
||||||
|
num_sentences = sentences.dim0
|
||||||
|
logging.info(f"Computing sentence lengths, num_sentences: {num_sentences}")
|
||||||
|
sentence_lengths = [0] * num_sentences
|
||||||
|
for i in range(num_sentences):
|
||||||
|
if step and i % step == 0:
|
||||||
|
logging.info(
|
||||||
|
f"Processed number of lines: {i} "
|
||||||
|
f"({i/num_sentences*100: .3f}%)"
|
||||||
|
)
|
||||||
|
|
||||||
|
word_ids = sentences[i]
|
||||||
|
|
||||||
|
# NOTE: If word_ids is a tensor with only 1 entry,
|
||||||
|
# token_ids is a torch.Tensor
|
||||||
|
token_ids = words[word_ids]
|
||||||
|
if isinstance(token_ids, k2.RaggedTensor):
|
||||||
|
token_ids = token_ids.values
|
||||||
|
|
||||||
|
# token_ids is a 1-D tensor containing the BPE tokens
|
||||||
|
# of the current sentence
|
||||||
|
|
||||||
|
sentence_lengths[i] = token_ids.numel()
|
||||||
|
|
||||||
|
output["sentence_lengths"] = torch.tensor(
|
||||||
|
sentence_lengths, dtype=torch.int32
|
||||||
|
)
|
||||||
|
|
||||||
|
torch.save(output, args.lm_archive)
|
||||||
|
logging.info(f"Saved to {args.lm_archive}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
formatter = (
|
||||||
|
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||||
|
|
||||||
|
main()
|
1
egs/librispeech/ASR/local/sort_lm_training_data.py
Symbolic link
1
egs/librispeech/ASR/local/sort_lm_training_data.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../ptb/LM/local/sort_lm_training_data.py
|
@ -24,6 +24,7 @@ stop_stage=100
|
|||||||
# - 4-gram.arpa
|
# - 4-gram.arpa
|
||||||
# - librispeech-vocab.txt
|
# - librispeech-vocab.txt
|
||||||
# - librispeech-lexicon.txt
|
# - librispeech-lexicon.txt
|
||||||
|
# - librispeech-lm-norm.txt.gz
|
||||||
#
|
#
|
||||||
# - $dl_dir/musan
|
# - $dl_dir/musan
|
||||||
# This directory contains the following directories downloaded from
|
# This directory contains the following directories downloaded from
|
||||||
@ -227,3 +228,100 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
|
|||||||
./local/compile_hlg.py --lang-dir $lang_dir
|
./local/compile_hlg.py --lang-dir $lang_dir
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
|
||||||
|
log "Stage 10: Generate LM training data"
|
||||||
|
|
||||||
|
for vocab_size in ${vocab_sizes[@]}; do
|
||||||
|
log "Processing vocab_size == ${vocab_size}"
|
||||||
|
lang_dir=data/lang_bpe_${vocab_size}
|
||||||
|
out_dir=data/lm_training_bpe_${vocab_size}
|
||||||
|
mkdir -p $out_dir
|
||||||
|
|
||||||
|
./local/prepare_lm_training_data.py \
|
||||||
|
--bpe-model $lang_dir/bpe.model \
|
||||||
|
--lm-data $dl_dir/lm/librispeech-lm-norm.txt \
|
||||||
|
--lm-archive $out_dir/lm_data.pt
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
|
||||||
|
log "Stage 11: Generate LM validation data"
|
||||||
|
|
||||||
|
for vocab_size in ${vocab_sizes[@]}; do
|
||||||
|
log "Processing vocab_size == ${vocab_size}"
|
||||||
|
out_dir=data/lm_training_bpe_${vocab_size}
|
||||||
|
mkdir -p $out_dir
|
||||||
|
|
||||||
|
if [ ! -f $out_dir/valid.txt ]; then
|
||||||
|
files=$(
|
||||||
|
find "$dl_dir/LibriSpeech/dev-clean" -name "*.trans.txt"
|
||||||
|
find "$dl_dir/LibriSpeech/dev-other" -name "*.trans.txt"
|
||||||
|
)
|
||||||
|
for f in ${files[@]}; do
|
||||||
|
cat $f | cut -d " " -f 2-
|
||||||
|
done > $out_dir/valid.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
lang_dir=data/lang_bpe_${vocab_size}
|
||||||
|
./local/prepare_lm_training_data.py \
|
||||||
|
--bpe-model $lang_dir/bpe.model \
|
||||||
|
--lm-data $out_dir/valid.txt \
|
||||||
|
--lm-archive $out_dir/lm_data-valid.pt
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
|
||||||
|
log "Stage 12: Generate LM test data"
|
||||||
|
|
||||||
|
for vocab_size in ${vocab_sizes[@]}; do
|
||||||
|
log "Processing vocab_size == ${vocab_size}"
|
||||||
|
out_dir=data/lm_training_bpe_${vocab_size}
|
||||||
|
mkdir -p $out_dir
|
||||||
|
|
||||||
|
if [ ! -f $out_dir/test.txt ]; then
|
||||||
|
files=$(
|
||||||
|
find "$dl_dir/LibriSpeech/test-clean" -name "*.trans.txt"
|
||||||
|
find "$dl_dir/LibriSpeech/test-other" -name "*.trans.txt"
|
||||||
|
)
|
||||||
|
for f in ${files[@]}; do
|
||||||
|
cat $f | cut -d " " -f 2-
|
||||||
|
done > $out_dir/test.txt
|
||||||
|
fi
|
||||||
|
exit 0
|
||||||
|
|
||||||
|
lang_dir=data/lang_bpe_${vocab_size}
|
||||||
|
./local/prepare_lm_training_data.py \
|
||||||
|
--bpe-model $lang_dir/bpe.model \
|
||||||
|
--lm-data $out_dir/test.txt \
|
||||||
|
--lm-archive $out_dir/lm_data-test.pt
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
|
||||||
|
log "Stage 13: Sort LM training data"
|
||||||
|
# Sort LM training data by sentence length in descending order
|
||||||
|
# for ease of training.
|
||||||
|
#
|
||||||
|
# Sentence length equals to the number of BPE tokens
|
||||||
|
# in a sentence.
|
||||||
|
|
||||||
|
for vocab_size in ${vocab_sizes[@]}; do
|
||||||
|
out_dir=data/lm_training_bpe_${vocab_size}
|
||||||
|
mkdir -p $out_dir
|
||||||
|
./local/sort_lm_training_data.py \
|
||||||
|
--in-lm-data $out_dir/lm_data.pt \
|
||||||
|
--out-lm-data $out_dir/sorted_lm_data.pt \
|
||||||
|
--out-statistics $out_dir/statistics.txt
|
||||||
|
|
||||||
|
./local/sort_lm_training_data.py \
|
||||||
|
--in-lm-data $out_dir/lm_data-valid.pt \
|
||||||
|
--out-lm-data $out_dir/sorted_lm_data-valid.pt \
|
||||||
|
--out-statistics $out_dir/statistics-valid.txt
|
||||||
|
|
||||||
|
./local/sort_lm_training_data.py \
|
||||||
|
--in-lm-data $out_dir/lm_data-test.pt \
|
||||||
|
--out-lm-data $out_dir/sorted_lm_data-test.pt \
|
||||||
|
--out-statistics $out_dir/statistics-test.txt
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
Loading…
x
Reference in New Issue
Block a user