Minor fixes. (#9)

This commit is contained in:
Fangjun Kuang 2021-08-16 19:01:25 +08:00 committed by GitHub
parent 12a2fd023e
commit 1c3b13c7eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 120 additions and 63 deletions

View File

@ -1,18 +1,18 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
This script compiles HLG from This script takes as input lang_dir and generates HLG from
- H, the ctc topology, built from tokens contained in lexicon.txt - H, the ctc topology, built from tokens contained in lang_dir/lexicon.txt
- L, the lexicon, built from L_disambig.pt - L, the lexicon, built from lang_dir/L_disambig.pt
Caution: We use a lexicon that contains disambiguation symbols Caution: We use a lexicon that contains disambiguation symbols
- G, the LM, built from data/lm/G_3_gram.fst.txt - G, the LM, built from data/lm/G_3_gram.fst.txt
The generated HLG is saved in data/lm/HLG.pt (phone based) The generated HLG is saved in $lang_dir/HLG.pt
or data/lm/HLG_bpe.pt (BPE based)
""" """
import argparse
import logging import logging
from pathlib import Path from pathlib import Path
@ -22,11 +22,23 @@ import torch
from icefall.lexicon import Lexicon from icefall.lexicon import Lexicon
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--lang-dir",
type=str,
help="""Input and output directory.
""",
)
return parser.parse_args()
def compile_HLG(lang_dir: str) -> k2.Fsa: def compile_HLG(lang_dir: str) -> k2.Fsa:
""" """
Args: Args:
lang_dir: lang_dir:
The language directory, e.g., data/lang_phone or data/lang_bpe. The language directory, e.g., data/lang_phone or data/lang_bpe_5000.
Return: Return:
An FSA representing HLG. An FSA representing HLG.
@ -104,17 +116,18 @@ def compile_HLG(lang_dir: str) -> k2.Fsa:
def main(): def main():
for d in ["data/lang_phone", "data/lang_bpe"]: args = get_args()
d = Path(d) lang_dir = Path(args.lang_dir)
logging.info(f"Processing {d}")
if (d / "HLG.pt").is_file(): if (lang_dir / "HLG.pt").is_file():
logging.info(f"{d}/HLG.pt already exists - skipping") logging.info(f"{lang_dir}/HLG.pt already exists - skipping")
continue return
HLG = compile_HLG(d) logging.info(f"Processing {lang_dir}")
logging.info(f"Saving HLG.pt to {d}")
torch.save(HLG.as_dict(), f"{d}/HLG.pt") HLG = compile_HLG(lang_dir)
logging.info(f"Saving HLG.pt to {lang_dir}")
torch.save(HLG.as_dict(), f"{lang_dir}/HLG.pt")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -3,12 +3,13 @@
# Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang) # Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang)
""" """
This script takes as inputs the following two files:
- data/lang_bpe/bpe.model, This script takes as input `lang_dir`, which should contain::
- data/lang_bpe/words.txt
and generates the following files in the directory data/lang_bpe: - lang_dir/bpe.model,
- lang_dir/words.txt
and generates the following files in the directory `lang_dir`:
- lexicon.txt - lexicon.txt
- lexicon_disambig.txt - lexicon_disambig.txt
@ -17,6 +18,7 @@ and generates the following files in the directory data/lang_bpe:
- tokens.txt - tokens.txt
""" """
import argparse
from pathlib import Path from pathlib import Path
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
@ -141,8 +143,22 @@ def generate_lexicon(
return lexicon, token2id return lexicon, token2id
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--lang-dir",
type=str,
help="""Input and output directory.
It should contain the bpe.model and words.txt
""",
)
return parser.parse_args()
def main(): def main():
lang_dir = Path("data/lang_bpe") args = get_args()
lang_dir = Path(args.lang_dir)
model_file = lang_dir / "bpe.model" model_file = lang_dir / "bpe.model"
word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt") word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
@ -189,15 +205,6 @@ def main():
torch.save(L.as_dict(), lang_dir / "L.pt") torch.save(L.as_dict(), lang_dir / "L.pt")
torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt") torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt")
if False:
# Just for debugging, will remove it
L.labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt")
L.aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt")
L_disambig.labels_sym = L.labels_sym
L_disambig.aux_labels_sym = L.aux_labels_sym
L.draw(lang_dir / "L.svg", title="L")
L_disambig.draw(lang_dir / "L_disambig.svg", title="L_disambig")
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -1,10 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""
This script takes as input "data/lang/bpe/train.txt"
and generates "data/lang/bpe/bep.model".
"""
# You can install sentencepiece via: # You can install sentencepiece via:
# #
# pip install sentencepiece # pip install sentencepiece
@ -14,17 +9,41 @@ and generates "data/lang/bpe/bep.model".
# #
# Please install a version >=0.1.96 # Please install a version >=0.1.96
import argparse
import shutil import shutil
from pathlib import Path from pathlib import Path
import sentencepiece as spm import sentencepiece as spm
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--lang-dir",
type=str,
help="""Input and output directory.
It should contain the training corpus: train.txt.
The generated bpe.model is saved to this directory.
""",
)
parser.add_argument(
"--vocab-size",
type=int,
help="Vocabulary size for BPE training",
)
return parser.parse_args()
def main(): def main():
args = get_args()
vocab_size = args.vocab_size
lang_dir = Path(args.lang_dir)
model_type = "unigram" model_type = "unigram"
vocab_size = 5000
model_prefix = f"data/lang_bpe/{model_type}_{vocab_size}" model_prefix = f"{lang_dir}/{model_type}_{vocab_size}"
train_text = "data/lang_bpe/train.txt" train_text = f"{lang_dir}/train.txt"
character_coverage = 1.0 character_coverage = 1.0
input_sentence_size = 100000000 input_sentence_size = 100000000
@ -49,10 +68,7 @@ def main():
eos_id=-1, eos_id=-1,
) )
sp = spm.SentencePieceProcessor(model_file=str(model_file)) shutil.copyfile(model_file, f"{lang_dir}/bpe.model")
vocab_size = sp.vocab_size()
shutil.copyfile(model_file, "data/lang_bpe/bpe.model")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -25,7 +25,7 @@ stop_stage=100
# - librispeech-vocab.txt # - librispeech-vocab.txt
# - librispeech-lexicon.txt # - librispeech-lexicon.txt
# #
# - $do_dir/musan # - $dl_dir/musan
# This directory contains the following directories downloaded from # This directory contains the following directories downloaded from
# http://www.openslr.org/17/ # http://www.openslr.org/17/
# #
@ -36,8 +36,15 @@ dl_dir=$PWD/download
. shared/parse_options.sh || exit 1 . shared/parse_options.sh || exit 1
# vocab size for sentence piece models.
# It will generate data/lang_bpe_xxx,
# data/lang_bpe_yyy if the array contains xxx, yyy
vocab_sizes=(
5000
)
# All generated files by this script are saved in "data" # All files generated by this script are saved in "data".
# You can safely remove "data" and rerun this script to regenerate it.
mkdir -p data mkdir -p data
log() { log() {
@ -50,6 +57,7 @@ log "dl_dir: $dl_dir"
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
log "stage -1: Download LM" log "stage -1: Download LM"
[ ! -e $dl_dir/lm ] && mkdir -p $dl_dir/lm
./local/download_lm.py --out-dir=$dl_dir/lm ./local/download_lm.py --out-dir=$dl_dir/lm
fi fi
@ -118,28 +126,34 @@ fi
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "State 6: Prepare BPE based lang" log "State 6: Prepare BPE based lang"
mkdir -p data/lang_bpe
for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bpe_${vocab_size}
mkdir -p $lang_dir
# We reuse words.txt from phone based lexicon # We reuse words.txt from phone based lexicon
# so that the two can share G.pt later. # so that the two can share G.pt later.
cp data/lang_phone/words.txt data/lang_bpe/ cp data/lang_phone/words.txt $lang_dir
if [ ! -f data/lang_bpe/train.txt ]; then if [ ! -f $lang_dir/train.txt ]; then
log "Generate data for BPE training" log "Generate data for BPE training"
files=$( files=$(
find "data/LibriSpeech/train-clean-100" -name "*.trans.txt" find "$dl_dir/LibriSpeech/train-clean-100" -name "*.trans.txt"
find "data/LibriSpeech/train-clean-360" -name "*.trans.txt" find "$dl_dir/LibriSpeech/train-clean-360" -name "*.trans.txt"
find "data/LibriSpeech/train-other-500" -name "*.trans.txt" find "$dl_dir/LibriSpeech/train-other-500" -name "*.trans.txt"
) )
for f in ${files[@]}; do for f in ${files[@]}; do
cat $f | cut -d " " -f 2- cat $f | cut -d " " -f 2-
done > data/lang_bpe/train.txt done > $lang_dir/train.txt
fi fi
python3 ./local/train_bpe_model.py ./local/train_bpe_model.py \
--lang-dir $lang_dir \
--vocab-size $vocab_size
if [ ! -f data/lang_bpe/L_disambig.pt ]; then if [ ! -f $lang_dir/L_disambig.pt ]; then
./local/prepare_lang_bpe.py ./local/prepare_lang_bpe.py --lang-dir $lang_dir
fi fi
done
fi fi
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
@ -169,5 +183,12 @@ fi
if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
log "Stage 8: Compile HLG" log "Stage 8: Compile HLG"
python3 ./local/compile_hlg.py ./local/compile_hlg.py --lang-dir data/lang_phone
for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bpe_${vocab_size}
./local/compile_hlg.py --lang-dir $lang_dir
done
fi fi
cd data && ln -sfv lang_bpe_5000 lang_bpe