diff --git a/egs/libriheavy/ASR/local/train_bpe_model.py b/egs/libriheavy/ASR/local/train_bpe_model.py deleted file mode 120000 index 6fad36421..000000000 --- a/egs/libriheavy/ASR/local/train_bpe_model.py +++ /dev/null @@ -1 +0,0 @@ -../../../librispeech/ASR/local/train_bpe_model.py \ No newline at end of file diff --git a/egs/libriheavy/ASR/local/train_bpe_model.py b/egs/libriheavy/ASR/local/train_bpe_model.py new file mode 100755 index 000000000..55a7d26a6 --- /dev/null +++ b/egs/libriheavy/ASR/local/train_bpe_model.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# You can install sentencepiece via: +# +# pip install sentencepiece +# +# Due to an issue reported in +# https://github.com/google/sentencepiece/pull/642#issuecomment-857972030 +# +# Please install a version >=0.1.96 + +import argparse +import shutil +from pathlib import Path + +import sentencepiece as spm + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + The generated bpe.model is saved to this directory. + """, + ) + + parser.add_argument( + "--transcript", + type=str, + help="Training transcript.", + ) + + parser.add_argument( + "--vocab-size", + type=int, + help="Vocabulary size for BPE training", + ) + + return parser.parse_args() + + +def main(): + args = get_args() + vocab_size = args.vocab_size + lang_dir = Path(args.lang_dir) + + model_type = "unigram" + + model_prefix = f"{lang_dir}/{model_type}_{vocab_size}" + train_text = args.transcript + character_coverage = 1.0 + input_sentence_size = 100000000 + + user_defined_symbols = ["", ""] + unk_id = len(user_defined_symbols) + # Note: unk_id is fixed to 2. + # If you change it, you should also change other + # places that are using it. + + model_file = Path(model_prefix + ".model") + if not model_file.is_file(): + spm.SentencePieceTrainer.train( + input=train_text, + vocab_size=vocab_size, + model_type=model_type, + model_prefix=model_prefix, + input_sentence_size=input_sentence_size, + character_coverage=character_coverage, + user_defined_symbols=user_defined_symbols, + unk_id=unk_id, + bos_id=-1, + eos_id=-1, + train_extremely_large_corpus=False, + ) + else: + print(f"{model_file} exists - skipping") + return + + shutil.copyfile(model_file, f"{lang_dir}/bpe.model") + + +if __name__ == "__main__": + main() diff --git a/egs/libriheavy/ASR/prepare.sh b/egs/libriheavy/ASR/prepare.sh index cca0cbf67..0aa6c91ae 100755 --- a/egs/libriheavy/ASR/prepare.sh +++ b/egs/libriheavy/ASR/prepare.sh @@ -2,6 +2,7 @@ # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python +export PYTHONPATH=/star-data/xiaoyu/icefall_libriheavy:$PYTHONPATH set -eou pipefail @@ -18,7 +19,7 @@ num_per_split=2000 # It will generate data/lang_bpe_xxx, # data/lang_bpe_yyy if the array contains xxx, yyy vocab_sizes=( - 500 + 1000 ) mkdir -p data @@ -30,14 +31,19 @@ log() { } manifest_dir=data/manifests -fbank_dir=data/fbank_new +fbank_dir=data/fbank mkdir -p $manifest_dir -subset="medium" +subset="large" -if [ $stage -le 1 ] && [ $stop_stage -ge 2 ]; then - log "Stage 1: Split libri-heavy medium" +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + log "Stage 1: Split libri-heavy ${subset}" + + if [ $subset == "large" ]; then + num_per_split=8000 + log "Change num_per_split to ${num_per_split} 8000 for large" + fi split_dir=$fbank_dir/libriheavy_${subset}_split mkdir -p $split_dir @@ -53,8 +59,8 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then num_splits=$(find $fbank_dir/libriheavy_${subset}_split -name "librilight_cuts_${subset}_raw.*.jsonl.gz" | wc -l) if [ ! -e $fbank_dir/.libriheavy.${subset}.done ]; then for i in $(seq 0 1 7); do - start=${i}00 - end=$(( i+1 ))00 + start=$(( i * 200 )) + end=$(( (i+1) * 200 )) ./local/compute_fbank_libriheavy.py \ --dataset ${subset} \ --fbank-dir $fbank_dir \ @@ -76,14 +82,18 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then fi fi + if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then log "Stage 4: Prepare BPE model" tmp_dir=data/tmp mkdir -p $tmp_dir if [ ! -f $tmp_dir/transcript_words.txt ]; then - gunzip -c $manifest_dir/librilight_cuts_${subset}_raw.jsonl.gz | - jq '.supervisions[].custom.texts[]' | sed 's/" //' | sed 's/\(.*\)"/\1/' > $tmp_dir/transcript_words.txt + for part in "small" "medium" "large"; do + gunzip -c $manifest_dir/librilight_cuts_${part}_raw.jsonl.gz | + jq '.supervisions[].custom.texts[]' | sed 's/" //' | sed 's/\(.*\)"/\1/' > $tmp_dir/transcript_words_${part}.txt + done + cat $tmp_dir/transcript_words_small.txt $tmp_dir/transcript_words_medium.txt $tmp_dir/transcript_words_large.txt > $tmp_dir/transcript_words.txt fi if [ ! -f $tmp_dir/words.txt ]; then @@ -115,15 +125,22 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then fi for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bpe_${vocab_size}_${subset} + lang_dir=data/lang_bpe_${vocab_size} mkdir -p $lang_dir cp $tmp_dir/words.txt $lang_dir/words.txt - + pushd $lang_dir + ln -s ../$tmp_dir/transcript_words.txt transcript_words.txt + popd + if [ ! -f $lang_dir/bpe.model ]; then ./local/train_bpe_model.py \ --lang-dir $lang_dir \ --vocab-size $vocab_size \ - --transcript $tmp_dir/transcript_words.txt + --transcript $tmp_dir/transcript_words_medium.txt + fi + + if [ ! -f $lang_dir/tokens.txt ]; then + ./local/bpe2tokens.py ${lang_dir}/bpe.model > ${lang_dir}/tokens.txt fi done