only use medium text to train the BPE as the whole corpus is tooooo large

2025-12-11 06:55:27 +00:00 · 2023-07-18 10:06:01 +08:00 · 2023-07-18 10:06:01 +08:00 · 189d424b25
commit 189d424b25
parent fef229e024
2 changed files with 130 additions and 13 deletions
--- a/egs/libriheavy/ASR/local/train_bpe_model.py
+++ b/egs/libriheavy/ASR/local/train_bpe_model.py
@ -1 +0,0 @@
 ../../../librispeech/ASR/local/train_bpe_model.py
--- a/egs/libriheavy/ASR/local/train_bpe_model.py
+++ b/egs/libriheavy/ASR/local/train_bpe_model.py
@ -0,0 +1,101 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # You can install sentencepiece via:
 #
 #  pip install sentencepiece
 #
 # Due to an issue reported in
 # https://github.com/google/sentencepiece/pull/642#issuecomment-857972030
 #
 # Please install a version >=0.1.96
 import argparse
 import shutil
 from pathlib import Path
 import sentencepiece as spm
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lang-dir",
        type=str,
        help="""Input and output directory.
        The generated bpe.model is saved to this directory.
        """,
    )
    parser.add_argument(
        "--transcript",
        type=str,
        help="Training transcript.",
    )
    parser.add_argument(
        "--vocab-size",
        type=int,
        help="Vocabulary size for BPE training",
    )
    return parser.parse_args()
 def main():
    args = get_args()
    vocab_size = args.vocab_size
    lang_dir = Path(args.lang_dir)
    model_type = "unigram"
    model_prefix = f"{lang_dir}/{model_type}_{vocab_size}"
    train_text = args.transcript
    character_coverage = 1.0
    input_sentence_size = 100000000
    user_defined_symbols = ["<blk>", "<sos/eos>"]
    unk_id = len(user_defined_symbols)
    # Note: unk_id is fixed to 2.
    # If you change it, you should also change other
    # places that are using it.
    model_file = Path(model_prefix + ".model")
    if not model_file.is_file():
        spm.SentencePieceTrainer.train(
            input=train_text,
            vocab_size=vocab_size,
            model_type=model_type,
            model_prefix=model_prefix,
            input_sentence_size=input_sentence_size,
            character_coverage=character_coverage,
            user_defined_symbols=user_defined_symbols,
            unk_id=unk_id,
            bos_id=-1,
            eos_id=-1,
            train_extremely_large_corpus=False,
        )
    else:
        print(f"{model_file} exists - skipping")
        return
    shutil.copyfile(model_file, f"{lang_dir}/bpe.model")
 if __name__ == "__main__":
    main()
--- a/egs/libriheavy/ASR/prepare.sh
+++ b/egs/libriheavy/ASR/prepare.sh
@ -2,6 +2,7 @@
 # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 export PYTHONPATH=/star-data/xiaoyu/icefall_libriheavy:$PYTHONPATH
 set -eou pipefail
@ -18,7 +19,7 @@ num_per_split=2000
 # It will generate data/lang_bpe_xxx,
 # data/lang_bpe_yyy if the array contains xxx, yyy
 vocab_sizes=(
-  500
+  1000
 )
 mkdir -p data
@ -30,14 +31,19 @@ log() {
 }
 manifest_dir=data/manifests
-fbank_dir=data/fbank_new
+fbank_dir=data/fbank
 mkdir -p $manifest_dir
-subset="medium"
+subset="large"
-if [ $stage -le 1 ] && [ $stop_stage -ge 2 ]; then
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
-  log "Stage 1: Split libri-heavy medium"
+  log "Stage 1: Split libri-heavy ${subset}"
  if [ $subset == "large" ]; then
    num_per_split=8000
    log "Change num_per_split to ${num_per_split} 8000 for large"
  fi
  split_dir=$fbank_dir/libriheavy_${subset}_split
  mkdir -p $split_dir
@ -53,8 +59,8 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  num_splits=$(find $fbank_dir/libriheavy_${subset}_split -name "librilight_cuts_${subset}_raw.*.jsonl.gz" | wc -l)
  if [ ! -e $fbank_dir/.libriheavy.${subset}.done ]; then
    for i in $(seq 0 1 7); do
-      start=${i}00
+      start=$(( i * 200 ))
-      end=$(( i+1 ))00
+      end=$(( (i+1) * 200 ))
      ./local/compute_fbank_libriheavy.py \
        --dataset ${subset} \
        --fbank-dir $fbank_dir \
@ -76,14 +82,18 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  fi
 fi
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Prepare BPE model"
  tmp_dir=data/tmp
  mkdir -p $tmp_dir
  if [ ! -f $tmp_dir/transcript_words.txt ]; then
-    gunzip -c $manifest_dir/librilight_cuts_${subset}_raw.jsonl.gz |
+    for part in "small" "medium" "large"; do
-      jq '.supervisions[].custom.texts[]' | sed 's/" //' | sed 's/\(.*\)"/\1/' > $tmp_dir/transcript_words.txt
+      gunzip -c $manifest_dir/librilight_cuts_${part}_raw.jsonl.gz |
        jq '.supervisions[].custom.texts[]' | sed 's/" //' | sed 's/\(.*\)"/\1/' > $tmp_dir/transcript_words_${part}.txt
    done
    cat $tmp_dir/transcript_words_small.txt $tmp_dir/transcript_words_medium.txt $tmp_dir/transcript_words_large.txt > $tmp_dir/transcript_words.txt
  fi
  if [ ! -f $tmp_dir/words.txt ]; then
@ -115,15 +125,22 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  fi
  for vocab_size in ${vocab_sizes[@]}; do
-    lang_dir=data/lang_bpe_${vocab_size}_${subset}
+    lang_dir=data/lang_bpe_${vocab_size}
    mkdir -p $lang_dir
    cp $tmp_dir/words.txt $lang_dir/words.txt
-
+    pushd $lang_dir
    ln -s ../$tmp_dir/transcript_words.txt transcript_words.txt
    popd
    if [ ! -f $lang_dir/bpe.model ]; then
      ./local/train_bpe_model.py \
        --lang-dir $lang_dir \
        --vocab-size $vocab_size \
-        --transcript $tmp_dir/transcript_words.txt
+        --transcript $tmp_dir/transcript_words_medium.txt
    fi
    if [ ! -f $lang_dir/tokens.txt ]; then
      ./local/bpe2tokens.py ${lang_dir}/bpe.model > ${lang_dir}/tokens.txt
    fi
    done
		`@ -1 +0,0 @@`
			`../../../librispeech/ASR/local/train_bpe_model.py`