diff --git a/egs/libriheavy/ASR/local/train_bpe_model.py b/egs/libriheavy/ASR/local/train_bpe_model.py
deleted file mode 120000
index 6fad36421..000000000
--- a/egs/libriheavy/ASR/local/train_bpe_model.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../librispeech/ASR/local/train_bpe_model.py
\ No newline at end of file
diff --git a/egs/libriheavy/ASR/local/train_bpe_model.py b/egs/libriheavy/ASR/local/train_bpe_model.py
new file mode 100755
index 000000000..55a7d26a6
--- /dev/null
+++ b/egs/libriheavy/ASR/local/train_bpe_model.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# You can install sentencepiece via:
+#
+#  pip install sentencepiece
+#
+# Due to an issue reported in
+# https://github.com/google/sentencepiece/pull/642#issuecomment-857972030
+#
+# Please install a version >=0.1.96
+
+import argparse
+import shutil
+from pathlib import Path
+
+import sentencepiece as spm
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        help="""Input and output directory.
+        The generated bpe.model is saved to this directory.
+        """,
+    )
+
+    parser.add_argument(
+        "--transcript",
+        type=str,
+        help="Training transcript.",
+    )
+
+    parser.add_argument(
+        "--vocab-size",
+        type=int,
+        help="Vocabulary size for BPE training",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    vocab_size = args.vocab_size
+    lang_dir = Path(args.lang_dir)
+
+    model_type = "unigram"
+
+    model_prefix = f"{lang_dir}/{model_type}_{vocab_size}"
+    train_text = args.transcript
+    character_coverage = 1.0
+    input_sentence_size = 100000000
+
+    user_defined_symbols = ["<blk>", "<sos/eos>"]
+    unk_id = len(user_defined_symbols)
+    # Note: unk_id is fixed to 2.
+    # If you change it, you should also change other
+    # places that are using it.
+
+    model_file = Path(model_prefix + ".model")
+    if not model_file.is_file():
+        spm.SentencePieceTrainer.train(
+            input=train_text,
+            vocab_size=vocab_size,
+            model_type=model_type,
+            model_prefix=model_prefix,
+            input_sentence_size=input_sentence_size,
+            character_coverage=character_coverage,
+            user_defined_symbols=user_defined_symbols,
+            unk_id=unk_id,
+            bos_id=-1,
+            eos_id=-1,
+            train_extremely_large_corpus=False,
+        )
+    else:
+        print(f"{model_file} exists - skipping")
+        return
+
+    shutil.copyfile(model_file, f"{lang_dir}/bpe.model")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/libriheavy/ASR/prepare.sh b/egs/libriheavy/ASR/prepare.sh
index cca0cbf67..0aa6c91ae 100755
--- a/egs/libriheavy/ASR/prepare.sh
+++ b/egs/libriheavy/ASR/prepare.sh
@@ -2,6 +2,7 @@
 
 # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+export PYTHONPATH=/star-data/xiaoyu/icefall_libriheavy:$PYTHONPATH
 
 set -eou pipefail
 
@@ -18,7 +19,7 @@ num_per_split=2000
 # It will generate data/lang_bpe_xxx,
 # data/lang_bpe_yyy if the array contains xxx, yyy
 vocab_sizes=(
-  500
+  1000
 )
 
 mkdir -p data
@@ -30,14 +31,19 @@ log() {
 }
 
 manifest_dir=data/manifests
-fbank_dir=data/fbank_new
+fbank_dir=data/fbank
 
 mkdir -p $manifest_dir
 
-subset="medium"
+subset="large"
 
-if [ $stage -le 1 ] && [ $stop_stage -ge 2 ]; then
-  log "Stage 1: Split libri-heavy medium"
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Split libri-heavy ${subset}"
+
+  if [ $subset == "large" ]; then
+    num_per_split=8000
+    log "Change num_per_split to ${num_per_split} 8000 for large"
+  fi
 
   split_dir=$fbank_dir/libriheavy_${subset}_split
   mkdir -p $split_dir
@@ -53,8 +59,8 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
   num_splits=$(find $fbank_dir/libriheavy_${subset}_split -name "librilight_cuts_${subset}_raw.*.jsonl.gz" | wc -l)
   if [ ! -e $fbank_dir/.libriheavy.${subset}.done ]; then
     for i in $(seq 0 1 7); do
-      start=${i}00
-      end=$(( i+1 ))00
+      start=$(( i * 200 ))
+      end=$(( (i+1) * 200 ))
       ./local/compute_fbank_libriheavy.py \
         --dataset ${subset} \
         --fbank-dir $fbank_dir \
@@ -76,14 +82,18 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
   fi
 fi
 
+
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
   log "Stage 4: Prepare BPE model"
 
   tmp_dir=data/tmp
   mkdir -p $tmp_dir
   if [ ! -f $tmp_dir/transcript_words.txt ]; then
-    gunzip -c $manifest_dir/librilight_cuts_${subset}_raw.jsonl.gz |
-      jq '.supervisions[].custom.texts[]' | sed 's/" //' | sed 's/\(.*\)"/\1/' > $tmp_dir/transcript_words.txt
+    for part in "small" "medium" "large"; do
+      gunzip -c $manifest_dir/librilight_cuts_${part}_raw.jsonl.gz |
+        jq '.supervisions[].custom.texts[]' | sed 's/" //' | sed 's/\(.*\)"/\1/' > $tmp_dir/transcript_words_${part}.txt
+    done
+    cat $tmp_dir/transcript_words_small.txt $tmp_dir/transcript_words_medium.txt $tmp_dir/transcript_words_large.txt > $tmp_dir/transcript_words.txt
   fi
 
   if [ ! -f $tmp_dir/words.txt ]; then
@@ -115,15 +125,22 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
   fi
 
   for vocab_size in ${vocab_sizes[@]}; do
-    lang_dir=data/lang_bpe_${vocab_size}_${subset}
+    lang_dir=data/lang_bpe_${vocab_size}
     mkdir -p $lang_dir
     cp $tmp_dir/words.txt $lang_dir/words.txt
-
+    pushd $lang_dir
+    ln -s ../$tmp_dir/transcript_words.txt transcript_words.txt
+    popd
+    
     if [ ! -f $lang_dir/bpe.model ]; then
       ./local/train_bpe_model.py \
         --lang-dir $lang_dir \
         --vocab-size $vocab_size \
-        --transcript $tmp_dir/transcript_words.txt
+        --transcript $tmp_dir/transcript_words_medium.txt
+    fi
+
+    if [ ! -f $lang_dir/tokens.txt ]; then
+      ./local/bpe2tokens.py ${lang_dir}/bpe.model > ${lang_dir}/tokens.txt
     fi
 
     done