diff --git a/egs/librispeech/ASR/codebook_index_extraction.sh b/egs/librispeech/ASR/codebook_index_extraction.sh
index 383573552..b962d0081 100644
--- a/egs/librispeech/ASR/codebook_index_extraction.sh
+++ b/egs/librispeech/ASR/codebook_index_extraction.sh
@@ -1,4 +1,4 @@
-stage=4
+stage=3
 
 # Parameters about model.
 exp_dir=./vq_pruned_transducer_stateless2/exp/
@@ -7,11 +7,15 @@ hubert_model_dir=${exp_dir}/hubert_models
 hubert_model=${hubert_model_dir}/${model_id}.pt
 
 # Parameters about quantizer.
+memory_layer=36  # 1-based
+
+
+# Make sure following parameters are identical to that in hubert_utils.vq_config
 num_utts=1000
-mem_layer=36
 bytes_per_frame=8
 enable_refine=True
 
+
 if [ $stage -eq -1 ]; then
   # Preparation state.
 
@@ -27,6 +31,10 @@ if [ $stage -eq -1 ]; then
   wget -c wget https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt -P ${hubert_model_dir}
 fi
 
+if [ ! -d ./data/fbank ]; then
+  echo "This script assumes ./data/fbank is already generated by prepare.sh"
+  exit 0
+fi
 
 if [ $stage -eq 0 ]; then
   # This stage is not directly used by codebook extraction.
@@ -36,17 +44,17 @@ if [ $stage -eq 0 ]; then
   # [test-clean-ctc_greedy_search] %WER 2.04% [1075 / 52576, 92 ins, 104 del, 879 sub ]
   # [test-other-ctc_greedy_search] %WER 3.71% [1942 / 52343, 152 ins, 126 del, 1664 sub ]
   export CUDA_VISIBLE_DEVICES=7
-  ./vq_pruned_transducer_stateless2/hubert_decode.py \
-    --max-duration 10
+  ./vq_pruned_transducer_stateless2/hubert_decode.py
 fi
 
 if [ $stage -eq 1 ]; then
   ./vq_pruned_transducer_stateless2/hubert_memory_embeddings.py \
-    --max-duration 10
+    --memory-layer=${memory_layer}
 fi
 
 if [ $stage -eq 2 ]; then
-  ./vq_pruned_transducer_stateless2/quantizer_train.py
+  ./vq_pruned_transducer_stateless2/quantizer_train.py \
+    --memory-layer=${memory_layer}
 fi
 
 # CAUTITHON: set quantizer_id MANUALLY when a new quantizer is used.
@@ -75,12 +83,22 @@ if [ $stage -eq 4 ]; then
   refine_iter=5
 
   extract_codebook_index(){
+    # Analysis of disk usage:
+    # With bytes_per_frame=8, each embedding is compressed into eight 8-bit integers, i.e. 8 bytes needed.
+    # Training dataset including clean-100h with speed perturb 0.9 and 1.1 has 300 hours.
+    # The output frame rates of Hubert is 50 per second.
+    # Theoretically, 412M = 300 * 3600 * 50 * 8 / 1024 / 1024 is needed.
+    # The actual size of all "*.h5" files storaging codebook index is 450M.
+    # I think the extra "48M" usage is some meta information.
+    #
+    # About CUDA_VISIBLE_DEVICES:
     # When I testing this code, gpu 6 and 7 are available,
     # So the CUDA_VISIBLE_DEVICES is (1 + 5) for job 0
     # and (2 + 5) for job 1
     # Note: order of split manfiests is 1-based, while gpu is 0-based.
     export CUDA_VISIBLE_DEVICES=`(expr $1 + 5)`
     ./vq_pruned_transducer_stateless2/hubert_code_indices.py \
+      --memory-layer=${memory_layer}
       --num-splits $num_jobs \
       --subset=$2 \
       --manifest-idx $1 \
@@ -98,11 +116,36 @@ if [ $stage -eq 4 ]; then
   done
   wait
 fi
+
+cdidx_manifests_dir=`pwd`/data/globalrandom-scaledquantizer-refine_iter-5-${num_utts}-$model_id-${mem_layer}layer-${quantizer_id}-bytes_per_frame-${bytes_per_frame}-enable-refine-True
 if [ $stage -eq 5 ]; then
-  for subset in ${train_subset}; do
-    cdidx_manifests_dir=`pwd`/data/$model_id-${mem_layer}layer-${quantizer_id}-bytes_per_frame-${bytes_per_frame}
+  for subset in ${train_subsets}; do
     combined_list=`find $cdidx_manifests_dir/splits$num_jobs/ -name cuts_train-${sbuset}*`
     echo $combined_list
     lhotse combine $combined_list $cdidx_manifests_dir/cuts_train-${subset}.json.gz
   done
+
+  reuseable_subsets="dev-clean dev-other test-clean test-other musan"
+  for subset in $reuseable_subsets; do
+    ori_manifest=./data/fbank/cuts_${subset}.json.gz
+    ln -sf `realpath ./data/fbank/cuts_${subset}.json.gz` ${cdidx_manifests_dir}
+  done
+fi
+if [ $stage -eq 6 ]; then
+  # Example training script.
+  # Note: it's better to set spec-aug-time-warpi-factor=-1
+  export CUDA_VISIBLE_DEVICES="4,5,6"
+  WORLD_SIZE=3
+  python3 ./vq_pruned_transducer_stateless2/train.py \
+    --codebook-loss-scale 0.1 \
+    --num-codebooks=${bytes_per_frame} \
+    --start-epoch 0 \
+    --master-port 12358 \
+    --manifest-dir ${cdidx_manifests_dir} \
+    --full-libri 0 \
+    --spec-aug-time-warp-factor -1 \
+    --max-duration 300 \
+    --world-size ${WORLD_SIZE} \
+    --num-epochs 30 \
+    --codebook-loss-scale 0.1
 fi
diff --git a/egs/librispeech/ASR/vq_pruned_transducer_stateless2/hubert_utils.py b/egs/librispeech/ASR/vq_pruned_transducer_stateless2/hubert_utils.py
index f8597a66f..31cf28219 100644
--- a/egs/librispeech/ASR/vq_pruned_transducer_stateless2/hubert_utils.py
+++ b/egs/librispeech/ASR/vq_pruned_transducer_stateless2/hubert_utils.py
@@ -31,17 +31,16 @@ from fairseq.models.hubert.hubert import HubertModel
 from omegaconf import OmegaConf
 
 vq_config = {
+    # TODO: Maybe better to convert this class to yaml driven config.
     # parameters about hubert model inference.
     "model_dir": "./vq_pruned_transducer_stateless2/exp/hubert_models/",
-    "model_id": "hubert_xtralarge_ll60k_finetune_ls960",
     "input_strategy": "AudioSamples",
     "enable_spec_aug": False,
     "enable_musan": False,
     "total_layers": 48,
     "memory_embedding_dim": 1280,
     # parameters about quantizer.
-    "num_utts": 100,
-    "memory_layer": 36,
+    "num_utts": 1000,
     "memory_dir": "./vq_pruned_transducer_stateless2/exp/mem/",
     "bytes_per_frame": 8,
     "refine_iter": 5,
@@ -62,9 +61,19 @@ def get_parser():
     )
 
     parser.add_argument(
-        "--manifest-idx",
+        "--model-id",
+        type=str,
+        default="hubert_xtralarge_ll60k_finetune_ls960",
+    )
+
+    parser.add_argument(
+        "--manifest-idx", type=int, help="Split manifest is 1-based."
+    )
+
+    parser.add_argument(
+        "--memory-layer",
         type=int,
-        help="Split manifest is 1-based."
+        help="layer to extract teacher embeddings, 1-based.",
     )
 
     parser.add_argument(