diff --git a/egs/librispeech/ASR/codebook_index_extraction.sh b/egs/librispeech/ASR/codebook_index_extraction.sh index 383573552..b962d0081 100644 --- a/egs/librispeech/ASR/codebook_index_extraction.sh +++ b/egs/librispeech/ASR/codebook_index_extraction.sh @@ -1,4 +1,4 @@ -stage=4 +stage=3 # Parameters about model. exp_dir=./vq_pruned_transducer_stateless2/exp/ @@ -7,11 +7,15 @@ hubert_model_dir=${exp_dir}/hubert_models hubert_model=${hubert_model_dir}/${model_id}.pt # Parameters about quantizer. +memory_layer=36 # 1-based + + +# Make sure following parameters are identical to that in hubert_utils.vq_config num_utts=1000 -mem_layer=36 bytes_per_frame=8 enable_refine=True + if [ $stage -eq -1 ]; then # Preparation state. @@ -27,6 +31,10 @@ if [ $stage -eq -1 ]; then wget -c wget https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt -P ${hubert_model_dir} fi +if [ ! -d ./data/fbank ]; then + echo "This script assumes ./data/fbank is already generated by prepare.sh" + exit 0 +fi if [ $stage -eq 0 ]; then # This stage is not directly used by codebook extraction. @@ -36,17 +44,17 @@ if [ $stage -eq 0 ]; then # [test-clean-ctc_greedy_search] %WER 2.04% [1075 / 52576, 92 ins, 104 del, 879 sub ] # [test-other-ctc_greedy_search] %WER 3.71% [1942 / 52343, 152 ins, 126 del, 1664 sub ] export CUDA_VISIBLE_DEVICES=7 - ./vq_pruned_transducer_stateless2/hubert_decode.py \ - --max-duration 10 + ./vq_pruned_transducer_stateless2/hubert_decode.py fi if [ $stage -eq 1 ]; then ./vq_pruned_transducer_stateless2/hubert_memory_embeddings.py \ - --max-duration 10 + --memory-layer=${memory_layer} fi if [ $stage -eq 2 ]; then - ./vq_pruned_transducer_stateless2/quantizer_train.py + ./vq_pruned_transducer_stateless2/quantizer_train.py \ + --memory-layer=${memory_layer} fi # CAUTITHON: set quantizer_id MANUALLY when a new quantizer is used. @@ -75,12 +83,22 @@ if [ $stage -eq 4 ]; then refine_iter=5 extract_codebook_index(){ + # Analysis of disk usage: + # With bytes_per_frame=8, each embedding is compressed into eight 8-bit integers, i.e. 8 bytes needed. + # Training dataset including clean-100h with speed perturb 0.9 and 1.1 has 300 hours. + # The output frame rates of Hubert is 50 per second. + # Theoretically, 412M = 300 * 3600 * 50 * 8 / 1024 / 1024 is needed. + # The actual size of all "*.h5" files storaging codebook index is 450M. + # I think the extra "48M" usage is some meta information. + # + # About CUDA_VISIBLE_DEVICES: # When I testing this code, gpu 6 and 7 are available, # So the CUDA_VISIBLE_DEVICES is (1 + 5) for job 0 # and (2 + 5) for job 1 # Note: order of split manfiests is 1-based, while gpu is 0-based. export CUDA_VISIBLE_DEVICES=`(expr $1 + 5)` ./vq_pruned_transducer_stateless2/hubert_code_indices.py \ + --memory-layer=${memory_layer} --num-splits $num_jobs \ --subset=$2 \ --manifest-idx $1 \ @@ -98,11 +116,36 @@ if [ $stage -eq 4 ]; then done wait fi + +cdidx_manifests_dir=`pwd`/data/globalrandom-scaledquantizer-refine_iter-5-${num_utts}-$model_id-${mem_layer}layer-${quantizer_id}-bytes_per_frame-${bytes_per_frame}-enable-refine-True if [ $stage -eq 5 ]; then - for subset in ${train_subset}; do - cdidx_manifests_dir=`pwd`/data/$model_id-${mem_layer}layer-${quantizer_id}-bytes_per_frame-${bytes_per_frame} + for subset in ${train_subsets}; do combined_list=`find $cdidx_manifests_dir/splits$num_jobs/ -name cuts_train-${sbuset}*` echo $combined_list lhotse combine $combined_list $cdidx_manifests_dir/cuts_train-${subset}.json.gz done + + reuseable_subsets="dev-clean dev-other test-clean test-other musan" + for subset in $reuseable_subsets; do + ori_manifest=./data/fbank/cuts_${subset}.json.gz + ln -sf `realpath ./data/fbank/cuts_${subset}.json.gz` ${cdidx_manifests_dir} + done +fi +if [ $stage -eq 6 ]; then + # Example training script. + # Note: it's better to set spec-aug-time-warpi-factor=-1 + export CUDA_VISIBLE_DEVICES="4,5,6" + WORLD_SIZE=3 + python3 ./vq_pruned_transducer_stateless2/train.py \ + --codebook-loss-scale 0.1 \ + --num-codebooks=${bytes_per_frame} \ + --start-epoch 0 \ + --master-port 12358 \ + --manifest-dir ${cdidx_manifests_dir} \ + --full-libri 0 \ + --spec-aug-time-warp-factor -1 \ + --max-duration 300 \ + --world-size ${WORLD_SIZE} \ + --num-epochs 30 \ + --codebook-loss-scale 0.1 fi diff --git a/egs/librispeech/ASR/vq_pruned_transducer_stateless2/hubert_utils.py b/egs/librispeech/ASR/vq_pruned_transducer_stateless2/hubert_utils.py index f8597a66f..31cf28219 100644 --- a/egs/librispeech/ASR/vq_pruned_transducer_stateless2/hubert_utils.py +++ b/egs/librispeech/ASR/vq_pruned_transducer_stateless2/hubert_utils.py @@ -31,17 +31,16 @@ from fairseq.models.hubert.hubert import HubertModel from omegaconf import OmegaConf vq_config = { + # TODO: Maybe better to convert this class to yaml driven config. # parameters about hubert model inference. "model_dir": "./vq_pruned_transducer_stateless2/exp/hubert_models/", - "model_id": "hubert_xtralarge_ll60k_finetune_ls960", "input_strategy": "AudioSamples", "enable_spec_aug": False, "enable_musan": False, "total_layers": 48, "memory_embedding_dim": 1280, # parameters about quantizer. - "num_utts": 100, - "memory_layer": 36, + "num_utts": 1000, "memory_dir": "./vq_pruned_transducer_stateless2/exp/mem/", "bytes_per_frame": 8, "refine_iter": 5, @@ -62,9 +61,19 @@ def get_parser(): ) parser.add_argument( - "--manifest-idx", + "--model-id", + type=str, + default="hubert_xtralarge_ll60k_finetune_ls960", + ) + + parser.add_argument( + "--manifest-idx", type=int, help="Split manifest is 1-based." + ) + + parser.add_argument( + "--memory-layer", type=int, - help="Split manifest is 1-based." + help="layer to extract teacher embeddings, 1-based.", ) parser.add_argument(