diff --git a/egs/librispeech/ASR/local/compute_fbank_librispeech.py b/egs/librispeech/ASR/local/compute_fbank_librispeech.py index 25d6050bb..5b703d9ca 100755 --- a/egs/librispeech/ASR/local/compute_fbank_librispeech.py +++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py @@ -32,7 +32,14 @@ from typing import Optional import sentencepiece as spm import torch from filter_cuts import filter_cuts -from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter +from lhotse import ( + CutSet, + Fbank, + FbankConfig, + LilcomChunkyWriter, + WhisperFbank, + WhisperFbankConfig, +) from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor, str2bool @@ -61,6 +68,13 @@ def get_args(): help="""Dataset parts to compute fbank. If None, we will use all""", ) + parser.add_argument( + "--output-dir", + type=str, + default="data/fbank", + help="Where to store the train/dev/test manifests and fbank features", + ) + parser.add_argument( "--perturb-speed", type=str2bool, @@ -68,18 +82,33 @@ def get_args(): help="""Perturb speed with factor 0.9 and 1.1 on train subset.""", ) + parser.add_argument( + "--whisper-fbank", + type=str2bool, + default=False, + help="If use Whisper configuration for fbank computation", + ) + + parser.add_argument( + "--num-mel-bins", + type=int, + default=80, + ) + return parser.parse_args() def compute_fbank_librispeech( bpe_model: Optional[str] = None, dataset: Optional[str] = None, + output_dir: Optional[str] = None, perturb_speed: Optional[bool] = True, + whisper_fbank: Optional[bool] = False, + num_mel_bins: Optional[int] = 80, ): src_dir = Path("data/manifests") - output_dir = Path("data/fbank") + output_dir = Path(output_dir) num_jobs = min(15, os.cpu_count()) - num_mel_bins = 80 if bpe_model: logging.info(f"Loading {bpe_model}") @@ -116,7 +145,12 @@ def compute_fbank_librispeech( dataset_parts, ) - extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins)) + if whisper_fbank: + extractor = WhisperFbank( + WhisperFbankConfig(num_filters=num_mel_bins, device="cuda") + ) + else: + extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins)) with get_executor() as ex: # Initialize the executor only once. for partition, m in manifests.items(): @@ -134,7 +168,7 @@ def compute_fbank_librispeech( if bpe_model: cut_set = filter_cuts(cut_set, sp) if perturb_speed: - logging.info(f"Doing speed perturb") + logging.info("Doing speed perturb") cut_set = ( cut_set + cut_set.perturb_speed(0.9) @@ -160,5 +194,8 @@ if __name__ == "__main__": compute_fbank_librispeech( bpe_model=args.bpe_model, dataset=args.dataset, + output_dir=args.output_dir, perturb_speed=args.perturb_speed, + whisper_fbank=args.whisper_fbank, + num_mel_bins=args.num_mel_bins, ) diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh index 40dc3260d..9f9048a6d 100755 --- a/egs/librispeech/ASR/prepare.sh +++ b/egs/librispeech/ASR/prepare.sh @@ -243,3 +243,23 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then $lang_dir/L_disambig.fst fi fi + + +if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then + log "Stage 7: Prepare whisper fbank feature" + perturb_speed=1 + whisper_mel_bins=80 + output_dir=data/fbank_whisper_${whisper_mel_bins}D + if [ ! -f $output_dir/.librispeech.whisper.done ]; then + mkdir -p $output_dir + ./local/compute_fbank_librispeech.py \ + --num-mel-bins ${whisper_mel_bins} \ + --whisper-fbank true \ + --output-dir $output_dir + ./local/compute_fbank_musan.py \ + --num-mel-bins ${whisper_mel_bins} \ + --whisper-fbank true \ + --output-dir $output_dir + touch $output_dir/.librispeech.whisper.done + fi +fi