diff --git a/egs/aishell2/ASR/local/compute_fbank_aishell2.py b/egs/aishell2/ASR/local/compute_fbank_aishell2.py index 1fb1621ff..f431966c2 100755 --- a/egs/aishell2/ASR/local/compute_fbank_aishell2.py +++ b/egs/aishell2/ASR/local/compute_fbank_aishell2.py @@ -29,7 +29,7 @@ import os from pathlib import Path import torch -from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter +from lhotse import CutSet, WhisperFbank, WhisperFbankConfig, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor, str2bool @@ -42,7 +42,7 @@ torch.set_num_threads(1) torch.set_num_interop_threads(1) -def compute_fbank_aishell2(num_mel_bins: int = 80, perturb_speed: bool = False): +def compute_fbank_aishell2(num_mel_bins: int = 80, perturb_speed: bool = False, whisper_fbank: bool = False): src_dir = Path("data/manifests") output_dir = Path("data/fbank") num_jobs = min(15, os.cpu_count()) @@ -68,8 +68,10 @@ def compute_fbank_aishell2(num_mel_bins: int = 80, perturb_speed: bool = False): list(manifests.keys()), dataset_parts, ) - - extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins)) + if whisper_fbank: + extractor = WhisperFbank(WhisperFbankConfig(num_filters=num_mel_bins, device='cuda')) + else: + extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins)) with get_executor() as ex: # Initialize the executor only once. for partition, m in manifests.items(): @@ -111,7 +113,12 @@ def get_args(): default=False, help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.", ) - + parser.add_argument( + "--whisper-fbank", + type=str2bool, + default=False, + help="Use WhisperFbank instead of Fbank. Default: False.", + ) return parser.parse_args() @@ -122,5 +129,5 @@ if __name__ == "__main__": args = get_args() compute_fbank_aishell2( - num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed + num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed, whisper_fbank=args.whisper_fbank ) diff --git a/egs/aishell2/ASR/prepare.sh b/egs/aishell2/ASR/prepare.sh index a5eb9bd13..418762be1 100755 --- a/egs/aishell2/ASR/prepare.sh +++ b/egs/aishell2/ASR/prepare.sh @@ -6,8 +6,8 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python set -eou pipefail nj=30 -stage=0 -stop_stage=7 +stage=1 +stop_stage=1 perturb_speed=true @@ -108,6 +108,16 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then fi fi +whisper_mel_bins=80 +if [ $stage -le 30 ] && [ $stop_stage -ge 30 ]; then + log "Stage 30: Compute whisper fbank for aishell2" + if [ ! -f data/fbank/.aishell2.whisper.done ]; then + mkdir -p data/fbank + ./local/compute_fbank_aishell2.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true + touch data/fbank/.aishell2.whisper.done + fi +fi + if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then log "Stage 4: Compute fbank for musan" if [ ! -f data/fbank/.msuan.done ]; then diff --git a/egs/wenetspeech/ASR/prepare.sh b/egs/wenetspeech/ASR/prepare.sh index bf91f0c6b..7d022e7a8 100755 --- a/egs/wenetspeech/ASR/prepare.sh +++ b/egs/wenetspeech/ASR/prepare.sh @@ -6,8 +6,8 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python set -eou pipefail nj=15 -stage=130 -stop_stage=130 +stage=131 +stop_stage=131 # Split L subset to this number of pieces # This is to avoid OOM during feature extraction. @@ -198,7 +198,7 @@ if [ $stage -le 130 ] && [ $stop_stage -ge 130 ]; then python3 ./local/compute_fbank_wenetspeech_splits.py \ --training-subset L \ - --num-workers 40 \ + --num-workers 8 \ --batch-duration 1600 \ --start 0 \ --num-mel-bins ${whisper_mel_bins} --whisper-fbank true \ @@ -215,9 +215,9 @@ if [ $stage -le 131 ] && [ $stop_stage -ge 131 ]; then python3 ./local/compute_fbank_wenetspeech_splits.py \ --training-subset L \ - --num-workers 40 \ + --num-workers 8 \ --batch-duration 1600 \ - --start 99 \ + --start 98 \ --num-mel-bins ${whisper_mel_bins} --whisper-fbank false \ --num-splits $num_splits