add manifest dir option

2025-12-11 06:55:27 +00:00 · 2024-01-25 08:31:08 +00:00 · 2024-01-25 08:31:08 +00:00 · fd4ebf3bfe
commit fd4ebf3bfe
parent 46605eaef2
5 changed files with 36 additions and 10 deletions
--- a/egs/aishell/ASR/local/compute_fbank_aishell.py
+++ b/egs/aishell/ASR/local/compute_fbank_aishell.py
@ -50,10 +50,13 @@ torch.set_num_interop_threads(1)


 def compute_fbank_aishell(
-    num_mel_bins: int = 80, perturb_speed: bool = False, whisper_fbank: bool = False
+    num_mel_bins: int = 80,
+    perturb_speed: bool = False,
+    whisper_fbank: bool = False,
+    output_dir: str = "data/fbank",
 ):
    src_dir = Path("data/manifests")
-    output_dir = Path("data/fbank")
+    output_dir = Path(output_dir)
    num_jobs = min(15, os.cpu_count())

    dataset_parts = (
@ -130,6 +133,12 @@ def get_args():
        default=False,
        help="Use WhisperFbank instead of Fbank. Default: False.",
    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="data/fbank",
+        help="Output directory. Default: data/fbank.",
+    )
    return parser.parse_args()


@ -143,4 +152,5 @@ if __name__ == "__main__":
        num_mel_bins=args.num_mel_bins,
        perturb_speed=args.perturb_speed,
        whisper_fbank=args.whisper_fbank,
+        output_dir=args.output_dir,
    )
--- a/egs/aishell/ASR/prepare.sh
+++ b/egs/aishell/ASR/prepare.sh
@ -379,12 +379,13 @@ fi

 # whisper large-v3 using 128 mel bins, others using 80 mel bins
 whisper_mel_bins=80
+output_dir=data/fbank_whisper
 if [ $stage -le 30 ] && [ $stop_stage -ge 30 ]; then
  log "Stage 30: Compute ${whisper_mel_bins} dim fbank for whisper model fine-tuning"
-  if [ ! -f data/fbank/.aishell.whisper.done ]; then
-    mkdir -p data/fbank
-    ./local/compute_fbank_aishell.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true
-    ./local/compute_fbank_musan.py --num-mel-bins ${whisper_mel_bins} --whisper-fbank true
-    touch data/fbank/.aishell.whisper.done
+  if [ ! -f $output_dir/.aishell.whisper.done ]; then
+    mkdir -p $output_dir
+    ./local/compute_fbank_aishell.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true --output-dir $output_dir
+    ./local/compute_fbank_musan.py --num-mel-bins ${whisper_mel_bins} --whisper-fbank true --output-dir $output_dir
+    touch $output_dir/.aishell.whisper.done
  fi
 fi
--- a/egs/aishell/ASR/whisper/decode.py
+++ b/egs/aishell/ASR/whisper/decode.py
@ -28,6 +28,7 @@ python3 ./whisper/decode.py \
  --exp-dir whisper/exp_large_v2 \
  --model-name large-v2 \
  --epoch 999 --avg 1 \
+  --manifest-dir data/fbank_whisper \
  --beam-size 10 --max-duration 50

 # Command for decoding using pretrained models (before fine-tuning):
@ -36,6 +37,7 @@ python3 ./whisper/decode.py \
  --exp-dir whisper/exp_large_v2 \
  --model-name large-v2 \
  --epoch -1 --avg 1 \
+  --manifest-dir data/fbank_whisper \
  --remove-whisper-encoder-input-length-restriction False \
  --beam-size 10 --max-duration 50

--- a/egs/aishell/ASR/whisper/train.py
+++ b/egs/aishell/ASR/whisper/train.py
@ -23,6 +23,7 @@ torchrun --nproc-per-node 8 ./whisper/train.py \
  --max-duration 200 \
  --exp-dir whisper/exp_large_v2 \
  --model-name large-v2 \
+  --manifest-dir data/fbank_whisper \
  --deepspeed \
  --deepspeed_config ./whisper/ds_config_zero1.json

@ -30,6 +31,7 @@ torchrun --nproc-per-node 8 ./whisper/train.py \
 torchrun --nproc-per-node 8 ./whisper/train.py \
  --max-duration 200 \
  --exp-dir whisper/exp_medium \
+  --manifest-dir data/fbank_whisper \
  --base-lr 1e-5 \
  --model-name medium
 """
@ -253,6 +255,7 @@ def get_params() -> AttributeDict:
    params = AttributeDict(
        {
            "frame_shift_ms": 10.0,
+            "subsampling_factor": 2,
            "allowed_excess_duration_ratio": 0.1,
            "best_train_loss": float("inf"),
            "best_valid_loss": float("inf"),
--- a/egs/librispeech/ASR/local/compute_fbank_musan.py
+++ b/egs/librispeech/ASR/local/compute_fbank_musan.py
@ -54,9 +54,11 @@ def is_cut_long(c: MonoCut) -> bool:
    return c.duration > 5


-def compute_fbank_musan(num_mel_bins: int = 80, whisper_fbank: bool = False):
+def compute_fbank_musan(
+    num_mel_bins: int = 80, whisper_fbank: bool = False, output_dir: str = "data/fbank"
+):
    src_dir = Path("data/manifests")
-    output_dir = Path("data/fbank")
+    output_dir = Path(output_dir)
    num_jobs = min(15, os.cpu_count())

    dataset_parts = (
@ -129,6 +131,12 @@ def get_args():
        default=False,
        help="Use WhisperFbank instead of Fbank. Default: False.",
    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="data/fbank",
+        help="Output directory. Default: data/fbank.",
+    )
    return parser.parse_args()


@ -138,5 +146,7 @@ if __name__ == "__main__":
    logging.basicConfig(format=formatter, level=logging.INFO)
    args = get_args()
    compute_fbank_musan(
-        num_mel_bins=args.num_mel_bins, whisper_fbank=args.whisper_fbank
+        num_mel_bins=args.num_mel_bins,
+        whisper_fbank=args.whisper_fbank,
+        output_dir=args.output_dir,
    )