From c2f8c6d232018f01a5950dee315eb7af638717ca Mon Sep 17 00:00:00 2001
From: marcoyang <marcoyang1998@gmail.com>
Date: Thu, 28 Mar 2024 12:33:23 +0800
Subject: [PATCH 01/13] add files

---
 egs/librispeech/ASR/whisper/asr_datamodule.py |   1 +
 .../ASR/whisper/label_smoothing.py            |   1 +
 egs/librispeech/ASR/whisper/optim.py          |   1 +
 egs/librispeech/ASR/whisper/train.py          | 927 ++++++++++++++++++
 4 files changed, 930 insertions(+)
 create mode 120000 egs/librispeech/ASR/whisper/asr_datamodule.py
 create mode 120000 egs/librispeech/ASR/whisper/label_smoothing.py
 create mode 120000 egs/librispeech/ASR/whisper/optim.py
 create mode 100755 egs/librispeech/ASR/whisper/train.py

diff --git a/egs/librispeech/ASR/whisper/asr_datamodule.py b/egs/librispeech/ASR/whisper/asr_datamodule.py
new file mode 120000
index 000000000..fa1b8cca3
--- /dev/null
+++ b/egs/librispeech/ASR/whisper/asr_datamodule.py
@@ -0,0 +1 @@
+../tdnn_lstm_ctc/asr_datamodule.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/whisper/label_smoothing.py b/egs/librispeech/ASR/whisper/label_smoothing.py
new file mode 120000
index 000000000..08734abd7
--- /dev/null
+++ b/egs/librispeech/ASR/whisper/label_smoothing.py
@@ -0,0 +1 @@
+../conformer_ctc/label_smoothing.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/whisper/optim.py b/egs/librispeech/ASR/whisper/optim.py
new file mode 120000
index 000000000..207eecfcd
--- /dev/null
+++ b/egs/librispeech/ASR/whisper/optim.py
@@ -0,0 +1 @@
+../zipformer/optim.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/whisper/train.py b/egs/librispeech/ASR/whisper/train.py
new file mode 100755
index 000000000..6ccb8d363
--- /dev/null
+++ b/egs/librispeech/ASR/whisper/train.py
@@ -0,0 +1,927 @@
+#!/usr/bin/env python3
+# Copyright    2023  Xiaomi Corp.        (authors: Xiaoyu Yang)
+#              2024  Yuekai Zhang
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+
+#fine-tuning with deepspeed zero stage 1
+torchrun --nproc_per_node 8 ./whisper/train.py \
+  --max-duration 200 \
+  --exp-dir whisper/exp_large_v2 \
+  --model-name large-v2 \
+  --manifest-dir data/fbank_whisper \
+  --deepspeed \
+  --deepspeed_config ./whisper/ds_config_zero1.json
+
+# fine-tuning with ddp
+torchrun --nproc_per_node 8 ./whisper/train.py \
+  --max-duration 200 \
+  --exp-dir whisper/exp_medium \
+  --manifest-dir data/fbank_whisper \
+  --base-lr 1e-5 \
+  --model-name medium
+"""
+
+
+import argparse
+import copy
+import logging
+import random
+import warnings
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import deepspeed
+import k2
+import optim
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+import whisper
+from asr_datamodule import AishellAsrDataModule
+from deepspeed.utils.zero_to_fp32 import convert_zero_checkpoint_to_fp32_state_dict
+from label_smoothing import LabelSmoothingLoss
+from lhotse import CutSet, load_manifest
+from lhotse.cut import Cut
+from lhotse.dataset.sampling.base import CutSampler
+from lhotse.utils import fix_random_seed
+from optim import Eden, ScaledAdam
+from torch import Tensor
+from torch.cuda.amp import GradScaler
+from torch.nn.functional import pad as pad_tensor
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from whisper_encoder_forward_monkey_patch import replace_whisper_encoder_forward
+
+from icefall import diagnostics
+from icefall.checkpoint import load_checkpoint, remove_checkpoints
+from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
+from icefall.checkpoint import update_averaged_model
+from icefall.dist import cleanup_dist, get_rank, get_world_size, setup_dist
+from icefall.env import get_env_info
+from icefall.hooks import register_inf_check_hooks
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    filter_uneven_sized_batch,
+    setup_logger,
+    str2bool,
+)
+
+LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
+
+
+def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
+    if isinstance(model, DDP):
+        # get underlying nn.Module
+        model = model.module
+    for module in model.modules():
+        if hasattr(module, "batch_count"):
+            module.batch_count = batch_count
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--tensorboard",
+        type=str2bool,
+        default=True,
+        help="Should various information be logged in tensorboard.",
+    )
+
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=10,
+        help="Number of epochs to train.",
+    )
+
+    parser.add_argument(
+        "--start-epoch",
+        type=int,
+        default=1,
+        help="""Resume training from this epoch. It should be positive.
+        If larger than 1, it will load checkpoint from
+        exp-dir/epoch-{start_epoch-1}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--start-batch",
+        type=int,
+        default=0,
+        help="""If positive, --start-epoch is ignored and
+        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="whisper/exp",
+        help="""The experiment dir.
+        It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="large-v2",
+        choices=["large-v2", "large-v3", "medium", "small", "tiny"],
+        help="""The model name to use.
+        """,
+    )
+
+    parser.add_argument(
+        "--base-lr", type=float, default=1e-5, help="The base learning rate."
+    )
+
+    parser.add_argument(
+        "--lr-batches",
+        type=float,
+        default=5000,
+        help="""Number of steps that affects how rapidly the learning rate
+        decreases. We suggest not to change this.""",
+    )
+
+    parser.add_argument(
+        "--lr-epochs",
+        type=float,
+        default=6,
+        help="""Number of epochs that affects how rapidly the learning rate decreases.
+        """,
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="The seed for random generators intended for reproducibility",
+    )
+
+    parser.add_argument(
+        "--print-diagnostics",
+        type=str2bool,
+        default=False,
+        help="Accumulate stats on activations, print them and exit.",
+    )
+
+    parser.add_argument(
+        "--inf-check",
+        type=str2bool,
+        default=False,
+        help="Add hooks to check for infinite module outputs and gradients.",
+    )
+
+    parser.add_argument(
+        "--keep-last-k",
+        type=int,
+        default=30,
+        help="""Only keep this number of checkpoints on disk.
+        For instance, if it is 3, there are only 3 checkpoints
+        in the exp-dir with filenames `checkpoint-xxx.pt`.
+        It does not affect checkpoints with name `epoch-xxx.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--average-period",
+        type=int,
+        default=200,
+        help="""Update the averaged model, namely `model_avg`, after processing
+        this number of batches. `model_avg` is a separate version of model,
+        in which each floating-point parameter is the average of all the
+        parameters from the start of training. Each time we take the average,
+        we do: `model_avg = model * (average_period / batch_idx_train) +
+            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
+        """,
+    )
+
+    parser.add_argument(
+        "--use-fp16",
+        type=str2bool,
+        default=True,
+        help="Whether to use half precision training.",
+    )
+
+    parser = deepspeed.add_config_arguments(parser)
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    """Return a dict containing training parameters.
+
+    All training related parameters that are not passed from the commandline
+    are saved in the variable `params`.
+
+    Commandline options are merged into `params` after they are parsed, so
+    you can also access them via `params`.
+
+    Explanation of options saved in `params`:
+
+        - frame_shift_ms: The frame shift in milliseconds.
+        - allowed_excess_duration_ratio: The allowed excess duration ratio.
+        - best_train_loss: The best training loss so far.
+        - best_valid_loss: The best validation loss so far.
+        - best_train_epoch: The epoch where the best training loss is achieved.
+        - best_valid_epoch: The epoch where the best validation loss is achieved.
+        - batch_idx_train: The batch index of the current batch.
+        - log_interval: Log training stats every `log_interval` batches.
+        - reset_interval: Reset the stats every `reset_interval` batches.
+        - valid_interval: Run validation every `valid_interval` batches.
+        - env_info: The environment information.
+    """
+    params = AttributeDict(
+        {
+            "frame_shift_ms": 10.0,
+            "subsampling_factor": 2,
+            "allowed_excess_duration_ratio": 0.1,
+            "best_train_loss": float("inf"),
+            "best_valid_loss": float("inf"),
+            "best_train_epoch": -1,
+            "best_valid_epoch": -1,
+            "batch_idx_train": 0,
+            "log_interval": 50,
+            "reset_interval": 200,
+            "valid_interval": 5000,
+            "env_info": get_env_info(),
+        }
+    )
+
+    return params
+
+
+def load_checkpoint_if_available(
+    params: AttributeDict,
+    model: nn.Module,
+    model_avg: nn.Module = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+) -> Optional[Dict[str, Any]]:
+    """Load checkpoint from file.
+
+    If params.start_batch is positive, it will load the checkpoint from
+    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
+    params.start_epoch is larger than 1, it will load the checkpoint from
+    `params.start_epoch - 1`.
+
+    Apart from loading state dict for `model` and `optimizer` it also updates
+    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
+    and `best_valid_loss` in `params`.
+
+    Args:
+      params:
+        The return value of :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer that we are using.
+      scheduler:
+        The scheduler that we are using.
+    Returns:
+      Return a dict containing previously saved training info.
+    """
+    if params.start_batch > 0:
+        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
+    elif params.start_epoch > 1:
+        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    else:
+        return None
+
+    assert filename.is_file(), f"{filename} does not exist!"
+
+    saved_params = load_checkpoint(
+        filename,
+        model=model,
+        model_avg=model_avg,
+        optimizer=optimizer,
+        scheduler=scheduler,
+    )
+
+    keys = [
+        "best_train_epoch",
+        "best_valid_epoch",
+        "batch_idx_train",
+        "best_train_loss",
+        "best_valid_loss",
+    ]
+    for k in keys:
+        params[k] = saved_params[k]
+
+    if params.start_batch > 0:
+        if "cur_epoch" in saved_params:
+            params["start_epoch"] = saved_params["cur_epoch"]
+
+    return saved_params
+
+
+def save_checkpoint(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    model_avg: Optional[nn.Module] = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+    sampler: Optional[CutSampler] = None,
+    scaler: Optional[GradScaler] = None,
+    rank: int = 0,
+) -> None:
+    """Save model, optimizer, scheduler and training stats to file.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer used in the training.
+      sampler:
+       The sampler for the training dataset.
+      scaler:
+        The scaler used for mix precision training.
+    """
+    if rank != 0:
+        return
+    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
+    save_checkpoint_impl(
+        filename=filename,
+        model=model,
+        model_avg=model_avg,
+        params=params,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        sampler=sampler,
+        scaler=scaler,
+        rank=rank,
+    )
+
+    if params.best_train_epoch == params.cur_epoch:
+        best_train_filename = params.exp_dir / "best-train-loss.pt"
+        copyfile(src=filename, dst=best_train_filename)
+
+    if params.best_valid_epoch == params.cur_epoch:
+        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
+        copyfile(src=filename, dst=best_valid_filename)
+
+
+def compute_loss(
+    params: AttributeDict,
+    tokenizer: whisper.tokenizer.Tokenizer,
+    model: Union[nn.Module, DDP],
+    batch: dict,
+    is_training: bool,
+) -> Tuple[Tensor, MetricsTracker]:
+    """
+    Compute the loss for the given batch.
+    Args:
+        params:
+            It is returned by :func:`get_params`.
+        tokenizer:
+            The tokenizer used to encode the text.
+        model:
+            The model for training.
+        batch:
+            A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+            for the content in it.
+        is_training:
+            Whether it is training.
+    Returns:
+        Return a tuple of two elements. The first element is the loss tensor.
+    """
+    # For the uneven-sized batch, the total duration after padding would possibly
+    # cause OOM. Hence, for each batch, which is sorted descendingly by length,
+    # we simply drop the last few shortest samples, so that the retained total frames
+    # (after padding) would not exceed `allowed_max_frames`:
+    # `allowed_max_frames = int(max_frames * (1.0 + allowed_excess_duration_ratio))`,
+    # where `max_frames = max_duration * 1000 // frame_shift_ms`.
+    # We set allowed_excess_duration_ratio=0.1.
+    if isinstance(model, DDP):
+        # get underlying nn.Module
+        model = model.module
+
+    def _batch_tensors(tensors: List[Tensor], pad_value: Any) -> Tensor:
+        padding_size = max(tensor.shape[0] for tensor in tensors)
+        dims = len(tensors[0].shape)
+        padded_tensors = []
+        for tensor in tensors:
+            padding = [0] * 2 * dims
+            padding[-1] = padding_size - tensor.shape[0]
+            padded_tensors.append(pad_tensor(tensor, padding, "constant", pad_value))
+        return torch.stack([tensor for tensor in padded_tensors], dim=0)
+
+    max_frames = params.max_duration * 1000 // params.frame_shift_ms
+    allowed_max_frames = int(max_frames * (1.0 + params.allowed_excess_duration_ratio))
+    batch = filter_uneven_sized_batch(batch, allowed_max_frames)
+
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
+    feature = batch["inputs"]
+
+    assert feature.ndim == 3
+    feature = feature.to(device)
+    feature = feature.transpose(1, 2)  # (N, C, T)
+
+    supervisions = batch["supervisions"]
+    feature_lens = supervisions["num_frames"].to(device)
+
+    batch_idx_train = params.batch_idx_train
+
+    texts = batch["supervisions"]["text"]
+    # remove spaces in texts
+    texts = [text.replace(" ", "") for text in texts]
+
+    text_tokens_list = [
+        list(tokenizer.sot_sequence_including_notimestamps)
+        + tokenizer.encode(text)
+        + [tokenizer.eot]
+        for text in texts
+    ]
+    # convert it to torch tensor
+    text_tokens_list = [
+        torch.LongTensor(text_tokens) for text_tokens in text_tokens_list
+    ]
+
+    # 50256 is the index of <pad> for all whisper models
+    prev_outputs_tokens = _batch_tensors(
+        [tokens[:-1] for tokens in text_tokens_list], pad_value=50256
+    )
+    target_tokens = _batch_tensors(
+        [tokens[1:] for tokens in text_tokens_list], pad_value=50256
+    )
+    target_lengths = torch.LongTensor(
+        [tokens.shape[0] - 1 for tokens in text_tokens_list]
+    )
+
+    decoder_criterion = LabelSmoothingLoss(
+        ignore_index=50256, label_smoothing=0.1, reduction="sum"
+    )
+
+    # ignore the first 3 tokens, which are always <|lang_id|>, <|transcibe|>, <|notimestampes|>
+    ignore_prefix_size = 3
+    with torch.set_grad_enabled(is_training):
+        encoder_out = model.encoder(feature)
+        text_logits = model.decoder(prev_outputs_tokens.to(device), encoder_out)
+        text_logits = text_logits[:, ignore_prefix_size:, :]
+        target_tokens = target_tokens[:, ignore_prefix_size:]
+        loss = decoder_criterion(text_logits, target_tokens.to(device))
+
+    assert loss.requires_grad == is_training
+
+    info = MetricsTracker()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+
+    # Note: We use reduction=sum while computing the loss.
+    info["loss"] = loss.detach().cpu().item()
+
+    return loss, info
+
+
+def compute_validation_loss(
+    params: AttributeDict,
+    tokenizer: whisper.tokenizer.Tokenizer,
+    model: Union[nn.Module, DDP],
+    valid_dl: torch.utils.data.DataLoader,
+    world_size: int = 1,
+) -> MetricsTracker:
+    """Run the validation process."""
+    model.eval()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(valid_dl):
+        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            loss, loss_info = compute_loss(
+                params=params,
+                tokenizer=tokenizer,
+                model=model,
+                batch=batch,
+                is_training=False,
+            )
+        assert loss.requires_grad is False
+        tot_loss = tot_loss + loss_info
+
+    if world_size > 1:
+        tot_loss.reduce(loss.device)
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    if loss_value < params.best_valid_loss:
+        params.best_valid_epoch = params.cur_epoch
+        params.best_valid_loss = loss_value
+
+    return tot_loss
+
+
+def train_one_epoch(
+    params: AttributeDict,
+    tokenizer: whisper.tokenizer.Tokenizer,
+    model: Union[nn.Module, DDP],
+    optimizer: torch.optim.Optimizer,
+    scheduler: LRSchedulerType,
+    train_dl: torch.utils.data.DataLoader,
+    valid_dl: torch.utils.data.DataLoader,
+    scaler: GradScaler,
+    model_avg: Optional[nn.Module] = None,
+    tb_writer: Optional[SummaryWriter] = None,
+    world_size: int = 1,
+    rank: int = 0,
+) -> None:
+    """Train the model for one epoch.
+
+    The training loss from the mean of all frames is saved in
+    `params.train_loss`. It runs the validation process every
+    `params.valid_interval` batches.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The model for training.
+      optimizer:
+        The optimizer we are using.
+      scheduler:
+        The learning rate scheduler, we call step() every step.
+      train_dl:
+        Dataloader for the training dataset.
+      valid_dl:
+        Dataloader for the validation dataset.
+      scaler:
+        The scaler used for mix precision training.
+      model_avg:
+        The stored model averaged from the start of training.
+      tb_writer:
+        Writer to write log messages to tensorboard.
+      world_size:
+        Number of nodes in DDP training. If it is 1, DDP is disabled.
+      rank:
+        The rank of the node in DDP training. If no DDP is used, it should
+        be set to 0.
+    """
+    model.train()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(train_dl):
+        params.batch_idx_train += 1
+        batch_size = len(batch["supervisions"]["text"])
+        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
+            logging.info("Computing validation loss")
+            valid_info = compute_validation_loss(
+                params=params,
+                tokenizer=tokenizer,
+                model=model,
+                valid_dl=valid_dl,
+                world_size=world_size,
+            )
+            model.train()
+            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
+            logging.info(
+                f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+            )
+            if tb_writer is not None:
+                valid_info.write_summary(
+                    tb_writer, "train/valid_", params.batch_idx_train
+                )
+
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, loss_info = compute_loss(
+                    params=params,
+                    tokenizer=tokenizer,
+                    model=model,
+                    batch=batch,
+                    is_training=True,
+                )
+            # summary stats
+            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+
+            # NOTE: We use reduction==sum and loss is computed over utterances
+            # in the batch and there is no normalization to it so far.
+            if params.deepspeed:
+                # deepspeed's backward() is different from torch's backward()
+                # in that it does not accept a loss tensor as input.
+                # It computes the loss internally.
+                model.backward(loss)
+                model.step()
+            else:
+                scaler.scale(loss).backward()
+                set_batch_count(model, params.batch_idx_train)
+                scheduler.step_batch(params.batch_idx_train)
+
+                scaler.step(optimizer)
+                scaler.update()
+                optimizer.zero_grad()
+        except:  # noqa
+            display_and_save_batch(batch, params=params)
+            raise
+
+        if params.print_diagnostics and batch_idx == 5:
+            return
+
+        if (
+            rank == 0
+            and params.batch_idx_train > 0
+            and params.batch_idx_train % params.average_period == 0
+            and not params.deepspeed
+        ):
+            update_averaged_model(
+                params=params,
+                model_cur=model,
+                model_avg=model_avg,
+            )
+
+        if batch_idx % 100 == 0 and params.use_fp16 and not params.deepspeed:
+            # If the grad scale was less than 1, try increasing it.    The _growth_interval
+            # of the grad scaler is configurable, but we can't configure it to have different
+            # behavior depending on the current grad scale.
+            cur_grad_scale = scaler._scale.item()
+            if cur_grad_scale < 1.0 or (cur_grad_scale < 8.0 and batch_idx % 400 == 0):
+                scaler.update(cur_grad_scale * 2.0)
+            if cur_grad_scale < 0.01:
+                logging.warning(f"Grad scale is small: {cur_grad_scale}")
+            if cur_grad_scale < 1.0e-05:
+                raise RuntimeError(
+                    f"grad_scale is too small, exiting: {cur_grad_scale}"
+                )
+        if batch_idx % params.log_interval == 0:
+            try:
+                cur_lr = scheduler.get_last_lr()[0]
+            except:  # noqa
+                cur_lr = 0.0
+            cur_grad_scale = (
+                scaler._scale.item()
+                if (params.use_fp16 and not params.deepspeed)
+                else 1.0
+            )
+
+            logging.info(
+                f"Epoch {params.cur_epoch}, "
+                f"batch {batch_idx}, loss[{loss_info}], "
+                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
+                f"lr: {cur_lr:.2e}, "
+                + (
+                    f"grad_scale: {scaler._scale.item()}"
+                    if (params.use_fp16 and not params.deepspeed)
+                    else ""
+                )
+            )
+
+            if tb_writer is not None:
+                tb_writer.add_scalar(
+                    "train/learning_rate", cur_lr, params.batch_idx_train
+                )
+
+                loss_info.write_summary(
+                    tb_writer, "train/current_", params.batch_idx_train
+                )
+                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
+                if params.use_fp16:
+                    tb_writer.add_scalar(
+                        "train/grad_scale",
+                        cur_grad_scale,
+                        params.batch_idx_train,
+                    )
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    params.train_loss = loss_value
+    if params.train_loss < params.best_train_loss:
+        params.best_train_epoch = params.cur_epoch
+        params.best_train_loss = params.train_loss
+
+
+def run(rank, world_size, args):
+    """
+    Args:
+      rank:
+        It is a value between 0 and `world_size-1`, which is
+        passed automatically by `mp.spawn()` in :func:`main`.
+        The node with rank 0 is responsible for saving checkpoint.
+      world_size:
+        Number of GPUs for DDP training.
+      args:
+        The return value of get_parser().parse_args()
+    """
+    params = get_params()
+    params.update(vars(args))
+
+    fix_random_seed(params.seed)
+
+    setup_logger(f"{params.exp_dir}/log/log-train")
+    logging.info(params)
+
+    logging.info("About to create model")
+
+    replace_whisper_encoder_forward()
+    model = whisper.load_model(params.model_name, "cpu")
+    del model.alignment_heads
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    tokenizer = whisper.tokenizer.get_tokenizer(
+        model.is_multilingual,
+        num_languages=model.num_languages,
+        language="zh",
+        task="transcribe",
+    )
+
+    model_avg: Optional[nn.Module] = None
+    if rank == 0:
+        # model_avg is only used with rank 0
+        model_avg = copy.deepcopy(model).to(torch.float64)
+
+    assert params.start_epoch > 0, params.start_epoch
+    checkpoints = load_checkpoint_if_available(
+        params=params, model=model, model_avg=model_avg
+    )
+
+    if torch.cuda.is_available():
+        device = torch.device("cuda", rank)
+    else:
+        device = torch.device("cpu")
+    logging.info(f"Device: {device}")
+    model.to(device)
+
+    optimizer = torch.optim.AdamW(model.parameters(), lr=params.base_lr)
+    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
+
+    if checkpoints and "optimizer" in checkpoints:
+        logging.info("Loading optimizer state dict")
+        optimizer.load_state_dict(checkpoints["optimizer"])
+
+    if (
+        checkpoints
+        and "scheduler" in checkpoints
+        and checkpoints["scheduler"] is not None
+    ):
+        logging.info("Loading scheduler state dict")
+        scheduler.load_state_dict(checkpoints["scheduler"])
+
+    if world_size > 1:
+        if params.deepspeed:
+            logging.info("Using DeepSpeed")
+            model, optimizer, _, scheduler = deepspeed.initialize(
+                args=params, model=model, model_parameters=model.parameters()
+            )
+        else:
+            logging.info("Using DDP")
+            setup_dist(use_ddp_launch=True)
+            model = DDP(model, device_ids=[rank], find_unused_parameters=True)
+
+    if params.print_diagnostics:
+        opts = diagnostics.TensorDiagnosticOptions(
+            512
+        )  # allow 4 megabytes per sub-module
+        diagnostic = diagnostics.attach_diagnostics(model, opts)
+
+    if params.inf_check:
+        register_inf_check_hooks(model)
+
+    aishell = AishellAsrDataModule(args)
+
+    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
+        # We only load the sampler's state dict when it loads a checkpoint
+        # saved in the middle of an epoch
+        sampler_state_dict = checkpoints["sampler"]
+    else:
+        sampler_state_dict = None
+
+    train_dl = aishell.train_dataloaders(aishell.train_cuts())
+    valid_dl = aishell.valid_dataloaders(aishell.valid_cuts())
+
+    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
+    if checkpoints and "grad_scaler" in checkpoints:
+        logging.info("Loading grad scaler state dict")
+        scaler.load_state_dict(checkpoints["grad_scaler"])
+
+    if args.tensorboard and rank == 0:
+        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
+    else:
+        tb_writer = None
+
+    logging.info(f"start training from epoch {params.start_epoch}")
+    for epoch in range(params.start_epoch, params.num_epochs + 1):
+        if not params.deepspeed:
+            scheduler.step_epoch(epoch - 1)
+        fix_random_seed(params.seed + epoch - 1)
+        train_dl.sampler.set_epoch(epoch - 1)
+
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
+
+        params.cur_epoch = epoch
+
+        train_one_epoch(
+            params=params,
+            tokenizer=tokenizer,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            train_dl=train_dl,
+            valid_dl=valid_dl,
+            scaler=scaler,
+            tb_writer=tb_writer,
+            world_size=world_size,
+            rank=rank,
+        )
+
+        if params.print_diagnostics:
+            diagnostic.print_diagnostics()
+            break
+
+        if params.deepspeed:
+            model.save_checkpoint(
+                save_dir=params.exp_dir,
+                tag=f"epoch-{params.cur_epoch}",
+                client_state={},
+            )
+            if rank == 0:
+                convert_zero_checkpoint_to_fp32_state_dict(
+                    params.exp_dir,
+                    f"{params.exp_dir}/epoch-{params.cur_epoch}.pt",
+                    tag=f"epoch-{params.cur_epoch}",
+                )
+        else:
+            save_checkpoint(
+                params=params,
+                model=model,
+                model_avg=model_avg,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                sampler=train_dl.sampler,
+                scaler=scaler,
+                rank=rank,
+            )
+
+    logging.info("Done!")
+
+    if world_size > 1 and not params.deepspeed:
+        torch.distributed.barrier()
+        cleanup_dist()
+
+
+def display_and_save_batch(
+    batch: dict,
+    params: AttributeDict,
+) -> None:
+    """Display the batch statistics and save the batch into disk.
+
+    Args:
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      params:
+        Parameters for training. See :func:`get_params`.
+    """
+    from lhotse.utils import uuid4
+
+    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
+    logging.info(f"Saving batch to {filename}")
+    torch.save(batch, filename)
+
+    supervisions = batch["supervisions"]
+    features = batch["inputs"]
+
+    logging.info(f"features shape: {features.shape}")
+
+
+def main():
+    parser = get_parser()
+    AishellAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    world_size = get_world_size()
+    rank = get_rank()
+
+    torch.set_num_threads(1)
+    torch.set_num_interop_threads(1)
+    run(rank=rank, world_size=world_size, args=args)
+
+
+if __name__ == "__main__":
+    main()

From 1cf78fd6755726668888031007f0270a1fb2c260 Mon Sep 17 00:00:00 2001
From: marcoyang <marcoyang1998@gmail.com>
Date: Thu, 28 Mar 2024 12:37:44 +0800
Subject: [PATCH 02/13] fbank for whisper

---
 .../ASR/local/compute_fbank_librispeech.py    | 47 +++++++++++++++++--
 egs/librispeech/ASR/prepare.sh                | 20 ++++++++
 2 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/egs/librispeech/ASR/local/compute_fbank_librispeech.py b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
index 25d6050bb..5b703d9ca 100755
--- a/egs/librispeech/ASR/local/compute_fbank_librispeech.py
+++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
@@ -32,7 +32,14 @@ from typing import Optional
 import sentencepiece as spm
 import torch
 from filter_cuts import filter_cuts
-from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
+from lhotse import (
+    CutSet,
+    Fbank,
+    FbankConfig,
+    LilcomChunkyWriter,
+    WhisperFbank,
+    WhisperFbankConfig,
+)
 from lhotse.recipes.utils import read_manifests_if_cached
 
 from icefall.utils import get_executor, str2bool
@@ -61,6 +68,13 @@ def get_args():
         help="""Dataset parts to compute fbank. If None, we will use all""",
     )
 
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="data/fbank",
+        help="Where to store the train/dev/test manifests and fbank features",
+    )
+
     parser.add_argument(
         "--perturb-speed",
         type=str2bool,
@@ -68,18 +82,33 @@ def get_args():
         help="""Perturb speed with factor 0.9 and 1.1 on train subset.""",
     )
 
+    parser.add_argument(
+        "--whisper-fbank",
+        type=str2bool,
+        default=False,
+        help="If use Whisper configuration for fbank computation",
+    )
+
+    parser.add_argument(
+        "--num-mel-bins",
+        type=int,
+        default=80,
+    )
+
     return parser.parse_args()
 
 
 def compute_fbank_librispeech(
     bpe_model: Optional[str] = None,
     dataset: Optional[str] = None,
+    output_dir: Optional[str] = None,
     perturb_speed: Optional[bool] = True,
+    whisper_fbank: Optional[bool] = False,
+    num_mel_bins: Optional[int] = 80,
 ):
     src_dir = Path("data/manifests")
-    output_dir = Path("data/fbank")
+    output_dir = Path(output_dir)
     num_jobs = min(15, os.cpu_count())
-    num_mel_bins = 80
 
     if bpe_model:
         logging.info(f"Loading {bpe_model}")
@@ -116,7 +145,12 @@ def compute_fbank_librispeech(
         dataset_parts,
     )
 
-    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
+    if whisper_fbank:
+        extractor = WhisperFbank(
+            WhisperFbankConfig(num_filters=num_mel_bins, device="cuda")
+        )
+    else:
+        extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
 
     with get_executor() as ex:  # Initialize the executor only once.
         for partition, m in manifests.items():
@@ -134,7 +168,7 @@ def compute_fbank_librispeech(
                 if bpe_model:
                     cut_set = filter_cuts(cut_set, sp)
                 if perturb_speed:
-                    logging.info(f"Doing speed perturb")
+                    logging.info("Doing speed perturb")
                     cut_set = (
                         cut_set
                         + cut_set.perturb_speed(0.9)
@@ -160,5 +194,8 @@ if __name__ == "__main__":
     compute_fbank_librispeech(
         bpe_model=args.bpe_model,
         dataset=args.dataset,
+        output_dir=args.output_dir,
         perturb_speed=args.perturb_speed,
+        whisper_fbank=args.whisper_fbank,
+        num_mel_bins=args.num_mel_bins,
     )
diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh
index 40dc3260d..9f9048a6d 100755
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@@ -243,3 +243,23 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
       $lang_dir/L_disambig.fst
   fi
 fi
+
+
+if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
+  log "Stage 7: Prepare whisper fbank feature"
+  perturb_speed=1
+  whisper_mel_bins=80
+  output_dir=data/fbank_whisper_${whisper_mel_bins}D
+  if [ ! -f $output_dir/.librispeech.whisper.done ]; then
+    mkdir -p $output_dir
+    ./local/compute_fbank_librispeech.py \
+      --num-mel-bins ${whisper_mel_bins} \
+      --whisper-fbank true \
+      --output-dir $output_dir
+    ./local/compute_fbank_musan.py \
+      --num-mel-bins ${whisper_mel_bins} \
+      --whisper-fbank true \
+      --output-dir $output_dir
+    touch $output_dir/.librispeech.whisper.done
+  fi
+fi

From 76e0d59267b45d8c48d0729d0dbc067b6853c8fe Mon Sep 17 00:00:00 2001
From: marcoyang <marcoyang1998@gmail.com>
Date: Thu, 28 Mar 2024 15:23:19 +0800
Subject: [PATCH 03/13] support decoding

---
 egs/librispeech/ASR/whisper/decode.py         | 513 ++++++++++++++++++
 .../whisper_encoder_forward_monkey_patch.py   |   1 +
 2 files changed, 514 insertions(+)
 create mode 100755 egs/librispeech/ASR/whisper/decode.py
 create mode 120000 egs/librispeech/ASR/whisper/whisper_encoder_forward_monkey_patch.py

diff --git a/egs/librispeech/ASR/whisper/decode.py b/egs/librispeech/ASR/whisper/decode.py
new file mode 100755
index 000000000..24f61f17f
--- /dev/null
+++ b/egs/librispeech/ASR/whisper/decode.py
@@ -0,0 +1,513 @@
+#!/usr/bin/env python3
+# Copyright 2021 Xiaomi Corporation (Author: Liyong Guo,
+#                                            Fangjun Kuang,
+#                                            Wei Kang)
+#           2024 Yuekai Zhang
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+# Command for decoding using fine-tuned models:
+git lfs install
+git clone https://huggingface.co/yuekai/icefall_asr_aishell_whisper
+ln -s icefall_asr_aishell_whisper/exp_large_v2/epoch-10-avg6.pt whisper/exp_large_v2/epoch-999.pt
+
+python3 ./whisper/decode.py \
+  --exp-dir whisper/exp_large_v2 \
+  --model-name large-v2 \
+  --epoch 999 --avg 1 \
+  --manifest-dir data/fbank_whisper \
+  --beam-size 10 --max-duration 50
+
+# Command for decoding using pretrained models (before fine-tuning):
+
+python3 ./whisper/decode.py \
+  --exp-dir whisper/exp_large_v2 \
+  --model-name large-v2 \
+  --epoch -1 --avg 1 \
+  --manifest-dir data/fbank_whisper \
+  --remove-whisper-encoder-input-length-restriction False \
+  --beam-size 10 --max-duration 50
+
+"""
+
+import argparse
+import logging
+import re
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import k2
+import torch
+import torch.nn as nn
+import whisper
+from asr_datamodule import LibriSpeechAsrDataModule
+from tn.chinese.normalizer import Normalizer
+from whisper.normalizers import BasicTextNormalizer
+from whisper_encoder_forward_monkey_patch import replace_whisper_encoder_forward
+from zhconv import convert
+
+from icefall.checkpoint import average_checkpoints_with_averaged_model, load_checkpoint
+from icefall.env import get_env_info
+from icefall.utils import (
+    AttributeDict,
+    setup_logger,
+    store_transcripts,
+    str2bool,
+    write_error_stats,
+)
+
+
+def average_checkpoints(
+    filenames: List[Path], device: torch.device = torch.device("cpu")
+) -> dict:
+    """Average a list of checkpoints.
+    The function is mainly used for deepspeed converted checkpoint averaging, which only include model state_dict.
+
+    Args:
+      filenames:
+        Filenames of the checkpoints to be averaged. We assume all
+        checkpoints are saved by :func:`save_checkpoint`.
+      device:
+        Move checkpoints to this device before averaging.
+    Returns:
+      Return a dict (i.e., state_dict) which is the average of all
+      model state dicts contained in the checkpoints.
+    """
+    n = len(filenames)
+
+    if "model" in torch.load(filenames[0], map_location=device):
+        avg = torch.load(filenames[0], map_location=device)["model"]
+    else:
+        avg = torch.load(filenames[0], map_location=device)
+
+    # Identify shared parameters. Two parameters are said to be shared
+    # if they have the same data_ptr
+    uniqued: Dict[int, str] = dict()
+
+    for k, v in avg.items():
+        v_data_ptr = v.data_ptr()
+        if v_data_ptr in uniqued:
+            continue
+        uniqued[v_data_ptr] = k
+
+    uniqued_names = list(uniqued.values())
+
+    for i in range(1, n):
+        if "model" in torch.load(filenames[i], map_location=device):
+            state_dict = torch.load(filenames[i], map_location=device)["model"]
+        else:
+            state_dict = torch.load(filenames[i], map_location=device)
+        for k in uniqued_names:
+            avg[k] += state_dict[k]
+
+    for k in uniqued_names:
+        if avg[k].is_floating_point():
+            avg[k] /= n
+        else:
+            avg[k] //= n
+
+    return avg
+
+
+def remove_punctuation(text: str or List[str]):
+    """Modified from https://github.com/yeyupiaoling/Whisper-Finetune/blob/master/utils/data_utils.py
+
+    Args:
+        text: It can be a string or a list of strings.
+    Returns:
+        Return a string or a list of strings without any punctuation.
+    """
+    punctuation = "!,.;:?、！，。；：？《》"
+    if isinstance(text, str):
+        text = re.sub(r"[{}]+".format(punctuation), "", text).strip()
+        return text
+    elif isinstance(text, list):
+        result_text = []
+        for t in text:
+            t = re.sub(r"[{}]+".format(punctuation), "", t).strip()
+            result_text.append(t)
+        return result_text
+    else:
+        raise Exception(f"Not support type {type(text)}")
+
+
+def to_simple(text: str or List[str]):
+    """Convert traditional Chinese to simplified Chinese.
+    Args:
+        text: It can be a string or a list of strings.
+    Returns:
+        Return a string or a list of strings converted to simplified Chinese.
+    """
+    if isinstance(text, str):
+        text = convert(text, "zh-cn")
+        return text
+    elif isinstance(text, list):
+        result_text = []
+        for t in text:
+            t = convert(t, "zh-cn")
+            result_text.append(t)
+        return result_text
+    else:
+        raise Exception(f"Not support type{type(text)}")
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=-1,
+        help="It specifies the checkpoint to use for decoding."
+        "Note: Epoch counts from 0.",
+    )
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=1,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch'. ",
+    )
+
+    parser.add_argument(
+        "--method",
+        type=str,
+        default="beam-search",
+        help="""Decoding method.
+        Supported values are:
+          - beam-search
+        """,
+    )
+
+    parser.add_argument(
+        "--beam-size",
+        type=int,
+        default=1,
+        help="beam size for beam search decoding",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="whisper/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="large-v2",
+        choices=["large-v2", "large-v3", "medium", "medium.en", "small", "small.en", "tiny", "tiny.en"],
+        help="""The model name to use.
+        """,
+    )
+
+    parser.add_argument(
+        "--remove-whisper-encoder-input-length-restriction",
+        type=str2bool,
+        default=True,
+        help="replace whisper encoder forward method to remove input length restriction",
+    )
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    params = AttributeDict(
+        {
+            "env_info": get_env_info(),
+        }
+    )
+    return params
+
+
+def decode_one_batch(
+    params: AttributeDict,
+    model: nn.Module,
+    batch: dict,
+) -> Dict[str, List[List[int]]]:
+    """Decode one batch and return the result in a dict. The dict has the
+    following format:
+
+        - key: "beam-search"
+        - value: A list of lists. Each sublist is a list of token IDs.
+    Args:
+        params:
+            It is returned by :func:`get_params`.
+        model:
+            The neural model.
+        batch:
+            It is returned by :meth:`torch.utils.data.DataLoader.__iter__`.
+    Returns:
+        Return a dict, whose key may be "beam-search".
+    """
+    dtype = torch.float16
+    device = torch.device("cuda")
+
+    feature = batch["inputs"]
+    assert feature.ndim == 3
+    feature = feature.to(device, dtype=dtype).transpose(1, 2)
+    if not params.remove_whisper_encoder_input_length_restriction:
+        T = 3000
+        if feature.shape[2] < T:
+            feature = torch.cat(
+                [
+                    feature,
+                    torch.zeros(
+                        feature.shape[0], feature.shape[1], T - feature.shape[2]
+                    ).to(device, dtype=dtype),
+                ],
+                2,
+            )
+
+    supervisions = batch["supervisions"]
+    feature_len = supervisions["num_frames"]
+    feature_len = feature_len.to(device, dtype=dtype)
+    results = model.decode(feature, params.decoding_options)
+    hyps = [result.text.upper() for result in results]
+
+    hyps = remove_punctuation(hyps)
+    hyps = [params.normalizer.normalize(hyp) for hyp in hyps]
+    hyps = [hyp.split() for hyp in hyps]
+
+    return {"beam-search": hyps}
+
+
+def decode_dataset(
+    dl: torch.utils.data.DataLoader,
+    params: AttributeDict,
+    model: nn.Module,
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
+    """Decode dataset.
+
+    Args:
+        dl:
+            The dataloader.
+        params:
+            It is returned by :func:`get_params`.
+        model:
+            The neural model.
+    Returns:
+        Return a dict, whose key may be "beam-search".
+    """
+    results = []
+
+    num_cuts = 0
+
+    try:
+        num_batches = len(dl)
+    except TypeError:
+        num_batches = "?"
+
+    results = defaultdict(list)
+    for batch_idx, batch in enumerate(dl):
+        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
+
+        hyps_dict = decode_one_batch(
+            params=params,
+            model=model,
+            batch=batch,
+        )
+
+        for name, hyps in hyps_dict.items():
+            this_batch = []
+            assert len(hyps) == len(texts)
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
+                ref_words = ref_text.split()
+                this_batch.append((cut_id, ref_words, hyp_words))
+
+            results[name].extend(this_batch)
+
+        num_cuts += len(batch["supervisions"]["text"])
+
+        if batch_idx % 100 == 0:
+            batch_str = f"{batch_idx}/{num_batches}"
+
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
+    return results
+
+
+def save_results(
+    params: AttributeDict,
+    test_set_name: str,
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
+):
+
+    enable_log = True
+    test_set_wers = dict()
+    for key, results in results_dict.items():
+        recog_path = (
+            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        results = sorted(results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
+        if enable_log:
+            logging.info(f"The transcripts are stored in {recog_path}")
+
+        # The following prints out WERs, per-word error statistics and aligned
+        # ref/hyp pairs.
+        errs_filename = (
+            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        # we compute CER for aishell dataset.
+        results_char = []
+        for res in results:
+            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
+        with open(errs_filename, "w") as f:
+            wer = write_error_stats(
+                f,
+                f"{test_set_name}-{key}",
+                results_char,
+                enable_log=enable_log,
+                compute_CER=True,
+            )
+            test_set_wers[key] = wer
+
+        if enable_log:
+            logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
+    errs_info = params.res_dir / f"cer-summary-{test_set_name}-{params.suffix}.txt"
+    with open(errs_info, "w") as f:
+        print("settings\tCER", file=f)
+        for key, val in test_set_wers:
+            print("{}\t{}".format(key, val), file=f)
+
+    s = "\nFor {}, CER of different settings are:\n".format(test_set_name)
+    note = "\tbest for {}".format(test_set_name)
+    for key, val in test_set_wers:
+        s += "{}\t{}{}\n".format(key, val, note)
+        note = ""
+    logging.info(s)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    LibriSpeechAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    params = get_params()
+    params.update(vars(args))
+    params.res_dir = params.exp_dir / params.method
+    params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+    setup_logger(
+        f"{params.res_dir}/log-{params.method}-beam{params.beam_size}/log-decode-{params.suffix}"
+    )
+
+    options = whisper.DecodingOptions(
+        task="transcribe",
+        language="en",
+        # without_timestamps=True,
+        # beam_size=params.beam_size,
+    )
+    params.decoding_options = options
+    params.cleaner = BasicTextNormalizer()
+    params.normalizer = Normalizer()
+
+    logging.info("Decoding started")
+    logging.info(params)
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+
+    logging.info(f"device: {device}")
+
+    if params.remove_whisper_encoder_input_length_restriction:
+        replace_whisper_encoder_forward()
+    model = whisper.load_model(params.model_name, "cpu")
+    if params.epoch > 0:
+        if params.avg > 1:
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            checkpoint = torch.load(
+                f"{params.exp_dir}/epoch-{params.epoch}.pt", map_location="cpu"
+            )
+            if "model" not in checkpoint:
+                # deepspeed converted checkpoint only contains model state_dict
+                filenames = [
+                    f"{params.exp_dir}/epoch-{epoch}.pt"
+                    for epoch in range(start, params.epoch + 1)
+                ]
+                model.load_state_dict(average_checkpoints(filenames))
+            else:
+                filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+                filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+                logging.info(
+                    f"Calculating the averaged model over epoch range from "
+                    f"{start} (excluded) to {params.epoch}"
+                )
+                model.to(device)
+                model.load_state_dict(
+                    average_checkpoints_with_averaged_model(
+                        filename_start=filename_start,
+                        filename_end=filename_end,
+                        device=device,
+                    )
+                )
+            # save checkpoints
+            filename = f"{params.exp_dir}/epoch-{params.epoch}-avg-{params.avg}.pt"
+            torch.save(model.state_dict(), filename)
+        else:
+            checkpoint = torch.load(
+                f"{params.exp_dir}/epoch-{params.epoch}.pt", map_location="cpu"
+            )
+            if "model" not in checkpoint:
+                model.load_state_dict(checkpoint, strict=True)
+            else:
+                load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+    model.to(device)
+    model.eval()
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    # we need cut ids to display recognition results.
+    args.return_cuts = True
+    librispeech = LibriSpeechAsrDataModule(args)
+
+    test_clean_cuts = librispeech.test_clean_cuts().subset(first=200)
+    test_other_cuts = librispeech.test_other_cuts().subset(first=200)
+
+    test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
+    test_other_dl = librispeech.test_dataloaders(test_other_cuts)
+
+    test_sets = ["test-clean", "test-other"]
+    test_dls = [test_clean_dl, test_other_dl]
+
+    for test_set, test_dl in zip(test_sets, test_dls):
+        results_dict = decode_dataset(
+            dl=test_dl,
+            params=params,
+            model=model,
+        )
+
+        save_results(params=params, test_set_name=test_set, results_dict=results_dict)
+
+    logging.info("Done!")
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/librispeech/ASR/whisper/whisper_encoder_forward_monkey_patch.py b/egs/librispeech/ASR/whisper/whisper_encoder_forward_monkey_patch.py
new file mode 120000
index 000000000..2a7808921
--- /dev/null
+++ b/egs/librispeech/ASR/whisper/whisper_encoder_forward_monkey_patch.py
@@ -0,0 +1 @@
+../../../aishell/ASR/whisper/whisper_encoder_forward_monkey_patch.py
\ No newline at end of file

From eb685364df8395e7ffbdae44941cea21bc86573c Mon Sep 17 00:00:00 2001
From: marcoyang <marcoyang1998@gmail.com>
Date: Thu, 28 Mar 2024 15:56:04 +0800
Subject: [PATCH 04/13] generate train-all-shuf for whisper fbank

---
 egs/librispeech/ASR/prepare.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh
index 9f9048a6d..81fe43d84 100755
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@@ -249,7 +249,7 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
   log "Stage 7: Prepare whisper fbank feature"
   perturb_speed=1
   whisper_mel_bins=80
-  output_dir=data/fbank_whisper_${whisper_mel_bins}D
+  output_dir=data/fbank_whisper_${whisper_mel_bins}D_hdf5
   if [ ! -f $output_dir/.librispeech.whisper.done ]; then
     mkdir -p $output_dir
     ./local/compute_fbank_librispeech.py \
@@ -262,4 +262,10 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
       --output-dir $output_dir
     touch $output_dir/.librispeech.whisper.done
   fi
+  if [ ! -f ${output_dir}/librispeech_cuts_train-all-shuf.jsonl.gz ]; then
+    cat <(gunzip -c ${output_dir}/librispeech_cuts_train-clean-100.jsonl.gz) \
+      <(gunzip -c ${output_dir}/librispeech_cuts_train-clean-360.jsonl.gz) \
+      <(gunzip -c ${output_dir}/librispeech_cuts_train-other-500.jsonl.gz) | \
+      shuf | gzip -c > ${output_dir}/librispeech_cuts_train-all-shuf.jsonl.gz
+  fi
 fi

From 711859c21fda7d0bd66e5e7706b92a064f911d67 Mon Sep 17 00:00:00 2001
From: marcoyang <marcoyang1998@gmail.com>
Date: Thu, 28 Mar 2024 16:14:44 +0800
Subject: [PATCH 05/13] fix typo

---
 egs/librispeech/ASR/zipformer/decode.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/librispeech/ASR/zipformer/decode.py b/egs/librispeech/ASR/zipformer/decode.py
index 339e253e6..5f18de9e8 100755
--- a/egs/librispeech/ASR/zipformer/decode.py
+++ b/egs/librispeech/ASR/zipformer/decode.py
@@ -1023,9 +1023,9 @@ def main():
     test_other_dl = librispeech.test_dataloaders(test_other_cuts)
 
     test_sets = ["test-clean", "test-other"]
-    test_dl = [test_clean_dl, test_other_dl]
+    test_dls = [test_clean_dl, test_other_dl]
 
-    for test_set, test_dl in zip(test_sets, test_dl):
+    for test_set, test_dl in zip(test_sets, test_dls):
         results_dict = decode_dataset(
             dl=test_dl,
             params=params,

From ebc0f3b052982355087ca71255c417bf64b36c88 Mon Sep 17 00:00:00 2001
From: marcoyang <marcoyang1998@gmail.com>
Date: Thu, 28 Mar 2024 16:16:18 +0800
Subject: [PATCH 06/13] update train.py

---
 egs/librispeech/ASR/whisper/train.py | 43 +++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/egs/librispeech/ASR/whisper/train.py b/egs/librispeech/ASR/whisper/train.py
index 6ccb8d363..bd6b27d99 100755
--- a/egs/librispeech/ASR/whisper/train.py
+++ b/egs/librispeech/ASR/whisper/train.py
@@ -23,7 +23,8 @@ torchrun --nproc_per_node 8 ./whisper/train.py \
   --max-duration 200 \
   --exp-dir whisper/exp_large_v2 \
   --model-name large-v2 \
-  --manifest-dir data/fbank_whisper \
+  --full-libri True \
+  --manifest-dir data/fbank_whisper_80D \
   --deepspeed \
   --deepspeed_config ./whisper/ds_config_zero1.json
 
@@ -31,7 +32,8 @@ torchrun --nproc_per_node 8 ./whisper/train.py \
 torchrun --nproc_per_node 8 ./whisper/train.py \
   --max-duration 200 \
   --exp-dir whisper/exp_medium \
-  --manifest-dir data/fbank_whisper \
+  --full-libri True \
+  --manifest-dir data/fbank_whisper_80D \
   --base-lr 1e-5 \
   --model-name medium
 """
@@ -53,7 +55,7 @@ import torch
 import torch.multiprocessing as mp
 import torch.nn as nn
 import whisper
-from asr_datamodule import AishellAsrDataModule
+from asr_datamodule import LibriSpeechAsrDataModule
 from deepspeed.utils.zero_to_fp32 import convert_zero_checkpoint_to_fp32_state_dict
 from label_smoothing import LabelSmoothingLoss
 from lhotse import CutSet, load_manifest
@@ -147,7 +149,7 @@ def get_parser():
         "--model-name",
         type=str,
         default="large-v2",
-        choices=["large-v2", "large-v3", "medium", "small", "tiny"],
+        choices=["large-v2", "large-v3", "medium", "medium.en", "small", "small.en", "tiny", "tiny.en"],
         help="""The model name to use.
         """,
     )
@@ -450,8 +452,7 @@ def compute_loss(
     batch_idx_train = params.batch_idx_train
 
     texts = batch["supervisions"]["text"]
-    # remove spaces in texts
-    texts = [text.replace(" ", "") for text in texts]
+    texts = [t[0] + t[1:].lower() for t in texts]
 
     text_tokens_list = [
         list(tokenizer.sot_sequence_including_notimestamps)
@@ -744,7 +745,7 @@ def run(rank, world_size, args):
     tokenizer = whisper.tokenizer.get_tokenizer(
         model.is_multilingual,
         num_languages=model.num_languages,
-        language="zh",
+        language="en",
         task="transcribe",
     )
 
@@ -800,7 +801,19 @@ def run(rank, world_size, args):
     if params.inf_check:
         register_inf_check_hooks(model)
 
-    aishell = AishellAsrDataModule(args)
+    librispeech = LibriSpeechAsrDataModule(args)
+
+    if params.full_libri:
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()
+
+    def remove_short_and_long_utt(c: Cut):
+        if c.duration < 1.0 or c.duration > 20.0:
+            return False
+        return True
+
+    train_cuts = train_cuts.filter(remove_short_and_long_utt)
 
     if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
         # We only load the sampler's state dict when it loads a checkpoint
@@ -809,8 +822,16 @@ def run(rank, world_size, args):
     else:
         sampler_state_dict = None
 
-    train_dl = aishell.train_dataloaders(aishell.train_cuts())
-    valid_dl = aishell.valid_dataloaders(aishell.valid_cuts())
+    train_dl = librispeech.train_dataloaders(
+        train_cuts, sampler_state_dict=sampler_state_dict
+    )
+
+    valid_cuts = librispeech.dev_clean_cuts()
+    valid_cuts += librispeech.dev_other_cuts()
+
+    # do this to prevent Whisper throwing the length mismatch error
+    valid_cuts = valid_cuts.filter(remove_short_and_long_utt)
+    valid_dl = librispeech.valid_dataloaders(valid_cuts)
 
     scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
     if checkpoints and "grad_scaler" in checkpoints:
@@ -911,7 +932,7 @@ def display_and_save_batch(
 
 def main():
     parser = get_parser()
-    AishellAsrDataModule.add_arguments(parser)
+    LibriSpeechAsrDataModule.add_arguments(parser)
     args = parser.parse_args()
     args.exp_dir = Path(args.exp_dir)
 

From 360f20803731a60e409400784dbf5931af50959e Mon Sep 17 00:00:00 2001
From: marcoyang <marcoyang1998@gmail.com>
Date: Thu, 28 Mar 2024 16:17:05 +0800
Subject: [PATCH 07/13] deactivate beam search temporarily for speed

---
 egs/librispeech/ASR/whisper/decode.py | 34 ++++++++-------------------
 1 file changed, 10 insertions(+), 24 deletions(-)

diff --git a/egs/librispeech/ASR/whisper/decode.py b/egs/librispeech/ASR/whisper/decode.py
index 24f61f17f..83d33418d 100755
--- a/egs/librispeech/ASR/whisper/decode.py
+++ b/egs/librispeech/ASR/whisper/decode.py
@@ -3,6 +3,7 @@
 #                                            Fangjun Kuang,
 #                                            Wei Kang)
 #           2024 Yuekai Zhang
+#           2024 Xiaomi Corporation          Xiaoyu Yang
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@@ -145,26 +146,6 @@ def remove_punctuation(text: str or List[str]):
         raise Exception(f"Not support type {type(text)}")
 
 
-def to_simple(text: str or List[str]):
-    """Convert traditional Chinese to simplified Chinese.
-    Args:
-        text: It can be a string or a list of strings.
-    Returns:
-        Return a string or a list of strings converted to simplified Chinese.
-    """
-    if isinstance(text, str):
-        text = convert(text, "zh-cn")
-        return text
-    elif isinstance(text, list):
-        result_text = []
-        for t in text:
-            t = convert(t, "zh-cn")
-            result_text.append(t)
-        return result_text
-    else:
-        raise Exception(f"Not support type{type(text)}")
-
-
 def get_parser():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
@@ -417,8 +398,8 @@ def main():
     options = whisper.DecodingOptions(
         task="transcribe",
         language="en",
-        # without_timestamps=True,
-        # beam_size=params.beam_size,
+        without_timestamps=True,
+        #beam_size=params.beam_size,
     )
     params.decoding_options = options
     params.cleaner = BasicTextNormalizer()
@@ -481,12 +462,17 @@ def main():
     num_param = sum([p.numel() for p in model.parameters()])
     logging.info(f"Number of model parameters: {num_param}")
 
+    def remove_short_and_long_utt(c):
+        if c.duration < 1.0 or c.duration > 30.0:
+            return False
+        return True
+
     # we need cut ids to display recognition results.
     args.return_cuts = True
     librispeech = LibriSpeechAsrDataModule(args)
 
-    test_clean_cuts = librispeech.test_clean_cuts().subset(first=200)
-    test_other_cuts = librispeech.test_other_cuts().subset(first=200)
+    test_clean_cuts = librispeech.test_clean_cuts().filter(remove_short_and_long_utt)
+    test_other_cuts = librispeech.test_other_cuts().filter(remove_short_and_long_utt)
 
     test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
     test_other_dl = librispeech.test_dataloaders(test_other_cuts)

From cfbc829df3b817f2baf400d24647eb5c2aa69aec Mon Sep 17 00:00:00 2001
From: marcoyang <marcoyang1998@gmail.com>
Date: Thu, 28 Mar 2024 18:16:33 +0800
Subject: [PATCH 08/13] support freezing modules

---
 egs/librispeech/ASR/whisper/train.py | 33 ++++++++++++++++++----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/egs/librispeech/ASR/whisper/train.py b/egs/librispeech/ASR/whisper/train.py
index bd6b27d99..db6f2e182 100755
--- a/egs/librispeech/ASR/whisper/train.py
+++ b/egs/librispeech/ASR/whisper/train.py
@@ -88,15 +88,6 @@ from icefall.utils import (
 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
 
 
-def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
-    if isinstance(model, DDP):
-        # get underlying nn.Module
-        model = model.module
-    for module in model.modules():
-        if hasattr(module, "batch_count"):
-            module.batch_count = batch_count
-
-
 def get_parser():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
@@ -226,6 +217,13 @@ def get_parser():
         help="Whether to use half precision training.",
     )
 
+    parser.add_argument(
+        "--freeze-modules",
+        type=str,
+        default=None,
+        help="Which modules to freeze during finetune"
+    )
+
     parser = deepspeed.add_config_arguments(parser)
 
     return parser
@@ -583,6 +581,9 @@ def train_one_epoch(
         be set to 0.
     """
     model.train()
+    for name, module in model.named_modules():
+        if name.startswith(params.freeze_modules):
+            module.eval()
 
     tot_loss = MetricsTracker()
 
@@ -630,7 +631,6 @@ def train_one_epoch(
                 model.step()
             else:
                 scaler.scale(loss).backward()
-                set_batch_count(model, params.batch_idx_train)
                 scheduler.step_batch(params.batch_idx_train)
 
                 scaler.step(optimizer)
@@ -739,8 +739,19 @@ def run(rank, world_size, args):
     replace_whisper_encoder_forward()
     model = whisper.load_model(params.model_name, "cpu")
     del model.alignment_heads
+
+    if params.freeze_modules is not None:        
+        for name, p in model.named_parameters():
+            if name.startswith(params.freeze_modules):
+                p.requires_grad = False
+                logging.info(f"Do not update {name}")
+        for name, module in model.named_modules():
+            if name.startswith(params.freeze_modules):
+                module.eval()
+
     num_param = sum([p.numel() for p in model.parameters()])
-    logging.info(f"Number of model parameters: {num_param}")
+    num_trainable = sum([p.numel() if p.requires_grad else 0 for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}. Total trainable parameters: {num_trainable}")
 
     tokenizer = whisper.tokenizer.get_tokenizer(
         model.is_multilingual,

From 5d41deca71198ad3a15104bf52bcd3258e130581 Mon Sep 17 00:00:00 2001
From: marcoyang <marcoyang1998@gmail.com>
Date: Thu, 28 Mar 2024 18:16:52 +0800
Subject: [PATCH 09/13] update the decoding script

---
 egs/librispeech/ASR/whisper/decode.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/egs/librispeech/ASR/whisper/decode.py b/egs/librispeech/ASR/whisper/decode.py
index 83d33418d..c5f8a9406 100755
--- a/egs/librispeech/ASR/whisper/decode.py
+++ b/egs/librispeech/ASR/whisper/decode.py
@@ -348,17 +348,12 @@ def save_results(
         errs_filename = (
             params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
         )
-        # we compute CER for aishell dataset.
-        results_char = []
-        for res in results:
-            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
         with open(errs_filename, "w") as f:
             wer = write_error_stats(
                 f,
                 f"{test_set_name}-{key}",
-                results_char,
+                results,
                 enable_log=enable_log,
-                compute_CER=True,
             )
             test_set_wers[key] = wer
 
@@ -366,13 +361,13 @@ def save_results(
             logging.info("Wrote detailed error stats to {}".format(errs_filename))
 
     test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
-    errs_info = params.res_dir / f"cer-summary-{test_set_name}-{params.suffix}.txt"
+    errs_info = params.res_dir / f"wer-summary-{test_set_name}-{params.suffix}.txt"
     with open(errs_info, "w") as f:
-        print("settings\tCER", file=f)
+        print("settings\tWER", file=f)
         for key, val in test_set_wers:
             print("{}\t{}".format(key, val), file=f)
 
-    s = "\nFor {}, CER of different settings are:\n".format(test_set_name)
+    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
     note = "\tbest for {}".format(test_set_name)
     for key, val in test_set_wers:
         s += "{}\t{}{}\n".format(key, val, note)
@@ -391,16 +386,21 @@ def main():
     params.update(vars(args))
     params.res_dir = params.exp_dir / params.method
     params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+    if params.method == "beam_search":
+        params.suffix += f"-beam-search-beam-size-{params.beam_size}"
+
+    params.suffix += f"-whisper-{params.model_name}"
     setup_logger(
-        f"{params.res_dir}/log-{params.method}-beam{params.beam_size}/log-decode-{params.suffix}"
+        f"{params.res_dir}/log-{params.method}/log-decode-{params.suffix}"
     )
 
     options = whisper.DecodingOptions(
         task="transcribe",
         language="en",
         without_timestamps=True,
-        #beam_size=params.beam_size,
+        beam_size=params.beam_size if params.method == "beam_search" else None,
     )
+
     params.decoding_options = options
     params.cleaner = BasicTextNormalizer()
     params.normalizer = Normalizer()

From 55a6857df6c4608c5487a2322fe8ee3c13ec8876 Mon Sep 17 00:00:00 2001
From: marcoyang <marcoyang1998@gmail.com>
Date: Fri, 29 Mar 2024 11:02:48 +0800
Subject: [PATCH 10/13] add an option to use hdf5 for whisper fbank extraction

---
 .../ASR/local/compute_fbank_librispeech.py        | 12 +++++++++++-
 egs/librispeech/ASR/local/compute_fbank_musan.py  | 15 +++++++++++++--
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/egs/librispeech/ASR/local/compute_fbank_librispeech.py b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
index 5b703d9ca..9802008c7 100755
--- a/egs/librispeech/ASR/local/compute_fbank_librispeech.py
+++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
@@ -36,6 +36,7 @@ from lhotse import (
     CutSet,
     Fbank,
     FbankConfig,
+    NumpyHdf5Writer,
     LilcomChunkyWriter,
     WhisperFbank,
     WhisperFbankConfig,
@@ -95,6 +96,13 @@ def get_args():
         default=80,
     )
 
+    parser.add_argument(
+        "--use-hdf5",
+        type=str2bool,
+        default=False,
+        help="If use hdf5 to store un-compressed features. Otherwise, use Lilcom"
+    )
+
     return parser.parse_args()
 
 
@@ -105,6 +113,7 @@ def compute_fbank_librispeech(
     perturb_speed: Optional[bool] = True,
     whisper_fbank: Optional[bool] = False,
     num_mel_bins: Optional[int] = 80,
+    use_hdf5: Optional[bool] = False,
 ):
     src_dir = Path("data/manifests")
     output_dir = Path(output_dir)
@@ -180,7 +189,7 @@ def compute_fbank_librispeech(
                 # when an executor is specified, make more partitions
                 num_jobs=num_jobs if ex is None else 80,
                 executor=ex,
-                storage_type=LilcomChunkyWriter,
+                storage_type=LilcomChunkyWriter if not use_hdf5 else NumpyHdf5Writer,
             )
             cut_set.to_file(output_dir / cuts_filename)
 
@@ -198,4 +207,5 @@ if __name__ == "__main__":
         perturb_speed=args.perturb_speed,
         whisper_fbank=args.whisper_fbank,
         num_mel_bins=args.num_mel_bins,
+        use_hdf5=args.use_hdf5,
     )
diff --git a/egs/librispeech/ASR/local/compute_fbank_musan.py b/egs/librispeech/ASR/local/compute_fbank_musan.py
index d7781687f..1a4542dc0 100755
--- a/egs/librispeech/ASR/local/compute_fbank_musan.py
+++ b/egs/librispeech/ASR/local/compute_fbank_musan.py
@@ -34,6 +34,7 @@ from lhotse import (
     FbankConfig,
     LilcomChunkyWriter,
     MonoCut,
+    NumpyHdf5Writer,
     WhisperFbank,
     WhisperFbankConfig,
     combine,
@@ -55,7 +56,10 @@ def is_cut_long(c: MonoCut) -> bool:
 
 
 def compute_fbank_musan(
-    num_mel_bins: int = 80, whisper_fbank: bool = False, output_dir: str = "data/fbank"
+    num_mel_bins: int = 80,
+    whisper_fbank: bool = False,
+    output_dir: str = "data/fbank",
+    use_hdf5: bool = False,
 ):
     src_dir = Path("data/manifests")
     output_dir = Path(output_dir)
@@ -111,7 +115,7 @@ def compute_fbank_musan(
                 storage_path=f"{output_dir}/musan_feats",
                 num_jobs=num_jobs if ex is None else 80,
                 executor=ex,
-                storage_type=LilcomChunkyWriter,
+                storage_type=LilcomChunkyWriter if not use_hdf5 else NumpyHdf5Writer,
             )
         )
         musan_cuts.to_file(musan_cuts_path)
@@ -137,6 +141,12 @@ def get_args():
         default="data/fbank",
         help="Output directory. Default: data/fbank.",
     )
+    parser.add_argument(
+        "--use-hdf5",
+        type=str2bool,
+        default=False,
+        help="If use hdf5 to store un-compressed features. Otherwise, use Lilcom"
+    )
     return parser.parse_args()
 
 
@@ -149,4 +159,5 @@ if __name__ == "__main__":
         num_mel_bins=args.num_mel_bins,
         whisper_fbank=args.whisper_fbank,
         output_dir=args.output_dir,
+        use_hdf5=args.use_hdf5,
     )

From 4d9f2120b3cda7448bd276f204d4cb11493ee3f4 Mon Sep 17 00:00:00 2001
From: marcoyang <marcoyang1998@gmail.com>
Date: Fri, 29 Mar 2024 11:03:37 +0800
Subject: [PATCH 11/13] update comments; generate train-all-shuf after feature
 extraction

---
 egs/librispeech/ASR/prepare.sh | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh
index 81fe43d84..1cf61125a 100755
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@@ -244,21 +244,26 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
   fi
 fi
 
-
+# NOTE: This stage is optional and should only be done if you want to
+# do Whisper related experiments
 if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
   log "Stage 7: Prepare whisper fbank feature"
-  perturb_speed=1
+  perturb_speed=0
   whisper_mel_bins=80
-  output_dir=data/fbank_whisper_${whisper_mel_bins}D_hdf5
+  use_hdf5=False
+  output_dir=data/fbank_whisper_${whisper_mel_bins}D_test
   if [ ! -f $output_dir/.librispeech.whisper.done ]; then
     mkdir -p $output_dir
     ./local/compute_fbank_librispeech.py \
       --num-mel-bins ${whisper_mel_bins} \
+      --perturb-speed ${perturb_speed} \
       --whisper-fbank true \
+      --use-hdf5 ${use_hdf5} \
       --output-dir $output_dir
     ./local/compute_fbank_musan.py \
       --num-mel-bins ${whisper_mel_bins} \
       --whisper-fbank true \
+      --use-hdf5 ${use_hdf5} \
       --output-dir $output_dir
     touch $output_dir/.librispeech.whisper.done
   fi

From f208431f5cdcb52dddf986200e96b4377971c7ea Mon Sep 17 00:00:00 2001
From: marcoyang <marcoyang1998@gmail.com>
Date: Fri, 29 Mar 2024 11:03:58 +0800
Subject: [PATCH 12/13] support on-the-fly whisper fbank extraction

---
 .../ASR/tdnn_lstm_ctc/asr_datamodule.py       | 49 +++++++++++++++++--
 1 file changed, 45 insertions(+), 4 deletions(-)

diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
index 814390ad6..b83a61ccf 100644
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
@@ -24,7 +24,15 @@ from pathlib import Path
 from typing import Any, Dict, Optional
 
 import torch
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
+from lhotse import (
+    CutSet,
+    Fbank,
+    FbankConfig,
+    load_manifest,
+    load_manifest_lazy,
+    WhisperFbank,
+    WhisperFbankConfig,
+)
 from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
     CutConcatenate,
     CutMix,
@@ -215,6 +223,20 @@ class LibriSpeechAsrDataModule:
             help="AudioSamples or PrecomputedFeatures",
         )
 
+        group.add_argument(
+            "--use-whisper-fbank",
+            type=str2bool,
+            default=False,
+            help="Use whisper fbank feature as input",
+        )
+
+        group.add_argument(
+            "--whisper-fbank-n-mels",
+            type=int,
+            default=80,
+            help="Number of mels for whisper fbank, large-v3 uses 128-mel fbank",
+        )
+
     def train_dataloaders(
         self,
         cuts_train: CutSet,
@@ -297,9 +319,15 @@ class LibriSpeechAsrDataModule:
             # to be strict (e.g. could be randomized)
             # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms   # noqa
             # Drop feats to be on the safe side.
+            if self.args.use_whisper_fbank:
+                extractor = WhisperFbank(
+                    WhisperFbankConfig(num_filters=self.args.whisper_fbank_n_mels),
+                )
+            else:
+                extractor = Fbank(FbankConfig(num_mel_bins=80))
             train = K2SpeechRecognitionDataset(
                 cut_transforms=transforms,
-                input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
+                input_strategy=OnTheFlyFeatures(extractor),
                 input_transforms=input_transforms,
                 return_cuts=self.args.return_cuts,
             )
@@ -355,9 +383,15 @@ class LibriSpeechAsrDataModule:
 
         logging.info("About to create dev dataset")
         if self.args.on_the_fly_feats:
+            if self.args.use_whisper_fbank:
+                extractor = WhisperFbank(
+                    WhisperFbankConfig(num_filters=self.args.whisper_fbank_n_mels),
+                )
+            else:
+                extractor = Fbank(FbankConfig(num_mel_bins=80))
             validate = K2SpeechRecognitionDataset(
                 cut_transforms=transforms,
-                input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
+                input_strategy=OnTheFlyFeatures(extractor),
                 return_cuts=self.args.return_cuts,
             )
         else:
@@ -383,8 +417,15 @@ class LibriSpeechAsrDataModule:
 
     def test_dataloaders(self, cuts: CutSet) -> DataLoader:
         logging.debug("About to create test dataset")
+        if self.args.use_whisper_fbank:
+            extractor = WhisperFbank(
+                WhisperFbankConfig(num_filters=self.args.whisper_fbank_n_mels),
+            )
+        else:
+            extractor = Fbank(FbankConfig(num_mel_bins=80))
+
         test = K2SpeechRecognitionDataset(
-            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+            input_strategy=OnTheFlyFeatures(extractor)
             if self.args.on_the_fly_feats
             else eval(self.args.input_strategy)(),
             return_cuts=self.args.return_cuts,

From 6b2bd0fb5234d57edd949359e1326cbe3fda4973 Mon Sep 17 00:00:00 2001
From: marcoyang <marcoyang1998@gmail.com>
Date: Fri, 29 Mar 2024 15:29:50 +0800
Subject: [PATCH 13/13] support fine-tuning mono-lingual whisper model; add
 ScaledAdam as an option

---
 egs/librispeech/ASR/whisper/train.py | 48 ++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/egs/librispeech/ASR/whisper/train.py b/egs/librispeech/ASR/whisper/train.py
index db6f2e182..40fa921a0 100755
--- a/egs/librispeech/ASR/whisper/train.py
+++ b/egs/librispeech/ASR/whisper/train.py
@@ -80,6 +80,7 @@ from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
     AttributeDict,
     MetricsTracker,
+    get_parameter_groups_with_lrs,
     filter_uneven_sized_batch,
     setup_logger,
     str2bool,
@@ -145,6 +146,14 @@ def get_parser():
         """,
     )
 
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        default="adam",
+        choices=["scaledadam", "adam"],
+        help="Which optimizer to use."
+    )
+
     parser.add_argument(
         "--base-lr", type=float, default=1e-5, help="The base learning rate."
     )
@@ -463,23 +472,33 @@ def compute_loss(
         torch.LongTensor(text_tokens) for text_tokens in text_tokens_list
     ]
 
-    # 50256 is the index of <pad> for all whisper models
+    if params.is_multilingual:
+        # 50256 is the index of <pad> for multi-lingual whisper models
+        pad_idx = 50256
+    else:
+        # choose a symbol that is not used in en-whisper model as padding symbol
+        pad_idx = 50363
+    
+    assert tokenizer.eot != pad_idx, "EOT symbol should be different from pad symbol"
+
     prev_outputs_tokens = _batch_tensors(
-        [tokens[:-1] for tokens in text_tokens_list], pad_value=50256
+        [tokens[:-1] for tokens in text_tokens_list], pad_value=pad_idx
     )
     target_tokens = _batch_tensors(
-        [tokens[1:] for tokens in text_tokens_list], pad_value=50256
+        [tokens[1:] for tokens in text_tokens_list], pad_value=pad_idx
     )
     target_lengths = torch.LongTensor(
         [tokens.shape[0] - 1 for tokens in text_tokens_list]
     )
 
     decoder_criterion = LabelSmoothingLoss(
-        ignore_index=50256, label_smoothing=0.1, reduction="sum"
+        ignore_index=pad_idx, label_smoothing=0.1, reduction="sum"
     )
 
-    # ignore the first 3 tokens, which are always <|lang_id|>, <|transcibe|>, <|notimestampes|>
-    ignore_prefix_size = 3
+    # ignore the prefix tokens, which are:
+    # 1. Multi-lingual model: <|startoftranscript|>, <|lang_id|>, <|transcibe|>, <|notimestampes|>
+    # 2. Mono-lingual model: <|startoftranscript|>, <|notimestampes|>
+    ignore_prefix_size = len(tokenizer.sot_sequence_including_notimestamps) - 1
     with torch.set_grad_enabled(is_training):
         encoder_out = model.encoder(feature)
         text_logits = model.decoder(prev_outputs_tokens.to(device), encoder_out)
@@ -581,9 +600,10 @@ def train_one_epoch(
         be set to 0.
     """
     model.train()
-    for name, module in model.named_modules():
-        if name.startswith(params.freeze_modules):
-            module.eval()
+    if params.freeze_modules is not None:
+        for name, module in model.named_modules():
+            if name.startswith(params.freeze_modules):
+                module.eval()
 
     tot_loss = MetricsTracker()
 
@@ -753,6 +773,7 @@ def run(rank, world_size, args):
     num_trainable = sum([p.numel() if p.requires_grad else 0 for p in model.parameters()])
     logging.info(f"Number of model parameters: {num_param}. Total trainable parameters: {num_trainable}")
 
+    params.is_multilingual = model.is_multilingual
     tokenizer = whisper.tokenizer.get_tokenizer(
         model.is_multilingual,
         num_languages=model.num_languages,
@@ -777,7 +798,14 @@ def run(rank, world_size, args):
     logging.info(f"Device: {device}")
     model.to(device)
 
-    optimizer = torch.optim.AdamW(model.parameters(), lr=params.base_lr)
+    if params.optimizer == "adam":
+        optimizer = torch.optim.AdamW(model.parameters(), lr=params.base_lr)
+    else:
+        optimizer = ScaledAdam(
+            get_parameter_groups_with_lrs(model, lr=params.base_lr, include_names=True),
+            lr=params.base_lr,  # should have no effect
+            clipping_scale=2.0,
+        )
     scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
 
     if checkpoints and "optimizer" in checkpoints: