Use Emformer model as RNN-T encoder.

2025-09-03 14:14:19 +00:00 · 2022-03-30 17:09:15 +08:00 · 2022-03-30 17:09:15 +08:00 · 5728a4456e
commit 5728a4456e
parent e867a62d32
2 changed files with 126 additions and 12 deletions
--- a/egs/librispeech/ASR/transducer_emformer/noam.py
+++ b/egs/librispeech/ASR/transducer_emformer/noam.py
@ -0,0 +1,104 @@
 # Copyright    2021 University of Chinese Academy of Sciences (author: Han Zhu)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
 class Noam(object):
    """
    Implements Noam optimizer.
    Proposed in
    "Attention Is All You Need", https://arxiv.org/pdf/1706.03762.pdf
    Modified from
    https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/optimizer.py  # noqa
    Args:
      params:
        iterable of parameters to optimize or dicts defining parameter groups
      model_size:
        attention dimension of the transformer model
      factor:
        learning rate factor
      warm_step:
        warmup steps
    """
    def __init__(
        self,
        params,
        model_size: int = 256,
        factor: float = 10.0,
        warm_step: int = 25000,
        weight_decay=0,
    ) -> None:
        """Construct an Noam object."""
        self.optimizer = torch.optim.Adam(
            params, lr=0, betas=(0.9, 0.98), eps=1e-9, weight_decay=weight_decay
        )
        self._step = 0
        self.warmup = warm_step
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
    @property
    def param_groups(self):
        """Return param_groups."""
        return self.optimizer.param_groups
    def step(self):
        """Update parameters and rate."""
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p["lr"] = rate
        self._rate = rate
        self.optimizer.step()
    def rate(self, step=None):
        """Implement `lrate` above."""
        if step is None:
            step = self._step
        return (
            self.factor
            * self.model_size ** (-0.5)
            * min(step ** (-0.5), step * self.warmup ** (-1.5))
        )
    def zero_grad(self):
        """Reset gradient."""
        self.optimizer.zero_grad()
    def state_dict(self):
        """Return state_dict."""
        return {
            "_step": self._step,
            "warmup": self.warmup,
            "factor": self.factor,
            "model_size": self.model_size,
            "_rate": self._rate,
            "optimizer": self.optimizer.state_dict(),
        }
    def load_state_dict(self, state_dict):
        """Load state_dict."""
        for key, value in state_dict.items():
            if key == "optimizer":
                self.optimizer.load_state_dict(state_dict["optimizer"])
            else:
                setattr(self, key, value)
--- a/egs/librispeech/ASR/transducer_emformer/train.py
+++ b/egs/librispeech/ASR/transducer_emformer/train.py
@ -21,11 +21,11 @@ Usage:
 export CUDA_VISIBLE_DEVICES="0,1,2,3"
-./pruned_transducer_stateless/train.py \
+./transducer_emformer/train.py \
  --world-size 4 \
  --num-epochs 30 \
  --start-epoch 0 \
-  --exp-dir pruned_transducer_stateless/exp \
+  --exp-dir transducer_emformer/exp \
  --full-libri 1 \
  --max-duration 300
 """
@ -33,6 +33,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3"
 import argparse
 import logging
 import warnings
 from pathlib import Path
 from shutil import copyfile
 from typing import Any, Dict, Optional, Tuple
@ -43,18 +44,18 @@ import torch
 import torch.multiprocessing as mp
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
 from conformer import Conformer
 from decoder import Decoder
 from emformer import Emformer
 from joiner import Joiner
 from lhotse.cut import Cut
 from lhotse.dataset.sampling.base import CutSampler
 from lhotse.utils import fix_random_seed
 from model import Transducer
 from noam import Noam
 from torch import Tensor
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.nn.utils import clip_grad_norm_
 from torch.utils.tensorboard import SummaryWriter
 from transformer import Noam
 from icefall.checkpoint import load_checkpoint, remove_checkpoints
 from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
@ -111,7 +112,7 @@ def get_parser():
        default=0,
        help="""Resume training from from this epoch.
        If it is positive, it will load checkpoint from
-        transducer_stateless/exp/epoch-{start_epoch-1}.pt
+        transducer_emformer/exp/epoch-{start_epoch-1}.pt
        """,
    )
@ -127,7 +128,7 @@ def get_parser():
    parser.add_argument(
        "--exp-dir",
        type=str,
-        default="pruned_transducer_stateless/exp",
+        default="transducer_emformer/exp",
        help="""The experiment dir.
        It specifies the directory where all training related
        files, e.g., checkpoints, log, etc, are saved
@ -279,7 +280,7 @@ def get_params() -> AttributeDict:
            "reset_interval": 200,
            "valid_interval": 3000,  # For the 100h subset, use 800
            "log_diagnostics": False,
-            # parameters for conformer
+            # parameters for Emformer
            "feature_dim": 80,
            "subsampling_factor": 4,
            "attention_dim": 512,
@ -287,10 +288,13 @@ def get_params() -> AttributeDict:
            "dim_feedforward": 2048,
            "num_encoder_layers": 12,
            "vgg_frontend": False,
            "left_context_length": 120,  # 120 frames
            "segment_length": 16,
            "right_context_length": 4,
            # parameters for decoder
            "embedding_dim": 512,
            # parameters for Noam
-            "warm_step": 80000,  # For the 100h subset, use 30000
+            "warm_step": 80000,  # For the 100h subset, use 20000
            "env_info": get_env_info(),
        }
    )
@ -299,8 +303,7 @@ def get_params() -> AttributeDict:
 def get_encoder_model(params: AttributeDict) -> nn.Module:
-    # TODO: We can add an option to switch between Conformer and Transformer
+    encoder = Emformer(
    encoder = Conformer(
        num_features=params.feature_dim,
        output_dim=params.vocab_size,
        subsampling_factor=params.subsampling_factor,
@ -309,6 +312,9 @@ def get_encoder_model(params: AttributeDict) -> nn.Module:
        dim_feedforward=params.dim_feedforward,
        num_encoder_layers=params.num_encoder_layers,
        vgg_frontend=params.vgg_frontend,
        left_context_length=params.left_context_length,
        segment_length=params.segment_length,
        right_context_length=params.right_context_length,
    )
    return encoder
@ -496,7 +502,11 @@ def compute_loss(
    assert loss.requires_grad == is_training
    info = MetricsTracker()
-    info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        info["frames"] = (
            (feature_lens // params.subsampling_factor).sum().item()
        )
    # Note: We use reduction=sum while computing the loss.
    info["loss"] = loss.detach().cpu().item()
@ -725,7 +735,7 @@ def run(rank, world_size, args):
    params.update(vars(args))
    if params.full_libri is False:
        params.valid_interval = 800
-        params.warm_step = 30000
+        params.warm_step = 20000
    fix_random_seed(params.seed)
    if world_size > 1: