From fcc22d3e9140e944adb27841b1f6517ffbe86a0b Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 17 Dec 2021 11:58:30 +0800 Subject: [PATCH] Use LSTM layers for the encoder. Need more tunings. --- egs/librispeech/ASR/transducer/README.md | 21 + egs/librispeech/ASR/transducer/beam_search.py | 2 +- egs/librispeech/ASR/transducer/rnn.py | 18 + egs/librispeech/ASR/transducer/test_rnn.py | 26 +- egs/librispeech/ASR/transducer/train.py | 16 +- egs/librispeech/ASR/transducer_lstm/README.md | 19 + .../ASR/transducer_lstm/asr_datamodule.py | 1 + .../ASR/transducer_lstm/beam_search.py | 212 +++++ egs/librispeech/ASR/transducer_lstm/decode.py | 457 +++++++++++ .../ASR/transducer_lstm/decoder.py | 101 +++ .../ASR/transducer_lstm/encoder.py | 115 +++ .../ASR/transducer_lstm/encoder_interface.py | 43 + egs/librispeech/ASR/transducer_lstm/joiner.py | 55 ++ egs/librispeech/ASR/transducer_lstm/model.py | 127 +++ egs/librispeech/ASR/transducer_lstm/noam.py | 104 +++ .../ASR/transducer_lstm/subsampling.py | 1 + .../ASR/transducer_lstm/test_encoder.py | 48 ++ egs/librispeech/ASR/transducer_lstm/train.py | 738 ++++++++++++++++++ 18 files changed, 2101 insertions(+), 3 deletions(-) create mode 100644 egs/librispeech/ASR/transducer/README.md create mode 100644 egs/librispeech/ASR/transducer_lstm/README.md create mode 120000 egs/librispeech/ASR/transducer_lstm/asr_datamodule.py create mode 100644 egs/librispeech/ASR/transducer_lstm/beam_search.py create mode 100755 egs/librispeech/ASR/transducer_lstm/decode.py create mode 100644 egs/librispeech/ASR/transducer_lstm/decoder.py create mode 100644 egs/librispeech/ASR/transducer_lstm/encoder.py create mode 100644 egs/librispeech/ASR/transducer_lstm/encoder_interface.py create mode 100644 egs/librispeech/ASR/transducer_lstm/joiner.py create mode 100644 egs/librispeech/ASR/transducer_lstm/model.py create mode 100644 egs/librispeech/ASR/transducer_lstm/noam.py create mode 120000 egs/librispeech/ASR/transducer_lstm/subsampling.py create mode 100755 egs/librispeech/ASR/transducer_lstm/test_encoder.py create mode 100755 egs/librispeech/ASR/transducer_lstm/train.py diff --git a/egs/librispeech/ASR/transducer/README.md b/egs/librispeech/ASR/transducer/README.md new file mode 100644 index 000000000..8f84671a3 --- /dev/null +++ b/egs/librispeech/ASR/transducer/README.md @@ -0,0 +1,21 @@ +## Introduction + +The encoder consists of Conformer layers in this folder. You can use the +following command to start the training: + +```bash +cd egs/librispeech/ASR + +export CUDA_VISIBLE_DEVICES="0,1,2" + +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +./transducer/train.py \ + --world-size 4 \ + --num-epochs 30 \ + --start-epoch 0 \ + --exp-dir transducer/exp \ + --full-libri 1 \ + --max-duration 250 \ + --lr-factor 2.5 \ +``` diff --git a/egs/librispeech/ASR/transducer/beam_search.py b/egs/librispeech/ASR/transducer/beam_search.py index 62ad14257..efca20ac3 100644 --- a/egs/librispeech/ASR/transducer/beam_search.py +++ b/egs/librispeech/ASR/transducer/beam_search.py @@ -70,7 +70,7 @@ def greedy_search(model: Transducer, encoder_out: torch.Tensor) -> List[int]: @dataclass class Hypothesis: - ys: List[int] # the predicated sequences so far + ys: List[int] # the predicted sequences so far log_prob: float # The log prob of ys # Optional decoder state. We assume it is LSTM for now, diff --git a/egs/librispeech/ASR/transducer/rnn.py b/egs/librispeech/ASR/transducer/rnn.py index 8e695db50..2a165b0c1 100644 --- a/egs/librispeech/ASR/transducer/rnn.py +++ b/egs/librispeech/ASR/transducer/rnn.py @@ -212,6 +212,24 @@ class LayerNormLSTMCell(nn.Module): if "layernorm" not in name: nn.init.uniform_(weight, -stdv, stdv) + if "bias_ih" in name or "bias_hh" in name: + # See the paper + # An Empirical Exploration of Recurrent Network Architectures + # https://proceedings.mlr.press/v37/jozefowicz15.pdf + # + # It recommends initializing the bias of the forget gate to + # a large value, such as 1 or 2. In PyTorch, there are two + # biases for the forget gate, we set both of them to 1 here. + # + # See also https://arxiv.org/pdf/1804.04849.pdf + assert weight.ndim == 1 + # Layout of the bias: + # | in_gate | forget_gate | cell_gate | output_gate | + start = weight.numel() // 4 + end = weight.numel() // 2 + with torch.no_grad(): + weight[start:end].fill_(1.0) + class LayerNormLSTMLayer(nn.Module): """ diff --git a/egs/librispeech/ASR/transducer/test_rnn.py b/egs/librispeech/ASR/transducer/test_rnn.py index d5adac482..a9393004f 100755 --- a/egs/librispeech/ASR/transducer/test_rnn.py +++ b/egs/librispeech/ASR/transducer/test_rnn.py @@ -23,7 +23,7 @@ To run this file, do: """ import torch import torch.nn as nn -from transducer.rnn import ( +from rnn import ( LayerNormGRU, LayerNormGRUCell, LayerNormGRULayer, @@ -505,6 +505,28 @@ def test_layernorm_lstm_with_projection_forward(device="cpu"): assert_allclose(x.grad, x_clone.grad) +def test_lstm_forget_gate_bias(device): + input_size = 2 + hidden_size = 3 + num_layers = 4 + bias = True + + lstm = LayerNormLSTM( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + bias=bias, + ln=nn.Identity, + device=device, + ) + for name, weight in lstm.named_parameters(): + if "bias_hh" in name or "bias_ih" in name: + start = weight.numel() // 4 + end = weight.numel() // 2 + expected = torch.ones(hidden_size).to(weight) + assert torch.all(torch.eq(weight[start:end], expected)) + + def test_layernorm_gru_cell_jit(device="cpu"): input_size = 10 hidden_size = 20 @@ -741,6 +763,8 @@ def _test_lstm(device): test_layernorm_lstm_with_projection_jit(device) test_layernorm_lstm_forward(device) test_layernorm_lstm_with_projection_forward(device) + # + test_lstm_forget_gate_bias(device) def _test_gru(device): diff --git a/egs/librispeech/ASR/transducer/train.py b/egs/librispeech/ASR/transducer/train.py index c80ea4bbc..30510f24a 100755 --- a/egs/librispeech/ASR/transducer/train.py +++ b/egs/librispeech/ASR/transducer/train.py @@ -16,6 +16,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +Usage: + +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +./transducer/train.py \ + --world-size 4 \ + --num-epochs 30 \ + --start-epoch 0 \ + --exp-dir transducer/exp \ + --full-libri 1 \ + --max-duration 250 \ + --lr-factor 2.5 +""" import argparse @@ -88,7 +102,7 @@ def get_parser(): default=0, help="""Resume training from from this epoch. If it is positive, it will load checkpoint from - conformer_ctc/exp/epoch-{start_epoch-1}.pt + transducer/exp/epoch-{start_epoch-1}.pt """, ) diff --git a/egs/librispeech/ASR/transducer_lstm/README.md b/egs/librispeech/ASR/transducer_lstm/README.md new file mode 100644 index 000000000..38c3d2bfd --- /dev/null +++ b/egs/librispeech/ASR/transducer_lstm/README.md @@ -0,0 +1,19 @@ +## Introduction + +The encoder consists of LSTM layers in this folder. You can use the +following command to start the training: + +```bash +cd egs/librispeech/ASR + +export CUDA_VISIBLE_DEVICES="0,1,2" + +./transducer_lstm/train.py \ + --world-size 3 \ + --num-epochs 30 \ + --start-epoch 0 \ + --exp-dir transducer_lstm/exp \ + --full-libri 1 \ + --max-duration 300 \ + --lr-factor 3 +``` diff --git a/egs/librispeech/ASR/transducer_lstm/asr_datamodule.py b/egs/librispeech/ASR/transducer_lstm/asr_datamodule.py new file mode 120000 index 000000000..07f39b451 --- /dev/null +++ b/egs/librispeech/ASR/transducer_lstm/asr_datamodule.py @@ -0,0 +1 @@ +../transducer/asr_datamodule.py \ No newline at end of file diff --git a/egs/librispeech/ASR/transducer_lstm/beam_search.py b/egs/librispeech/ASR/transducer_lstm/beam_search.py new file mode 100644 index 000000000..013e065be --- /dev/null +++ b/egs/librispeech/ASR/transducer_lstm/beam_search.py @@ -0,0 +1,212 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + +import torch +from model import Transducer + + +def greedy_search(model: Transducer, encoder_out: torch.Tensor) -> List[int]: + """ + Args: + model: + An instance of `Transducer`. + encoder_out: + A tensor of shape (N, T, C) from the encoder. Support only N==1 for now. + Returns: + Return the decoded result. + """ + assert encoder_out.ndim == 3 + + # support only batch_size == 1 for now + assert encoder_out.size(0) == 1, encoder_out.size(0) + blank_id = model.decoder.blank_id + device = model.device + + sos = torch.tensor([blank_id], device=device).reshape(1, 1) + decoder_out, (h, c) = model.decoder(sos) + T = encoder_out.size(1) + t = 0 + hyp = [] + max_u = 1000 # terminate after this number of steps + u = 0 + + while t < T and u < max_u: + # fmt: off + current_encoder_out = encoder_out[:, t:t+1, :] + # fmt: on + logits = model.joiner(current_encoder_out, decoder_out) + # logits is (1, 1, 1, vocab_size) + + log_prob = logits.log_softmax(dim=-1) + # log_prob is (1, 1, 1, vocab_size) + # TODO: Use logits.argmax() + y = log_prob.argmax() + if y != blank_id: + hyp.append(y.item()) + y = y.reshape(1, 1) + decoder_out, (h, c) = model.decoder(y, (h, c)) + u += 1 + else: + t += 1 + + return hyp + + +@dataclass +class Hypothesis: + ys: List[int] # the predicted sequences so far + log_prob: float # The log prob of ys + + # Optional decoder state. We assume it is LSTM for now, + # so the state is a tuple (h, c) + decoder_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None + + +def beam_search( + model: Transducer, + encoder_out: torch.Tensor, + beam: int = 5, +) -> List[int]: + """ + It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf + + espnet/nets/beam_search_transducer.py#L247 is used as a reference. + + Args: + model: + An instance of `Transducer`. + encoder_out: + A tensor of shape (N, T, C) from the encoder. Support only N==1 for now. + beam: + Beam size. + Returns: + Return the decoded result. + """ + assert encoder_out.ndim == 3 + + # support only batch_size == 1 for now + assert encoder_out.size(0) == 1, encoder_out.size(0) + blank_id = model.decoder.blank_id + sos_id = model.decoder.sos_id + device = model.device + + sos = torch.tensor([blank_id], device=device).reshape(1, 1) + decoder_out, (h, c) = model.decoder(sos) + T = encoder_out.size(1) + t = 0 + B = [Hypothesis(ys=[blank_id], log_prob=0.0, decoder_state=None)] + max_u = 20000 # terminate after this number of steps + u = 0 + + cache: Dict[ + str, Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] + ] = {} + + while t < T and u < max_u: + # fmt: off + current_encoder_out = encoder_out[:, t:t+1, :] + # fmt: on + A = B + B = [] + # for hyp in A: + # for h in A: + # if h.ys == hyp.ys[:-1]: + # # update the score of hyp + # decoder_input = torch.tensor( + # [h.ys[-1]], device=device + # ).reshape(1, 1) + # decoder_out, _ = model.decoder( + # decoder_input, h.decoder_state + # ) + # logits = model.joiner(current_encoder_out, decoder_out) + # log_prob = logits.log_softmax(dim=-1) + # log_prob = log_prob.squeeze() + # hyp.log_prob += h.log_prob + log_prob[hyp.ys[-1]].item() + + while u < max_u: + y_star = max(A, key=lambda hyp: hyp.log_prob) + A.remove(y_star) + + # Note: y_star.ys is unhashable, i.e., cannot be used + # as a key into a dict + cached_key = "_".join(map(str, y_star.ys)) + + if cached_key not in cache: + decoder_input = torch.tensor( + [y_star.ys[-1]], device=device + ).reshape(1, 1) + + decoder_out, decoder_state = model.decoder( + decoder_input, + y_star.decoder_state, + ) + cache[cached_key] = (decoder_out, decoder_state) + else: + decoder_out, decoder_state = cache[cached_key] + + logits = model.joiner(current_encoder_out, decoder_out) + log_prob = logits.log_softmax(dim=-1) + # log_prob is (1, 1, 1, vocab_size) + log_prob = log_prob.squeeze() + # Now log_prob is (vocab_size,) + + # If we choose blank here, add the new hypothesis to B. + # Otherwise, add the new hypothesis to A + + # First, choose blank + skip_log_prob = log_prob[blank_id] + new_y_star_log_prob = y_star.log_prob + skip_log_prob.item() + + # ys[:] returns a copy of ys + new_y_star = Hypothesis( + ys=y_star.ys[:], + log_prob=new_y_star_log_prob, + # Caution: Use y_star.decoder_state here + decoder_state=y_star.decoder_state, + ) + B.append(new_y_star) + + # Second, choose other labels + for i, v in enumerate(log_prob.tolist()): + if i in (blank_id, sos_id): + continue + new_ys = y_star.ys + [i] + new_log_prob = y_star.log_prob + v + new_hyp = Hypothesis( + ys=new_ys, + log_prob=new_log_prob, + decoder_state=decoder_state, + ) + A.append(new_hyp) + u += 1 + # check whether B contains more than "beam" elements more probable + # than the most probable in A + A_most_probable = max(A, key=lambda hyp: hyp.log_prob) + B = sorted( + [hyp for hyp in B if hyp.log_prob > A_most_probable.log_prob], + key=lambda hyp: hyp.log_prob, + reverse=True, + ) + if len(B) >= beam: + B = B[:beam] + break + t += 1 + best_hyp = max(B, key=lambda hyp: hyp.log_prob / len(hyp.ys[1:])) + ys = best_hyp.ys[1:] # [1:] to remove the blank + return ys diff --git a/egs/librispeech/ASR/transducer_lstm/decode.py b/egs/librispeech/ASR/transducer_lstm/decode.py new file mode 100755 index 000000000..18ae5234c --- /dev/null +++ b/egs/librispeech/ASR/transducer_lstm/decode.py @@ -0,0 +1,457 @@ +#!/usr/bin/env python3 +# +# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Usage: +(1) greedy search +./transducer_lstm/decode.py \ + --epoch 14 \ + --avg 7 \ + --exp-dir ./transducer_lstm/exp \ + --max-duration 100 \ + --decoding-method greedy_search +(2) beam search + +./transducer_lstm/decode.py \ + --epoch 14 \ + --avg 7 \ + --exp-dir ./transducer_lstm/exp \ + --max-duration 100 \ + --decoding-method beam_search \ + --beam-size 8 +""" + + +import argparse +import logging +from collections import defaultdict +from pathlib import Path +from typing import Dict, List, Tuple + +import sentencepiece as spm +import torch +import torch.nn as nn +from asr_datamodule import LibriSpeechAsrDataModule +from beam_search import beam_search, greedy_search +from decoder import Decoder +from encoder import LstmEncoder +from joiner import Joiner +from model import Transducer + +from icefall.checkpoint import average_checkpoints, load_checkpoint +from icefall.env import get_env_info +from icefall.utils import ( + AttributeDict, + setup_logger, + store_transcripts, + write_error_stats, +) + + +def get_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "--epoch", + type=int, + default=77, + help="It specifies the checkpoint to use for decoding." + "Note: Epoch counts from 0.", + ) + parser.add_argument( + "--avg", + type=int, + default=55, + help="Number of checkpoints to average. Automatically select " + "consecutive checkpoints before the checkpoint specified by " + "'--epoch'. ", + ) + + parser.add_argument( + "--exp-dir", + type=str, + default="transducer_lstm/exp", + help="The experiment dir", + ) + + parser.add_argument( + "--bpe-model", + type=str, + default="data/lang_bpe_500/bpe.model", + help="Path to the BPE model", + ) + + parser.add_argument( + "--decoding-method", + type=str, + default="greedy_search", + help="""Possible values are: + - greedy_search + - beam_search + """, + ) + + parser.add_argument( + "--beam-size", + type=int, + default=5, + help="Used only when --decoding-method is beam_search", + ) + + return parser + + +def get_params() -> AttributeDict: + params = AttributeDict( + { + # parameters for conformer + "feature_dim": 80, + "encoder_out_dim": 512, + "subsampling_factor": 4, + "encoder_hidden_size": 1024, + "num_encoder_layers": 4, + "proj_size": 512, + "vgg_frontend": False, + # decoder params + "decoder_embedding_dim": 1024, + "num_decoder_layers": 4, + "decoder_hidden_dim": 512, + "env_info": get_env_info(), + } + ) + return params + + +def get_encoder_model(params: AttributeDict): + encoder = LstmEncoder( + num_features=params.feature_dim, + hidden_size=params.encoder_hidden_size, + output_dim=params.encoder_out_dim, + subsampling_factor=params.subsampling_factor, + num_encoder_layers=params.num_encoder_layers, + vgg_frontend=params.vgg_frontend, + ) + return encoder + + +def get_decoder_model(params: AttributeDict): + decoder = Decoder( + vocab_size=params.vocab_size, + embedding_dim=params.decoder_embedding_dim, + blank_id=params.blank_id, + sos_id=params.sos_id, + num_layers=params.num_decoder_layers, + hidden_dim=params.decoder_hidden_dim, + output_dim=params.encoder_out_dim, + ) + return decoder + + +def get_joiner_model(params: AttributeDict): + joiner = Joiner( + input_dim=params.encoder_out_dim, + output_dim=params.vocab_size, + ) + return joiner + + +def get_transducer_model(params: AttributeDict): + encoder = get_encoder_model(params) + decoder = get_decoder_model(params) + joiner = get_joiner_model(params) + + model = Transducer( + encoder=encoder, + decoder=decoder, + joiner=joiner, + ) + return model + + +def decode_one_batch( + params: AttributeDict, + model: nn.Module, + sp: spm.SentencePieceProcessor, + batch: dict, +) -> Dict[str, List[List[str]]]: + """Decode one batch and return the result in a dict. The dict has the + following format: + + - key: It indicates the setting used for decoding. For example, + if greedy_search is used, it would be "greedy_search" + If beam search with a beam size of 7 is used, it would be + "beam_7" + - value: It contains the decoding result. `len(value)` equals to + batch size. `value[i]` is the decoding result for the i-th + utterance in the given batch. + Args: + params: + It's the return value of :func:`get_params`. + model: + The neural model. + sp: + The BPE model. + batch: + It is the return value from iterating + `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation + for the format of the `batch`. + Returns: + Return the decoding result. See above description for the format of + the returned dict. + """ + device = model.device + feature = batch["inputs"] + assert feature.ndim == 3 + + feature = feature.to(device) + # at entry, feature is (N, T, C) + + supervisions = batch["supervisions"] + feature_lens = supervisions["num_frames"].to(device) + + encoder_out, encoder_out_lens = model.encoder( + x=feature, x_lens=feature_lens + ) + hyps = [] + batch_size = encoder_out.size(0) + + for i in range(batch_size): + # fmt: off + encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]] + # fmt: on + if params.decoding_method == "greedy_search": + hyp = greedy_search(model=model, encoder_out=encoder_out_i) + elif params.decoding_method == "beam_search": + hyp = beam_search( + model=model, encoder_out=encoder_out_i, beam=params.beam_size + ) + else: + raise ValueError( + f"Unsupported decoding method: {params.decoding_method}" + ) + hyps.append(sp.decode(hyp).split()) + + if params.decoding_method == "greedy_search": + return {"greedy_search": hyps} + else: + return {f"beam_{params.beam_size}": hyps} + + +def decode_dataset( + dl: torch.utils.data.DataLoader, + params: AttributeDict, + model: nn.Module, + sp: spm.SentencePieceProcessor, +) -> Dict[str, List[Tuple[List[str], List[str]]]]: + """Decode dataset. + + Args: + dl: + PyTorch's dataloader containing the dataset to decode. + params: + It is returned by :func:`get_params`. + model: + The neural model. + sp: + The BPE model. + Returns: + Return a dict, whose key may be "greedy_search" if greedy search + is used, or it may be "beam_7" if beam size of 7 is used. + Its value is a list of tuples. Each tuple contains two elements: + The first is the reference transcript, and the second is the + predicted result. + """ + num_cuts = 0 + + try: + num_batches = len(dl) + except TypeError: + num_batches = "?" + + if params.decoding_method == "greedy_search": + log_interval = 100 + else: + log_interval = 2 + + results = defaultdict(list) + for batch_idx, batch in enumerate(dl): + texts = batch["supervisions"]["text"] + + hyps_dict = decode_one_batch( + params=params, + model=model, + sp=sp, + batch=batch, + ) + + for name, hyps in hyps_dict.items(): + this_batch = [] + assert len(hyps) == len(texts) + for hyp_words, ref_text in zip(hyps, texts): + ref_words = ref_text.split() + this_batch.append((ref_words, hyp_words)) + + results[name].extend(this_batch) + + num_cuts += len(texts) + + if batch_idx % log_interval == 0: + batch_str = f"{batch_idx}/{num_batches}" + + logging.info( + f"batch {batch_str}, cuts processed until now is {num_cuts}" + ) + return results + + +def save_results( + params: AttributeDict, + test_set_name: str, + results_dict: Dict[str, List[Tuple[List[int], List[int]]]], +): + test_set_wers = dict() + for key, results in results_dict.items(): + recog_path = ( + params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt" + ) + store_transcripts(filename=recog_path, texts=results) + logging.info(f"The transcripts are stored in {recog_path}") + + # The following prints out WERs, per-word error statistics and aligned + # ref/hyp pairs. + errs_filename = ( + params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt" + ) + with open(errs_filename, "w") as f: + wer = write_error_stats( + f, f"{test_set_name}-{key}", results, enable_log=True + ) + test_set_wers[key] = wer + + logging.info("Wrote detailed error stats to {}".format(errs_filename)) + + test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1]) + errs_info = ( + params.res_dir + / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt" + ) + with open(errs_info, "w") as f: + print("settings\tWER", file=f) + for key, val in test_set_wers: + print("{}\t{}".format(key, val), file=f) + + s = "\nFor {}, WER of different settings are:\n".format(test_set_name) + note = "\tbest for {}".format(test_set_name) + for key, val in test_set_wers: + s += "{}\t{}{}\n".format(key, val, note) + note = "" + logging.info(s) + + +@torch.no_grad() +def main(): + parser = get_parser() + LibriSpeechAsrDataModule.add_arguments(parser) + args = parser.parse_args() + args.exp_dir = Path(args.exp_dir) + + params = get_params() + params.update(vars(args)) + + assert params.decoding_method in ("greedy_search", "beam_search") + params.res_dir = params.exp_dir / params.decoding_method + + params.suffix = f"epoch-{params.epoch}-avg-{params.avg}" + if params.decoding_method == "beam_search": + params.suffix += f"-beam-{params.beam_size}" + + setup_logger(f"{params.res_dir}/log-decode-{params.suffix}") + logging.info("Decoding started") + + device = torch.device("cpu") + if torch.cuda.is_available(): + device = torch.device("cuda", 0) + + logging.info(f"Device: {device}") + + sp = spm.SentencePieceProcessor() + sp.load(params.bpe_model) + + # and are defined in local/train_bpe_model.py + params.blank_id = sp.piece_to_id("") + params.sos_id = sp.piece_to_id("") + params.vocab_size = sp.get_piece_size() + + logging.info(params) + + logging.info("About to create model") + model = get_transducer_model(params) + + if params.avg == 1: + load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model) + else: + start = params.epoch - params.avg + 1 + filenames = [] + for i in range(start, params.epoch + 1): + if start >= 0: + filenames.append(f"{params.exp_dir}/epoch-{i}.pt") + logging.info(f"averaging {filenames}") + model.to(device) + model.load_state_dict(average_checkpoints(filenames, device=device)) + + model.to(device) + model.eval() + model.device = device + + num_param = sum([p.numel() for p in model.parameters()]) + logging.info(f"Number of model parameters: {num_param}") + + librispeech = LibriSpeechAsrDataModule(args) + + test_clean_cuts = librispeech.test_clean_cuts() + test_other_cuts = librispeech.test_other_cuts() + + test_clean_dl = librispeech.test_dataloaders(test_clean_cuts) + test_other_dl = librispeech.test_dataloaders(test_other_cuts) + + test_sets = ["test-clean", "test-other"] + test_dl = [test_clean_dl, test_other_dl] + + for test_set, test_dl in zip(test_sets, test_dl): + results_dict = decode_dataset( + dl=test_dl, + params=params, + model=model, + sp=sp, + ) + + save_results( + params=params, + test_set_name=test_set, + results_dict=results_dict, + ) + + logging.info("Done!") + + +torch.set_num_threads(1) +torch.set_num_interop_threads(1) + +if __name__ == "__main__": + main() diff --git a/egs/librispeech/ASR/transducer_lstm/decoder.py b/egs/librispeech/ASR/transducer_lstm/decoder.py new file mode 100644 index 000000000..2f6bf4c07 --- /dev/null +++ b/egs/librispeech/ASR/transducer_lstm/decoder.py @@ -0,0 +1,101 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Tuple + +import torch +import torch.nn as nn + + +# TODO(fangjun): Support switching between LSTM and GRU +class Decoder(nn.Module): + def __init__( + self, + vocab_size: int, + embedding_dim: int, + blank_id: int, + sos_id: int, + num_layers: int, + hidden_dim: int, + output_dim: int, + embedding_dropout: float = 0.0, + rnn_dropout: float = 0.0, + ): + """ + Args: + vocab_size: + Number of tokens of the modeling unit including blank. + embedding_dim: + Dimension of the input embedding. + blank_id: + The ID of the blank symbol. + sos_id: + The ID of the SOS symbol. + num_layers: + Number of LSTM layers. + hidden_dim: + Hidden dimension of LSTM layers. + output_dim: + Output dimension of the decoder. + embedding_dropout: + Dropout rate for the embedding layer. + rnn_dropout: + Dropout for LSTM layers. + """ + super().__init__() + self.embedding = nn.Embedding( + num_embeddings=vocab_size, + embedding_dim=embedding_dim, + padding_idx=blank_id, + ) + self.embedding_dropout = nn.Dropout(embedding_dropout) + # TODO(fangjun): Use layer normalized LSTM + self.rnn = nn.LSTM( + input_size=embedding_dim, + hidden_size=hidden_dim, + num_layers=num_layers, + batch_first=True, + dropout=rnn_dropout, + ) + self.blank_id = blank_id + self.sos_id = sos_id + self.output_linear = nn.Linear(hidden_dim, output_dim) + + def forward( + self, + y: torch.Tensor, + states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + """ + Args: + y: + A 2-D tensor of shape (N, U) with BOS prepended. + states: + A tuple of two tensors containing the states information of + LSTM layers in this decoder. + Returns: + Return a tuple containing: + + - rnn_output, a tensor of shape (N, U, C) + - (h, c), containing the state information for LSTM layers. + Both are of shape (num_layers, N, C) + """ + embeding_out = self.embedding(y) + embeding_out = self.embedding_dropout(embeding_out) + rnn_out, (h, c) = self.rnn(embeding_out, states) + out = self.output_linear(rnn_out) + + return out, (h, c) diff --git a/egs/librispeech/ASR/transducer_lstm/encoder.py b/egs/librispeech/ASR/transducer_lstm/encoder.py new file mode 100644 index 000000000..860a84bb1 --- /dev/null +++ b/egs/librispeech/ASR/transducer_lstm/encoder.py @@ -0,0 +1,115 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Tuple + +import torch +import torch.nn as nn +from encoder_interface import EncoderInterface +from subsampling import Conv2dSubsampling, VggSubsampling +from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence + + +class LstmEncoder(EncoderInterface): + def __init__( + self, + num_features: int, + hidden_size: int, + output_dim: int, + subsampling_factor: int = 4, + num_encoder_layers: int = 12, + dropout: float = 0.1, + vgg_frontend: bool = False, + proj_size: int = 0, + ): + super().__init__() + real_hidden_size = proj_size if proj_size > 0 else hidden_size + assert ( + subsampling_factor == 4 + ), "Only subsampling_factor==4 is supported at present" + + # self.encoder_embed converts the input of shape (N, T, num_features) + # to the shape (N, T//subsampling_factor, d_model). + # That is, it does two things simultaneously: + # (1) subsampling: T -> T//subsampling_factor + # (2) embedding: num_features -> d_model + if vgg_frontend: + self.encoder_embed = VggSubsampling(num_features, real_hidden_size) + else: + self.encoder_embed = Conv2dSubsampling( + num_features, real_hidden_size + ) + + self.rnn = nn.LSTM( + input_size=hidden_size, + hidden_size=hidden_size, + num_layers=num_encoder_layers, + bias=True, + proj_size=proj_size, + batch_first=True, + dropout=dropout, + bidirectional=False, + ) + + self.encoder_output_layer = nn.Sequential( + nn.Dropout(p=dropout), + nn.Linear(real_hidden_size, output_dim), + ) + + def forward( + self, x: torch.Tensor, x_lens: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + x: + The input tensor. Its shape is (batch_size, seq_len, feature_dim). + x_lens: + A tensor of shape (batch_size,) containing the number of frames in + `x` before padding. + Returns: + Return a tuple containing 2 tensors: + - logits, its shape is (batch_size, output_seq_len, output_dim) + - logit_lens, a tensor of shape (batch_size,) containing the number + of frames in `logits` before padding. + """ + x = self.encoder_embed(x) + + # Caution: We assume the subsampling factor is 4! + lengths = ((x_lens - 1) // 2 - 1) // 2 + assert x.size(1) == lengths.max().item(), ( + x.size(1), + lengths.max(), + ) + + if False: + # It is commented out as DPP complains that not all parameters are + # used. Need more checks later for the reason. + # + # Caution: We assume the dataloader returns utterances with + # duration being sorted in decreasing order + packed_x = pack_padded_sequence( + input=x, + lengths=lengths.cpu(), + batch_first=True, + enforce_sorted=True, + ) + + packed_rnn_out, _ = self.rnn(packed_x) + rnn_out, _ = pad_packed_sequence(packed_x, batch_first=True) + else: + rnn_out, _ = self.rnn(x) + + logits = self.encoder_output_layer(rnn_out) + return logits, lengths diff --git a/egs/librispeech/ASR/transducer_lstm/encoder_interface.py b/egs/librispeech/ASR/transducer_lstm/encoder_interface.py new file mode 100644 index 000000000..257facce4 --- /dev/null +++ b/egs/librispeech/ASR/transducer_lstm/encoder_interface.py @@ -0,0 +1,43 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Tuple + +import torch +import torch.nn as nn + + +class EncoderInterface(nn.Module): + def forward( + self, x: torch.Tensor, x_lens: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + x: + A tensor of shape (batch_size, input_seq_len, num_features) + containing the input features. + x_lens: + A tensor of shape (batch_size,) containing the number of frames + in `x` before padding. + Returns: + Return a tuple containing two tensors: + - encoder_out, a tensor of (batch_size, out_seq_len, output_dim) + containing unnormalized probabilities, i.e., the output of a + linear layer. + - encoder_out_lens, a tensor of shape (batch_size,) containing + the number of frames in `encoder_out` before padding. + """ + raise NotImplementedError("Please implement it in a subclass") diff --git a/egs/librispeech/ASR/transducer_lstm/joiner.py b/egs/librispeech/ASR/transducer_lstm/joiner.py new file mode 100644 index 000000000..0422f8a6f --- /dev/null +++ b/egs/librispeech/ASR/transducer_lstm/joiner.py @@ -0,0 +1,55 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Joiner(nn.Module): + def __init__(self, input_dim: int, output_dim: int): + super().__init__() + + self.output_linear = nn.Linear(input_dim, output_dim) + + def forward( + self, encoder_out: torch.Tensor, decoder_out: torch.Tensor + ) -> torch.Tensor: + """ + Args: + encoder_out: + Output from the encoder. Its shape is (N, T, C). + decoder_out: + Output from the decoder. Its shape is (N, U, C). + Returns: + Return a tensor of shape (N, T, U, C). + """ + assert encoder_out.ndim == decoder_out.ndim == 3 + assert encoder_out.size(0) == decoder_out.size(0) + assert encoder_out.size(2) == decoder_out.size(2) + + encoder_out = encoder_out.unsqueeze(2) + # Now encoder_out is (N, T, 1, C) + + decoder_out = decoder_out.unsqueeze(1) + # Now decoder_out is (N, 1, U, C) + + logit = encoder_out + decoder_out + logit = F.relu(logit) + + output = self.output_linear(logit) + + return output diff --git a/egs/librispeech/ASR/transducer_lstm/model.py b/egs/librispeech/ASR/transducer_lstm/model.py new file mode 100644 index 000000000..8a4d3ca69 --- /dev/null +++ b/egs/librispeech/ASR/transducer_lstm/model.py @@ -0,0 +1,127 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Note we use `rnnt_loss` from torchaudio, which exists only in +torchaudio >= v0.10.0. It also means you have to use torch >= v1.10.0 +""" +import k2 +import torch +import torch.nn as nn +import torchaudio +import torchaudio.functional +from encoder_interface import EncoderInterface + +from icefall.utils import add_sos + +assert hasattr(torchaudio.functional, "rnnt_loss"), ( + f"Current torchaudio version: {torchaudio.__version__}\n" + "Please install a version >= 0.10.0" +) + + +class Transducer(nn.Module): + """It implements https://arxiv.org/pdf/1211.3711.pdf + "Sequence Transduction with Recurrent Neural Networks" + """ + + def __init__( + self, + encoder: EncoderInterface, + decoder: nn.Module, + joiner: nn.Module, + ): + """ + Args: + encoder: + It is the transcription network in the paper. Its accepts + two inputs: `x` of (N, T, C) and `x_lens` of shape (N,). + It returns two tensors: `logits` of shape (N, T, C) and + `logit_lens` of shape (N,). + decoder: + It is the prediction network in the paper. Its input shape + is (N, U) and its output shape is (N, U, C). It should contain + two attributes: `blank_id` and `sos_id`. + joiner: + It has two inputs with shapes: (N, T, C) and (N, U, C). Its + output shape is (N, T, U, C). Note that its output contains + unnormalized probs, i.e., not processed by log-softmax. + """ + super().__init__() + assert isinstance(encoder, EncoderInterface) + assert hasattr(decoder, "blank_id") + assert hasattr(decoder, "sos_id") + + self.encoder = encoder + self.decoder = decoder + self.joiner = joiner + + def forward( + self, + x: torch.Tensor, + x_lens: torch.Tensor, + y: k2.RaggedTensor, + ) -> torch.Tensor: + """ + Args: + x: + A 3-D tensor of shape (N, T, C). + x_lens: + A 1-D tensor of shape (N,). It contains the number of frames in `x` + before padding. + y: + A ragged tensor with 2 axes [utt][label]. It contains labels of each + utterance. + Returns: + Return the transducer loss. + """ + assert x.ndim == 3, x.shape + assert x_lens.ndim == 1, x_lens.shape + assert y.num_axes == 2, y.num_axes + + assert x.size(0) == x_lens.size(0) == y.dim0 + + encoder_out, x_lens = self.encoder(x, x_lens) + assert torch.all(x_lens > 0) + + # Now for the decoder, i.e., the prediction network + row_splits = y.shape.row_splits(1) + y_lens = row_splits[1:] - row_splits[:-1] + + blank_id = self.decoder.blank_id + sos_id = self.decoder.sos_id + sos_y = add_sos(y, sos_id=sos_id) + + sos_y_padded = sos_y.pad(mode="constant", padding_value=blank_id) + + decoder_out, _ = self.decoder(sos_y_padded) + + logits = self.joiner(encoder_out, decoder_out) + + # rnnt_loss requires 0 padded targets + # Note: y does not start with SOS + y_padded = y.pad(mode="constant", padding_value=0) + + loss = torchaudio.functional.rnnt_loss( + logits=logits, + targets=y_padded, + logit_lengths=x_lens, + target_lengths=y_lens, + blank=blank_id, + reduction="sum", + ) + + return loss diff --git a/egs/librispeech/ASR/transducer_lstm/noam.py b/egs/librispeech/ASR/transducer_lstm/noam.py new file mode 100644 index 000000000..e46bf35fb --- /dev/null +++ b/egs/librispeech/ASR/transducer_lstm/noam.py @@ -0,0 +1,104 @@ +# Copyright 2021 University of Chinese Academy of Sciences (author: Han Zhu) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + + +class Noam(object): + """ + Implements Noam optimizer. + + Proposed in + "Attention Is All You Need", https://arxiv.org/pdf/1706.03762.pdf + + Modified from + https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/optimizer.py # noqa + + Args: + params: + iterable of parameters to optimize or dicts defining parameter groups + model_size: + attention dimension of the transformer model + factor: + learning rate factor + warm_step: + warmup steps + """ + + def __init__( + self, + params, + model_size: int = 256, + factor: float = 10.0, + warm_step: int = 25000, + weight_decay=0, + ) -> None: + """Construct an Noam object.""" + self.optimizer = torch.optim.Adam( + params, lr=0, betas=(0.9, 0.98), eps=1e-9, weight_decay=weight_decay + ) + self._step = 0 + self.warmup = warm_step + self.factor = factor + self.model_size = model_size + self._rate = 0 + + @property + def param_groups(self): + """Return param_groups.""" + return self.optimizer.param_groups + + def step(self): + """Update parameters and rate.""" + self._step += 1 + rate = self.rate() + for p in self.optimizer.param_groups: + p["lr"] = rate + self._rate = rate + self.optimizer.step() + + def rate(self, step=None): + """Implement `lrate` above.""" + if step is None: + step = self._step + return ( + self.factor + * self.model_size ** (-0.5) + * min(step ** (-0.5), step * self.warmup ** (-1.5)) + ) + + def zero_grad(self): + """Reset gradient.""" + self.optimizer.zero_grad() + + def state_dict(self): + """Return state_dict.""" + return { + "_step": self._step, + "warmup": self.warmup, + "factor": self.factor, + "model_size": self.model_size, + "_rate": self._rate, + "optimizer": self.optimizer.state_dict(), + } + + def load_state_dict(self, state_dict): + """Load state_dict.""" + for key, value in state_dict.items(): + if key == "optimizer": + self.optimizer.load_state_dict(state_dict["optimizer"]) + else: + setattr(self, key, value) diff --git a/egs/librispeech/ASR/transducer_lstm/subsampling.py b/egs/librispeech/ASR/transducer_lstm/subsampling.py new file mode 120000 index 000000000..73068da26 --- /dev/null +++ b/egs/librispeech/ASR/transducer_lstm/subsampling.py @@ -0,0 +1 @@ +../transducer/subsampling.py \ No newline at end of file diff --git a/egs/librispeech/ASR/transducer_lstm/test_encoder.py b/egs/librispeech/ASR/transducer_lstm/test_encoder.py new file mode 100755 index 000000000..cad5f1148 --- /dev/null +++ b/egs/librispeech/ASR/transducer_lstm/test_encoder.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +# +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +To run this file, do: + + cd icefall/egs/librispeech/ASR + python ./transducer_lstm/test_encoder.py +""" + +from encoder import LstmEncoder + + +def test_encoder(): + encoder = LstmEncoder( + num_features=80, + hidden_size=1024, + proj_size=512, + output_dim=512, + subsampling_factor=4, + num_encoder_layers=12, + ) + num_params = sum(p.numel() for p in encoder.parameters() if p.requires_grad) + print(num_params) + # 93979284 + # 66427392 + + +def main(): + test_encoder() + + +if __name__ == "__main__": + main() diff --git a/egs/librispeech/ASR/transducer_lstm/train.py b/egs/librispeech/ASR/transducer_lstm/train.py new file mode 100755 index 000000000..941d24a56 --- /dev/null +++ b/egs/librispeech/ASR/transducer_lstm/train.py @@ -0,0 +1,738 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, +# Wei Kang +# Mingshuang Luo) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Usage: + +export CUDA_VISIBLE_DEVICES="0,1,2" + +./transducer_lstm/train.py \ + --world-size 3 \ + --num-epochs 30 \ + --start-epoch 0 \ + --exp-dir transducer_lstm/exp \ + --full-libri 1 \ + --max-duration 400 \ + --lr-factor 3 +""" + + +import argparse +import logging +from pathlib import Path +from shutil import copyfile +from typing import Optional, Tuple + +import k2 +import sentencepiece as spm +import torch +import torch.multiprocessing as mp +import torch.nn as nn +from asr_datamodule import LibriSpeechAsrDataModule +from decoder import Decoder +from encoder import LstmEncoder +from joiner import Joiner +from lhotse.cut import Cut +from lhotse.utils import fix_random_seed +from model import Transducer +from noam import Noam +from torch import Tensor +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.nn.utils import clip_grad_norm_ +from torch.utils.tensorboard import SummaryWriter + +from icefall.checkpoint import load_checkpoint +from icefall.checkpoint import save_checkpoint as save_checkpoint_impl +from icefall.dist import cleanup_dist, setup_dist +from icefall.env import get_env_info +from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool + + +def get_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "--world-size", + type=int, + default=1, + help="Number of GPUs for DDP training.", + ) + + parser.add_argument( + "--master-port", + type=int, + default=12354, + help="Master port to use for DDP training.", + ) + + parser.add_argument( + "--tensorboard", + type=str2bool, + default=True, + help="Should various information be logged in tensorboard.", + ) + + parser.add_argument( + "--num-epochs", + type=int, + default=78, + help="Number of epochs to train.", + ) + + parser.add_argument( + "--start-epoch", + type=int, + default=0, + help="""Resume training from from this epoch. + If it is positive, it will load checkpoint from + transducer_lstm/exp/epoch-{start_epoch-1}.pt + """, + ) + + parser.add_argument( + "--exp-dir", + type=str, + default="transducer_lstm/exp", + help="""The experiment dir. + It specifies the directory where all training related + files, e.g., checkpoints, log, etc, are saved + """, + ) + + parser.add_argument( + "--bpe-model", + type=str, + default="data/lang_bpe_500/bpe.model", + help="Path to the BPE model", + ) + + parser.add_argument( + "--lr-factor", + type=float, + default=5.0, + help="The lr_factor for Noam optimizer", + ) + + return parser + + +def get_params() -> AttributeDict: + """Return a dict containing training parameters. + + All training related parameters that are not passed from the commandline + are saved in the variable `params`. + + Commandline options are merged into `params` after they are parsed, so + you can also access them via `params`. + + Explanation of options saved in `params`: + + - best_train_loss: Best training loss so far. It is used to select + the model that has the lowest training loss. It is + updated during the training. + + - best_valid_loss: Best validation loss so far. It is used to select + the model that has the lowest validation loss. It is + updated during the training. + + - best_train_epoch: It is the epoch that has the best training loss. + + - best_valid_epoch: It is the epoch that has the best validation loss. + + - batch_idx_train: Used to writing statistics to tensorboard. It + contains number of batches trained so far across + epochs. + + - log_interval: Print training loss if batch_idx % log_interval` is 0 + + - reset_interval: Reset statistics if batch_idx % reset_interval is 0 + + - valid_interval: Run validation if batch_idx % valid_interval is 0 + + - feature_dim: The model input dim. It has to match the one used + in computing features. + + - subsampling_factor: The subsampling factor for the model. + + - use_feat_batchnorm: Whether to do batch normalization for the + input features. + + - attention_dim: Hidden dim for multi-head attention model. + + - num_decoder_layers: Number of decoder layer of transformer decoder. + + - weight_decay: The weight_decay for the optimizer. + + - warm_step: The warm_step for Noam optimizer. + """ + params = AttributeDict( + { + "best_train_loss": float("inf"), + "best_valid_loss": float("inf"), + "best_train_epoch": -1, + "best_valid_epoch": -1, + "batch_idx_train": 0, + "log_interval": 50, + "reset_interval": 200, + "valid_interval": 3000, # For the 100h subset, use 800 + # parameters for conformer + "feature_dim": 80, + "encoder_out_dim": 512, + "subsampling_factor": 4, + "encoder_hidden_size": 1024, + "num_encoder_layers": 4, + "proj_size": 512, + "vgg_frontend": False, + # decoder params + "decoder_embedding_dim": 1024, + "num_decoder_layers": 4, + "decoder_hidden_dim": 512, + # parameters for Noam + "weight_decay": 1e-6, + "warm_step": 80000, # For the 100h subset, use 8k + "env_info": get_env_info(), + } + ) + + return params + + +def get_encoder_model(params: AttributeDict): + encoder = LstmEncoder( + num_features=params.feature_dim, + hidden_size=params.encoder_hidden_size, + output_dim=params.encoder_out_dim, + subsampling_factor=params.subsampling_factor, + num_encoder_layers=params.num_encoder_layers, + vgg_frontend=params.vgg_frontend, + ) + return encoder + + +def get_decoder_model(params: AttributeDict): + decoder = Decoder( + vocab_size=params.vocab_size, + embedding_dim=params.decoder_embedding_dim, + blank_id=params.blank_id, + sos_id=params.sos_id, + num_layers=params.num_decoder_layers, + hidden_dim=params.decoder_hidden_dim, + output_dim=params.encoder_out_dim, + ) + return decoder + + +def get_joiner_model(params: AttributeDict): + joiner = Joiner( + input_dim=params.encoder_out_dim, + output_dim=params.vocab_size, + ) + return joiner + + +def get_transducer_model(params: AttributeDict): + encoder = get_encoder_model(params) + decoder = get_decoder_model(params) + joiner = get_joiner_model(params) + + model = Transducer( + encoder=encoder, + decoder=decoder, + joiner=joiner, + ) + return model + + +def load_checkpoint_if_available( + params: AttributeDict, + model: nn.Module, + optimizer: Optional[torch.optim.Optimizer] = None, + scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, +) -> None: + """Load checkpoint from file. + + If params.start_epoch is positive, it will load the checkpoint from + `params.start_epoch - 1`. Otherwise, this function does nothing. + + Apart from loading state dict for `model`, `optimizer` and `scheduler`, + it also updates `best_train_epoch`, `best_train_loss`, `best_valid_epoch`, + and `best_valid_loss` in `params`. + + Args: + params: + The return value of :func:`get_params`. + model: + The training model. + optimizer: + The optimizer that we are using. + scheduler: + The learning rate scheduler we are using. + Returns: + Return None. + """ + if params.start_epoch <= 0: + return + + filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt" + saved_params = load_checkpoint( + filename, + model=model, + optimizer=optimizer, + scheduler=scheduler, + ) + + keys = [ + "best_train_epoch", + "best_valid_epoch", + "batch_idx_train", + "best_train_loss", + "best_valid_loss", + ] + for k in keys: + params[k] = saved_params[k] + + return saved_params + + +def save_checkpoint( + params: AttributeDict, + model: nn.Module, + optimizer: Optional[torch.optim.Optimizer] = None, + scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, + rank: int = 0, +) -> None: + """Save model, optimizer, scheduler and training stats to file. + + Args: + params: + It is returned by :func:`get_params`. + model: + The training model. + """ + if rank != 0: + return + filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt" + save_checkpoint_impl( + filename=filename, + model=model, + params=params, + optimizer=optimizer, + scheduler=scheduler, + rank=rank, + ) + + if params.best_train_epoch == params.cur_epoch: + best_train_filename = params.exp_dir / "best-train-loss.pt" + copyfile(src=filename, dst=best_train_filename) + + if params.best_valid_epoch == params.cur_epoch: + best_valid_filename = params.exp_dir / "best-valid-loss.pt" + copyfile(src=filename, dst=best_valid_filename) + + +def compute_loss( + params: AttributeDict, + model: nn.Module, + sp: spm.SentencePieceProcessor, + batch: dict, + is_training: bool, +) -> Tuple[Tensor, MetricsTracker]: + """ + Compute CTC loss given the model and its inputs. + + Args: + params: + Parameters for training. See :func:`get_params`. + model: + The model for training. It is an instance of Conformer in our case. + batch: + A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()` + for the content in it. + is_training: + True for training. False for validation. When it is True, this + function enables autograd during computation; when it is False, it + disables autograd. + """ + device = model.device + feature = batch["inputs"] + # at entry, feature is (N, T, C) + assert feature.ndim == 3 + feature = feature.to(device) + + supervisions = batch["supervisions"] + feature_lens = supervisions["num_frames"].to(device) + + texts = batch["supervisions"]["text"] + y = sp.encode(texts, out_type=int) + y = k2.RaggedTensor(y).to(device) + + with torch.set_grad_enabled(is_training): + loss = model(x=feature, x_lens=feature_lens, y=y) + + assert loss.requires_grad == is_training + + info = MetricsTracker() + info["frames"] = (feature_lens // params.subsampling_factor).sum().item() + + # Note: We use reduction=sum while computing the loss. + info["loss"] = loss.detach().cpu().item() + + return loss, info + + +def compute_validation_loss( + params: AttributeDict, + model: nn.Module, + sp: spm.SentencePieceProcessor, + valid_dl: torch.utils.data.DataLoader, + world_size: int = 1, +) -> MetricsTracker: + """Run the validation process.""" + model.eval() + + tot_loss = MetricsTracker() + + for batch_idx, batch in enumerate(valid_dl): + loss, loss_info = compute_loss( + params=params, + model=model, + sp=sp, + batch=batch, + is_training=False, + ) + assert loss.requires_grad is False + tot_loss = tot_loss + loss_info + + if world_size > 1: + tot_loss.reduce(loss.device) + + loss_value = tot_loss["loss"] / tot_loss["frames"] + if loss_value < params.best_valid_loss: + params.best_valid_epoch = params.cur_epoch + params.best_valid_loss = loss_value + + return tot_loss + + +def train_one_epoch( + params: AttributeDict, + model: nn.Module, + optimizer: torch.optim.Optimizer, + sp: spm.SentencePieceProcessor, + train_dl: torch.utils.data.DataLoader, + valid_dl: torch.utils.data.DataLoader, + tb_writer: Optional[SummaryWriter] = None, + world_size: int = 1, +) -> None: + """Train the model for one epoch. + + The training loss from the mean of all frames is saved in + `params.train_loss`. It runs the validation process every + `params.valid_interval` batches. + + Args: + params: + It is returned by :func:`get_params`. + model: + The model for training. + optimizer: + The optimizer we are using. + train_dl: + Dataloader for the training dataset. + valid_dl: + Dataloader for the validation dataset. + tb_writer: + Writer to write log messages to tensorboard. + world_size: + Number of nodes in DDP training. If it is 1, DDP is disabled. + """ + model.train() + + tot_loss = MetricsTracker() + + for batch_idx, batch in enumerate(train_dl): + params.batch_idx_train += 1 + batch_size = len(batch["supervisions"]["text"]) + + loss, loss_info = compute_loss( + params=params, + model=model, + sp=sp, + batch=batch, + is_training=True, + ) + # summary stats + tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info + + # NOTE: We use reduction==sum and loss is computed over utterances + # in the batch and there is no normalization to it so far. + + optimizer.zero_grad() + loss.backward() + clip_grad_norm_(model.parameters(), 5.0, 2.0) + optimizer.step() + + if batch_idx % params.log_interval == 0: + logging.info( + f"Epoch {params.cur_epoch}, " + f"batch {batch_idx}, loss[{loss_info}], " + f"tot_loss[{tot_loss}], batch size: {batch_size}" + ) + + if batch_idx % params.log_interval == 0: + + if tb_writer is not None: + loss_info.write_summary( + tb_writer, "train/current_", params.batch_idx_train + ) + tot_loss.write_summary( + tb_writer, "train/tot_", params.batch_idx_train + ) + + if batch_idx > 0 and batch_idx % params.valid_interval == 0: + logging.info("Computing validation loss") + valid_info = compute_validation_loss( + params=params, + model=model, + sp=sp, + valid_dl=valid_dl, + world_size=world_size, + ) + model.train() + logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}") + if tb_writer is not None: + valid_info.write_summary( + tb_writer, "train/valid_", params.batch_idx_train + ) + + loss_value = tot_loss["loss"] / tot_loss["frames"] + params.train_loss = loss_value + if params.train_loss < params.best_train_loss: + params.best_train_epoch = params.cur_epoch + params.best_train_loss = params.train_loss + + +def run(rank, world_size, args): + """ + Args: + rank: + It is a value between 0 and `world_size-1`, which is + passed automatically by `mp.spawn()` in :func:`main`. + The node with rank 0 is responsible for saving checkpoint. + world_size: + Number of GPUs for DDP training. + args: + The return value of get_parser().parse_args() + """ + params = get_params() + params.update(vars(args)) + if params.full_libri is False: + params.valid_interval = 800 + params.warm_step = 8000 + + fix_random_seed(42) + if world_size > 1: + setup_dist(rank, world_size, params.master_port) + + setup_logger(f"{params.exp_dir}/log/log-train") + logging.info("Training started") + + if args.tensorboard and rank == 0: + tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard") + else: + tb_writer = None + + device = torch.device("cpu") + if torch.cuda.is_available(): + device = torch.device("cuda", rank) + logging.info(f"Device: {device}") + + sp = spm.SentencePieceProcessor() + sp.load(params.bpe_model) + + # and are defined in local/train_bpe_model.py + params.blank_id = sp.piece_to_id("") + params.sos_id = sp.piece_to_id("") + params.vocab_size = sp.get_piece_size() + + logging.info(params) + + logging.info("About to create model") + model = get_transducer_model(params) + + checkpoints = load_checkpoint_if_available(params=params, model=model) + + num_param = sum([p.numel() for p in model.parameters() if p.requires_grad]) + logging.info(f"Number of model parameters: {num_param}") + + model.to(device) + if world_size > 1: + logging.info("Using DDP") + model = DDP(model, device_ids=[rank]) + model.device = device + + optimizer = Noam( + model.parameters(), + model_size=params.encoder_hidden_size, + factor=params.lr_factor, + warm_step=params.warm_step, + weight_decay=params.weight_decay, + ) + + if checkpoints and "optimizer" in checkpoints: + logging.info("Loading optimizer state dict") + optimizer.load_state_dict(checkpoints["optimizer"]) + + librispeech = LibriSpeechAsrDataModule(args) + + train_cuts = librispeech.train_clean_100_cuts() + if params.full_libri: + train_cuts += librispeech.train_clean_360_cuts() + train_cuts += librispeech.train_other_500_cuts() + + def remove_short_and_long_utt(c: Cut): + # Keep only utterances with duration between 1 second and 20 seconds + return 1.0 <= c.duration <= 20.0 + + num_in_total = len(train_cuts) + + train_cuts = train_cuts.filter(remove_short_and_long_utt) + + num_left = len(train_cuts) + num_removed = num_in_total - num_left + removed_percent = num_removed / num_in_total * 100 + + logging.info(f"Before removing short and long utterances: {num_in_total}") + logging.info(f"After removing short and long utterances: {num_left}") + logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)") + + train_dl = librispeech.train_dataloaders(train_cuts) + + valid_cuts = librispeech.dev_clean_cuts() + valid_cuts += librispeech.dev_other_cuts() + valid_dl = librispeech.valid_dataloaders(valid_cuts) + + scan_pessimistic_batches_for_oom( + model=model, + train_dl=train_dl, + optimizer=optimizer, + sp=sp, + params=params, + ) + + for epoch in range(params.start_epoch, params.num_epochs): + train_dl.sampler.set_epoch(epoch) + + cur_lr = optimizer._rate + if tb_writer is not None: + tb_writer.add_scalar( + "train/learning_rate", cur_lr, params.batch_idx_train + ) + tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train) + + if rank == 0: + logging.info("epoch {}, learning rate {}".format(epoch, cur_lr)) + + params.cur_epoch = epoch + + train_one_epoch( + params=params, + model=model, + optimizer=optimizer, + sp=sp, + train_dl=train_dl, + valid_dl=valid_dl, + tb_writer=tb_writer, + world_size=world_size, + ) + + save_checkpoint( + params=params, + model=model, + optimizer=optimizer, + rank=rank, + ) + + logging.info("Done!") + + if world_size > 1: + torch.distributed.barrier() + cleanup_dist() + + +def scan_pessimistic_batches_for_oom( + model: nn.Module, + train_dl: torch.utils.data.DataLoader, + optimizer: torch.optim.Optimizer, + sp: spm.SentencePieceProcessor, + params: AttributeDict, +): + from lhotse.dataset import find_pessimistic_batches + + logging.info( + "Sanity check -- see if any of the batches in epoch 0 would cause OOM." + ) + batches, crit_values = find_pessimistic_batches(train_dl.sampler) + for criterion, cuts in batches.items(): + batch = train_dl.dataset[cuts] + try: + optimizer.zero_grad() + loss, _ = compute_loss( + params=params, + model=model, + sp=sp, + batch=batch, + is_training=True, + ) + loss.backward() + clip_grad_norm_(model.parameters(), 5.0, 2.0) + optimizer.step() + except RuntimeError as e: + if "CUDA out of memory" in str(e): + logging.error( + "Your GPU ran out of memory with the current " + "max_duration setting. We recommend decreasing " + "max_duration and trying again.\n" + f"Failing criterion: {criterion} " + f"(={crit_values[criterion]}) ..." + ) + raise + + +def main(): + parser = get_parser() + LibriSpeechAsrDataModule.add_arguments(parser) + args = parser.parse_args() + args.exp_dir = Path(args.exp_dir) + + world_size = args.world_size + assert world_size >= 1 + if world_size > 1: + mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True) + else: + run(rank=0, world_size=1, args=args) + + +torch.set_num_threads(1) +torch.set_num_interop_threads(1) + +if __name__ == "__main__": + main()