Use pre-computed alignments in LF-MMI training.

2025-08-26 18:24:18 +00:00 · 2021-09-28 15:37:47 +08:00 · 2021-09-28 15:37:47 +08:00 · 94daaee6ba
commit 94daaee6ba
parent 9e6bd0f07c
4 changed files with 446 additions and 2 deletions
--- a/egs/librispeech/ASR/conformer_mmi/asr_datamodule.py
+++ b/egs/librispeech/ASR/conformer_mmi/asr_datamodule.py
@ -162,7 +162,9 @@ class LibriSpeechAsrDataModule(DataModule):
        cuts_musan = load_manifest(self.args.feature_dir / "cuts_musan.json.gz")
        logging.info("About to create train dataset")
-        transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))]
+        transforms = [
            CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True)
        ]
        if self.args.concatenate_cuts:
            logging.info(
                f"Using cut concatenation with duration factor "
--- a/egs/librispeech/ASR/conformer_mmi/train.py
+++ b/egs/librispeech/ASR/conformer_mmi/train.py
@ -21,7 +21,7 @@ import argparse
 import logging
 from pathlib import Path
 from shutil import copyfile
-from typing import Optional
+from typing import Dict, Optional
 import k2
 import torch
@ -36,6 +36,11 @@ from torch.nn.utils import clip_grad_norm_
 from torch.utils.tensorboard import SummaryWriter
 from transformer import Noam
 from icefall.ali import (
    convert_alignments_to_tensor,
    load_alignments,
    lookup_alignments,
 )
 from icefall.checkpoint import load_checkpoint
 from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
 from icefall.dist import cleanup_dist, setup_dist
@ -93,6 +98,17 @@ def get_parser():
        """,
    )
    parser.add_argument(
        "--ali-dir",
        type=str,
        default="data/ali_500",
        help="""This folder is expected to contain
        two files, train-960.pt and valid.pt, which
        contain framewise alignment information for
        the training set and validation set.
        """,
    )
    return parser
@ -284,6 +300,7 @@ def compute_loss(
    batch: dict,
    graph_compiler: MmiTrainingGraphCompiler,
    is_training: bool,
    ali: Optional[Dict[str, torch.Tensor]],
 ):
    """
    Compute LF-MMI loss given the model and its inputs.
@ -304,6 +321,8 @@ def compute_loss(
        True for training. False for validation. When it is True, this
        function enables autograd during computation; when it is False, it
        disables autograd.
      ali:
        Precomputed alignments.
    """
    device = graph_compiler.device
    feature = batch["inputs"]
@ -323,6 +342,30 @@ def compute_loss(
            supervisions, subsampling_factor=params.subsampling_factor
        )
        if ali is not None and params.batch_idx_train < 4000:
            cut_ids = [cut.id for cut in supervisions["cut"]]
            # As encode_supervisions reorders cuts, we need
            # also to reorder cut IDs here
            new2old = supervision_segments[:, 0].tolist()
            cut_ids = [cut_ids[i] for i in new2old]
            # Check that new2old is just a permutation,
            # i.e., each cut contains only one utterance
            new2old.sort()
            assert new2old == torch.arange(len(new2old)).tolist()
            mask = lookup_alignments(
                cut_ids=cut_ids,
                alignments=ali,
                num_classes=nnet_output.shape[2],
            ).to(nnet_output)
            min_len = min(nnet_output.shape[1], mask.shape[1])
            ali_scale = 500.0 / (params.batch_idx_train + 500)
            nnet_output = nnet_output.clone()
            nnet_output[:, :min_len, :] += ali_scale * mask[:, :min_len, :]
        loss_fn = LFMMILoss(
            graph_compiler=graph_compiler,
            use_pruned_intersect=params.use_pruned_intersect,
@ -377,6 +420,7 @@ def compute_validation_loss(
    graph_compiler: MmiTrainingGraphCompiler,
    valid_dl: torch.utils.data.DataLoader,
    world_size: int = 1,
    ali: Optional[Dict[str, torch.Tensor]] = None,
 ) -> None:
    """Run the validation process. The validation loss
    is saved in `params.valid_loss`.
@ -394,6 +438,7 @@ def compute_validation_loss(
            batch=batch,
            graph_compiler=graph_compiler,
            is_training=False,
            ali=ali,
        )
        assert loss.requires_grad is False
        assert mmi_loss.requires_grad is False
@ -435,6 +480,8 @@ def train_one_epoch(
    graph_compiler: MmiTrainingGraphCompiler,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
    train_ali: Optional[Dict[str, torch.Tensor]],
    valid_ali: Optional[Dict[str, torch.Tensor]],
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
 ) -> None:
@ -457,6 +504,10 @@ def train_one_epoch(
        Dataloader for the training dataset.
      valid_dl:
        Dataloader for the validation dataset.
      train_ali:
        Precomputed alignments for the training set.
      valid_ali:
        Precomputed alignments for the validation set.
      tb_writer:
        Writer to write log messages to tensorboard.
      world_size:
@ -481,6 +532,7 @@ def train_one_epoch(
            batch=batch,
            graph_compiler=graph_compiler,
            is_training=True,
            ali=train_ali,
        )
        # NOTE: We use reduction==sum and loss is computed over utterances
@ -565,6 +617,7 @@ def train_one_epoch(
                graph_compiler=graph_compiler,
                valid_dl=valid_dl,
                world_size=world_size,
                ali=valid_ali,
            )
            model.train()
            logging.info(
@ -673,12 +726,34 @@ def run(rank, world_size, args):
    if checkpoints:
        optimizer.load_state_dict(checkpoints["optimizer"])
    train_960_ali_filename = Path(params.ali_dir) / "train-960.pt"
    if params.batch_idx_train < 4000 and train_960_ali_filename.is_file():
        logging.info("Use pre-computed alignments")
        subsampling_factor, train_ali = load_alignments(train_960_ali_filename)
        assert subsampling_factor == params.subsampling_factor
        assert len(train_ali) == 843723, f"{len(train_ali)} vs 843723"
        valid_ali_filename = Path(params.ali_dir) / "valid.pt"
        subsampling_factor, valid_ali = load_alignments(valid_ali_filename)
        assert subsampling_factor == params.subsampling_factor
        train_ali = convert_alignments_to_tensor(train_ali, device=device)
        valid_ali = convert_alignments_to_tensor(valid_ali, device=device)
    else:
        logging.info("Not using alignments")
        train_ali = None
        valid_ali = None
    librispeech = LibriSpeechAsrDataModule(args)
    train_dl = librispeech.train_dataloaders()
    valid_dl = librispeech.valid_dataloaders()
    for epoch in range(params.start_epoch, params.num_epochs):
        train_dl.sampler.set_epoch(epoch)
        if params.batch_idx_train > 4000 and train_ali is not None:
            # Delete the alignments to save memory
            train_ali = None
            valid_ali = None
        cur_lr = optimizer._rate
        if tb_writer is not None:
@ -699,6 +774,8 @@ def run(rank, world_size, args):
            graph_compiler=graph_compiler,
            train_dl=train_dl,
            valid_dl=valid_dl,
            train_ali=train_ali,
            valid_ali=valid_ali,
            tb_writer=tb_writer,
            world_size=world_size,
        )
--- a/icefall/ali.py
+++ b/icefall/ali.py
@ -0,0 +1,142 @@
 # Copyright      2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Dict, List, Tuple
 import torch
 from torch.nn.utils.rnn import pad_sequence
 def save_alignments(
    alignments: Dict[str, List[int]],
    subsampling_factor: int,
    filename: str,
 ) -> None:
    """Save alignments to a file.
    Args:
      alignments:
        A dict containing alignments. Keys of the dict are utterances and
        values are the corresponding framewise alignments after subsampling.
      subsampling_factor:
        The subsampling factor of the model.
      filename:
        Path to save the alignments.
    Returns:
      Return None.
    """
    ali_dict = {
        "subsampling_factor": subsampling_factor,
        "alignments": alignments,
    }
    torch.save(ali_dict, filename)
 def load_alignments(filename: str) -> Tuple[int, Dict[str, List[int]]]:
    """Load alignments from a file.
    Args:
      filename:
        Path to the file containing alignment information.
        The file should be saved by :func:`save_alignments`.
    Returns:
      Return a tuple containing:
        - subsampling_factor: The subsampling_factor used to compute
          the alignments.
        - alignments: A dict containing utterances and their corresponding
          framewise alignment, after subsampling.
    """
    ali_dict = torch.load(filename)
    subsampling_factor = ali_dict["subsampling_factor"]
    alignments = ali_dict["alignments"]
    return subsampling_factor, alignments
 def convert_alignments_to_tensor(
    alignments: Dict[str, List[int]], device: torch.device
 ) -> Dict[str, torch.Tensor]:
    """Convert alignments from list of int to a 1-D torch.Tensor.
    Args:
      alignments:
        A dict containing alignments. Keys are utterance IDs and
        values are their corresponding frame-wise alignments.
      device:
        The device to move the alignments to.
    Returns:
      Return a dict using 1-D torch.Tensor to store the alignments.
      The dtype of the tensor are `torch.int64`. We choose `torch.int64`
      because `torch.nn.functional.one_hot` requires that.
    """
    ans = {}
    for utt_id, ali in alignments.items():
        ali = torch.tensor(ali, dtype=torch.int64, device=device)
        ans[utt_id] = ali
    return ans
 def lookup_alignments(
    cut_ids: List[str],
    alignments: Dict[str, torch.Tensor],
    num_classes: int,
    log_score: float = -10,
 ) -> torch.Tensor:
    """Return a mask constructed from alignments by a list of cut IDs.
    The returned mask is a 3-D tensor of shape (N, T, C). For each frame,
    i.e., each row, of the returned mask, positions not corresponding to
    the alignments are filled with `log_score`, while the position
    specified by the alignment is filled with 0. For instance, if the alignments
    of two utterances are:
        [ [1, 3, 2], [1, 0, 4, 2] ]
    num_classes is 5 and log_score is -10,  then the returned mask is
        [
          [[-10, 0, -10, -10, -10],
           [-10, -10, -10, 0, -10],
           [-10, -10, 0, -10, -10],
           [0, -10, -10, -10, -10]],
          [[-10, 0, -10, -10, -10],
           [0, -10, -10, -10, -10],
           [-10, -10, -10, -10, 0],
           [-10, -10, 0, -10, -10]]
        ]
    Note: We pad the alignment of the first utterance with 0.
    Args:
      cut_ids:
        A list of utterance IDs.
      alignments:
        A dict containing alignments. The keys are utterance IDs and the values
        are framewise alignments.
      num_classes:
        The max token ID + 1 that appears in the alignments.
      log_score:
        Positions in the returned tensor not corresponding to the alignments
        are filled with this value.
    Returns:
      Return a 3-D torch.float32 tensor of shape (N, T, C).
    """
    # We assume all utterances have their alignments.
    ali = [alignments[cut_id] for cut_id in cut_ids]
    padded_ali = pad_sequence(ali, batch_first=True, padding_value=0)
    padded_one_hot = torch.nn.functional.one_hot(
        padded_ali,
        num_classes=num_classes,
    )
    mask = (1 - padded_one_hot) * float(log_score)
    return mask
--- a/test/test_ali.py
+++ b/test/test_ali.py
@ -0,0 +1,223 @@
 #!/usr/bin/env python3
 # Copyright      2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Runt his file using one of the following two ways:
 #  (1) python3 ./test/test_ali.py
 #  (2) pytest ./test/test_ali.py
 # The purpose of this file is to show that if we build a mask
 # from alignments and add it to a randomly generated nnet_output,
 # we can decode the correct transcript.
 from pathlib import Path
 import k2
 import torch
 from lhotse import load_manifest
 from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler
 from torch.nn.utils.rnn import pad_sequence
 from torch.utils.data import DataLoader
 from icefall.ali import (
    convert_alignments_to_tensor,
    load_alignments,
    lookup_alignments,
 )
 from icefall.decode import get_lattice, one_best_decoding
 from icefall.lexicon import Lexicon
 from icefall.utils import get_texts
 ICEFALL_DIR = Path(__file__).resolve().parent.parent
 egs_dir = ICEFALL_DIR / "egs/librispeech/ASR"
 lang_dir = egs_dir / "data/lang_bpe_500"
 #  cut_json = egs_dir / "data/fbank/cuts_train-clean-100.json.gz"
 cut_json = egs_dir / "data/fbank/cuts_train-clean-360.json.gz"
 #  cut_json = egs_dir / "data/fbank/cuts_train-other-500.json.gz"
 ali_filename = ICEFALL_DIR / "egs/librispeech/ASR/data/ali_500/train-960.pt"
 #  cut_json = egs_dir / "data/fbank/cuts_test-clean.json.gz"
 #  ali_filename = ICEFALL_DIR / "egs/librispeech/ASR/data/ali_500/test_clean.pt"
 def data_exists():
    return ali_filename.exists() and cut_json.exists() and lang_dir.exists()
 def get_dataloader():
    cuts_train = load_manifest(cut_json)
    cuts_train = cuts_train.with_features_path_prefix(egs_dir)
    train_sampler = SingleCutSampler(
        cuts_train,
        max_duration=200,
        shuffle=False,
    )
    train = K2SpeechRecognitionDataset(return_cuts=True)
    train_dl = DataLoader(
        train,
        sampler=train_sampler,
        batch_size=None,
        num_workers=1,
        persistent_workers=False,
    )
    return train_dl
 def test_one_hot():
    a = [1, 3, 2]
    b = [1, 0, 4, 2]
    c = [torch.tensor(a), torch.tensor(b)]
    d = pad_sequence(c, batch_first=True, padding_value=0)
    f = torch.nn.functional.one_hot(d, num_classes=5)
    e = (1 - f) * -10.0
    expected = torch.tensor(
        [
            [
                [-10, 0, -10, -10, -10],
                [-10, -10, -10, 0, -10],
                [-10, -10, 0, -10, -10],
                [0, -10, -10, -10, -10],
            ],
            [
                [-10, 0, -10, -10, -10],
                [0, -10, -10, -10, -10],
                [-10, -10, -10, -10, 0],
                [-10, -10, 0, -10, -10],
            ],
        ]
    ).to(e.dtype)
    assert torch.all(torch.eq(e, expected))
 def test():
    """
    The purpose of this test is to show that we can use pre-computed
    alignments to construct a mask, adding it to a randomly generated
    nnet_output, to decode the correct transcript from the resulting
    nnet_output.
    """
    if not data_exists():
        return
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    dl = get_dataloader()
    subsampling_factor, ali = load_alignments(ali_filename)
    ali = convert_alignments_to_tensor(ali, device=device)
    lexicon = Lexicon(lang_dir)
    max_token_id = max(lexicon.tokens)
    num_classes = max_token_id + 1  # +1 for the blank
    word_table = lexicon.word_table
    HLG = k2.Fsa.from_dict(
        torch.load(f"{lang_dir}/HLG.pt", map_location=device)
    )
    for batch in dl:
        features = batch["inputs"]
        supervisions = batch["supervisions"]
        N = features.shape[0]
        T = features.shape[1] // subsampling_factor
        nnet_output = (
            torch.rand(N, T, num_classes, dtype=torch.float32, device=device)
            .softmax(dim=-1)
            .log()
        )
        cut_ids = [cut.id for cut in supervisions["cut"]]
        mask = lookup_alignments(
            cut_ids=cut_ids, alignments=ali, num_classes=num_classes
        )
        min_len = min(nnet_output.shape[1], mask.shape[1])
        ali_model_scale = 0.8
        nnet_output[:, :min_len, :] += ali_model_scale * mask[:, :min_len, :]
        supervisions = batch["supervisions"]
        supervision_segments = torch.stack(
            (
                supervisions["sequence_idx"],
                supervisions["start_frame"] // subsampling_factor,
                supervisions["num_frames"] // subsampling_factor,
            ),
            1,
        ).to(torch.int32)
        lattice = get_lattice(
            nnet_output=nnet_output,
            HLG=HLG,
            supervision_segments=supervision_segments,
            search_beam=20,
            output_beam=8,
            min_active_states=30,
            max_active_states=10000,
            subsampling_factor=subsampling_factor,
        )
        best_path = one_best_decoding(lattice=lattice, use_double_scores=True)
        hyps = get_texts(best_path)
        hyps = [[word_table[i] for i in ids] for ids in hyps]
        hyps = [" ".join(s) for s in hyps]
        print(hyps)
        print(supervisions["text"])
        break
 def show_cut_ids():
    # The purpose of this function is to check that
    # for each utterance in the training set, there is
    # a corresponding alignment.
    #
    # After generating a1.txt and b1.txt
    # You can use
    #  wc -l a1.txt b1.txt
    # which should show the same number of lines.
    #
    # cat a1.txt | sort | uniq > a11.txt
    # cat b1.txt | sort | uniq > b11.txt
    #
    # md5sum a11.txt b11.txt
    #   which should show the identical hash
    #
    # diff a11.txt b11.txt
    #   should print nothing
    subsampling_factor, ali = load_alignments(ali_filename)
    with open("a1.txt", "w") as f:
        for key in ali:
            f.write(f"{key}\n")
    #  dl = get_dataloader()
    cuts_train = (
        load_manifest(egs_dir / "data/fbank/cuts_train-clean-100.json.gz")
        + load_manifest(egs_dir / "data/fbank/cuts_train-clean-360.json.gz")
        + load_manifest(egs_dir / "data/fbank/cuts_train-other-500.json.gz")
    )
    ans = []
    for cut in cuts_train:
        ans.append(cut.id)
    with open("b1.txt", "w") as f:
        for line in ans:
            f.write(f"{line}\n")
 if __name__ == "__main__":
    test()