Copy the files related to multi round nbest rescoring from k2 & snowfall

2025-08-09 10:02:22 +00:00 · 2021-08-04 14:27:11 +08:00 · 2021-08-04 14:27:11 +08:00 · cabe8b625b
commit cabe8b625b
parent cf8d76293d
3 changed files with 447 additions and 0 deletions
--- a/icefall/nbest.py
+++ b/icefall/nbest.py
@ -0,0 +1,264 @@
 # Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
 # This file implements the ideas proposed by Daniel Povey.
 #
 # See https://github.com/k2-fsa/snowfall/issues/232 for more details
 #
 import logging
 from typing import List
 import torch
 import _k2
 import k2
 # Note: We use `utterance` and `sequence` interchangeably in the comment
 class Nbest(object):
    '''
    An Nbest object contains two fields:
        (1) fsa, its type is k2.Fsa
        (2) shape, its type is k2.RaggedShape (alias to _k2.RaggedShape)
    The field `fsa` is an FsaVec containing a vector of **linear** FSAs.
    The field `shape` has two axes [utt][path]. `shape.dim0()` contains
    the number of utterances, which is also the number of rows in the
    supervision_segments. `shape.tot_size(1)` contains the number
    of paths, which is also the number of FSAs in `fsa`.
    '''
    def __init__(self, fsa: k2.Fsa, shape: _k2.RaggedShape) -> None:
        assert len(fsa.shape) == 3, f'fsa.shape: {fsa.shape}'
        assert shape.num_axes() == 2, f'num_axes: {shape.num_axes()}'
        assert fsa.shape[0] == shape.tot_size(1), \
                f'{fsa.shape[0]} vs {shape.tot_size(1)}'
        self.fsa = fsa
        self.shape = shape
    def __str__(self):
        s = 'Nbest('
        s += f'num_seqs:{self.shape.dim0()}, '
        s += f'num_fsas:{self.fsa.shape[0]})'
        return s
    def intersect(self, lats: k2.Fsa) -> 'Nbest':
        '''Intersect this Nbest object with a lattice and get 1-best
        path from the resulting FsaVec.
        Caution:
          We assume FSAs in `self.fsa` don't have epsilon self-loops.
          We also assume `self.fsa.labels` and `lats.labels` are token IDs.
        Args:
          lats:
            An FsaVec. It can be the return value of
            :func:`whole_lattice_rescoring`.
        Returns:
          Return a new Nbest. This new Nbest shares the same shape with `self`,
          while its `fsa` is the 1-best path from intersecting `self.fsa` and
          `lats.
        '''
        assert self.fsa.device == lats.device, \
                f'{self.fsa.device} vs {lats.device}'
        assert len(lats.shape) == 3, f'{lats.shape}'
        assert lats.arcs.dim0() == self.shape.dim0(), \
                f'{lats.arcs.dim0()} vs {self.shape.dim0()}'
        lats = k2.arc_sort(lats)  # no-op if lats is already arc sorted
        fsas_with_epsilon_loops = k2.add_epsilon_self_loops(self.fsa)
        path_to_seq_map = self.shape.row_ids(1)
        ans_lats = k2.intersect_device(a_fsas=lats,
                                       b_fsas=fsas_with_epsilon_loops,
                                       b_to_a_map=path_to_seq_map,
                                       sorted_match_a=True)
        one_best = k2.shortest_path(ans_lats, use_double_scores=True)
        one_best = k2.remove_epsilon(one_best)
        return Nbest(fsa=one_best, shape=self.shape)
    def total_scores(self) -> _k2.RaggedFloat:
        '''Get total scores of the FSAs in this Nbest.
        Note:
          Since FSAs in Nbest are just linear FSAs, log-semirng and tropical
          semiring produce the same total scores.
        Returns:
          Return a ragged tensor with two axes [utt][path_scores].
        '''
        scores = self.fsa.get_tot_scores(use_double_scores=True,
                                         log_semiring=False)
        # We use single precision here since we only wrap k2.RaggedFloat.
        # If k2.RaggedDouble is wrapped, we can use double precision here.
        return _k2.RaggedFloat(self.shape, scores.float())
    def top_k(self, k: int) -> 'Nbest':
        '''Get a subset of paths in the Nbest. The resulting Nbest is regular
        in that each sequence (i.e., utterance) has the same number of
        paths (k).
        We select the top-k paths according to the total_scores of each path.
        If a utterance has less than k paths, then its last path, after sorting
        by tot_scores in descending order, is repeated so that each utterance
        has exactly k paths.
        Args:
          k:
            Number of paths in each utterance.
        Returns:
          Return a new Nbest with a regular shape.
        '''
        ragged_scores = self.total_scores()
        # indexes contains idx01's for self.shape
        # ragged_scores.values()[indexes] is sorted
        indexes = k2.ragged.sort_sublist(ragged_scores,
                                         descending=True,
                                         need_new2old_indexes=True)
        ragged_indexes = k2.RaggedInt(self.shape, indexes)
        padded_indexes = k2.ragged.pad(ragged_indexes,
                                       mode='replicate',
                                       value=-1)
        assert torch.ge(padded_indexes, 0).all(), \
                'Some utterances contain empty ' \
                f'n-best: {self.shape.row_splits(1)}'
        # Select the idx01's of top-k paths of each utterance
        top_k_indexes = padded_indexes[:, :k].flatten().contiguous()
        top_k_fsas = k2.index_fsa(self.fsa, top_k_indexes)
        top_k_shape = k2.ragged.regular_ragged_shape(dim0=self.shape.dim0(),
                                                     dim1=k)
        return Nbest(top_k_fsas, top_k_shape)
 def whole_lattice_rescoring(lats: k2.Fsa, G_with_epsilon_loops: k2.Fsa) -> k2.Fsa:
    '''Rescore the 1st pass lattice with an LM.
    In general, the G in HLG used to obtain `lats` is a 3-gram LM.
    This function replaces the 3-gram LM in `lats` with a 4-gram LM.
    Args:
      lats:
        The decoding lattice from the 1st pass. We assume it is the result
        of intersecting HLG with the network output.
      G_with_epsilon_loops:
        An LM. It is usually a 4-gram LM with epsilon self-loops.
        It should be arc sorted.
    Returns:
      Return a new lattice rescored with a given G.
    '''
    assert len(lats.shape) == 3, f'{lats.shape}'
    assert hasattr(lats, 'lm_scores')
    assert G_with_epsilon_loops.shape == (1, None, None), \
            f'{G_with_epsilon_loops.shape}'
    device = lats.device
    lats.scores = lats.scores - lats.lm_scores
    # Now lats contains only acoustic scores
    # We will use lm_scores from the given G, so remove lats.lm_scores here
    del lats.lm_scores
    assert hasattr(lats, 'lm_scores') is False
    # inverted_lats has word IDs as labels.
    # Its aux_labels are token IDs, which is a ragged tensor k2.RaggedInt
    # if lats.aux_labels is a ragged tensor
    inverted_lats = k2.invert(lats)
    num_seqs = lats.shape[0]
    b_to_a_map = torch.zeros(num_seqs, device=device, dtype=torch.int32)
    while True:
        try:
            rescoring_lats = k2.intersect_device(G_with_epsilon_loops,
                                                 inverted_lats,
                                                 b_to_a_map,
                                                 sorted_match_a=True)
            break
        except RuntimeError as e:
            logging.info(f'Caught exception:\n{e}\n')
            # Usually, this is an OOM exception. We reduce
            # the size of the lattice and redo k2.intersect_device()
            # NOTE(fangjun): The choice of the threshold 1e-5 is arbitrary here
            # to avoid OOM. We may need to fine tune it.
            logging.info(f'num_arcs before: {inverted_lats.num_arcs}')
            inverted_lats = k2.prune_on_arc_post(inverted_lats, 1e-5, True)
            logging.info(f'num_arcs after: {inverted_lats.num_arcs}')
    rescoring_lats = k2.top_sort(k2.connect(rescoring_lats))
    # inv_rescoring_lats has token IDs as labels
    # and word IDs as aux_labels.
    inv_rescoring_lats = k2.invert(rescoring_lats)
    return inv_rescoring_lats
 def generate_nbest_list(lats: k2.Fsa, num_paths: int) -> Nbest:
    '''Generate an n-best list from a lattice.
    Args:
      lats:
        The decoding lattice from the first pass after LM rescoring.
        lats is an FsaVec. It can be the return value of
        :func:`whole_lattice_rescoring`
      num_paths:
        Size of n for n-best list. CAUTION: After removing paths
        that represent the same token sequences, the number of paths
        in different sequences may not be equal.
    Return:
      Return an Nbest object. Note the returned FSAs don't have epsilon
      self-loops.
    '''
    assert len(lats.shape) == 3
    # CAUTION: We use `phones` instead of `tokens` here because
    # :func:`compile_HLG` uses `phones`
    #
    # Note: compile_HLG is from k2-fsa/snowfall
    assert hasattr(lats, 'phones')
    assert not hasattr(lats, 'tokens')
    lats.tokens = lats.phones
    # we use tokens instead of phones in the following code
    # First, extract `num_paths` paths for each sequence.
    # paths is a k2.RaggedInt with axes [seq][path][arc_pos]
    paths = k2.random_paths(lats, num_paths=num_paths, use_double_scores=True)
    # token_seqs is a k2.RaggedInt sharing the same shape as `paths`
    # but it contains token IDs. Note that it also contains 0s and -1s.
    # The last entry in each sublist is -1.
    # Its axes are [seq][path][token_id]
    token_seqs = k2.index(lats.tokens, paths)
    # Remove epsilons (0s) and -1 from token_seqs
    token_seqs = k2.ragged.remove_values_leq(token_seqs, 0)
    # unique_token_seqs is still a k2.RaggedInt with axes [seq][path]token_id].
    # But then number of pathsin each sequence may be different.
    unique_token_seqs, _, _ = k2.ragged.unique_sequences(
        token_seqs, need_num_repeats=False, need_new2old_indexes=False)
    seq_to_path_shape = k2.ragged.get_layer(unique_token_seqs.shape(), 0)
    # Remove the seq axis.
    # Now unique_token_seqs has only two axes [path][token_id]
    unique_token_seqs = k2.ragged.remove_axis(unique_token_seqs, 0)
    token_fsas = k2.linear_fsa(unique_token_seqs)
    return Nbest(fsa=token_fsas, shape=seq_to_path_shape)
--- a/icefall/utils.py
+++ b/icefall/utils.py
@ -5,6 +5,7 @@ import subprocess
 from collections import defaultdict
 from contextlib import contextmanager
 from datetime import datetime
 from nbest import Nbest
 from pathlib import Path
 from typing import Dict, Iterable, List, TextIO, Tuple, Union
@ -381,3 +382,75 @@ def write_error_stats(
        print(f"{word}   {corr} {tot_errs} {ref_count} {hyp_count}", file=f)
    return float(tot_err_rate)
 def get_best_matching_stats(keys: Nbest, queries: Nbest,
                            max_order: int) -> torch.Tensor:
    '''Get best matching stats on query positions.
    Args:
      keys:
        The nbest after doing second pass rescoring.
      queries:
        Another nbest before doing second pass rescoring.
      max_order:
        The maximum n-gram order to ever return by `k2.get_best_matching_stats`
    Returns:
      A tensor with the shape of [queries.fsa.num_elements, 5], each row
      contains the stats (init_score, mean, var, counts_out, ngram_order)
      of the token in the correspodding position in queries.
    '''
    assert keys.shape.dim0() == queries.shape.dim0(), \
        f'Utterances number in keys and queries should be equal : \
         {keys.shape.dim0()} vs {queries.shape.dim0()}'
    # keys_tokens_shape [utt][path][token]
    keys_tokens_shape = k2.ragged.compose_ragged_shapes(keys.shape,
        k2.ragged.remove_axis(keys.fsa.arcs.shape(), 1))
    # queries_tokens_shape [utt][path][token]
    queries_tokens_shape = k2.ragged.compose_ragged_shapes(queries.shape,
        k2.ragged.remove_axis(queries.fsa.arcs.shape(), 1))
    keys_tokens = k2.RaggedInt(keys_tokens_shape, keys.fsa.labels.clone())
    queries_tokens = k2.RaggedInt(queries_tokens_shape,
                                  queries.fsa.labels.clone())
    # tokens shape [utt][path][token]
    tokens = k2.ragged.cat([keys_tokens, queries_tokens], axis=1)
    keys_token_num = keys.fsa.labels.size()[0]
    queries_tokens_num = queries.fsa.labels.size()[0]
    # counts on key positions are ones
    keys_counts = k2.RaggedInt(keys_tokens_shape,
                               torch.ones(keys_token_num,
                                          dtype=torch.int32))
    # counts on query positions are zeros
    queries_counts = k2.RaggedInt(queries_tokens_shape,
                                  torch.zeros(queries_tokens_num,
                                              dtype=torch.int32))
    counts = k2.ragged.cat([keys_counts, queries_counts], axis=1).values()
    # scores on key positions are the scores inherted from nbest path
    keys_scores = k2.RaggedFloat(keys_tokens_shape, keys.fsa.scores.clone())
    # scores on query positions MUST be zeros
    queries_scores = k2.RaggedFloat(queries_tokens_shape,
                                    torch.zeros(queries_tokens_num,
                                                dtype=torch.float32))
    scores = k2.ragged.cat([keys_scores, queries_scores], axis=1).values()
    # we didn't remove -1 labels before
    min_token = -1
    eos = -1
    max_token = torch.max(torch.max(keys.fsa.labels),
                          torch.max(queries.fsa.labels))
    mean, var, counts_out, ngram = k2.get_best_matching_stats(tokens, scores,
        counts, eos, min_token, max_token, max_order)
    queries_init_scores = queries.fsa.scores.clone()
    # only return the stats on query positions
    masking = counts == 0
    # shape [queries_tokens_num, 5]
    return torch.transpose(torch.stack((queries_init_scores, mean[masking],
                                        var[masking], counts_out[masking],
                                        ngram[masking])), 0, 1)
--- a/test/test_nbest.py
+++ b/test/test_nbest.py
@ -0,0 +1,110 @@
 #!/usr/bin/env python3
 #
 # Copyright      2021  Xiaomi Corp.       (authors: Fangjun Kuang)
 #
 # See ../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # To run this single test, use
 #
 #  ctest --verbose -R nbest_test_py
 import unittest
 import k2
 import torch
 from icefall.nbest import Nbest
 class TestNbest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.devices = [torch.device('cpu')]
        if torch.cuda.is_available() and k2.with_cuda:
            cls.devices.append(torch.device('cuda', 0))
            if torch.cuda.device_count() > 1:
                torch.cuda.set_device(1)
                cls.devices.append(torch.device('cuda', 1))
    def test_nbest_constructor(self):
        fsa = k2.Fsa.from_str('''
            0 1 -1 0.1
            1
        ''')
        fsa_vec = k2.create_fsa_vec([fsa, fsa, fsa])
        shape = k2.RaggedShape('[[x x] [x]]')
        Nbest(fsa_vec, shape)
    def test_top_k(self):
        fsa0 = k2.Fsa.from_str('''
            0 1 -1 0
            1
        ''')
        fsas = [fsa0.clone() for i in range(10)]
        fsa_vec = k2.create_fsa_vec(fsas)
        fsa_vec.scores = torch.tensor([3, 0, 1, 5, 4, 2, 8, 1, 9, 6],
                                      dtype=torch.float)
        #    0 1   2 3 4   5 6 7 8 9
        # [ [3 0] [1 5 4] [2 8 1 9 6]
        shape = k2.RaggedShape('[ [x x] [x x x] [x x x x x] ]')
        nbest = Nbest(fsa_vec, shape)
        # top_k: k is 1
        nbest1 = nbest.top_k(1)
        expected_fsa = k2.create_fsa_vec([fsa_vec[0], fsa_vec[3], fsa_vec[8]])
        assert str(nbest1.fsa) == str(expected_fsa)
        expected_shape = k2.RaggedShape('[ [x] [x] [x] ]')
        assert nbest1.shape == expected_shape
        # top_k: k is 2
        nbest2 = nbest.top_k(2)
        expected_fsa = k2.create_fsa_vec([
            fsa_vec[0], fsa_vec[1], fsa_vec[3], fsa_vec[4], fsa_vec[8],
            fsa_vec[6]
        ])
        assert str(nbest2.fsa) == str(expected_fsa)
        expected_shape = k2.RaggedShape('[ [x x] [x x] [x x] ]')
        assert nbest2.shape == expected_shape
        # top_k: k is 3
        nbest3 = nbest.top_k(3)
        expected_fsa = k2.create_fsa_vec([
            fsa_vec[0], fsa_vec[1], fsa_vec[1], fsa_vec[3], fsa_vec[4],
            fsa_vec[2], fsa_vec[8], fsa_vec[6], fsa_vec[9]
        ])
        assert str(nbest3.fsa) == str(expected_fsa)
        expected_shape = k2.RaggedShape('[ [x x x] [x x x] [x x x] ]')
        assert nbest3.shape == expected_shape
        # top_k: k is 4
        nbest4 = nbest.top_k(4)
        expected_fsa = k2.create_fsa_vec([
            fsa_vec[0], fsa_vec[1], fsa_vec[1], fsa_vec[1], fsa_vec[3],
            fsa_vec[4], fsa_vec[2], fsa_vec[2], fsa_vec[8], fsa_vec[6],
            fsa_vec[9], fsa_vec[5]
        ])
        assert str(nbest4.fsa) == str(expected_fsa)
        expected_shape = k2.RaggedShape('[ [x x x x] [x x x x] [x x x x] ]')
        assert nbest4.shape == expected_shape
 if __name__ == '__main__':
    unittest.main()