Support long audios recognition (#980)

* support long file transcription * rename recipe as long_file_recog * add docs * support multi-gpu decoding * style fix
2025-08-08 09:32:20 +00:00 · 2023-05-19 20:27:55 +08:00 · 2023-05-19 20:27:55 +08:00 · a7e142b7ff
commit a7e142b7ff
parent f18b539fbc
8 changed files with 1681 additions and 1 deletions
--- a/egs/librispeech/ASR/long_file_recog.sh
+++ b/egs/librispeech/ASR/long_file_recog.sh
@ -0,0 +1,94 @@
+#!/usr/bin/env bash
+
+# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+set -eou pipefail
+
+# This script is used to recogize long audios. The process is as follows:
+# 1) Split long audios into chunks with overlaps.
+# 2) Perform speech recognition on chunks, getting tokens and timestamps.
+# 3) Merge the overlapped chunks into utterances acording to the timestamps.
+
+# Each chunk (except the first and the last) is padded with extra left side and right side.
+# The chunk length is: left_side + chunk_size + right_side.
+chunk=30.0
+extra=2.0
+
+stage=1
+stop_stage=4
+
+# We assume that you have downloaded the LibriLight dataset
+# with audio files in $corpus_dir and texts in $text_dir
+corpus_dir=$PWD/download/libri-light
+text_dir=$PWD/download/librilight_text
+# Path to save the manifests
+output_dir=$PWD/data/librilight
+
+world_size=4
+
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  # We will get librilight_recodings_{subset}.jsonl.gz and librilight_supervisions_{subset}.jsonl.gz
+  # saved in $output_dir/manifests
+  log "Stage 1: Prepare LibriLight manifest"
+  lhotse prepare librilight $corpus_dir $text_dir $output_dir/manifests -j 10
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  # Chunk manifests are saved to $output_dir/manifests_chunk/librilight_cuts_{subset}.jsonl.gz
+  log "Stage 2: Split long audio into chunks"
+  ./long_file_recog/split_into_chunks.py \
+    --manifest-in-dir $output_dir/manifests \
+    --manifest-out-dir $output_dir/manifests_chunk \
+    --chunk $chunk \
+    --extra $extra  # Extra duration (in seconds) at both sides
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  # Recognized tokens and timestamps are saved to $output_dir/manifests_chunk_recog/librilight_cuts_{subset}.jsonl.gz
+
+  # This script loads torchscript models, exported by `torch.jit.script()`,
+  # and uses it to decode waves.
+  # You can download the jit model from https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11
+
+  log "Stage 3: Perform speech recognition on splitted chunks"
+  for subset in small median large; do
+    ./long_file_recog/recognize.py \
+      --world-size $world_size \
+      --num-workers 8 \
+      --subset $subset \
+      --manifest-in-dir $output_dir/manifests_chunk \
+      --manifest-out-dir $output_dir/manifests_chunk_recog \
+      --nn-model-filename long_file_recog/exp/jit_model.pt \
+      --bpe-model data/lang_bpe_500/bpe.model \
+      --max-duration 2400 \
+      --decoding-method greedy_search
+      --master 12345
+
+    if [ $world_size -gt 1 ]; then
+      # Combine manifests from different jobs
+      lhotse combine $(find $output_dir/manifests_chunk_recog -name librilight_cuts_${subset}_job_*.jsonl.gz | tr "\n" " ") $output_dir/manifests_chunk_recog/librilight_cuts_${subset}.jsonl.gz
+    fi
+  done
+fi
+
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+  # Final results are saved in $output_dir/manifests/librilight_cuts_{subset}.jsonl.gz
+  log "Stage 4: Merge splitted chunks into utterances."
+  ./long_file_recog/merge_chunks.py \
+    --manifest-in-dir $output_dir/manifests_chunk_recog \
+    --manifest-out-dir $output_dir/manifests \
+    --bpe-model data/lang_bpe_500/bpe.model \
+    --extra $extra
+fi
+
+
--- a/egs/librispeech/ASR/long_file_recog/asr_datamodule.py
+++ b/egs/librispeech/ASR/long_file_recog/asr_datamodule.py
@ -0,0 +1,189 @@
+# Copyright      2021  Piotr Żelasko
+# Copyright      2022  Xiaomi Corporation     (Author: Mingshuang Luo)
+# Copyright      2023  Xiaomi Corporation     (Author: Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import logging
+from functools import lru_cache
+from pathlib import Path
+from typing import Dict, List, Union
+
+import torch
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
+from lhotse.cut import Cut
+from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
+    CutConcatenate,
+    CutMix,
+    DynamicBucketingSampler,
+    K2SpeechRecognitionDataset,
+    PrecomputedFeatures,
+    SimpleCutSampler,
+    SpecAugment,
+)
+from lhotse.dataset.input_strategies import (  # noqa F401 For AudioSamples
+    AudioSamples,
+    BatchIO,
+    OnTheFlyFeatures,
+)
+from torch.utils.data import DataLoader
+
+from icefall.utils import str2bool
+
+
+class SpeechRecognitionDataset(K2SpeechRecognitionDataset):
+    def __init__(
+        self,
+        return_cuts: bool = False,
+        input_strategy: BatchIO = PrecomputedFeatures(),
+    ):
+        super().__init__(return_cuts=return_cuts, input_strategy=input_strategy)
+
+    def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[Cut]]]:
+        """
+        Return a new batch, with the batch size automatically determined using the constraints
+        of max_frames and max_cuts.
+        """
+        self.hdf5_fix.update()
+
+        # Note: don't sort cuts here
+        # Sort the cuts by duration so that the first one determines the batch time dimensions.
+        # cuts = cuts.sort_by_duration(ascending=False)
+
+        # Get a tensor with batched feature matrices, shape (B, T, F)
+        # Collation performs auto-padding, if necessary.
+        input_tpl = self.input_strategy(cuts)
+        if len(input_tpl) == 3:
+            # An input strategy with fault tolerant audio reading mode.
+            # "cuts" may be a subset of the original "cuts" variable,
+            # that only has cuts for which we succesfully read the audio.
+            inputs, _, cuts = input_tpl
+        else:
+            inputs, _ = input_tpl
+
+        # Get a dict of tensors that encode the positional information about supervisions
+        # in the batch of feature matrices. The tensors are named "sequence_idx",
+        # "start_frame/sample" and "num_frames/samples".
+        supervision_intervals = self.input_strategy.supervision_intervals(cuts)
+
+        batch = {"inputs": inputs, "supervisions": supervision_intervals}
+        if self.return_cuts:
+            batch["supervisions"]["cut"] = [cut for cut in cuts]
+
+        return batch
+
+
+class AsrDataModule:
+    """
+    DataModule for k2 ASR experiments.
+    It assumes there is always one train and valid dataloader,
+    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
+    and test-other).
+
+    It contains all the common data pipeline modules used in ASR
+    experiments, e.g.:
+    - dynamic batch size,
+    - bucketing samplers,
+    - cut concatenation,
+    - augmentation,
+    - on-the-fly feature extraction
+
+    This class should be derived for specific corpora used in ASR tasks.
+    """
+
+    def __init__(self, args: argparse.Namespace):
+        self.args = args
+
+    @classmethod
+    def add_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(
+            title="ASR data related options",
+            description="These options are used for the preparation of "
+            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
+            "effective batch sizes, sampling strategies, applied data "
+            "augmentations, etc.",
+        )
+        group.add_argument(
+            "--manifest-dir",
+            type=Path,
+            default=Path("data/manifests_chunk"),
+            help="Path to directory with train/valid/test cuts.",
+        )
+        group.add_argument(
+            "--max-duration",
+            type=int,
+            default=600.0,
+            help="Maximum pooled recordings duration (seconds) in a "
+            "single batch. You can reduce it if it causes CUDA OOM.",
+        )
+        group.add_argument(
+            "--bucketing-sampler",
+            type=str2bool,
+            default=True,
+            help="When enabled, the batches will come from buckets of "
+            "similar duration (saves padding frames).",
+        )
+        group.add_argument(
+            "--return-cuts",
+            type=str2bool,
+            default=True,
+            help="When enabled, each batch will have the "
+            "field: batch['supervisions']['cut'] with the cuts that "
+            "were used to construct it.",
+        )
+        group.add_argument(
+            "--num-workers",
+            type=int,
+            default=8,
+            help="The number of training dataloader workers that "
+            "collect the batches.",
+        )
+
+        group.add_argument(
+            "--input-strategy",
+            type=str,
+            default="PrecomputedFeatures",
+            help="AudioSamples or PrecomputedFeatures",
+        )
+
+    def dataloaders(self, cuts: CutSet) -> DataLoader:
+        logging.debug("About to create test dataset")
+        test = SpeechRecognitionDataset(
+            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
+            return_cuts=self.args.return_cuts,
+        )
+
+        sampler = SimpleCutSampler(
+            cuts,
+            max_duration=self.args.max_duration,
+            shuffle=False,
+            drop_last=False,
+        )
+
+        logging.debug("About to create test dataloader")
+        test_dl = DataLoader(
+            test,
+            batch_size=None,
+            sampler=sampler,
+            num_workers=self.args.num_workers,
+            persistent_workers=False,
+        )
+        return test_dl
+
+    @lru_cache()
+    def load_subset(self, cuts_filename: Path) -> CutSet:
+        return load_manifest_lazy(cuts_filename)
--- a/egs/librispeech/ASR/long_file_recog/beam_search.py
+++ b/egs/librispeech/ASR/long_file_recog/beam_search.py
@ -0,0 +1,613 @@
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang
+#                                                  Xiaoyu Yang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Union
+
+import k2
+import torch
+
+from icefall.decode import one_best_decoding
+from icefall.utils import DecodingResults, get_texts, get_texts_with_timestamp
+
+
+def fast_beam_search(
+    model: torch.nn.Module,
+    decoding_graph: k2.Fsa,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    beam: float,
+    max_states: int,
+    max_contexts: int,
+    temperature: float = 1.0,
+) -> k2.Fsa:
+    """It limits the maximum number of symbols per frame to 1.
+
+    Args:
+      model:
+        An instance of `Transducer`.
+      decoding_graph:
+        Decoding graph used for decoding, may be a TrivialGraph or a LG.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder.
+      encoder_out_lens:
+        A tensor of shape (N,) containing the number of frames in `encoder_out`
+        before padding.
+      beam:
+        Beam value, similar to the beam used in Kaldi..
+      max_states:
+        Max states per stream per frame.
+      max_contexts:
+        Max contexts pre stream per frame.
+      temperature:
+        Softmax temperature.
+    Returns:
+      Return an FsaVec with axes [utt][state][arc] containing the decoded
+      lattice. Note: When the input graph is a TrivialGraph, the returned
+      lattice is actually an acceptor.
+    """
+    assert encoder_out.ndim == 3
+
+    context_size = model.decoder.context_size
+    vocab_size = model.decoder.vocab_size
+
+    B, T, C = encoder_out.shape
+
+    config = k2.RnntDecodingConfig(
+        vocab_size=vocab_size,
+        decoder_history_len=context_size,
+        beam=beam,
+        max_contexts=max_contexts,
+        max_states=max_states,
+    )
+    individual_streams = []
+    for i in range(B):
+        individual_streams.append(k2.RnntDecodingStream(decoding_graph))
+    decoding_streams = k2.RnntDecodingStreams(individual_streams, config)
+
+    encoder_out = model.joiner.encoder_proj(encoder_out)
+
+    for t in range(T):
+        # shape is a RaggedShape of shape (B, context)
+        # contexts is a Tensor of shape (shape.NumElements(), context_size)
+        shape, contexts = decoding_streams.get_contexts()
+        # `nn.Embedding()` in torch below v1.7.1 supports only torch.int64
+        contexts = contexts.to(torch.int64)
+        # decoder_out is of shape (shape.NumElements(), 1, decoder_out_dim)
+        decoder_out = model.decoder(contexts, need_pad=False)
+        decoder_out = model.joiner.decoder_proj(decoder_out)
+        # current_encoder_out is of shape
+        # (shape.NumElements(), 1, joiner_dim)
+        # fmt: off
+        current_encoder_out = torch.index_select(
+            encoder_out[:, t:t + 1, :], 0, shape.row_ids(1).to(torch.int64)
+        )
+        # fmt: on
+        logits = model.joiner(
+            current_encoder_out.unsqueeze(2),
+            decoder_out.unsqueeze(1),
+            project_input=False,
+        )
+        logits = logits.squeeze(1).squeeze(1)
+        log_probs = (logits / temperature).log_softmax(dim=-1)
+        decoding_streams.advance(log_probs)
+    decoding_streams.terminate_and_flush_to_streams()
+    lattice = decoding_streams.format_output(encoder_out_lens.tolist())
+
+    return lattice
+
+
+def fast_beam_search_one_best(
+    model: torch.nn.Module,
+    decoding_graph: k2.Fsa,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    beam: float,
+    max_states: int,
+    max_contexts: int,
+    temperature: float = 1.0,
+    return_timestamps: bool = False,
+) -> Union[List[List[int]], DecodingResults]:
+    """It limits the maximum number of symbols per frame to 1.
+
+    A lattice is first obtained using fast beam search, and then
+    the shortest path within the lattice is used as the final output.
+
+    Args:
+      model:
+        An instance of `Transducer`.
+      decoding_graph:
+        Decoding graph used for decoding, may be a TrivialGraph or a LG.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder.
+      encoder_out_lens:
+        A tensor of shape (N,) containing the number of frames in `encoder_out`
+        before padding.
+      beam:
+        Beam value, similar to the beam used in Kaldi..
+      max_states:
+        Max states per stream per frame.
+      max_contexts:
+        Max contexts pre stream per frame.
+      temperature:
+        Softmax temperature.
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    lattice = fast_beam_search(
+        model=model,
+        decoding_graph=decoding_graph,
+        encoder_out=encoder_out,
+        encoder_out_lens=encoder_out_lens,
+        beam=beam,
+        max_states=max_states,
+        max_contexts=max_contexts,
+        temperature=temperature,
+    )
+
+    best_path = one_best_decoding(lattice)
+
+    if not return_timestamps:
+        return get_texts(best_path)
+    else:
+        return get_texts_with_timestamp(best_path)
+
+
+def greedy_search_batch(
+    model: torch.nn.Module,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    return_timestamps: bool = False,
+) -> Union[List[List[int]], DecodingResults]:
+    """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
+    Args:
+      model:
+        The transducer model.
+      encoder_out:
+        Output from the encoder. Its shape is (N, T, C), where N >= 1.
+      encoder_out_lens:
+        A 1-D tensor of shape (N,), containing number of valid frames in
+        encoder_out before padding.
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    assert encoder_out.ndim == 3
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
+
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
+
+    device = next(model.parameters()).device
+
+    blank_id = model.decoder.blank_id
+    unk_id = getattr(model, "unk_id", blank_id)
+    context_size = model.decoder.context_size
+
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    hyps = [[-1] * (context_size - 1) + [blank_id] for _ in range(N)]
+
+    # timestamp[n][i] is the frame index after subsampling
+    # on which hyp[n][i] is decoded
+    timestamps = [[] for _ in range(N)]
+    # scores[n][i] is the logits on which hyp[n][i] is decoded
+    scores = [[] for _ in range(N)]
+
+    decoder_input = torch.tensor(
+        hyps,
+        device=device,
+        dtype=torch.int64,
+    )  # (N, context_size)
+
+    decoder_out = model.decoder(decoder_input, need_pad=False)
+    decoder_out = model.joiner.decoder_proj(decoder_out)
+    # decoder_out: (N, 1, decoder_out_dim)
+
+    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
+
+    offset = 0
+    for (t, batch_size) in enumerate(batch_size_list):
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = encoder_out.data[start:end]
+        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
+        # current_encoder_out's shape: (batch_size, 1, 1, encoder_out_dim)
+        offset = end
+
+        decoder_out = decoder_out[:batch_size]
+
+        logits = model.joiner(
+            current_encoder_out, decoder_out.unsqueeze(1), project_input=False
+        )
+        # logits'shape (batch_size, 1, 1, vocab_size)
+
+        logits = logits.squeeze(1).squeeze(1)  # (batch_size, vocab_size)
+        log_probs = logits.log_softmax(dim=-1)
+        assert log_probs.ndim == 2, log_probs.shape
+        y = log_probs.argmax(dim=1).tolist()
+        emitted = False
+        for i, v in enumerate(y):
+            if v not in (blank_id, unk_id):
+                hyps[i].append(v)
+                timestamps[i].append(t)
+                scores[i].append(log_probs[i, v].item())
+                emitted = True
+        if emitted:
+            # update decoder output
+            decoder_input = [h[-context_size:] for h in hyps[:batch_size]]
+            decoder_input = torch.tensor(
+                decoder_input,
+                device=device,
+                dtype=torch.int64,
+            )
+            decoder_out = model.decoder(decoder_input, need_pad=False)
+            decoder_out = model.joiner.decoder_proj(decoder_out)
+
+    sorted_ans = [h[context_size:] for h in hyps]
+    ans = []
+    ans_timestamps = []
+    ans_scores = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
+        ans_timestamps.append(timestamps[unsorted_indices[i]])
+        ans_scores.append(scores[unsorted_indices[i]])
+
+    if not return_timestamps:
+        return ans
+    else:
+        return DecodingResults(
+            hyps=ans,
+            timestamps=ans_timestamps,
+            scores=ans_scores,
+        )
+
+
+@dataclass
+class Hypothesis:
+    # The predicted tokens so far.
+    # Newly predicted tokens are appended to `ys`.
+    ys: List[int]
+
+    # The log prob of ys.
+    # It contains only one entry.
+    log_prob: torch.Tensor
+
+    # timestamp[i] is the frame index after subsampling
+    # on which ys[i] is decoded
+    timestamp: List[int] = field(default_factory=list)
+
+    @property
+    def key(self) -> str:
+        """Return a string representation of self.ys"""
+        return "_".join(map(str, self.ys))
+
+
+class HypothesisList(object):
+    def __init__(self, data: Optional[Dict[str, Hypothesis]] = None) -> None:
+        """
+        Args:
+          data:
+            A dict of Hypotheses. Its key is its `value.key`.
+        """
+        if data is None:
+            self._data = {}
+        else:
+            self._data = data
+
+    @property
+    def data(self) -> Dict[str, Hypothesis]:
+        return self._data
+
+    def add(self, hyp: Hypothesis) -> None:
+        """Add a Hypothesis to `self`.
+
+        If `hyp` already exists in `self`, its probability is updated using
+        `log-sum-exp` with the existed one.
+
+        Args:
+          hyp:
+            The hypothesis to be added.
+        """
+        key = hyp.key
+        if key in self:
+            old_hyp = self._data[key]  # shallow copy
+            torch.logaddexp(old_hyp.log_prob, hyp.log_prob, out=old_hyp.log_prob)
+        else:
+            self._data[key] = hyp
+
+    def get_most_probable(self, length_norm: bool = False) -> Hypothesis:
+        """Get the most probable hypothesis, i.e., the one with
+        the largest `log_prob`.
+
+        Args:
+          length_norm:
+            If True, the `log_prob` of a hypothesis is normalized by the
+            number of tokens in it.
+        Returns:
+          Return the hypothesis that has the largest `log_prob`.
+        """
+        if length_norm:
+            return max(self._data.values(), key=lambda hyp: hyp.log_prob / len(hyp.ys))
+        else:
+            return max(self._data.values(), key=lambda hyp: hyp.log_prob)
+
+    def remove(self, hyp: Hypothesis) -> None:
+        """Remove a given hypothesis.
+
+        Caution:
+          `self` is modified **in-place**.
+
+        Args:
+          hyp:
+            The hypothesis to be removed from `self`.
+            Note: It must be contained in `self`. Otherwise,
+            an exception is raised.
+        """
+        key = hyp.key
+        assert key in self, f"{key} does not exist"
+        del self._data[key]
+
+    def filter(self, threshold: torch.Tensor) -> "HypothesisList":
+        """Remove all Hypotheses whose log_prob is less than threshold.
+
+        Caution:
+          `self` is not modified. Instead, a new HypothesisList is returned.
+
+        Returns:
+          Return a new HypothesisList containing all hypotheses from `self`
+          with `log_prob` being greater than the given `threshold`.
+        """
+        ans = HypothesisList()
+        for _, hyp in self._data.items():
+            if hyp.log_prob > threshold:
+                ans.add(hyp)  # shallow copy
+        return ans
+
+    def topk(self, k: int) -> "HypothesisList":
+        """Return the top-k hypothesis."""
+        hyps = list(self._data.items())
+
+        hyps = sorted(hyps, key=lambda h: h[1].log_prob, reverse=True)[:k]
+
+        ans = HypothesisList(dict(hyps))
+        return ans
+
+    def __contains__(self, key: str):
+        return key in self._data
+
+    def __iter__(self):
+        return iter(self._data.values())
+
+    def __len__(self) -> int:
+        return len(self._data)
+
+    def __str__(self) -> str:
+        s = []
+        for key in self:
+            s.append(key)
+        return ", ".join(s)
+
+
+def get_hyps_shape(hyps: List[HypothesisList]) -> k2.RaggedShape:
+    """Return a ragged shape with axes [utt][num_hyps].
+
+    Args:
+      hyps:
+        len(hyps) == batch_size. It contains the current hypothesis for
+        each utterance in the batch.
+    Returns:
+      Return a ragged shape with 2 axes [utt][num_hyps]. Note that
+      the shape is on CPU.
+    """
+    num_hyps = [len(h) for h in hyps]
+
+    # torch.cumsum() is inclusive sum, so we put a 0 at the beginning
+    # to get exclusive sum later.
+    num_hyps.insert(0, 0)
+
+    num_hyps = torch.tensor(num_hyps)
+    row_splits = torch.cumsum(num_hyps, dim=0, dtype=torch.int32)
+    ans = k2.ragged.create_ragged_shape2(
+        row_splits=row_splits, cached_tot_size=row_splits[-1].item()
+    )
+    return ans
+
+
+def modified_beam_search(
+    model: torch.nn.Module,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    beam: int = 4,
+    temperature: float = 1.0,
+    return_timestamps: bool = False,
+) -> Union[List[List[int]], DecodingResults]:
+    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
+
+    Args:
+      model:
+        The transducer model.
+      encoder_out:
+        Output from the encoder. Its shape is (N, T, C).
+      encoder_out_lens:
+        A 1-D tensor of shape (N,), containing number of valid frames in
+        encoder_out before padding.
+      beam:
+        Number of active paths during the beam search.
+      temperature:
+        Softmax temperature.
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    assert encoder_out.ndim == 3, encoder_out.shape
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
+
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
+
+    blank_id = model.decoder.blank_id
+    unk_id = getattr(model, "unk_id", blank_id)
+    context_size = model.decoder.context_size
+    device = next(model.parameters()).device
+
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    B = [HypothesisList() for _ in range(N)]
+    for i in range(N):
+        B[i].add(
+            Hypothesis(
+                ys=[blank_id] * context_size,
+                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
+                timestamp=[],
+            )
+        )
+
+    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
+
+    offset = 0
+    finalized_B = []
+    for (t, batch_size) in enumerate(batch_size_list):
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = encoder_out.data[start:end]
+        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
+        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
+        offset = end
+
+        finalized_B = B[batch_size:] + finalized_B
+        B = B[:batch_size]
+
+        hyps_shape = get_hyps_shape(B).to(device)
+
+        A = [list(b) for b in B]
+        B = [HypothesisList() for _ in range(batch_size)]
+
+        ys_log_probs = torch.cat(
+            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
+        )  # (num_hyps, 1)
+
+        decoder_input = torch.tensor(
+            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
+            device=device,
+            dtype=torch.int64,
+        )  # (num_hyps, context_size)
+
+        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
+        decoder_out = model.joiner.decoder_proj(decoder_out)
+        # decoder_out is of shape (num_hyps, 1, 1, joiner_dim)
+
+        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
+        # as index, so we use `to(torch.int64)` below.
+        current_encoder_out = torch.index_select(
+            current_encoder_out,
+            dim=0,
+            index=hyps_shape.row_ids(1).to(torch.int64),
+        )  # (num_hyps, 1, 1, encoder_out_dim)
+
+        logits = model.joiner(
+            current_encoder_out,
+            decoder_out,
+            project_input=False,
+        )  # (num_hyps, 1, 1, vocab_size)
+
+        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
+
+        log_probs = (logits / temperature).log_softmax(dim=-1)  # (num_hyps, vocab_size)
+
+        log_probs.add_(ys_log_probs)
+
+        vocab_size = log_probs.size(-1)
+
+        log_probs = log_probs.reshape(-1)
+
+        row_splits = hyps_shape.row_splits(1) * vocab_size
+        log_probs_shape = k2.ragged.create_ragged_shape2(
+            row_splits=row_splits, cached_tot_size=log_probs.numel()
+        )
+        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
+
+        for i in range(batch_size):
+            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
+
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
+                topk_token_indexes = (topk_indexes % vocab_size).tolist()
+
+            for k in range(len(topk_hyp_indexes)):
+                hyp_idx = topk_hyp_indexes[k]
+                hyp = A[i][hyp_idx]
+
+                new_ys = hyp.ys[:]
+                new_token = topk_token_indexes[k]
+                new_timestamp = hyp.timestamp[:]
+                if new_token not in (blank_id, unk_id):
+                    new_ys.append(new_token)
+                    new_timestamp.append(t)
+
+                new_log_prob = topk_log_probs[k]
+                new_hyp = Hypothesis(
+                    ys=new_ys, log_prob=new_log_prob, timestamp=new_timestamp
+                )
+                B[i].add(new_hyp)
+
+    B = B + finalized_B
+    best_hyps = [b.get_most_probable(length_norm=True) for b in B]
+
+    sorted_ans = [h.ys[context_size:] for h in best_hyps]
+    sorted_timestamps = [h.timestamp for h in best_hyps]
+    ans = []
+    ans_timestamps = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
+        ans_timestamps.append(sorted_timestamps[unsorted_indices[i]])
+
+    if not return_timestamps:
+        return ans
+    else:
+        return DecodingResults(
+            hyps=ans,
+            timestamps=ans_timestamps,
+        )
--- a/egs/librispeech/ASR/long_file_recog/merge_chunks.py
+++ b/egs/librispeech/ASR/long_file_recog/merge_chunks.py
@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+# Copyright    2023  Xiaomi Corp.        (authors: Fangjun Kuang, Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file merge overlapped chunks into utterances accroding to recording ids.
+"""
+
+import argparse
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import List
+
+import sentencepiece as spm
+from lhotse import (
+    CutSet,
+    MonoCut,
+    SupervisionSegment,
+    SupervisionSet,
+    load_manifest,
+    load_manifest_lazy,
+)
+from lhotse.cut import Cut
+from lhotse.serialization import SequentialJsonlWriter
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--manifest-in-dir",
+        type=Path,
+        default=Path("data/librilight/manifests_chunk_recog"),
+        help="Path to directory of chunk cuts with recognition results.",
+    )
+
+    parser.add_argument(
+        "--manifest-out-dir",
+        type=Path,
+        default=Path("data/manifests"),
+        help="Path to directory to save full utterance by merging overlapped chunks.",
+    )
+
+    parser.add_argument(
+        "--extra",
+        type=float,
+        default=2.0,
+        help="""Extra duration (in seconds) at both sides.""",
+    )
+
+    return parser.parse_args()
+
+
+def merge_chunks(
+    cuts_chunk: CutSet,
+    supervisions: SupervisionSet,
+    cuts_writer: SequentialJsonlWriter,
+    sp: spm.SentencePieceProcessor,
+    extra: float,
+) -> int:
+    """Merge chunk-wise cuts accroding to recording ids.
+
+    Args:
+      cuts_chunk:
+        The chunk-wise cuts opened in a lazy mode.
+      supervisions:
+        The supervision manifest containing text file path, opened in a lazy mode.
+      cuts_writer:
+        Writer to save the cuts with recognition results.
+      sp:
+        The BPE model.
+      extra:
+        Extra duration (in seconds) to drop at both sides of each chunk.
+    """
+
+    #  Background worker to add alignemnt and save cuts to disk.
+    def _save_worker(utt_cut: Cut, flush=False):
+        cuts_writer.write(utt_cut, flush=flush)
+
+    def _merge(cut_list: List[Cut], rec_id: str, utt_idx: int):
+        """Merge chunks with same recording_id."""
+        for cut in cut_list:
+            assert cut.recording.id == rec_id, (cut.recording.id, rec_id)
+
+        # For each group with a same recording, sort it accroding to the start time
+        # In fact, we don't need to do this since the cuts have been sorted
+        # according to the start time
+        cut_list = sorted(cut_list, key=(lambda cut: cut.start))
+
+        rec = cut_list[0].recording
+        alignments = []
+        cur_end = 0
+        for cut in cut_list:
+            # Get left and right borders
+            left = cut.start + extra if cut.start > 0 else 0
+            chunk_end = cut.start + cut.duration
+            right = chunk_end - extra if chunk_end < rec.duration else rec.duration
+
+            # Assert the chunks are continuous
+            assert left == cur_end, (left, cur_end)
+            cur_end = right
+
+            assert len(cut.supervisions) == 1, len(cut.supervisions)
+            for ali in cut.supervisions[0].alignment["symbol"]:
+                t = ali.start + cut.start
+                if left <= t < right:
+                    alignments.append(ali.with_offset(cut.start))
+
+        old_sup = supervisions[rec_id]
+        # Assuming the supervisions are sorted with the same recoding order as in cuts_chunk
+        # old_sup = supervisions[utt_idx]
+        assert old_sup.recording_id == rec_id, (old_sup.recording_id, rec_id)
+
+        new_sup = SupervisionSegment(
+            id=rec_id,
+            recording_id=rec_id,
+            start=0,
+            duration=rec.duration,
+            alignment={"symbol": alignments},
+            language=old_sup.language,
+            speaker=old_sup.speaker,
+        )
+
+        utt_cut = MonoCut(
+            id=rec_id,
+            start=0,
+            duration=rec.duration,
+            channel=0,
+            recording=rec,
+            supervisions=[new_sup],
+        )
+        # Set a custom attribute to the cut
+        utt_cut.text_path = old_sup.book
+
+        return utt_cut
+
+    last_rec_id = None
+    cut_list = []
+    utt_idx = 0
+
+    futures = []
+    with ThreadPoolExecutor(max_workers=1) as executor:
+
+        for cut in cuts_chunk:
+            cur_rec_id = cut.recording.id
+            if len(cut_list) == 0:
+                # Case of the first cut
+                last_rec_id = cur_rec_id
+                cut_list.append(cut)
+            elif cur_rec_id == last_rec_id:
+                cut_list.append(cut)
+            else:
+                # Case of a cut belonging to a new recording
+                utt_cut = _merge(cut_list, last_rec_id, utt_idx)
+                utt_idx += 1
+
+                futures.append(executor.submit(_save_worker, utt_cut))
+
+                last_rec_id = cur_rec_id
+                cut_list = [cut]
+
+                if utt_idx % 5000 == 0:
+                    logging.info(f"Procesed {utt_idx} utterances.")
+
+        # For the cuts belonging to the last recording
+        if len(cut_list) != 0:
+            utt_cut = _merge(cut_list, last_rec_id, utt_idx)
+            utt_idx += 1
+
+            futures.append(executor.submit(_save_worker, utt_cut))
+            logging.info("Finished")
+
+        for f in futures:
+            f.result()
+
+    return utt_idx
+
+
+def main():
+    args = get_parser()
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(args.bpe_model)
+
+    # It contains "librilight_recordings_*.jsonl.gz" and "librilight_supervisions_small.jsonl.gz"
+    manifest_out_dir = args.manifest_out_dir
+
+    subsets = ["small", "median", "large"]
+
+    for subset in subsets:
+        logging.info(f"Processing {subset} subset")
+
+        manifest_out = manifest_out_dir / f"librilight_cuts_{subset}.jsonl.gz"
+        if manifest_out.is_file():
+            logging.info(f"{manifest_out} already exists - skipping.")
+            continue
+
+        supervisions = load_manifest(
+            manifest_out_dir / f"librilight_supervisions_{subset}.jsonl.gz"
+        )  # We will use the text path from supervisions
+
+        cuts_chunk = load_manifest_lazy(
+            args.manifest_in_dir / f"librilight_cuts_{subset}.jsonl.gz"
+        )
+
+        cuts_writer = CutSet.open_writer(manifest_out, overwrite=True)
+        num_utt = merge_chunks(
+            cuts_chunk, supervisions, cuts_writer=cuts_writer, sp=sp, extra=args.extra
+        )
+        cuts_writer.close()
+        logging.info(f"{num_utt} cuts saved to {manifest_out}")
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    main()
--- a/egs/librispeech/ASR/long_file_recog/recognize.py
+++ b/egs/librispeech/ASR/long_file_recog/recognize.py
@ -0,0 +1,435 @@
+#!/usr/bin/env python3
+# Copyright 2023 Xiaomi Corporation (Author: Fangjun Kuang, Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script loads torchscript models, exported by `torch.jit.script()`,
+and uses them to decode waves.
+You can use the following command to get the exported models:
+
+./pruned_transducer_stateless7/export.py \
+  --exp-dir ./pruned_transducer_stateless7/exp \
+  --bpe-model data/lang_bpe_500/bpe.model \
+  --epoch 20 \
+  --avg 10 \
+  --jit 1
+
+You can also download the jit model from
+https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11
+"""
+
+import argparse
+import torch.multiprocessing as mp
+import torch
+import torch.nn as nn
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from typing import List, Optional, Tuple
+
+from pathlib import Path
+
+import k2
+import sentencepiece as spm
+from asr_datamodule import AsrDataModule
+from beam_search import (
+    fast_beam_search_one_best,
+    greedy_search_batch,
+    modified_beam_search,
+)
+from icefall.utils import AttributeDict, convert_timestamp, setup_logger
+from lhotse import CutSet, load_manifest_lazy
+from lhotse.cut import Cut
+from lhotse.supervision import AlignmentItem
+from lhotse.serialization import SequentialJsonlWriter
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=1,
+        help="Number of GPUs for DDP training.",
+    )
+
+    parser.add_argument(
+        "--master-port",
+        type=int,
+        default=12354,
+        help="Master port to use for DDP training.",
+    )
+    parser.add_argument(
+        "--subset",
+        type=str,
+        default="small",
+        help="Subset to process. Possible values are 'small', 'medium', 'large'",
+    )
+
+    parser.add_argument(
+        "--manifest-in-dir",
+        type=Path,
+        default=Path("data/librilight/manifests_chunk"),
+        help="Path to directory with chunks cuts.",
+    )
+
+    parser.add_argument(
+        "--manifest-out-dir",
+        type=Path,
+        default=Path("data/librilight/manifests_chunk_recog"),
+        help="Path to directory to save the chunk cuts with recognition results.",
+    )
+
+    parser.add_argument(
+        "--log-dir",
+        type=Path,
+        default=Path("long_file_recog/log"),
+        help="Path to directory to save logs.",
+    )
+
+    parser.add_argument(
+        "--nn-model-filename",
+        type=str,
+        required=True,
+        help="Path to the torchscript model cpu_jit.pt",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="""Possible values are:
+          - greedy_search
+          - modified_beam_search
+          - fast_beam_search
+        """,
+    )
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    """Return a dict containing decoding parameters."""
+    params = AttributeDict(
+        {
+            "subsampling_factor": 4,
+            "frame_shift_ms": 10,
+            # Used only when --method is beam_search or modified_beam_search.
+            "beam_size": 4,
+            # Used only when --method is beam_search or fast_beam_search.
+            # A floating point value to calculate the cutoff score during beam
+            # search (i.e., `cutoff = max-score - beam`), which is the same as the
+            # `beam` in Kaldi.
+            "beam": 4,
+            "max_contexts": 4,
+            "max_states": 8,
+        }
+    )
+    return params
+
+
+def decode_one_batch(
+    params: AttributeDict,
+    model: nn.Module,
+    batch: dict,
+    decoding_graph: Optional[k2.Fsa] = None,
+) -> Tuple[List[List[str]], List[List[float]], List[List[float]]]:
+    """Decode one batch.
+
+    Args:
+      params:
+        It's the return value of :func:`get_params`.
+      paramsmodel:
+        The neural model.
+      batch:
+        It is the return value from iterating
+        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
+        for the format of the `batch`.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or LG, Used
+        only when --decoding_method is fast_beam_search.
+
+    Returns:
+      Return the decoding result, timestamps, and scores.
+    """
+    device = next(model.parameters()).device
+    feature = batch["inputs"]
+    assert feature.ndim == 3
+
+    feature = feature.to(device)
+    # at entry, feature is (N, T, C)
+
+    supervisions = batch["supervisions"]
+    feature_lens = supervisions["num_frames"].to(device)
+
+    encoder_out, encoder_out_lens = model.encoder(x=feature, x_lens=feature_lens)
+
+    if params.decoding_method == "fast_beam_search":
+        res = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            return_timestamps=True,
+        )
+    elif params.decoding_method == "greedy_search":
+        res = greedy_search_batch(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            return_timestamps=True,
+        )
+    elif params.decoding_method == "modified_beam_search":
+        res = modified_beam_search(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+            return_timestamps=True,
+        )
+    else:
+        raise ValueError(f"Unsupported decoding method: {params.decoding_method}")
+
+    hyps = []
+    timestamps = []
+    scores = []
+    for i in range(feature.shape[0]):
+        hyps.append(res.hyps[i])
+        timestamps.append(
+            convert_timestamp(
+                res.timestamps[i], params.subsampling_factor, params.frame_shift_ms
+            )
+        )
+        scores.append(res.scores[i])
+
+    return hyps, timestamps, scores
+
+
+def decode_dataset(
+    dl: torch.utils.data.DataLoader,
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+    cuts_writer: SequentialJsonlWriter,
+    decoding_graph: Optional[k2.Fsa] = None,
+) -> None:
+    """Decode dataset and store the recognition results to manifest.
+
+    Args:
+      dl:
+        PyTorch's dataloader containing the dataset to decode.
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The neural model.
+      sp:
+        The BPE model.
+      cuts_writer:
+        Writer to save the cuts with recognition results.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or LG, Used
+        only when --decoding_method is fast_beam_search.
+
+    Returns:
+      Return a dict, whose key may be "greedy_search" if greedy search
+      is used, or it may be "beam_7" if beam size of 7 is used.
+      Its value is a list of tuples. Each tuple contains five elements:
+        - cut_id
+        - reference transcript
+        - predicted result
+        - timestamps of reference transcript
+        - timestamps of predicted result
+    """
+    #  Background worker to add alignemnt and save cuts to disk.
+    def _save_worker(
+        cuts: List[Cut],
+        hyps: List[List[str]],
+        timestamps: List[List[float]],
+        scores: List[List[float]],
+    ):
+        for cut, symbol_list, time_list, score_list in zip(
+            cuts, hyps, timestamps, scores
+        ):
+            symbol_list = sp.id_to_piece(symbol_list)
+            ali = [
+                AlignmentItem(symbol=symbol, start=start, duration=None, score=score)
+                for symbol, start, score in zip(symbol_list, time_list, score_list)
+            ]
+            assert len(cut.supervisions) == 1, len(cut.supervisions)
+            cut.supervisions[0].alignment = {"symbol": ali}
+            cuts_writer.write(cut, flush=True)
+
+    num_cuts = 0
+    log_interval = 10
+    futures = []
+    with ThreadPoolExecutor(max_workers=1) as executor:
+        # We only want one background worker so that serialization is deterministic.
+
+        for batch_idx, batch in enumerate(dl):
+            cuts = batch["supervisions"]["cut"]
+
+            hyps, timestamps, scores = decode_one_batch(
+                params=params,
+                model=model,
+                decoding_graph=decoding_graph,
+                batch=batch,
+            )
+
+            futures.append(
+                executor.submit(_save_worker, cuts, hyps, timestamps, scores)
+            )
+
+            num_cuts += len(cuts)
+            if batch_idx % log_interval == 0:
+                logging.info(f"cuts processed until now is {num_cuts}")
+
+        for f in futures:
+            f.result()
+
+
+@torch.no_grad()
+def run(rank, world_size, args, in_cuts):
+    """
+    Args:
+      rank:
+        It is a value between 0 and `world_size-1`.
+      world_size:
+        Number of GPUs for DDP training.
+      args:
+        The return value of get_parser().parse_args()
+    """
+    params = get_params()
+    params.update(vars(args))
+
+    setup_logger(f"{params.log_dir}/log-decode")
+    logging.info("Decoding started")
+
+    assert params.decoding_method in (
+        "greedy_search",
+        "fast_beam_search",
+        "modified_beam_search",
+    ), params.decoding_method
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> is defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.unk_id = sp.piece_to_id("<unk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(f"{params}")
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", rank)
+    logging.info(f"device: {device}")
+
+    logging.info("Loading jit model")
+    model = torch.jit.load(params.nn_model_filename)
+    model.to(device)
+    model.eval()
+
+    if params.decoding_method == "fast_beam_search":
+        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+    else:
+        decoding_graph = None
+
+    # we will store new cuts with recognition results.
+    args.return_cuts = True
+    asr_data_module = AsrDataModule(args)
+
+    if world_size > 1:
+        in_cuts = in_cuts[rank]
+        out_cuts_filename = params.manifest_out_dir / (
+            f"{params.cuts_filename}_job_{rank}" + params.suffix
+        )
+    else:
+        out_cuts_filename = params.manifest_out_dir / (
+            f"{params.cuts_filename}" + params.suffix
+        )
+
+    dl = asr_data_module.dataloaders(in_cuts)
+
+    cuts_writer = CutSet.open_writer(out_cuts_filename, overwrite=True)
+    decode_dataset(
+        dl=dl,
+        params=params,
+        model=model,
+        sp=sp,
+        decoding_graph=decoding_graph,
+        cuts_writer=cuts_writer,
+    )
+    cuts_writer.close()
+    logging.info(f"Cuts saved to {out_cuts_filename}")
+
+    logging.info("Done!")
+
+
+def main():
+    parser = get_parser()
+    AsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+
+    subset = args.subset
+    assert subset in ["small", "medium", "large"], subset
+
+    manifest_out_dir = args.manifest_out_dir
+    manifest_out_dir.mkdir(parents=True, exist_ok=True)
+
+    args.suffix = ".jsonl.gz"
+    args.cuts_filename = f"librilight_cuts_{args.subset}"
+
+    out_cuts_filename = manifest_out_dir / (args.cuts_filename + args.suffix)
+    if out_cuts_filename.is_file():
+        logging.info(f"{out_cuts_filename} already exists - skipping.")
+        return
+
+    in_cuts_filename = args.manifest_in_dir / (args.cuts_filename + args.suffix)
+    in_cuts = load_manifest_lazy(in_cuts_filename)
+
+    world_size = args.world_size
+    assert world_size >= 1
+    if world_size > 1:
+        chunk_size = (len(in_cuts) + (world_size - 1)) // world_size
+        # Each manifest is saved at: ``{output_dir}/{prefix}.{split_idx}.jsonl.gz``
+        splits = in_cuts.split_lazy(
+            output_dir=args.manifest_in_dir / "split",
+            chunk_size=chunk_size,
+            prefix=args.cuts_filename,
+        )
+        assert len(splits) == world_size, (len(splits), world_size)
+        mp.spawn(run, args=(world_size, args, splits), nprocs=world_size, join=True)
+    else:
+        run(rank=0, world_size=world_size, args=args, in_cuts=in_cuts)
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+if __name__ == "__main__":
+    main()
--- a/egs/librispeech/ASR/long_file_recog/split_into_chunks.py
+++ b/egs/librispeech/ASR/long_file_recog/split_into_chunks.py
@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+# Copyright    2023  Xiaomi Corp.        (authors: Fangjun Kuang, Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This script splits long utterances into chunks with overlaps.
+Each chunk (except the first and the last) is padded with extra left side and right side.
+The chunk length is: left_side + chunk_size + right_side.
+"""
+
+import argparse
+import logging
+from pathlib import Path
+
+from lhotse import CutSet, load_manifest
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--manifest-in-dir",
+        type=Path,
+        default=Path("data/librilight/manifests"),
+        help="Path to directory of full utterances.",
+    )
+
+    parser.add_argument(
+        "--manifest-out-dir",
+        type=Path,
+        default=Path("data/librilight/manifests_chunk"),
+        help="Path to directory to save splitted chunks.",
+    )
+
+    parser.add_argument(
+        "--chunk",
+        type=float,
+        default=300.0,
+        help="""Duration (in seconds) of each chunk.""",
+    )
+
+    parser.add_argument(
+        "--extra",
+        type=float,
+        default=2.0,
+        help="""Extra duration (in seconds) at both sides.""",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    logging.info(vars(args))
+
+    manifest_out_dir = args.manifest_out_dir
+    manifest_out_dir.mkdir(parents=True, exist_ok=True)
+
+    subsets = ["small", "medium", "large"]
+
+    for subset in subsets:
+        logging.info(f"Processing {subset} subset")
+
+        manifest_out = manifest_out_dir / f"librilight_cuts_{subset}.jsonl.gz"
+        if manifest_out.is_file():
+            logging.info(f"{manifest_out} already exists - skipping.")
+            continue
+
+        manifest_in = args.manifest_in_dir / f"librilight_recordings_{subset}.jsonl.gz"
+        recordings = load_manifest(manifest_in)
+
+        cuts = CutSet.from_manifests(recordings=recordings)
+        cuts = cuts.cut_into_windows(
+            duration=args.chunk, hop=args.chunk - args.extra * 2
+        )
+        cuts = cuts.fill_supervisions()
+
+        cuts.to_file(manifest_out)
+        logging.info(f"Cuts saved to {manifest_out}")
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    main()
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
@ -670,6 +670,8 @@ def greedy_search_batch(
    # timestamp[n][i] is the frame index after subsampling
    # on which hyp[n][i] is decoded
    timestamps = [[] for _ in range(N)]
+    # scores[n][i] is the logits on which hyp[n][i] is decoded
+    scores = [[] for _ in range(N)]

    decoder_input = torch.tensor(
        hyps,
@ -707,6 +709,7 @@ def greedy_search_batch(
            if v not in (blank_id, unk_id):
                hyps[i].append(v)
                timestamps[i].append(t)
+                scores[i].append(logits[i, v].item())
                emitted = True
        if emitted:
            # update decoder output
@ -722,10 +725,12 @@ def greedy_search_batch(
    sorted_ans = [h[context_size:] for h in hyps]
    ans = []
    ans_timestamps = []
+    ans_scores = []
    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
    for i in range(N):
        ans.append(sorted_ans[unsorted_indices[i]])
        ans_timestamps.append(timestamps[unsorted_indices[i]])
+        ans_scores.append(scores[unsorted_indices[i]])

    if not return_timestamps:
        return ans
@ -733,6 +738,7 @@ def greedy_search_batch(
        return DecodingResults(
            hyps=ans,
            timestamps=ans_timestamps,
+            scores=ans_scores,
        )


--- a/icefall/utils.py
+++ b/icefall/utils.py
@ -272,6 +272,9 @@ class DecodingResults:
    # for the i-th utterance with fast_beam_search_nbest_LG.
    hyps: Union[List[List[int]], k2.RaggedTensor]

+    # scores[i][k] contains the log-prob of tokens[i][k]
+    scores: Optional[List[List[float]]] = None
+

 def get_texts_with_timestamp(
    best_paths: k2.Fsa, return_ragged: bool = False
@ -1442,7 +1445,7 @@ def convert_timestamp(
    frame_shift = frame_shift_ms / 1000.0
    time = []
    for f in frames:
-        time.append(f * subsampling_factor * frame_shift)
+        time.append(round(f * subsampling_factor * frame_shift, ndigits=3))

    return time