add cut_id to results for all recipes

2022-08-10 14:52:12 +08:00 · 2022-08-10 14:52:12 +08:00 · f51e64dada
commit f51e64dada
parent 6217b5e64c
53 changed files with 267 additions and 88 deletions
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/decode.py
@ -367,6 +367,7 @@ def decode_dataset(
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
        texts = [list(str(text).replace(" ", "")) for text in texts]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -379,8 +380,8 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
-                this_batch.append((ref_text, hyp_words))
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
+                this_batch.append((cut_id, ref_text, hyp_words))

            results[name].extend(this_batch)

@ -405,6 +406,7 @@ def save_results(
        recog_path = (
            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
+        results = sorted(results)
        store_transcripts(filename=recog_path, texts=results)
        logging.info(f"The transcripts are stored in {recog_path}")

@ -528,6 +530,8 @@ def main():
    from lhotse import CutSet
    from lhotse.dataset.webdataset import export_to_webdataset

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    aidatatang_200zh = Aidatatang_200zhAsrDataModule(args)

    dev = "dev"
--- a/egs/aishell/ASR/conformer_ctc/decode.py
+++ b/egs/aishell/ASR/conformer_ctc/decode.py
@ -374,6 +374,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -389,9 +390,9 @@ def decode_dataset(
        for lm_scale, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[lm_scale].extend(this_batch)

@ -538,6 +539,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    aishell = AishellAsrDataModule(args)
    test_cuts = aishell.test_cuts()
    test_dl = aishell.test_dataloaders(test_cuts)
--- a/egs/aishell/ASR/conformer_mmi/decode.py
+++ b/egs/aishell/ASR/conformer_mmi/decode.py
@ -386,6 +386,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -401,9 +402,9 @@ def decode_dataset(
        for lm_scale, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[lm_scale].extend(this_batch)

@ -557,6 +558,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    aishell = AishellAsrDataModule(args)
    test_cuts = aishell.test_cuts()
    test_dl = aishell.test_dataloaders(test_cuts)
--- a/egs/aishell/ASR/pruned_transducer_stateless3/decode.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless3/decode.py
@ -377,6 +377,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -389,9 +390,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -607,6 +608,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    asr_datamodule = AsrDataModule(args)
    aishell = AIShell(manifest_dir=args.manifest_dir)
    test_cuts = aishell.test_cuts()
--- a/egs/aishell/ASR/tdnn_lstm_ctc/decode.py
+++ b/egs/aishell/ASR/tdnn_lstm_ctc/decode.py
@ -241,6 +241,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -253,9 +254,9 @@ def decode_dataset(
        for lm_scale, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[lm_scale].extend(this_batch)

@ -366,6 +367,8 @@ def main():
    model.to(device)
    model.eval()

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    aishell = AishellAsrDataModule(args)
    test_cuts = aishell.test_cuts()
    test_dl = aishell.test_dataloaders(test_cuts)
--- a/egs/aishell/ASR/transducer_stateless/decode.py
+++ b/egs/aishell/ASR/transducer_stateless/decode.py
@ -296,6 +296,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -307,9 +308,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -439,6 +440,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    aishell = AishellAsrDataModule(args)
    test_cuts = aishell.test_cuts()
    test_dl = aishell.test_dataloaders(test_cuts)
--- a/egs/aishell/ASR/transducer_stateless_modified-2/decode.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/decode.py
@ -341,6 +341,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -353,9 +354,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -497,6 +498,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    asr_datamodule = AsrDataModule(args)
    aishell = AIShell(manifest_dir=args.manifest_dir)
    test_cuts = aishell.test_cuts()
--- a/egs/aishell/ASR/transducer_stateless_modified/decode.py
+++ b/egs/aishell/ASR/transducer_stateless_modified/decode.py
@ -345,6 +345,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -357,9 +358,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -499,6 +500,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    aishell = AishellAsrDataModule(args)
    test_cuts = aishell.test_cuts()
    test_dl = aishell.test_dataloaders(test_cuts)
--- a/egs/aishell2/ASR/pruned_transducer_stateless5/decode.py
+++ b/egs/aishell2/ASR/pruned_transducer_stateless5/decode.py
@ -514,6 +514,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -527,8 +528,8 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
-                this_batch.append((ref_text, hyp_words))
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
+                this_batch.append((cut_id, ref_text, hyp_words))

            results[name].extend(this_batch)

@ -553,6 +554,7 @@ def save_results(
        recog_path = (
            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
+        results = sorted(results)
        store_transcripts(filename=recog_path, texts=results)
        logging.info(f"The transcripts are stored in {recog_path}")

@ -756,6 +758,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    aishell2 = AiShell2AsrDataModule(args)

    valid_cuts = aishell2.valid_cuts()
--- a/egs/aishell4/ASR/pruned_transducer_stateless5/decode.py
+++ b/egs/aishell4/ASR/pruned_transducer_stateless5/decode.py
@ -378,6 +378,7 @@ def decode_dataset(
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
        texts = [list(str(text).replace(" ", "")) for text in texts]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -390,8 +391,8 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
-                this_batch.append((ref_text, hyp_words))
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
+                this_batch.append((cut_id, ref_text, hyp_words))

            results[name].extend(this_batch)

@ -416,6 +417,7 @@ def save_results(
        recog_path = (
            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
+        results = sorted(results)
        store_transcripts(filename=recog_path, texts=results)
        logging.info(f"The transcripts are stored in {recog_path}")

@ -607,6 +609,8 @@ def main():
        c.supervisions[0].text = text_normalize(text)
        return c

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    aishell4 = Aishell4AsrDataModule(args)
    test_cuts = aishell4.test_cuts()
    test_cuts = test_cuts.map(text_normalize_for_cut)
--- a/egs/alimeeting/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/alimeeting/ASR/pruned_transducer_stateless2/decode.py
@ -367,6 +367,7 @@ def decode_dataset(
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
        texts = [list(str(text).replace(" ", "")) for text in texts]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -379,8 +380,8 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
-                this_batch.append((ref_text, hyp_words))
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
+                this_batch.append((cut_id, ref_text, hyp_words))

            results[name].extend(this_batch)

@ -405,6 +406,7 @@ def save_results(
        recog_path = (
            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
+        results = sorted(results)
        store_transcripts(filename=recog_path, texts=results)
        logging.info(f"The transcripts are stored in {recog_path}")

@ -535,6 +537,8 @@ def main():
    from lhotse import CutSet
    from lhotse.dataset.webdataset import export_to_webdataset

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    alimeeting = AlimeetingAsrDataModule(args)

    dev = "eval"
--- a/egs/gigaspeech/ASR/conformer_ctc/decode.py
+++ b/egs/gigaspeech/ASR/conformer_ctc/decode.py
@ -451,6 +451,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -469,9 +470,9 @@ def decode_dataset(
            for lm_scale, hyps in hyps_dict.items():
                this_batch = []
                assert len(hyps) == len(texts)
-                for hyp_words, ref_text in zip(hyps, texts):
+                for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                    ref_words = ref_text.split()
-                    this_batch.append((ref_words, hyp_words))
+                    this_batch.append((cut_id, ref_words, hyp_words))

                results[lm_scale].extend(this_batch)
        else:
@ -677,6 +678,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    gigaspeech = GigaSpeechAsrDataModule(args)

    dev_cuts = gigaspeech.dev_cuts()
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/decode.py
@ -374,6 +374,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -386,9 +387,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -545,6 +546,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    gigaspeech = GigaSpeechAsrDataModule(args)

    dev_cuts = gigaspeech.dev_cuts()
--- a/egs/librispeech/ASR/conformer_ctc/decode.py
+++ b/egs/librispeech/ASR/conformer_ctc/decode.py
@ -525,6 +525,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -544,9 +545,9 @@ def decode_dataset(
            for lm_scale, hyps in hyps_dict.items():
                this_batch = []
                assert len(hyps) == len(texts)
-                for hyp_words, ref_text in zip(hyps, texts):
+                for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                    ref_words = ref_text.split()
-                    this_batch.append((ref_words, hyp_words))
+                    this_batch.append((cut_id, ref_words, hyp_words))

                results[lm_scale].extend(this_batch)
        else:
@ -780,6 +781,8 @@ def main():
            )
        rnn_lm_model.eval()

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    librispeech = LibriSpeechAsrDataModule(args)

    test_clean_cuts = librispeech.test_clean_cuts()
--- a/egs/librispeech/ASR/conformer_ctc2/decode.py
+++ b/egs/librispeech/ASR/conformer_ctc2/decode.py
@ -632,6 +632,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -651,9 +652,9 @@ def decode_dataset(
            for lm_scale, hyps in hyps_dict.items():
                this_batch = []
                assert len(hyps) == len(texts)
-                for hyp_words, ref_text in zip(hyps, texts):
+                for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                    ref_words = ref_text.split()
-                    this_batch.append((ref_words, hyp_words))
+                    this_batch.append((cut_id, ref_words, hyp_words))

                results[lm_scale].extend(this_batch)
        else:
@ -956,6 +957,8 @@ def main():
            )
        rnn_lm_model.eval()

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    librispeech = LibriSpeechAsrDataModule(args)

    test_clean_cuts = librispeech.test_clean_cuts()
--- a/egs/librispeech/ASR/conformer_mmi/decode.py
+++ b/egs/librispeech/ASR/conformer_mmi/decode.py
@ -449,6 +449,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -466,9 +467,9 @@ def decode_dataset(
        for lm_scale, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[lm_scale].extend(this_batch)

@ -662,6 +663,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    librispeech = LibriSpeechAsrDataModule(args)
    # CAUTION: `test_sets` is for displaying only.
    # If you want to skip test-clean, you have to skip
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/decode.py
@ -403,6 +403,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -415,9 +416,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -625,6 +626,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    librispeech = LibriSpeechAsrDataModule(args)

    test_clean_cuts = librispeech.test_clean_cuts()
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/stream.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/stream.py
@ -29,6 +29,7 @@ class Stream(object):
    def __init__(
        self,
        params: AttributeDict,
+        cut_id: str,
        decoding_graph: Optional[k2.Fsa] = None,
        device: torch.device = torch.device("cpu"),
        LOG_EPS: float = math.log(1e-10),
@ -44,6 +45,7 @@ class Stream(object):
            The device to run this stream.
        """
        self.LOG_EPS = LOG_EPS
+        self.cut_id = cut_id

        # Containing attention caches and convolution caches
        self.states: Optional[
@ -138,6 +140,10 @@ class Stream(object):
        """Return True if all feature frames are processed."""
        return self._done

+    @property
+    def id(self) -> str:
+        return self.cut_id
+
    def decoding_result(self) -> List[int]:
        """Obtain current decoding result."""
        if self.decoding_method == "greedy_search":
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/streaming_decode.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/streaming_decode.py
@ -74,7 +74,6 @@ from pathlib import Path
 from typing import Dict, List, Optional, Tuple

 import k2
-from lhotse import CutSet
 import numpy as np
 import sentencepiece as spm
 import torch
@ -83,6 +82,7 @@ from asr_datamodule import LibriSpeechAsrDataModule
 from beam_search import Hypothesis, HypothesisList, get_hyps_shape
 from emformer import LOG_EPSILON, stack_states, unstack_states
 from kaldifeat import Fbank, FbankOptions
+from lhotse import CutSet
 from stream import Stream
 from torch.nn.utils.rnn import pad_sequence
 from train import add_model_arguments, get_params, get_transducer_model
@ -678,6 +678,7 @@ def decode_dataset(
        # Each utterance has a Stream.
        stream = Stream(
            params=params,
+            cut_id=cut.id,
            decoding_graph=decoding_graph,
            device=device,
            LOG_EPS=LOG_EPSILON,
@ -711,6 +712,7 @@ def decode_dataset(
            for i in sorted(finished_streams, reverse=True):
                decode_results.append(
                    (
+                        streams[i].id,
                        streams[i].ground_truth.split(),
                        sp.decode(streams[i].decoding_result()).split(),
                    )
@ -731,6 +733,7 @@ def decode_dataset(
        for i in sorted(finished_streams, reverse=True):
            decode_results.append(
                (
+                    streams[i].id,
                    streams[i].ground_truth.split(),
                    sp.decode(streams[i].decoding_result()).split(),
                )
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/decode.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/decode.py
@ -403,6 +403,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -415,9 +416,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -625,6 +626,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    librispeech = LibriSpeechAsrDataModule(args)

    test_clean_cuts = librispeech.test_clean_cuts()
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/streaming_decode.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/streaming_decode.py
@ -74,7 +74,6 @@ from pathlib import Path
 from typing import Dict, List, Optional, Tuple

 import k2
-from lhotse import CutSet
 import numpy as np
 import sentencepiece as spm
 import torch
@ -83,6 +82,7 @@ from asr_datamodule import LibriSpeechAsrDataModule
 from beam_search import Hypothesis, HypothesisList, get_hyps_shape
 from emformer import LOG_EPSILON, stack_states, unstack_states
 from kaldifeat import Fbank, FbankOptions
+from lhotse import CutSet
 from stream import Stream
 from torch.nn.utils.rnn import pad_sequence
 from train import add_model_arguments, get_params, get_transducer_model
@ -678,6 +678,7 @@ def decode_dataset(
        # Each utterance has a Stream.
        stream = Stream(
            params=params,
+            cut_id=cut.id,
            decoding_graph=decoding_graph,
            device=device,
            LOG_EPS=LOG_EPSILON,
@ -711,6 +712,7 @@ def decode_dataset(
            for i in sorted(finished_streams, reverse=True):
                decode_results.append(
                    (
+                        streams[i].id,
                        streams[i].ground_truth.split(),
                        sp.decode(streams[i].decoding_result()).split(),
                    )
@ -731,6 +733,7 @@ def decode_dataset(
        for i in sorted(finished_streams, reverse=True):
            decode_results.append(
                (
+                    streams[i].id,
                    streams[i].ground_truth.split(),
                    sp.decode(streams[i].decoding_result()).split(),
                )
--- a/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/decode.py
+++ b/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/decode.py
@ -391,6 +391,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -403,9 +404,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -613,6 +614,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    librispeech = LibriSpeechAsrDataModule(args)

    test_clean_cuts = librispeech.test_clean_cuts()
--- a/egs/librispeech/ASR/pruned_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/decode.py
@ -758,6 +758,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    librispeech = LibriSpeechAsrDataModule(args)

    test_clean_cuts = librispeech.test_clean_cuts()
--- a/egs/librispeech/ASR/pruned_transducer_stateless/decode_stream.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/decode_stream.py
@ -28,6 +28,7 @@ class DecodeStream(object):
    def __init__(
        self,
        params: AttributeDict,
+        cut_id: str,
        initial_states: List[torch.Tensor],
        decoding_graph: Optional[k2.Fsa] = None,
        device: torch.device = torch.device("cpu"),
@ -48,6 +49,7 @@ class DecodeStream(object):
            assert device == decoding_graph.device

        self.params = params
+        self.cut_id = cut_id
        self.LOG_EPS = math.log(1e-10)

        self.states = initial_states
@ -102,6 +104,10 @@ class DecodeStream(object):
        """Return True if all the features are processed."""
        return self._done

+    @property
+    def id(self) -> str:
+        return self.cut_id
+
    def set_features(
        self,
        features: torch.Tensor,
--- a/egs/librispeech/ASR/pruned_transducer_stateless/streaming_decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/streaming_decode.py
@ -356,6 +356,7 @@ def decode_dataset(
        # each utterance has a DecodeStream.
        decode_stream = DecodeStream(
            params=params,
+            cut_id=cut.id,
            initial_states=initial_states,
            decoding_graph=decoding_graph,
            device=device,
@ -385,6 +386,7 @@ def decode_dataset(
            for i in sorted(finished_streams, reverse=True):
                decode_results.append(
                    (
+                        decode_streams[i].id,
                        decode_streams[i].ground_truth.split(),
                        sp.decode(decode_streams[i].decoding_result()).split(),
                    )
@ -402,6 +404,7 @@ def decode_dataset(
        for i in sorted(finished_streams, reverse=True):
            decode_results.append(
                (
+                    decode_streams[i].id,
                    decode_streams[i].ground_truth.split(),
                    sp.decode(decode_streams[i].decoding_result()).split(),
                )
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/decode.py
@ -574,6 +574,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -587,9 +588,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -778,6 +779,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    librispeech = LibriSpeechAsrDataModule(args)

    test_clean_cuts = librispeech.test_clean_cuts()
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/streaming_decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/streaming_decode.py
@ -358,6 +358,7 @@ def decode_dataset(
        # each utterance has a DecodeStream.
        decode_stream = DecodeStream(
            params=params,
+            cut_id=cut.id,
            initial_states=initial_states,
            decoding_graph=decoding_graph,
            device=device,
@ -388,6 +389,7 @@ def decode_dataset(
            for i in sorted(finished_streams, reverse=True):
                decode_results.append(
                    (
+                        decode_streams[i].id,
                        decode_streams[i].ground_truth.split(),
                        sp.decode(decode_streams[i].decoding_result()).split(),
                    )
@ -405,6 +407,7 @@ def decode_dataset(
        for i in sorted(finished_streams, reverse=True):
            decode_results.append(
                (
+                    decode_streams[i].id,
                    decode_streams[i].ground_truth.split(),
                    sp.decode(decode_streams[i].decoding_result()).split(),
                )
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/decode-giga.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/decode-giga.py
@ -422,6 +422,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -434,9 +435,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -610,6 +611,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    asr_datamodule = AsrDataModule(args)
    gigaspeech = GigaSpeech(manifest_dir=args.manifest_dir)

--- a/egs/librispeech/ASR/pruned_transducer_stateless3/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/decode.py
@ -745,6 +745,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -760,9 +761,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -1068,6 +1069,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    asr_datamodule = AsrDataModule(args)
    librispeech = LibriSpeech(manifest_dir=args.manifest_dir)

--- a/egs/librispeech/ASR/pruned_transducer_stateless3/streaming_decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/streaming_decode.py
@ -359,6 +359,7 @@ def decode_dataset(
        # each utterance has a DecodeStream.
        decode_stream = DecodeStream(
            params=params,
+            cut_id=cut.id,
            initial_states=initial_states,
            decoding_graph=decoding_graph,
            device=device,
@ -389,6 +390,7 @@ def decode_dataset(
            for i in sorted(finished_streams, reverse=True):
                decode_results.append(
                    (
+                        decode_streams[i].id,
                        decode_streams[i].ground_truth.split(),
                        sp.decode(decode_streams[i].decoding_result()).split(),
                    )
@ -406,6 +408,7 @@ def decode_dataset(
        for i in sorted(finished_streams, reverse=True):
            decode_results.append(
                (
+                    decode_streams[i].id,
                    decode_streams[i].ground_truth.split(),
                    sp.decode(decode_streams[i].decoding_result()).split(),
                )
--- a/egs/librispeech/ASR/pruned_transducer_stateless4/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless4/decode.py
@ -578,6 +578,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -591,9 +592,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -832,6 +833,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    librispeech = LibriSpeechAsrDataModule(args)

    test_clean_cuts = librispeech.test_clean_cuts()
--- a/egs/librispeech/ASR/pruned_transducer_stateless4/streaming_decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless4/streaming_decode.py
@ -371,6 +371,7 @@ def decode_dataset(
        # each utterance has a DecodeStream.
        decode_stream = DecodeStream(
            params=params,
+            cut_id=cut.id,
            initial_states=initial_states,
            decoding_graph=decoding_graph,
            device=device,
@ -401,6 +402,7 @@ def decode_dataset(
            for i in sorted(finished_streams, reverse=True):
                decode_results.append(
                    (
+                        decode_streams[i].id,
                        decode_streams[i].ground_truth.split(),
                        sp.decode(decode_streams[i].decoding_result()).split(),
                    )
@ -418,6 +420,7 @@ def decode_dataset(
        for i in sorted(finished_streams, reverse=True):
            decode_results.append(
                (
+                    decode_streams[i].id,
                    decode_streams[i].ground_truth.split(),
                    sp.decode(decode_streams[i].decoding_result()).split(),
                )
--- a/egs/librispeech/ASR/pruned_transducer_stateless5/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless5/decode.py
@ -564,6 +564,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -577,9 +578,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -818,6 +819,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    librispeech = LibriSpeechAsrDataModule(args)

    test_clean_cuts = librispeech.test_clean_cuts()
--- a/egs/librispeech/ASR/pruned_transducer_stateless5/streaming_decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless5/streaming_decode.py
@ -371,6 +371,7 @@ def decode_dataset(
        # each utterance has a DecodeStream.
        decode_stream = DecodeStream(
            params=params,
+            cut_id=cut.id,
            initial_states=initial_states,
            decoding_graph=decoding_graph,
            device=device,
@ -401,6 +402,7 @@ def decode_dataset(
            for i in sorted(finished_streams, reverse=True):
                decode_results.append(
                    (
+                        decode_streams[i].id,
                        decode_streams[i].ground_truth.split(),
                        sp.decode(decode_streams[i].decoding_result()).split(),
                    )
@ -418,6 +420,7 @@ def decode_dataset(
        for i in sorted(finished_streams, reverse=True):
            decode_results.append(
                (
+                    decode_streams[i].id,
                    decode_streams[i].ground_truth.split(),
                    sp.decode(decode_streams[i].decoding_result()).split(),
                )
--- a/egs/librispeech/ASR/pruned_transducer_stateless6/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless6/decode.py
@ -387,6 +387,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -399,9 +400,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -609,6 +610,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    librispeech = LibriSpeechAsrDataModule(args)

    test_clean_cuts = librispeech.test_clean_cuts()
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/decode.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/decode.py
@ -311,6 +311,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -324,9 +325,9 @@ def decode_dataset(
        for lm_scale, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[lm_scale].extend(this_batch)

@ -474,6 +475,8 @@ def main():
    model.to(device)
    model.eval()

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    librispeech = LibriSpeechAsrDataModule(args)

    test_clean_cuts = librispeech.test_clean_cuts()
--- a/egs/librispeech/ASR/transducer/decode.py
+++ b/egs/librispeech/ASR/transducer/decode.py
@ -295,6 +295,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -306,9 +307,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -425,6 +426,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    librispeech = LibriSpeechAsrDataModule(args)

    test_clean_cuts = librispeech.test_clean_cuts()
--- a/egs/librispeech/ASR/transducer_lstm/decode.py
+++ b/egs/librispeech/ASR/transducer_lstm/decode.py
@ -292,6 +292,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -303,9 +304,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -423,6 +424,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    librispeech = LibriSpeechAsrDataModule(args)

    test_clean_cuts = librispeech.test_clean_cuts()
--- a/egs/librispeech/ASR/transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/transducer_stateless/decode.py
@ -350,6 +350,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -362,9 +363,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -501,6 +502,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    librispeech = LibriSpeechAsrDataModule(args)

    test_clean_cuts = librispeech.test_clean_cuts()
--- a/egs/librispeech/ASR/transducer_stateless2/decode.py
+++ b/egs/librispeech/ASR/transducer_stateless2/decode.py
@ -350,6 +350,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -362,9 +363,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -501,6 +502,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    librispeech = LibriSpeechAsrDataModule(args)

    test_clean_cuts = librispeech.test_clean_cuts()
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/decode.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/decode.py
@ -351,6 +351,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -363,9 +364,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -504,6 +505,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    asr_datamodule = AsrDataModule(args)
    librispeech = LibriSpeech(manifest_dir=args.manifest_dir)

--- a/egs/spgispeech/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/spgispeech/ASR/pruned_transducer_stateless2/decode.py
@ -365,6 +365,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -377,9 +378,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -405,6 +406,7 @@ def save_results(
        recog_path = (
            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
+        results = sorted(results)
        store_transcripts(filename=recog_path, texts=results)
        logging.info(f"The transcripts are stored in {recog_path}")

@ -561,6 +563,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    spgispeech = SPGISpeechAsrDataModule(args)

    dev_cuts = spgispeech.dev_cuts()
--- a/egs/tal_csasr/ASR/pruned_transducer_stateless5/decode.py
+++ b/egs/tal_csasr/ASR/pruned_transducer_stateless5/decode.py
@ -453,6 +453,7 @@ def decode_dataset(
    zh_char = "[\u4e00-\u9fa5]+"  # Chinese chars
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
        zh_texts = []
        en_texts = []
        for i in range(len(texts)):
@ -487,14 +488,14 @@ def decode_dataset(
            # print(hyps_texts)
            hyps, zh_hyps, en_hyps = hyps_texts
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
-                this_batch.append((ref_text, hyp_words))
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
+                this_batch.append((cut_id, ref_text, hyp_words))

-            for hyp_words, ref_text in zip(zh_hyps, zh_texts):
-                this_batch_zh.append((ref_text, hyp_words))
+            for cut_id, hyp_words, ref_text in zip(cut_ids, zh_hyps, zh_texts):
+                this_batch_zh.append((cut_id, ref_text, hyp_words))

-            for hyp_words, ref_text in zip(en_hyps, en_texts):
-                this_batch_en.append((ref_text, hyp_words))
+            for cut_id, hyp_words, ref_text in zip(cut_ids, en_hyps, en_texts):
+                this_batch_en.append((cut_id, ref_text, hyp_words))

            results[name].extend(this_batch)
            zh_results[name + "_zh"].extend(this_batch_zh)
@ -521,6 +522,7 @@ def save_results(
        recog_path = (
            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
+        results = sorted(results)
        store_transcripts(filename=recog_path, texts=results)
        logging.info(f"The transcripts are stored in {recog_path}")

@ -710,6 +712,8 @@ def main():
        c.supervisions[0].text = text_normalize(text)
        return c

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    tal_csasr = TAL_CSASRAsrDataModule(args)

    dev_cuts = tal_csasr.valid_cuts()
--- a/egs/tedlium3/ASR/pruned_transducer_stateless/decode.py
+++ b/egs/tedlium3/ASR/pruned_transducer_stateless/decode.py
@ -350,6 +350,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -362,9 +363,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -389,6 +390,7 @@ def save_results(
        recog_path = (
            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
+        results = sorted(results)
        store_transcripts(filename=recog_path, texts=results)
        logging.info(f"The transcripts are stored in {recog_path}")

@ -498,6 +500,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    tedlium = TedLiumAsrDataModule(args)
    dev_cuts = tedlium.dev_cuts()
    test_cuts = tedlium.test_cuts()
--- a/egs/tedlium3/ASR/transducer_stateless/decode.py
+++ b/egs/tedlium3/ASR/transducer_stateless/decode.py
@ -325,6 +325,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -336,9 +337,9 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[name].extend(this_batch)

@ -363,6 +364,7 @@ def save_results(
        recog_path = (
            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
+        results = sorted(results)
        store_transcripts(filename=recog_path, texts=results)
        logging.info(f"The transcripts are stored in {recog_path}")

@ -462,6 +464,8 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    tedlium = TedLiumAsrDataModule(args)
    dev_cuts = tedlium.dev_cuts()
    test_cuts = tedlium.test_cuts()
--- a/egs/timit/ASR/tdnn_ligru_ctc/decode.py
+++ b/egs/timit/ASR/tdnn_ligru_ctc/decode.py
@ -311,6 +311,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -324,9 +325,9 @@ def decode_dataset(
        for lm_scale, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[lm_scale].extend(this_batch)

@ -349,6 +350,7 @@ def save_results(
    test_set_wers = dict()
    for key, results in results_dict.items():
        recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}.txt"
+        results = sorted(results)
        store_transcripts(filename=recog_path, texts=results)
        logging.info(f"The transcripts are stored in {recog_path}")

@ -468,6 +470,8 @@ def main():
    model.to(device)
    model.eval()

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    timit = TimitAsrDataModule(args)
    test_set = "TEST"
    test_dl = timit.test_dataloaders()
--- a/egs/timit/ASR/tdnn_lstm_ctc/decode.py
+++ b/egs/timit/ASR/tdnn_lstm_ctc/decode.py
@ -310,6 +310,7 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -323,9 +324,9 @@ def decode_dataset(
        for lm_scale, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))

            results[lm_scale].extend(this_batch)

@ -348,6 +349,7 @@ def save_results(
    test_set_wers = dict()
    for key, results in results_dict.items():
        recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}.txt"
+        results = sorted(results)
        store_transcripts(filename=recog_path, texts=results)
        logging.info(f"The transcripts are stored in {recog_path}")

@ -467,6 +469,8 @@ def main():
    model.to(device)
    model.eval()

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    timit = TimitAsrDataModule(args)
    test_set = "TEST"
    test_dl = timit.test_dataloaders()
--- a/egs/wenetspeech/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/wenetspeech/ASR/pruned_transducer_stateless2/decode.py
@ -491,6 +491,7 @@ def decode_dataset(
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
        texts = [list(str(text)) for text in texts]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -504,8 +505,8 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
-                this_batch.append((ref_text, hyp_words))
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
+                this_batch.append((cut_id, ref_text, hyp_words))

            results[name].extend(this_batch)

@ -679,6 +680,8 @@ def main():
    from lhotse import CutSet
    from lhotse.dataset.webdataset import export_to_webdataset

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    wenetspeech = WenetSpeechAsrDataModule(args)

    dev = "dev"
--- a/egs/wenetspeech/ASR/pruned_transducer_stateless5/decode.py
+++ b/egs/wenetspeech/ASR/pruned_transducer_stateless5/decode.py
@ -461,6 +461,7 @@ def decode_dataset(
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
        texts = [list(str(text)) for text in texts]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps_dict = decode_one_batch(
            params=params,
@ -473,8 +474,8 @@ def decode_dataset(
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
-                this_batch.append((ref_text, hyp_words))
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
+                this_batch.append((cut_id, ref_text, hyp_words))

            results[name].extend(this_batch)

@ -683,6 +684,8 @@ def main():
    from lhotse import CutSet
    from lhotse.dataset.webdataset import export_to_webdataset

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    wenetspeech = WenetSpeechAsrDataModule(args)

    dev = "dev"
--- a/egs/wenetspeech/ASR/pruned_transducer_stateless5/decode_stream.py
+++ b/egs/wenetspeech/ASR/pruned_transducer_stateless5/decode_stream.py
@ -28,6 +28,7 @@ class DecodeStream(object):
    def __init__(
        self,
        params: AttributeDict,
+        cut_id: str,
        initial_states: List[torch.Tensor],
        decoding_graph: Optional[k2.Fsa] = None,
        device: torch.device = torch.device("cpu"),
@ -48,6 +49,7 @@ class DecodeStream(object):
            assert device == decoding_graph.device

        self.params = params
+        self.cut_id = cut_id
        self.LOG_EPS = math.log(1e-10)

        self.states = initial_states
@ -102,6 +104,10 @@ class DecodeStream(object):
        """Return True if all the features are processed."""
        return self._done

+    @property
+    def id(self) -> str:
+        return self.cut_id
+
    def set_features(
        self,
        features: torch.Tensor,
--- a/egs/wenetspeech/ASR/pruned_transducer_stateless5/streaming_decode.py
+++ b/egs/wenetspeech/ASR/pruned_transducer_stateless5/streaming_decode.py
@ -396,6 +396,7 @@ def decode_dataset(
        # each utterance has a DecodeStream.
        decode_stream = DecodeStream(
            params=params,
+            cut_id=cut.id,
            initial_states=initial_states,
            decoding_graph=decoding_graph,
            device=device,
@ -423,6 +424,7 @@ def decode_dataset(
                hyp = decode_streams[i].decoding_result()
                decode_results.append(
                    (
+                        decode_streams[i].id,
                        list(decode_streams[i].ground_truth),
                        [lexicon.token_table[idx] for idx in hyp],
                    )
@ -441,6 +443,7 @@ def decode_dataset(
            hyp = decode_streams[i].decoding_result()
            decode_results.append(
                (
+                    decode_streams[i].id,
                    list(decode_streams[i].ground_truth),
                    [lexicon.token_table[idx] for idx in hyp],
                )
--- a/egs/yesno/ASR/tdnn/decode.py
+++ b/egs/yesno/ASR/tdnn/decode.py
@ -178,6 +178,7 @@ def decode_dataset(
    results = []
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps = decode_one_batch(
            params=params,
@ -189,9 +190,9 @@ def decode_dataset(

        this_batch = []
        assert len(hyps) == len(texts)
-        for hyp_words, ref_text in zip(hyps, texts):
+        for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
            ref_words = ref_text.split()
-            this_batch.append((ref_words, hyp_words))
+            this_batch.append((cut_id, ref_words, hyp_words))

        results.extend(this_batch)

@ -237,6 +238,7 @@ def save_results(
      Return None.
    """
    recog_path = exp_dir / f"recogs-{test_set_name}.txt"
+    results = sorted(results)
    store_transcripts(filename=recog_path, texts=results)
    logging.info(f"The transcripts are stored in {recog_path}")

@ -303,6 +305,8 @@ def main():
    model.to(device)
    model.eval()

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    yes_no = YesNoAsrDataModule(args)
    test_dl = yes_no.test_dataloaders()
    results = decode_dataset(
--- a/egs/yesno/ASR/transducer/decode.py
+++ b/egs/yesno/ASR/transducer/decode.py
@ -165,6 +165,7 @@ def decode_dataset(
    results = []
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

        hyps = decode_one_batch(
            params=params,
@ -174,9 +175,9 @@ def decode_dataset(

        this_batch = []
        assert len(hyps) == len(texts)
-        for hyp_words, ref_text in zip(hyps, texts):
+        for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
            ref_words = ref_text.split()
-            this_batch.append((ref_words, hyp_words))
+            this_batch.append((cut_id, ref_words, hyp_words))

        results.extend(this_batch)

@ -222,6 +223,7 @@ def save_results(
      Return None.
    """
    recog_path = exp_dir / f"recogs-{test_set_name}.txt"
+    results = sorted(results)
    store_transcripts(filename=recog_path, texts=results)
    logging.info(f"The transcripts are stored in {recog_path}")

@ -291,6 +293,8 @@ def main():
    model.eval()
    model.device = device

+    # we need cut ids to display recognition results.
+    args.return_cuts = True
    yes_no = YesNoAsrDataModule(args)
    test_dl = yes_no.test_dataloaders()
    results = decode_dataset(