minor updates

2025-12-11 06:55:27 +00:00 · 2023-09-04 11:09:32 +08:00 · 2023-09-04 11:09:32 +08:00 · fae87b3009
commit fae87b3009
parent 1715567f57
4 changed files with 137 additions and 4 deletions
--- a/egs/swbd/ASR/README.md
+++ b/egs/swbd/ASR/README.md
@ -11,10 +11,8 @@ Switchboard is a collection of about 2,400 two-sided telephone conversations amo
 ## TODO List
 - [x] Incorporate Lhotse for data processing
 - [x] Further text normalization
- [ ] Refer to Global Mapping Rules when computing Word Error Rate
 - [x] Detailed Word Error Rate summary for eval2000 (callhome, swbd) and rt03 (fsh, swbd) testset
- [ ] Switchboard transcript train/dev split for LM training
- [ ] Fisher corpus LDC2004T19 LDC2005T19 LDC2004S13 LDC2005S13 for LM training
+- [x] Switchboard transcript train/dev split for LM training

 ## Performance Record
 |                                |  eval2000  |  rt03  |
@ -30,3 +28,5 @@ The training script for `conformer_ctc` comes from the LibriSpeech `conformer_ct
 A lot of the scripts for data processing are from the first-gen Kaldi and the ESPNet project, tailored by myself to incorporate with Lhotse and Icefall.

 Some of the scripts for text normalization are from stale pull requests of [Piotr Żelasko](https://github.com/pzelasko) and [Nagendra Goel](https://github.com/ngoel17).
+
+The `sclite_scoring.py` is from the GigaSpeech recipe for post processing and glm-like scoring, which is definitely not an elegant stuff to do.
--- a/egs/swbd/ASR/conformer_ctc/decode.py
+++ b/egs/swbd/ASR/conformer_ctc/decode.py
@ -30,6 +30,8 @@ import torch.nn as nn
 from asr_datamodule import SwitchBoardAsrDataModule
 from conformer import Conformer

+from sclite_scoring import asr_text_post_processing
+
 from icefall.bpe_graph_compiler import BpeCtcTrainingGraphCompiler
 from icefall.checkpoint import load_checkpoint
 from icefall.decode import (
@ -233,6 +235,17 @@ def get_params() -> AttributeDict:
    return params


+def post_processing(
+    results: List[Tuple[str, List[str], List[str]]],
+) -> List[Tuple[str, List[str], List[str]]]:
+    new_results = []
+    for key, ref, hyp in results:
+        new_ref = asr_text_post_processing(" ".join(ref)).split()
+        new_hyp = asr_text_post_processing(" ".join(hyp)).split()
+        new_results.append((key, new_ref, new_hyp))
+    return new_results
+
+
 def decode_one_batch(
    params: AttributeDict,
    model: nn.Module,
@ -591,6 +604,7 @@ def save_results(
        test_set_wers = dict()
        for key, results in results_dict.items():
            recog_path = params.exp_dir / f"recogs-{test_set_name}-{subset}-{key}.txt"
+            results = post_processing(results)
            results = (
                sorted(list(filter(lambda x: x[0].startswith(prefix), results)))
                if subset != "avg"
@ -605,7 +619,11 @@ def save_results(
            errs_filename = params.exp_dir / f"errs-{test_set_name}-{subset}-{key}.txt"
            with open(errs_filename, "w") as f:
                wer = write_error_stats(
-                    f, f"{test_set_name}-{subset}-{key}", results, enable_log=enable_log
+                    f,
+                    f"{test_set_name}-{subset}-{key}",
+                    results,
+                    enable_log=enable_log,
+                    sclite_mode=True,
                )
                test_set_wers[key] = wer

--- a/egs/swbd/ASR/conformer_ctc/sclite_scoring.py
+++ b/egs/swbd/ASR/conformer_ctc/sclite_scoring.py
@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+# Copyright 2021 Jiayu Du
+# Copyright 2022 Johns Hopkins University (Author: Guanbo Wang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import os
+
+conversational_filler = [
+    "UH",
+    "UHH",
+    "UM",
+    "EH",
+    "MM",
+    "HM",
+    "AH",
+    "HUH",
+    "HA",
+    "ER",
+    "OOF",
+    "HEE",
+    "ACH",
+    "EEE",
+    "EW",
+    "MHM",
+    "HUM",
+    "AW",
+    "OH",
+]
+unk_tags = ["<UNK>", "<unk>"]
+switchboard_garbage_utterance_tags = [
+    "[LAUGHTER]",
+    "[NOISE]",
+    "[VOCALIZED-NOISE]",
+    "[SILENCE]"
+]
+non_scoring_words = (
+    conversational_filler + unk_tags + switchboard_garbage_utterance_tags
+)
+
+
+def asr_text_post_processing(text: str) -> str:
+    # 1. convert to uppercase
+    text = text.upper()
+
+    # 2. remove non-scoring words from evaluation
+    remaining_words = []
+    for word in text.split():
+        if word in non_scoring_words:
+            continue
+        remaining_words.append(word)
+
+    return " ".join(remaining_words)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="This script evaluates GigaSpeech ASR result via"
+        "SCTK's tool sclite"
+    )
+    parser.add_argument(
+        "ref",
+        type=str,
+        help="sclite's standard transcription(trn) reference file",
+    )
+    parser.add_argument(
+        "hyp",
+        type=str,
+        help="sclite's standard transcription(trn) hypothesis file",
+    )
+    parser.add_argument(
+        "work_dir",
+        type=str,
+        help="working dir",
+    )
+    args = parser.parse_args()
+
+    if not os.path.isdir(args.work_dir):
+        os.mkdir(args.work_dir)
+
+    REF = os.path.join(args.work_dir, "REF")
+    HYP = os.path.join(args.work_dir, "HYP")
+    RESULT = os.path.join(args.work_dir, "RESULT")
+
+    for io in [(args.ref, REF), (args.hyp, HYP)]:
+        with open(io[0], "r", encoding="utf8") as fi:
+            with open(io[1], "w+", encoding="utf8") as fo:
+                for line in fi:
+                    line = line.strip()
+                    if line:
+                        cols = line.split()
+                        text = asr_text_post_processing(" ".join(cols[0:-1]))
+                        uttid_field = cols[-1]
+                        print(f"{text} {uttid_field}", file=fo)
+
+    # GigaSpeech's uttid comforms to swb
+    os.system(f"sclite -r {REF} trn -h {HYP} trn -i swb | tee {RESULT}")
--- a/egs/swbd/ASR/prepare.sh
+++ b/egs/swbd/ASR/prepare.sh
@ -108,6 +108,10 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
            data/manifests/eval2000/eval2000_cuts_all.jsonl.gz \
            data/manifests/eval2000/eval2000_cuts_all_trimmed.jsonl.gz

+        sed -e 's:((:(:' -e 's:<B_ASIDE>::g' -e 's:<E_ASIDE>::g' \
+            $eval2000_dir/LDC2002T43/reference/hub5e00.english.000405.stm >  data/manifests/eval2000/stm
+        cp $eval2000_dir/LDC2002T43/reference/en20000405_hub5.glm  $dir/glm
+
        # ./local/rt03_data_prep.sh $rt03_dir

        # normalize eval2000 and rt03 texts by