From 11fe0004f4e5fd6f39610cd4d01da94fd6d75dba Mon Sep 17 00:00:00 2001
From: zr_jin <60612200+JinZr@users.noreply.github.com>
Date: Tue, 1 Aug 2023 17:15:01 +0800
Subject: [PATCH] minor updates

---
 egs/swbd/ASR/local/eval2000_data_prep.sh | 118 -----------------------
 egs/swbd/ASR/local/normalize_eval2000.py |   4 +
 egs/swbd/ASR/prepare.sh                  |  21 +++-
 3 files changed, 20 insertions(+), 123 deletions(-)
 delete mode 100755 egs/swbd/ASR/local/eval2000_data_prep.sh
diff --git a/egs/swbd/ASR/local/eval2000_data_prep.sh b/egs/swbd/ASR/local/eval2000_data_prep.sh
deleted file mode 100755
index a5bd3ebcf..000000000
--- a/egs/swbd/ASR/local/eval2000_data_prep.sh
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/bin/bash
-
-# Hub-5 Eval 2000 data preparation
-# Author:  Arnab Ghoshal (Jan 2013)
-
-# To be run from one directory above this script.
-
-# The input is two directory names (possibly the same) containing the
-# 2000 Hub5 english evaluation test set and transcripts, which are
-# respectively: LDC2002S09  LDC2002T43
-# e.g. see
-# http://www.ldc.upenn.edu/Catalog/catalogEntry.jsp?catalogId=LDC2002S09
-# http://www.ldc.upenn.edu/Catalog/CatalogEntry.jsp?catalogId=LDC2002T43
-#
-# Example usage:
-# local/eval2000_data_prep_edin.sh /exports/work/inf_hcrc_cstr_general/corpora/hub5/2000 /exports/work/inf_hcrc_cstr_general/corpora/hub5/2000/transcr
-# The first directory ($sdir) contains the speech data, and the directory
-# $sdir/english/ must exist.
-# The second directory ($tdir) contains the transcripts, and the directory
-# $tdir/reference must exist; in particular we need the file
-# $tdir/reference/hub5e00.english.000405.stm
-
-if [ $# -ne 2 ]; then
-    echo "Usage: "$(basename $0)" <speech-dir> <transcription-dir>"
-    echo "See comments in the script for more details"
-    exit 1
-fi
-
-sdir=$1
-tdir=$2
-[ ! -d $sdir/english ] &&
-    echo Expecting directory $sdir/english to be present && exit 1
-[ -d $tdir/2000_hub5_eng_eval_tr ] &&
-    tdir=$tdir/2000_hub5_eng_eval_tr
-[ ! -d $tdir/reference ] &&
-    echo Expecting directory $tdir/reference to be present && exit 1
-
-dir=data/local/eval2000
-mkdir -p $dir
-
-find $sdir/english -iname '*.sph' | sort >$dir/sph.flist
-sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
-    >$dir/sph.scp
-
-sph2pipe=sph2pipe
-[ ! -x $sph2pipe ] &&
-    echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1
-
-awk -v sph2pipe=$sph2pipe '{
-            printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
-            printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
-        }' <$dir/sph.scp | sort >$dir/wav.scp || exit 1
-#side A - channel 1, side B - channel 2
-
-# Get segments file...
-# segments file format is: utt-id side-id start-time end-time, e.g.:
-# sw02001-A_000098-001156 sw02001-A 0.98 11.56
-pem=$sdir/english/hub5e_00.pem
-[ ! -f $pem ] && echo "$0: No such file $pem" && exit 1
-# pem file has lines like:
-# en_4156 A unknown_speaker 301.85 302.48
-
-# we ignore the warnings below for now, although they seem to indicate some problems
-# with the data.
-grep -v ';;' $pem |
-    awk '{
-    spk=$1"-"$2;
-    utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
-    print utt,spk,$4,$5;}' |
-    sort -u | local/extend_segments.pl 0.1 >$dir/segments
-
-# stm file has lines like:
-# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER
-# TODO(arnab): We should really be lowercasing this since the Edinburgh
-# recipe uses lowercase. This is not used in the actual scoring.
-grep -v ';;' $tdir/reference/hub5e00.english.000405.stm |
-    awk '{
-    spk=$1"-"$2;
-    utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
-    printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' |
-    sort >$dir/text.all
-
-# We'll use the stm file for sclite scoring.  There seem to be various errors
-# in the stm file that upset hubscr.pl, and we fix them here.
-sed -e 's:((:(:' -e 's:<B_ASIDE>::g' -e 's:<E_ASIDE>::g' \
-    $tdir/reference/hub5e00.english.000405.stm >$dir/stm
-cp $tdir/reference/en20000405_hub5.glm $dir/glm
-
-# next line uses command substitution
-# Just checking that the segments are the same in pem vs. stm.
-! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) &&
-    echo "Segments from pem file and stm file do not match." && exit 1
-
-grep -v IGNORE_TIME_SEGMENT_ $dir/text.all >$dir/text
-
-# create an utt2spk file that assumes each conversation side is
-# a separate speaker.
-awk '{print $1,$2;}' $dir/segments >$dir/utt2spk
-utils/utt2spk_to_spk2utt.pl $dir/utt2spk >$dir/spk2utt
-
-# cp $dir/segments $dir/segments.tmp
-# awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \
-#   $dir/segments.tmp > $dir/segments
-
-awk '{print $1}' $dir/wav.scp |
-    perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
-    print "$1-$2 $1 $2\n"; ' \
-        >$dir/reco2file_and_channel || exit 1
-
-echo Data preparation and formatting completed for Eval 2000
-echo "(but not MFCC extraction)"
-
-utils/fix_data_dir.sh $dir
-
-if [ $(wc -l <$dir/wav.scp) -ne 80 ]; then
-    echo "$0: error: expected 80 lines in wav.scp, got $(wc -l <$dir/wav.scp)"
-    exit 1
-fi
diff --git a/egs/swbd/ASR/local/normalize_eval2000.py b/egs/swbd/ASR/local/normalize_eval2000.py
index de19aa436..4de3ab0f8 100644
--- a/egs/swbd/ASR/local/normalize_eval2000.py
+++ b/egs/swbd/ASR/local/normalize_eval2000.py
@@ -177,6 +177,10 @@ def replace_silphone(text: str) -> str:
     text = text.replace("{SNORT}", " ")
     text = text.replace("{SHARP EXHALATION}", " ")
     text = text.replace("{BREATH LAUGH}", " ")
+
+    text = text.replace("[LAUGHTER]", " ")
+    text = text.replace("[NOISE]", " ")
+    text = text.replace("[VOCALIZED-NOISE]", " ")
     return text
 
 
diff --git a/egs/swbd/ASR/prepare.sh b/egs/swbd/ASR/prepare.sh
index 6940d7186..7c9156abe 100755
--- a/egs/swbd/ASR/prepare.sh
+++ b/egs/swbd/ASR/prepare.sh
@@ -24,8 +24,15 @@ stop_stage=100
 
 dl_dir=./download
 swbd1_dir="/export/corpora3/LDC/LDC97S62"
-eval2000_dir="/export/corpora2/LDC/LDC2002S09/hub5e_00"
-eval2000_ref_dir="/export/corpora2/LDC/LDC2002T43"
+
+# eval2000_dir contains the following files and directories
+# downloaded from LDC website:
+#  - LDC2002S09
+#       - hub5e_00
+#  - LDC2002T43
+#       - 2000_hub5_eng_eval_tr
+eval2000_dir="/export/corpora2/LDC/eval2000"
+
 rt03_dir="/export/corpora/LDC/LDC2007S10"
 fisher_dir="/export/corpora3/LDC/LDC2004T19"
 
@@ -52,7 +59,7 @@ log() {
 }
 
 log "swbd1_dir: $swbd1_dir"
-log "eval2000_dir: $eval2000_dir $eval2000_ref_dir"
+log "eval2000_dir: $eval2000_dir"
 log "rt03_dir: $rt03_dir"
 
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
@@ -68,7 +75,11 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
         mv data/manifests_train/recordings.jsonl.gz data/manifests_train/swbd_recordings_all.jsonl.gz
         mv data/manifests_train/supervisions.jsonl.gz data/manifests_train/swbd_supervisions_all.jsonl.gz
 
-        ./local/eval2000_data_prep.sh $eval2000_dir $eval2000_ref_dir
+        lhotse prepare $eval2000_dir data/manifests_eval2000
+        ./local/normalize_eval2000.py \
+            data/manifests_eval2000/eval2000_supervisions_unnorm.jsonl.gz \
+            data/manifests_eval2000/eval2000_supervisions.jsonl.gz
+
         ./local/rt03_data_prep.sh $rt03_dir
 
         # normalize eval2000 and rt03 texts by
@@ -76,7 +87,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
         # 2) remove tags (%AH) (%HESITATION) (%UH)
         # 3) remove <B_ASIDE> <E_ASIDE>
         # 4) remove "(" or ")"
-        for x in eval2000 rt03; do
+        for x in  rt03; do
             cp data/local/${x}/text data/local/${x}/text.org
             paste -d "" \
                 <(cut -f 1 -d" " data/local/${x}/text.org) \