From 11fe0004f4e5fd6f39610cd4d01da94fd6d75dba Mon Sep 17 00:00:00 2001 From: zr_jin <60612200+JinZr@users.noreply.github.com> Date: Tue, 1 Aug 2023 17:15:01 +0800 Subject: [PATCH] minor updates --- egs/swbd/ASR/local/eval2000_data_prep.sh | 118 ----------------------- egs/swbd/ASR/local/normalize_eval2000.py | 4 + egs/swbd/ASR/prepare.sh | 21 +++- 3 files changed, 20 insertions(+), 123 deletions(-) delete mode 100755 egs/swbd/ASR/local/eval2000_data_prep.sh diff --git a/egs/swbd/ASR/local/eval2000_data_prep.sh b/egs/swbd/ASR/local/eval2000_data_prep.sh deleted file mode 100755 index a5bd3ebcf..000000000 --- a/egs/swbd/ASR/local/eval2000_data_prep.sh +++ /dev/null @@ -1,118 +0,0 @@ -#!/bin/bash - -# Hub-5 Eval 2000 data preparation -# Author: Arnab Ghoshal (Jan 2013) - -# To be run from one directory above this script. - -# The input is two directory names (possibly the same) containing the -# 2000 Hub5 english evaluation test set and transcripts, which are -# respectively: LDC2002S09 LDC2002T43 -# e.g. see -# http://www.ldc.upenn.edu/Catalog/catalogEntry.jsp?catalogId=LDC2002S09 -# http://www.ldc.upenn.edu/Catalog/CatalogEntry.jsp?catalogId=LDC2002T43 -# -# Example usage: -# local/eval2000_data_prep_edin.sh /exports/work/inf_hcrc_cstr_general/corpora/hub5/2000 /exports/work/inf_hcrc_cstr_general/corpora/hub5/2000/transcr -# The first directory ($sdir) contains the speech data, and the directory -# $sdir/english/ must exist. -# The second directory ($tdir) contains the transcripts, and the directory -# $tdir/reference must exist; in particular we need the file -# $tdir/reference/hub5e00.english.000405.stm - -if [ $# -ne 2 ]; then - echo "Usage: "$(basename $0)" " - echo "See comments in the script for more details" - exit 1 -fi - -sdir=$1 -tdir=$2 -[ ! -d $sdir/english ] && - echo Expecting directory $sdir/english to be present && exit 1 -[ -d $tdir/2000_hub5_eng_eval_tr ] && - tdir=$tdir/2000_hub5_eng_eval_tr -[ ! -d $tdir/reference ] && - echo Expecting directory $tdir/reference to be present && exit 1 - -dir=data/local/eval2000 -mkdir -p $dir - -find $sdir/english -iname '*.sph' | sort >$dir/sph.flist -sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \ - >$dir/sph.scp - -sph2pipe=sph2pipe -[ ! -x $sph2pipe ] && - echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1 - -awk -v sph2pipe=$sph2pipe '{ - printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); - printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2); - }' <$dir/sph.scp | sort >$dir/wav.scp || exit 1 -#side A - channel 1, side B - channel 2 - -# Get segments file... -# segments file format is: utt-id side-id start-time end-time, e.g.: -# sw02001-A_000098-001156 sw02001-A 0.98 11.56 -pem=$sdir/english/hub5e_00.pem -[ ! -f $pem ] && echo "$0: No such file $pem" && exit 1 -# pem file has lines like: -# en_4156 A unknown_speaker 301.85 302.48 - -# we ignore the warnings below for now, although they seem to indicate some problems -# with the data. -grep -v ';;' $pem | - awk '{ - spk=$1"-"$2; - utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); - print utt,spk,$4,$5;}' | - sort -u | local/extend_segments.pl 0.1 >$dir/segments - -# stm file has lines like: -# en_4156 A en_4156_A 357.64 359.64 HE IS A POLICE OFFICER -# TODO(arnab): We should really be lowercasing this since the Edinburgh -# recipe uses lowercase. This is not used in the actual scoring. -grep -v ';;' $tdir/reference/hub5e00.english.000405.stm | - awk '{ - spk=$1"-"$2; - utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); - printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' | - sort >$dir/text.all - -# We'll use the stm file for sclite scoring. There seem to be various errors -# in the stm file that upset hubscr.pl, and we fix them here. -sed -e 's:((:(:' -e 's:::g' -e 's:::g' \ - $tdir/reference/hub5e00.english.000405.stm >$dir/stm -cp $tdir/reference/en20000405_hub5.glm $dir/glm - -# next line uses command substitution -# Just checking that the segments are the same in pem vs. stm. -! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && - echo "Segments from pem file and stm file do not match." && exit 1 - -grep -v IGNORE_TIME_SEGMENT_ $dir/text.all >$dir/text - -# create an utt2spk file that assumes each conversation side is -# a separate speaker. -awk '{print $1,$2;}' $dir/segments >$dir/utt2spk -utils/utt2spk_to_spk2utt.pl $dir/utt2spk >$dir/spk2utt - -# cp $dir/segments $dir/segments.tmp -# awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \ -# $dir/segments.tmp > $dir/segments - -awk '{print $1}' $dir/wav.scp | - perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; - print "$1-$2 $1 $2\n"; ' \ - >$dir/reco2file_and_channel || exit 1 - -echo Data preparation and formatting completed for Eval 2000 -echo "(but not MFCC extraction)" - -utils/fix_data_dir.sh $dir - -if [ $(wc -l <$dir/wav.scp) -ne 80 ]; then - echo "$0: error: expected 80 lines in wav.scp, got $(wc -l <$dir/wav.scp)" - exit 1 -fi diff --git a/egs/swbd/ASR/local/normalize_eval2000.py b/egs/swbd/ASR/local/normalize_eval2000.py index de19aa436..4de3ab0f8 100644 --- a/egs/swbd/ASR/local/normalize_eval2000.py +++ b/egs/swbd/ASR/local/normalize_eval2000.py @@ -177,6 +177,10 @@ def replace_silphone(text: str) -> str: text = text.replace("{SNORT}", " ") text = text.replace("{SHARP EXHALATION}", " ") text = text.replace("{BREATH LAUGH}", " ") + + text = text.replace("[LAUGHTER]", " ") + text = text.replace("[NOISE]", " ") + text = text.replace("[VOCALIZED-NOISE]", " ") return text diff --git a/egs/swbd/ASR/prepare.sh b/egs/swbd/ASR/prepare.sh index 6940d7186..7c9156abe 100755 --- a/egs/swbd/ASR/prepare.sh +++ b/egs/swbd/ASR/prepare.sh @@ -24,8 +24,15 @@ stop_stage=100 dl_dir=./download swbd1_dir="/export/corpora3/LDC/LDC97S62" -eval2000_dir="/export/corpora2/LDC/LDC2002S09/hub5e_00" -eval2000_ref_dir="/export/corpora2/LDC/LDC2002T43" + +# eval2000_dir contains the following files and directories +# downloaded from LDC website: +# - LDC2002S09 +# - hub5e_00 +# - LDC2002T43 +# - 2000_hub5_eng_eval_tr +eval2000_dir="/export/corpora2/LDC/eval2000" + rt03_dir="/export/corpora/LDC/LDC2007S10" fisher_dir="/export/corpora3/LDC/LDC2004T19" @@ -52,7 +59,7 @@ log() { } log "swbd1_dir: $swbd1_dir" -log "eval2000_dir: $eval2000_dir $eval2000_ref_dir" +log "eval2000_dir: $eval2000_dir" log "rt03_dir: $rt03_dir" if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then @@ -68,7 +75,11 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then mv data/manifests_train/recordings.jsonl.gz data/manifests_train/swbd_recordings_all.jsonl.gz mv data/manifests_train/supervisions.jsonl.gz data/manifests_train/swbd_supervisions_all.jsonl.gz - ./local/eval2000_data_prep.sh $eval2000_dir $eval2000_ref_dir + lhotse prepare $eval2000_dir data/manifests_eval2000 + ./local/normalize_eval2000.py \ + data/manifests_eval2000/eval2000_supervisions_unnorm.jsonl.gz \ + data/manifests_eval2000/eval2000_supervisions.jsonl.gz + ./local/rt03_data_prep.sh $rt03_dir # normalize eval2000 and rt03 texts by @@ -76,7 +87,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then # 2) remove tags (%AH) (%HESITATION) (%UH) # 3) remove # 4) remove "(" or ")" - for x in eval2000 rt03; do + for x in rt03; do cp data/local/${x}/text data/local/${x}/text.org paste -d "" \ <(cut -f 1 -d" " data/local/${x}/text.org) \