diff --git a/egs/swbd/ASR/local/eval2000_data_prep.sh b/egs/swbd/ASR/local/eval2000_data_prep.sh index f88198328..a5bd3ebcf 100755 --- a/egs/swbd/ASR/local/eval2000_data_prep.sh +++ b/egs/swbd/ASR/local/eval2000_data_prep.sh @@ -21,100 +21,98 @@ # $tdir/reference/hub5e00.english.000405.stm if [ $# -ne 2 ]; then - echo "Usage: "`basename $0`" " + echo "Usage: "$(basename $0)" " echo "See comments in the script for more details" exit 1 fi sdir=$1 tdir=$2 -[ ! -d $sdir/english ] \ - && echo Expecting directory $sdir/english to be present && exit 1; - [ -d $tdir/2000_hub5_eng_eval_tr ] \ - && tdir=$tdir/2000_hub5_eng_eval_tr - [ ! -d $tdir/reference ] \ - && echo Expecting directory $tdir/reference to be present && exit 1; +[ ! -d $sdir/english ] && + echo Expecting directory $sdir/english to be present && exit 1 +[ -d $tdir/2000_hub5_eng_eval_tr ] && + tdir=$tdir/2000_hub5_eng_eval_tr +[ ! -d $tdir/reference ] && + echo Expecting directory $tdir/reference to be present && exit 1 +dir=data/local/eval2000 +mkdir -p $dir - dir=data/local/eval2000 - mkdir -p $dir +find $sdir/english -iname '*.sph' | sort >$dir/sph.flist +sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \ + >$dir/sph.scp - find $sdir/english -iname '*.sph' | sort > $dir/sph.flist - sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \ - > $dir/sph.scp +sph2pipe=sph2pipe +[ ! -x $sph2pipe ] && + echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1 - sph2pipe=sph2pipe - [ ! -x $sph2pipe ] \ - && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1; - - awk -v sph2pipe=$sph2pipe '{ +awk -v sph2pipe=$sph2pipe '{ printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2); - }' < $dir/sph.scp | sort > $dir/wav.scp || exit 1; - #side A - channel 1, side B - channel 2 + }' <$dir/sph.scp | sort >$dir/wav.scp || exit 1 +#side A - channel 1, side B - channel 2 # Get segments file... # segments file format is: utt-id side-id start-time end-time, e.g.: # sw02001-A_000098-001156 sw02001-A 0.98 11.56 pem=$sdir/english/hub5e_00.pem -[ ! -f $pem ] && echo "$0: No such file $pem" && exit 1; +[ ! -f $pem ] && echo "$0: No such file $pem" && exit 1 # pem file has lines like: # en_4156 A unknown_speaker 301.85 302.48 # we ignore the warnings below for now, although they seem to indicate some problems # with the data. -grep -v ';;' $pem \ - | awk '{ +grep -v ';;' $pem | + awk '{ spk=$1"-"$2; utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); - print utt,spk,$4,$5;}' \ - | sort -u | local/extend_segments.pl 0.1 > $dir/segments + print utt,spk,$4,$5;}' | + sort -u | local/extend_segments.pl 0.1 >$dir/segments # stm file has lines like: # en_4156 A en_4156_A 357.64 359.64 HE IS A POLICE OFFICER # TODO(arnab): We should really be lowercasing this since the Edinburgh # recipe uses lowercase. This is not used in the actual scoring. -grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \ - | awk '{ +grep -v ';;' $tdir/reference/hub5e00.english.000405.stm | + awk '{ spk=$1"-"$2; utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); - printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' \ - | sort > $dir/text.all + printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' | + sort >$dir/text.all # We'll use the stm file for sclite scoring. There seem to be various errors # in the stm file that upset hubscr.pl, and we fix them here. sed -e 's:((:(:' -e 's:::g' -e 's:::g' \ - $tdir/reference/hub5e00.english.000405.stm > $dir/stm - cp $tdir/reference/en20000405_hub5.glm $dir/glm + $tdir/reference/hub5e00.english.000405.stm >$dir/stm +cp $tdir/reference/en20000405_hub5.glm $dir/glm # next line uses command substitution # Just checking that the segments are the same in pem vs. stm. -! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && \ - echo "Segments from pem file and stm file do not match." && exit 1; +! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && + echo "Segments from pem file and stm file do not match." && exit 1 -grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text +grep -v IGNORE_TIME_SEGMENT_ $dir/text.all >$dir/text # create an utt2spk file that assumes each conversation side is # a separate speaker. -awk '{print $1,$2;}' $dir/segments > $dir/utt2spk -utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt +awk '{print $1,$2;}' $dir/segments >$dir/utt2spk +utils/utt2spk_to_spk2utt.pl $dir/utt2spk >$dir/spk2utt # cp $dir/segments $dir/segments.tmp # awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \ # $dir/segments.tmp > $dir/segments -awk '{print $1}' $dir/wav.scp \ - | perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; +awk '{print $1}' $dir/wav.scp | + perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; print "$1-$2 $1 $2\n"; ' \ - > $dir/reco2file_and_channel || exit 1; + >$dir/reco2file_and_channel || exit 1 +echo Data preparation and formatting completed for Eval 2000 +echo "(but not MFCC extraction)" - echo Data preparation and formatting completed for Eval 2000 - echo "(but not MFCC extraction)" +utils/fix_data_dir.sh $dir - utils/fix_data_dir.sh $dir - - if [ $(wc -l < $dir/wav.scp) -ne 80 ]; then - echo "$0: error: expected 80 lines in wav.scp, got $(wc -l < $dir/wav.scp)" - exit 1; - fi +if [ $(wc -l <$dir/wav.scp) -ne 80 ]; then + echo "$0: error: expected 80 lines in wav.scp, got $(wc -l <$dir/wav.scp)" + exit 1 +fi diff --git a/egs/swbd/ASR/local/rt03_data_prep.sh b/egs/swbd/ASR/local/rt03_data_prep.sh index 1545b7809..8a5f64324 100755 --- a/egs/swbd/ASR/local/rt03_data_prep.sh +++ b/egs/swbd/ASR/local/rt03_data_prep.sh @@ -15,30 +15,30 @@ if [ $# -ne 1 ]; then fi sdir=$1 -[ ! -d $sdir/data/audio/eval03/english/cts ] \ - && echo Expecting directory $sdir/data/audio/eval03/english/cts to be present && exit 1; - [ ! -d $sdir/data/references/eval03/english/cts ] \ - && echo Expecting directory $tdir/data/references/eval03/english/cts to be present && exit 1; +[ ! -d $sdir/data/audio/eval03/english/cts ] && + echo Expecting directory $sdir/data/audio/eval03/english/cts to be present && exit 1 +[ ! -d $sdir/data/references/eval03/english/cts ] && + echo Expecting directory $tdir/data/references/eval03/english/cts to be present && exit 1 - dir=data/local/rt03 - mkdir -p $dir +dir=data/local/rt03 +mkdir -p $dir - rtroot=$sdir - tdir=$sdir/data/references/eval03/english/cts - sdir=$sdir/data/audio/eval03/english/cts +rtroot=$sdir +tdir=$sdir/data/references/eval03/english/cts +sdir=$sdir/data/audio/eval03/english/cts - find -L $sdir -iname '*.sph' | sort > $dir/sph.flist - sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \ - > $dir/sph.scp +find -L $sdir -iname '*.sph' | sort >$dir/sph.flist +sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \ + >$dir/sph.scp - sph2pipe=sph2pipe - ! command -v "${sph2pipe}" &> /dev/null \ - && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1; +sph2pipe=sph2pipe +! command -v "${sph2pipe}" &>/dev/null && + echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1 - awk -v sph2pipe=$sph2pipe '{ +awk -v sph2pipe=$sph2pipe '{ printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2); -}' < $dir/sph.scp | sort > $dir/wav.scp || exit 1; +}' <$dir/sph.scp | sort >$dir/wav.scp || exit 1 #side A - channel 1, side B - channel 2 # Get segments file... @@ -50,58 +50,58 @@ sdir=$1 # en_4156 A unknown_speaker 301.85 302.48 #grep -v ';;' $pem \ -cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \ - | awk '{ +cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap | + awk '{ spk=$1"-"(($2==1)?"A":"B"); utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); - print utt,spk,$4,$5;}' \ - | sort -u > $dir/segments + print utt,spk,$4,$5;}' | + sort -u >$dir/segments # stm file has lines like: # en_4156 A en_4156_A 357.64 359.64 HE IS A POLICE OFFICER # TODO(arnab): We should really be lowercasing this since the Edinburgh # recipe uses lowercase. This is not used in the actual scoring. #grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \ -cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \ - | awk '{ +cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap | + awk '{ spk=$1"-"(($2==1)?"A":"B"); utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); - printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' \ - | sort > $dir/text.all + printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' | + sort >$dir/text.all # We'll use the stm file for sclite scoring. There seem to be various errors # in the stm file that upset hubscr.pl, and we fix them here. -cat $tdir/*.stm | \ - sed -e 's:((:(:' -e 's:::g' -e 's:::g' | \ - grep -v inter_segment_gap | \ +cat $tdir/*.stm | + sed -e 's:((:(:' -e 's:::g' -e 's:::g' | + grep -v inter_segment_gap | awk '{ - printf $1; if ($1==";;") printf(" %s",$2); else printf(($2==1)?" A":" B"); for(n=3;n<=NF;n++) printf(" %s", $n); print ""; }'\ - > $dir/stm - #$tdir/reference/hub5e00.english.000405.stm > $dir/stm - cp $rtroot/data/trans_rules/en20030506.glm $dir/glm + printf $1; if ($1==";;") printf(" %s",$2); else printf(($2==1)?" A":" B"); for(n=3;n<=NF;n++) printf(" %s", $n); print ""; }' \ + >$dir/stm +#$tdir/reference/hub5e00.english.000405.stm > $dir/stm +cp $rtroot/data/trans_rules/en20030506.glm $dir/glm # next line uses command substitution # Just checking that the segments are the same in pem vs. stm. -! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && \ - echo "Segments from pem file and stm file do not match." && exit 1; +! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && + echo "Segments from pem file and stm file do not match." && exit 1 -grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text +grep -v IGNORE_TIME_SEGMENT_ $dir/text.all >$dir/text # create an utt2spk file that assumes each conversation side is # a separate speaker. -awk '{print $1,$2;}' $dir/segments > $dir/utt2spk -utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt +awk '{print $1,$2;}' $dir/segments >$dir/utt2spk +utils/utt2spk_to_spk2utt.pl $dir/utt2spk >$dir/spk2utt # cp $dir/segments $dir/segments.tmp # awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \ # $dir/segments.tmp > $dir/segments -awk '{print $1}' $dir/wav.scp \ - | perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; +awk '{print $1}' $dir/wav.scp | + perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; print "$1-$2 $1 $2\n"; ' \ - > $dir/reco2file_and_channel || exit 1; + >$dir/reco2file_and_channel || exit 1 - ./utils/fix_data_dir.sh $dir +./utils/fix_data_dir.sh $dir - echo Data preparation and formatting completed for RT-03 - echo "(but not MFCC extraction)" +echo Data preparation and formatting completed for RT-03 +echo "(but not MFCC extraction)" diff --git a/egs/swbd/ASR/local/swbd1_data_prep.sh b/egs/swbd/ASR/local/swbd1_data_prep.sh index 002740354..159359491 100755 --- a/egs/swbd/ASR/local/swbd1_data_prep.sh +++ b/egs/swbd/ASR/local/swbd1_data_prep.sh @@ -17,11 +17,10 @@ ## will be using "find" to locate this file so we don't make any assumptions ## on the directory structure. (Peng Qi, Aug 2014) - #check existing directories if [ $# != 1 -a $# != 2 ]; then echo "Usage: swbd1_data_prep.sh /path/to/SWBD [/path/to/SWBD_DOC]" - exit 1; + exit 1 fi SWBD_DIR=$1 @@ -29,29 +28,27 @@ SWBD_DIR=$1 dir=data/local/train mkdir -p $dir - # Audio data directory check if [ ! -d $SWBD_DIR ]; then echo "Error: run.sh requires a directory argument" - exit 1; + exit 1 fi sph2pipe=sph2pipe -! command -v "${sph2pipe}" &> /dev/null \ - && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1; +! command -v "${sph2pipe}" &>/dev/null && + echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1 # Option A: SWBD dictionary file check -[ ! -f ./swb_ms98_transcriptions/sw-ms98-dict.text ] && \ - echo "SWBD dictionary file does not exist" && exit 1; +[ ! -f ./swb_ms98_transcriptions/sw-ms98-dict.text ] && + echo "SWBD dictionary file does not exist" && exit 1 # find sph audio files -find -L $SWBD_DIR -iname '*.sph' | sort > $dir/sph.flist +find -L $SWBD_DIR -iname '*.sph' | sort >$dir/sph.flist -n=`cat $dir/sph.flist | wc -l` -[ $n -ne 2435 ] && [ $n -ne 2438 ] && \ +n=$(cat $dir/sph.flist | wc -l) +[ $n -ne 2435 ] && [ $n -ne 2438 ] && echo Warning: expected 2435 or 2438 data data files, found $n - # (1a) Transcriptions preparation # make basic transcription file (add segments info) # **NOTE: In the default Kaldi recipe, everything is made uppercase, while we @@ -64,11 +61,11 @@ stime=$2; etime=$3; printf("%s-%s_%06.0f-%06.0f", name, side, int(100*stime+0.5), int(100*etime+0.5)); for(i=4;i<=NF;i++) printf(" %s", $i); printf "\n" -}' ./swb_ms98_transcriptions/*/*/*-trans.text > $dir/transcripts1.txt +}' ./swb_ms98_transcriptions/*/*/*-trans.text >$dir/transcripts1.txt # test if trans. file is sorted -export LC_ALL=C; -sort -c $dir/transcripts1.txt || exit 1; # check it's sorted. +export LC_ALL=C +sort -c $dir/transcripts1.txt || exit 1 # check it's sorted. # Remove SILENCE, and . @@ -77,22 +74,21 @@ sort -c $dir/transcripts1.txt || exit 1; # check it's sorted. # speech to somone; we will give phones to the other three (NSN, SPN, LAU). # There will also be a silence phone, SIL. # **NOTE: modified the pattern matches to make them case insensitive -cat $dir/transcripts1.txt \ - | perl -ane 's:\s\[SILENCE\](\s|$):$1:gi; +cat $dir/transcripts1.txt | + perl -ane 's:\s\[SILENCE\](\s|$):$1:gi; s///gi; s///gi; - print;' \ - | awk '{if(NF > 1) { print; } } ' > $dir/transcripts2.txt - + print;' | + awk '{if(NF > 1) { print; } } ' >$dir/transcripts2.txt # **NOTE: swbd1_map_words.pl has been modified to make the pattern matches # case insensitive -local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt > $dir/text +local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt >$dir/text # format acronyms in text python3 local/map_acronyms_transcripts.py -i $dir/text -o $dir/text_map \ -M data/local/dict_nosp/acronyms.map - mv $dir/text_map $dir/text +mv $dir/text_map $dir/text # (1c) Make segment files from transcript #segments file format is: utt-id side-id start-time end-time, e.g.: @@ -102,15 +98,15 @@ segment=$1; split(segment,S,"[_-]"); side=S[2]; audioname=S[1]; startf=S[3]; endf=S[4]; print segment " " audioname "-" side " " startf/100 " " endf/100 -}' < $dir/text > $dir/segments +}' <$dir/text >$dir/segments sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \ - > $dir/sph.scp + >$dir/sph.scp awk -v sph2pipe=$sph2pipe '{ printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2); -}' < $dir/sph.scp | sort > $dir/wav.scp || exit 1; +}' <$dir/sph.scp | sort >$dir/wav.scp || exit 1 #side A - channel 1, side B - channel 2 # this file reco2file_and_channel maps recording-id (e.g. sw02001-A) @@ -118,15 +114,15 @@ printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2); # sw02001-A sw02001 A # In this case it's trivial, but in other corpora the information might # be less obvious. Later it will be needed for ctm scoring. -awk '{print $1}' $dir/wav.scp \ - | perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; +awk '{print $1}' $dir/wav.scp | + perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; print "$1-$2 $1 $2\n"; ' \ - > $dir/reco2file_and_channel || exit 1; + >$dir/reco2file_and_channel || exit 1 - awk '{spk=substr($1,1,9); print $1 " " spk}' $dir/segments > $dir/utt2spk \ - || exit 1; - sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; +awk '{spk=substr($1,1,9); print $1 " " spk}' $dir/segments >$dir/utt2spk || + exit 1 +sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl >$dir/spk2utt || exit 1 - echo Switchboard-1 data preparation succeeded. +echo Switchboard-1 data preparation succeeded. - utils/fix_data_dir.sh data/local/train +utils/fix_data_dir.sh data/local/train diff --git a/egs/swbd/ASR/local/swbd1_prepare_dict.sh b/egs/swbd/ASR/local/swbd1_prepare_dict.sh index 0c38a72dc..0bb98903f 100755 --- a/egs/swbd/ASR/local/swbd1_prepare_dict.sh +++ b/egs/swbd/ASR/local/swbd1_prepare_dict.sh @@ -5,32 +5,36 @@ # To be run from one directory above this script. - #check existing directories -[ $# != 0 ] && echo "Usage: local/swbd1_data_prep.sh" && exit 1; +[ $# != 0 ] && echo "Usage: local/swbd1_data_prep.sh" && exit 1 -srcdir=. # This is where we downloaded some stuff.. +srcdir=. # This is where we downloaded some stuff.. dir=./data/local/dict_nosp mkdir -p $dir srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text # assume swbd_p1_data_prep.sh was done already. -[ ! -f "$srcdict" ] && echo "$0: No such file $srcdict" && exit 1; +[ ! -f "$srcdict" ] && echo "$0: No such file $srcdict" && exit 1 -cp $srcdict $dir/lexicon0.txt || exit 1; +cp $srcdict $dir/lexicon0.txt || exit 1 chmod a+w $dir/lexicon0.txt -patch 0' | sort > $dir/lexicon1.txt || exit 1; +grep -v '^#' $dir/lexicon0.txt | awk 'NF>0' | sort >$dir/lexicon1.txt || exit 1 -cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \ - grep -v sil > $dir/nonsilence_phones.txt || exit 1; +cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | + grep -v sil >$dir/nonsilence_phones.txt || exit 1 -( echo sil; echo spn; echo nsn; echo lau ) > $dir/silence_phones.txt +( + echo sil + echo spn + echo nsn + echo lau +) >$dir/silence_phones.txt -echo sil > $dir/optional_silence.txt +echo sil >$dir/optional_silence.txt # No "extra questions" in the input to this setup, as we don't # have stress or tone. @@ -41,9 +45,14 @@ cp local/MSU_single_letter.txt $dir/ # Add single letter lexicon # The original swbd lexicon does not have precise single letter lexicion # e.g. it does not have entry of W -( echo '!sil sil'; echo '[vocalized-noise] spn'; echo '[noise] nsn'; \ - echo '[laughter] lau'; echo ' spn' ) \ - | cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt > $dir/lexicon2.txt || exit 1; +( + echo '!sil sil' + echo '[vocalized-noise] spn' + echo '[noise] nsn' + echo '[laughter] lau' + echo ' spn' +) | + cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt >$dir/lexicon2.txt || exit 1 # Map the words in the lexicon. That is-- for each word in the lexicon, we map it # to a new written form. The transformations we do are: @@ -77,16 +86,16 @@ cp local/MSU_single_letter.txt $dir/ # in the lexicon. local/swbd1_map_words.pl -f 1 $dir/lexicon2.txt | sort -u \ - > $dir/lexicon3.txt || exit 1; + >$dir/lexicon3.txt || exit 1 python3 local/format_acronyms_dict.py -i $dir/lexicon3.txt -o $dir/lexicon4.txt \ -L $dir/MSU_single_letter.txt -M $dir/acronyms_raw.map - cat $dir/acronyms_raw.map | sort -u > $dir/acronyms.map +cat $dir/acronyms_raw.map | sort -u >$dir/acronyms.map - ( echo 'i ay' )| cat - $dir/lexicon4.txt | tr '[A-Z]' '[a-z]' | sort -u > $dir/lexicon5.txt +(echo 'i ay') | cat - $dir/lexicon4.txt | tr '[A-Z]' '[a-z]' | sort -u >$dir/lexicon5.txt - pushd $dir >&/dev/null - ln -sf lexicon5.txt lexicon.txt # This is the final lexicon. - popd >&/dev/null - rm $dir/lexiconp.txt 2>/dev/null - echo Prepared input dictionary and phone-sets for Switchboard phase 1. +pushd $dir >&/dev/null +ln -sf lexicon5.txt lexicon.txt # This is the final lexicon. +popd >&/dev/null +rm $dir/lexiconp.txt 2>/dev/null +echo Prepared input dictionary and phone-sets for Switchboard phase 1. diff --git a/egs/swbd/ASR/prepare.sh b/egs/swbd/ASR/prepare.sh index ae0b278b1..6ac7dbf9e 100755 --- a/egs/swbd/ASR/prepare.sh +++ b/egs/swbd/ASR/prepare.sh @@ -78,28 +78,28 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then cp data/local/${x}/text data/local/${x}/text.org paste -d "" \ <(cut -f 1 -d" " data/local/${x}/text.org) \ - <(awk '{$1=""; print tolower($0)}' data/local/${x}/text.org | perl -pe 's| \(\%.*\)||g' | perl -pe 's| \<.*\>||g' | sed -e "s/(//g" -e "s/)//g") \ - | sed -e 's/\s\+/ /g' > data/local/${x}/text - rm data/local/${x}/text.org - done + <(awk '{$1=""; print tolower($0)}' data/local/${x}/text.org | perl -pe 's| \(\%.*\)||g' | perl -pe 's| \<.*\>||g' | sed -e "s/(//g" -e "s/)//g") | + sed -e 's/\s\+/ /g' >data/local/${x}/text + rm data/local/${x}/text.org + done - python ./local/filter_empty_text.py --kaldi-data-dir data/local/eval2000 - ./utils/fix_data_dir.sh data/local/eval2000 - lhotse kaldi import data/local/eval2000 8000 data/manifests_eval2000 - mv data/manifests_eval2000/recordings.jsonl.gz data/manifests_eval2000/swbd_recordings_eval2000.jsonl.gz - mv data/manifests_eval2000/supervisions.jsonl.gz data/manifests_eval2000/swbd_supervisions_eval2000.jsonl.gz + python ./local/filter_empty_text.py --kaldi-data-dir data/local/eval2000 + ./utils/fix_data_dir.sh data/local/eval2000 + lhotse kaldi import data/local/eval2000 8000 data/manifests_eval2000 + mv data/manifests_eval2000/recordings.jsonl.gz data/manifests_eval2000/swbd_recordings_eval2000.jsonl.gz + mv data/manifests_eval2000/supervisions.jsonl.gz data/manifests_eval2000/swbd_supervisions_eval2000.jsonl.gz - python ./local/filter_empty_text.py --kaldi-data-dir data/local/rt03 - ./utils/fix_data_dir.sh data/local/rt03 - lhotse kaldi import data/local/rt03 8000 data/manifests_rt03 - mv data/manifests_rt03/recordings.jsonl.gz data/manifests_rt03/swbd_recordings_rt03.jsonl.gz - mv data/manifests_rt03/supervisions.jsonl.gz data/manifests_rt03/swbd_supervisions_rt03.jsonl.gz + python ./local/filter_empty_text.py --kaldi-data-dir data/local/rt03 + ./utils/fix_data_dir.sh data/local/rt03 + lhotse kaldi import data/local/rt03 8000 data/manifests_rt03 + mv data/manifests_rt03/recordings.jsonl.gz data/manifests_rt03/swbd_recordings_rt03.jsonl.gz + mv data/manifests_rt03/supervisions.jsonl.gz data/manifests_rt03/swbd_supervisions_rt03.jsonl.gz - lhotse fix data/manifests_train/swbd_recordings_all.jsonl.gz data/manifests_train/swbd_supervisions_all.jsonl.gz data/manifests - lhotse fix data/manifests_eval2000/swbd_recordings_eval2000.jsonl.gz data/manifests_eval2000/swbd_supervisions_eval2000.jsonl.gz data/manifests - lhotse fix data/manifests_rt03/swbd_recordings_rt03.jsonl.gz data/manifests_rt03/swbd_supervisions_rt03.jsonl.gz data/manifests + lhotse fix data/manifests_train/swbd_recordings_all.jsonl.gz data/manifests_train/swbd_supervisions_all.jsonl.gz data/manifests + lhotse fix data/manifests_eval2000/swbd_recordings_eval2000.jsonl.gz data/manifests_eval2000/swbd_supervisions_eval2000.jsonl.gz data/manifests + lhotse fix data/manifests_rt03/swbd_recordings_rt03.jsonl.gz data/manifests_rt03/swbd_supervisions_rt03.jsonl.gz data/manifests - touch data/manifests/.swbd.done + touch data/manifests/.swbd.done fi fi @@ -260,11 +260,11 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then -ngram-order 3 \ -text ${lang_dir}/input.txt \ -lm data/lm/3-gram.arpa - python3 -m kaldilm \ - --read-symbol-table="data/lang_phone/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - data/lm/3-gram.arpa >data/lm/G_3_gram.fst.txt + python3 -m kaldilm \ + --read-symbol-table="data/lang_phone/words.txt" \ + --disambig-symbol='#0' \ + --max-order=3 \ + data/lm/3-gram.arpa >data/lm/G_3_gram.fst.txt fi if [ ! -f data/lm/G_4_gram.fst.txt ]; then @@ -273,11 +273,11 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then -ngram-order 4 \ -text ${lang_dir}/input.txt \ -lm data/lm/4-gram.arpa - python3 -m kaldilm \ - --read-symbol-table="data/lang_phone/words.txt" \ - --disambig-symbol='#0' \ - --max-order=4 \ - data/lm/4-gram.arpa >data/lm/G_4_gram.fst.txt + python3 -m kaldilm \ + --read-symbol-table="data/lang_phone/words.txt" \ + --disambig-symbol='#0' \ + --max-order=4 \ + data/lm/4-gram.arpa >data/lm/G_4_gram.fst.txt fi fi @@ -325,7 +325,7 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then --bpe-model $lang_dir/bpe.model \ --lm-data data/lang_phone/input.txt \ --lm-archive $out_dir/lm_data.pt - done + done fi # if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then @@ -373,8 +373,8 @@ if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then --bpe-model $lang_dir/bpe.model \ --lm-data $out_dir/${testset}.txt \ --lm-archive $out_dir/lm_data-${testset}.pt - done done + done fi if [ $stage -le 14 ] && [ $stop_stage -ge 14 ]; then @@ -393,11 +393,11 @@ if [ $stage -le 14 ] && [ $stop_stage -ge 14 ]; then --in-lm-data $out_dir/lm_data.pt \ --out-lm-data $out_dir/sorted_lm_data.pt \ --out-statistics $out_dir/statistics.txt - for testset in ${testsets[@]}; do - ./local/sort_lm_training_data.py \ - --in-lm-data $out_dir/lm_data-${testset}.pt \ - --out-lm-data $out_dir/sorted_lm_data-${testset}.pt \ - --out-statistics $out_dir/statistics-test-${testset}.txt - done - done + for testset in ${testsets[@]}; do + ./local/sort_lm_training_data.py \ + --in-lm-data $out_dir/lm_data-${testset}.pt \ + --out-lm-data $out_dir/sorted_lm_data-${testset}.pt \ + --out-statistics $out_dir/statistics-test-${testset}.txt + done + done fi