Fixed formatting issues in bash scripts.

This commit is contained in:
jinzr 2023-06-26 19:32:50 +08:00
parent 96738b538a
commit 439855e3f3
5 changed files with 184 additions and 181 deletions

View File

@ -21,20 +21,19 @@
# $tdir/reference/hub5e00.english.000405.stm # $tdir/reference/hub5e00.english.000405.stm
if [ $# -ne 2 ]; then if [ $# -ne 2 ]; then
echo "Usage: "`basename $0`" <speech-dir> <transcription-dir>" echo "Usage: "$(basename $0)" <speech-dir> <transcription-dir>"
echo "See comments in the script for more details" echo "See comments in the script for more details"
exit 1 exit 1
fi fi
sdir=$1 sdir=$1
tdir=$2 tdir=$2
[ ! -d $sdir/english ] \ [ ! -d $sdir/english ] &&
&& echo Expecting directory $sdir/english to be present && exit 1; echo Expecting directory $sdir/english to be present && exit 1
[ -d $tdir/2000_hub5_eng_eval_tr ] \ [ -d $tdir/2000_hub5_eng_eval_tr ] &&
&& tdir=$tdir/2000_hub5_eng_eval_tr tdir=$tdir/2000_hub5_eng_eval_tr
[ ! -d $tdir/reference ] \ [ ! -d $tdir/reference ] &&
&& echo Expecting directory $tdir/reference to be present && exit 1; echo Expecting directory $tdir/reference to be present && exit 1
dir=data/local/eval2000 dir=data/local/eval2000
mkdir -p $dir mkdir -p $dir
@ -44,42 +43,42 @@ tdir=$2
>$dir/sph.scp >$dir/sph.scp
sph2pipe=sph2pipe sph2pipe=sph2pipe
[ ! -x $sph2pipe ] \ [ ! -x $sph2pipe ] &&
&& echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1; echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1
awk -v sph2pipe=$sph2pipe '{ awk -v sph2pipe=$sph2pipe '{
printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
}' < $dir/sph.scp | sort > $dir/wav.scp || exit 1; }' <$dir/sph.scp | sort >$dir/wav.scp || exit 1
#side A - channel 1, side B - channel 2 #side A - channel 1, side B - channel 2
# Get segments file... # Get segments file...
# segments file format is: utt-id side-id start-time end-time, e.g.: # segments file format is: utt-id side-id start-time end-time, e.g.:
# sw02001-A_000098-001156 sw02001-A 0.98 11.56 # sw02001-A_000098-001156 sw02001-A 0.98 11.56
pem=$sdir/english/hub5e_00.pem pem=$sdir/english/hub5e_00.pem
[ ! -f $pem ] && echo "$0: No such file $pem" && exit 1; [ ! -f $pem ] && echo "$0: No such file $pem" && exit 1
# pem file has lines like: # pem file has lines like:
# en_4156 A unknown_speaker 301.85 302.48 # en_4156 A unknown_speaker 301.85 302.48
# we ignore the warnings below for now, although they seem to indicate some problems # we ignore the warnings below for now, although they seem to indicate some problems
# with the data. # with the data.
grep -v ';;' $pem \ grep -v ';;' $pem |
| awk '{ awk '{
spk=$1"-"$2; spk=$1"-"$2;
utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
print utt,spk,$4,$5;}' \ print utt,spk,$4,$5;}' |
| sort -u | local/extend_segments.pl 0.1 > $dir/segments sort -u | local/extend_segments.pl 0.1 >$dir/segments
# stm file has lines like: # stm file has lines like:
# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F> HE IS A POLICE OFFICER # en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F> HE IS A POLICE OFFICER
# TODO(arnab): We should really be lowercasing this since the Edinburgh # TODO(arnab): We should really be lowercasing this since the Edinburgh
# recipe uses lowercase. This is not used in the actual scoring. # recipe uses lowercase. This is not used in the actual scoring.
grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \ grep -v ';;' $tdir/reference/hub5e00.english.000405.stm |
| awk '{ awk '{
spk=$1"-"$2; spk=$1"-"$2;
utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' \ printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' |
| sort > $dir/text.all sort >$dir/text.all
# We'll use the stm file for sclite scoring. There seem to be various errors # We'll use the stm file for sclite scoring. There seem to be various errors
# in the stm file that upset hubscr.pl, and we fix them here. # in the stm file that upset hubscr.pl, and we fix them here.
@ -89,8 +88,8 @@ sed -e 's:((:(:' -e 's:<B_ASIDE>::g' -e 's:<E_ASIDE>::g' \
# next line uses command substitution # next line uses command substitution
# Just checking that the segments are the same in pem vs. stm. # Just checking that the segments are the same in pem vs. stm.
! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && \ ! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) &&
echo "Segments from pem file and stm file do not match." && exit 1; echo "Segments from pem file and stm file do not match." && exit 1
grep -v IGNORE_TIME_SEGMENT_ $dir/text.all >$dir/text grep -v IGNORE_TIME_SEGMENT_ $dir/text.all >$dir/text
@ -103,11 +102,10 @@ utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
# awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \ # awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \
# $dir/segments.tmp > $dir/segments # $dir/segments.tmp > $dir/segments
awk '{print $1}' $dir/wav.scp \ awk '{print $1}' $dir/wav.scp |
| perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
print "$1-$2 $1 $2\n"; ' \ print "$1-$2 $1 $2\n"; ' \
> $dir/reco2file_and_channel || exit 1; >$dir/reco2file_and_channel || exit 1
echo Data preparation and formatting completed for Eval 2000 echo Data preparation and formatting completed for Eval 2000
echo "(but not MFCC extraction)" echo "(but not MFCC extraction)"
@ -116,5 +114,5 @@ awk '{print $1}' $dir/wav.scp \
if [ $(wc -l <$dir/wav.scp) -ne 80 ]; then if [ $(wc -l <$dir/wav.scp) -ne 80 ]; then
echo "$0: error: expected 80 lines in wav.scp, got $(wc -l <$dir/wav.scp)" echo "$0: error: expected 80 lines in wav.scp, got $(wc -l <$dir/wav.scp)"
exit 1; exit 1
fi fi

View File

@ -15,10 +15,10 @@ if [ $# -ne 1 ]; then
fi fi
sdir=$1 sdir=$1
[ ! -d $sdir/data/audio/eval03/english/cts ] \ [ ! -d $sdir/data/audio/eval03/english/cts ] &&
&& echo Expecting directory $sdir/data/audio/eval03/english/cts to be present && exit 1; echo Expecting directory $sdir/data/audio/eval03/english/cts to be present && exit 1
[ ! -d $sdir/data/references/eval03/english/cts ] \ [ ! -d $sdir/data/references/eval03/english/cts ] &&
&& echo Expecting directory $tdir/data/references/eval03/english/cts to be present && exit 1; echo Expecting directory $tdir/data/references/eval03/english/cts to be present && exit 1
dir=data/local/rt03 dir=data/local/rt03
mkdir -p $dir mkdir -p $dir
@ -32,13 +32,13 @@ sdir=$1
>$dir/sph.scp >$dir/sph.scp
sph2pipe=sph2pipe sph2pipe=sph2pipe
! command -v "${sph2pipe}" &> /dev/null \ ! command -v "${sph2pipe}" &>/dev/null &&
&& echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1; echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1
awk -v sph2pipe=$sph2pipe '{ awk -v sph2pipe=$sph2pipe '{
printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
}' < $dir/sph.scp | sort > $dir/wav.scp || exit 1; }' <$dir/sph.scp | sort >$dir/wav.scp || exit 1
#side A - channel 1, side B - channel 2 #side A - channel 1, side B - channel 2
# Get segments file... # Get segments file...
@ -50,30 +50,30 @@ sdir=$1
# en_4156 A unknown_speaker 301.85 302.48 # en_4156 A unknown_speaker 301.85 302.48
#grep -v ';;' $pem \ #grep -v ';;' $pem \
cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \ cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap |
| awk '{ awk '{
spk=$1"-"(($2==1)?"A":"B"); spk=$1"-"(($2==1)?"A":"B");
utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
print utt,spk,$4,$5;}' \ print utt,spk,$4,$5;}' |
| sort -u > $dir/segments sort -u >$dir/segments
# stm file has lines like: # stm file has lines like:
# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F> HE IS A POLICE OFFICER # en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F> HE IS A POLICE OFFICER
# TODO(arnab): We should really be lowercasing this since the Edinburgh # TODO(arnab): We should really be lowercasing this since the Edinburgh
# recipe uses lowercase. This is not used in the actual scoring. # recipe uses lowercase. This is not used in the actual scoring.
#grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \ #grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \ cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap |
| awk '{ awk '{
spk=$1"-"(($2==1)?"A":"B"); spk=$1"-"(($2==1)?"A":"B");
utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' \ printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' |
| sort > $dir/text.all sort >$dir/text.all
# We'll use the stm file for sclite scoring. There seem to be various errors # We'll use the stm file for sclite scoring. There seem to be various errors
# in the stm file that upset hubscr.pl, and we fix them here. # in the stm file that upset hubscr.pl, and we fix them here.
cat $tdir/*.stm | \ cat $tdir/*.stm |
sed -e 's:((:(:' -e 's:<B_ASIDE>::g' -e 's:<E_ASIDE>::g' | \ sed -e 's:((:(:' -e 's:<B_ASIDE>::g' -e 's:<E_ASIDE>::g' |
grep -v inter_segment_gap | \ grep -v inter_segment_gap |
awk '{ awk '{
printf $1; if ($1==";;") printf(" %s",$2); else printf(($2==1)?" A":" B"); for(n=3;n<=NF;n++) printf(" %s", $n); print ""; }' \ printf $1; if ($1==";;") printf(" %s",$2); else printf(($2==1)?" A":" B"); for(n=3;n<=NF;n++) printf(" %s", $n); print ""; }' \
>$dir/stm >$dir/stm
@ -82,8 +82,8 @@ cat $tdir/*.stm | \
# next line uses command substitution # next line uses command substitution
# Just checking that the segments are the same in pem vs. stm. # Just checking that the segments are the same in pem vs. stm.
! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && \ ! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) &&
echo "Segments from pem file and stm file do not match." && exit 1; echo "Segments from pem file and stm file do not match." && exit 1
grep -v IGNORE_TIME_SEGMENT_ $dir/text.all >$dir/text grep -v IGNORE_TIME_SEGMENT_ $dir/text.all >$dir/text
@ -96,10 +96,10 @@ utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
# awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \ # awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \
# $dir/segments.tmp > $dir/segments # $dir/segments.tmp > $dir/segments
awk '{print $1}' $dir/wav.scp \ awk '{print $1}' $dir/wav.scp |
| perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
print "$1-$2 $1 $2\n"; ' \ print "$1-$2 $1 $2\n"; ' \
> $dir/reco2file_and_channel || exit 1; >$dir/reco2file_and_channel || exit 1
./utils/fix_data_dir.sh $dir ./utils/fix_data_dir.sh $dir

View File

@ -17,11 +17,10 @@
## will be using "find" to locate this file so we don't make any assumptions ## will be using "find" to locate this file so we don't make any assumptions
## on the directory structure. (Peng Qi, Aug 2014) ## on the directory structure. (Peng Qi, Aug 2014)
#check existing directories #check existing directories
if [ $# != 1 -a $# != 2 ]; then if [ $# != 1 -a $# != 2 ]; then
echo "Usage: swbd1_data_prep.sh /path/to/SWBD [/path/to/SWBD_DOC]" echo "Usage: swbd1_data_prep.sh /path/to/SWBD [/path/to/SWBD_DOC]"
exit 1; exit 1
fi fi
SWBD_DIR=$1 SWBD_DIR=$1
@ -29,29 +28,27 @@ SWBD_DIR=$1
dir=data/local/train dir=data/local/train
mkdir -p $dir mkdir -p $dir
# Audio data directory check # Audio data directory check
if [ ! -d $SWBD_DIR ]; then if [ ! -d $SWBD_DIR ]; then
echo "Error: run.sh requires a directory argument" echo "Error: run.sh requires a directory argument"
exit 1; exit 1
fi fi
sph2pipe=sph2pipe sph2pipe=sph2pipe
! command -v "${sph2pipe}" &> /dev/null \ ! command -v "${sph2pipe}" &>/dev/null &&
&& echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1; echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1
# Option A: SWBD dictionary file check # Option A: SWBD dictionary file check
[ ! -f ./swb_ms98_transcriptions/sw-ms98-dict.text ] && \ [ ! -f ./swb_ms98_transcriptions/sw-ms98-dict.text ] &&
echo "SWBD dictionary file does not exist" && exit 1; echo "SWBD dictionary file does not exist" && exit 1
# find sph audio files # find sph audio files
find -L $SWBD_DIR -iname '*.sph' | sort >$dir/sph.flist find -L $SWBD_DIR -iname '*.sph' | sort >$dir/sph.flist
n=`cat $dir/sph.flist | wc -l` n=$(cat $dir/sph.flist | wc -l)
[ $n -ne 2435 ] && [ $n -ne 2438 ] && \ [ $n -ne 2435 ] && [ $n -ne 2438 ] &&
echo Warning: expected 2435 or 2438 data data files, found $n echo Warning: expected 2435 or 2438 data data files, found $n
# (1a) Transcriptions preparation # (1a) Transcriptions preparation
# make basic transcription file (add segments info) # make basic transcription file (add segments info)
# **NOTE: In the default Kaldi recipe, everything is made uppercase, while we # **NOTE: In the default Kaldi recipe, everything is made uppercase, while we
@ -67,8 +64,8 @@ for(i=4;i<=NF;i++) printf(" %s", $i); printf "\n"
}' ./swb_ms98_transcriptions/*/*/*-trans.text >$dir/transcripts1.txt }' ./swb_ms98_transcriptions/*/*/*-trans.text >$dir/transcripts1.txt
# test if trans. file is sorted # test if trans. file is sorted
export LC_ALL=C; export LC_ALL=C
sort -c $dir/transcripts1.txt || exit 1; # check it's sorted. sort -c $dir/transcripts1.txt || exit 1 # check it's sorted.
# Remove SILENCE, <B_ASIDE> and <E_ASIDE>. # Remove SILENCE, <B_ASIDE> and <E_ASIDE>.
@ -77,13 +74,12 @@ sort -c $dir/transcripts1.txt || exit 1; # check it's sorted.
# speech to somone; we will give phones to the other three (NSN, SPN, LAU). # speech to somone; we will give phones to the other three (NSN, SPN, LAU).
# There will also be a silence phone, SIL. # There will also be a silence phone, SIL.
# **NOTE: modified the pattern matches to make them case insensitive # **NOTE: modified the pattern matches to make them case insensitive
cat $dir/transcripts1.txt \ cat $dir/transcripts1.txt |
| perl -ane 's:\s\[SILENCE\](\s|$):$1:gi; perl -ane 's:\s\[SILENCE\](\s|$):$1:gi;
s/<B_ASIDE>//gi; s/<B_ASIDE>//gi;
s/<E_ASIDE>//gi; s/<E_ASIDE>//gi;
print;' \ print;' |
| awk '{if(NF > 1) { print; } } ' > $dir/transcripts2.txt awk '{if(NF > 1) { print; } } ' >$dir/transcripts2.txt
# **NOTE: swbd1_map_words.pl has been modified to make the pattern matches # **NOTE: swbd1_map_words.pl has been modified to make the pattern matches
# case insensitive # case insensitive
@ -110,7 +106,7 @@ sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
awk -v sph2pipe=$sph2pipe '{ awk -v sph2pipe=$sph2pipe '{
printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
}' < $dir/sph.scp | sort > $dir/wav.scp || exit 1; }' <$dir/sph.scp | sort >$dir/wav.scp || exit 1
#side A - channel 1, side B - channel 2 #side A - channel 1, side B - channel 2
# this file reco2file_and_channel maps recording-id (e.g. sw02001-A) # this file reco2file_and_channel maps recording-id (e.g. sw02001-A)
@ -118,14 +114,14 @@ printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
# sw02001-A sw02001 A # sw02001-A sw02001 A
# In this case it's trivial, but in other corpora the information might # In this case it's trivial, but in other corpora the information might
# be less obvious. Later it will be needed for ctm scoring. # be less obvious. Later it will be needed for ctm scoring.
awk '{print $1}' $dir/wav.scp \ awk '{print $1}' $dir/wav.scp |
| perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
print "$1-$2 $1 $2\n"; ' \ print "$1-$2 $1 $2\n"; ' \
> $dir/reco2file_and_channel || exit 1; >$dir/reco2file_and_channel || exit 1
awk '{spk=substr($1,1,9); print $1 " " spk}' $dir/segments > $dir/utt2spk \ awk '{spk=substr($1,1,9); print $1 " " spk}' $dir/segments >$dir/utt2spk ||
|| exit 1; exit 1
sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl >$dir/spk2utt || exit 1
echo Switchboard-1 data preparation succeeded. echo Switchboard-1 data preparation succeeded.

View File

@ -5,9 +5,8 @@
# To be run from one directory above this script. # To be run from one directory above this script.
#check existing directories #check existing directories
[ $# != 0 ] && echo "Usage: local/swbd1_data_prep.sh" && exit 1; [ $# != 0 ] && echo "Usage: local/swbd1_data_prep.sh" && exit 1
srcdir=. # This is where we downloaded some stuff.. srcdir=. # This is where we downloaded some stuff..
dir=./data/local/dict_nosp dir=./data/local/dict_nosp
@ -15,20 +14,25 @@ mkdir -p $dir
srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text
# assume swbd_p1_data_prep.sh was done already. # assume swbd_p1_data_prep.sh was done already.
[ ! -f "$srcdict" ] && echo "$0: No such file $srcdict" && exit 1; [ ! -f "$srcdict" ] && echo "$0: No such file $srcdict" && exit 1
cp $srcdict $dir/lexicon0.txt || exit 1; cp $srcdict $dir/lexicon0.txt || exit 1
chmod a+w $dir/lexicon0.txt chmod a+w $dir/lexicon0.txt
patch <local/dict.patch $dir/lexicon0.txt || exit 1; patch <local/dict.patch $dir/lexicon0.txt || exit 1
#(2a) Dictionary preparation: #(2a) Dictionary preparation:
# Pre-processing (remove comments) # Pre-processing (remove comments)
grep -v '^#' $dir/lexicon0.txt | awk 'NF>0' | sort > $dir/lexicon1.txt || exit 1; grep -v '^#' $dir/lexicon0.txt | awk 'NF>0' | sort >$dir/lexicon1.txt || exit 1
cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \ cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' |
grep -v sil > $dir/nonsilence_phones.txt || exit 1; grep -v sil >$dir/nonsilence_phones.txt || exit 1
( echo sil; echo spn; echo nsn; echo lau ) > $dir/silence_phones.txt (
echo sil
echo spn
echo nsn
echo lau
) >$dir/silence_phones.txt
echo sil >$dir/optional_silence.txt echo sil >$dir/optional_silence.txt
@ -41,9 +45,14 @@ cp local/MSU_single_letter.txt $dir/
# Add single letter lexicon # Add single letter lexicon
# The original swbd lexicon does not have precise single letter lexicion # The original swbd lexicon does not have precise single letter lexicion
# e.g. it does not have entry of W # e.g. it does not have entry of W
( echo '!sil sil'; echo '[vocalized-noise] spn'; echo '[noise] nsn'; \ (
echo '[laughter] lau'; echo '<unk> spn' ) \ echo '!sil sil'
| cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt > $dir/lexicon2.txt || exit 1; echo '[vocalized-noise] spn'
echo '[noise] nsn'
echo '[laughter] lau'
echo '<unk> spn'
) |
cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt >$dir/lexicon2.txt || exit 1
# Map the words in the lexicon. That is-- for each word in the lexicon, we map it # Map the words in the lexicon. That is-- for each word in the lexicon, we map it
# to a new written form. The transformations we do are: # to a new written form. The transformations we do are:
@ -77,7 +86,7 @@ cp local/MSU_single_letter.txt $dir/
# in the lexicon. # in the lexicon.
local/swbd1_map_words.pl -f 1 $dir/lexicon2.txt | sort -u \ local/swbd1_map_words.pl -f 1 $dir/lexicon2.txt | sort -u \
> $dir/lexicon3.txt || exit 1; >$dir/lexicon3.txt || exit 1
python3 local/format_acronyms_dict.py -i $dir/lexicon3.txt -o $dir/lexicon4.txt \ python3 local/format_acronyms_dict.py -i $dir/lexicon3.txt -o $dir/lexicon4.txt \
-L $dir/MSU_single_letter.txt -M $dir/acronyms_raw.map -L $dir/MSU_single_letter.txt -M $dir/acronyms_raw.map

View File

@ -78,8 +78,8 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
cp data/local/${x}/text data/local/${x}/text.org cp data/local/${x}/text data/local/${x}/text.org
paste -d "" \ paste -d "" \
<(cut -f 1 -d" " data/local/${x}/text.org) \ <(cut -f 1 -d" " data/local/${x}/text.org) \
<(awk '{$1=""; print tolower($0)}' data/local/${x}/text.org | perl -pe 's| \(\%.*\)||g' | perl -pe 's| \<.*\>||g' | sed -e "s/(//g" -e "s/)//g") \ <(awk '{$1=""; print tolower($0)}' data/local/${x}/text.org | perl -pe 's| \(\%.*\)||g' | perl -pe 's| \<.*\>||g' | sed -e "s/(//g" -e "s/)//g") |
| sed -e 's/\s\+/ /g' > data/local/${x}/text sed -e 's/\s\+/ /g' >data/local/${x}/text
rm data/local/${x}/text.org rm data/local/${x}/text.org
done done