mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-08 00:24:19 +00:00
Fixed formatting issues in bash scripts.
This commit is contained in:
parent
96738b538a
commit
439855e3f3
@ -21,100 +21,98 @@
|
||||
# $tdir/reference/hub5e00.english.000405.stm
|
||||
|
||||
if [ $# -ne 2 ]; then
|
||||
echo "Usage: "`basename $0`" <speech-dir> <transcription-dir>"
|
||||
echo "Usage: "$(basename $0)" <speech-dir> <transcription-dir>"
|
||||
echo "See comments in the script for more details"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sdir=$1
|
||||
tdir=$2
|
||||
[ ! -d $sdir/english ] \
|
||||
&& echo Expecting directory $sdir/english to be present && exit 1;
|
||||
[ -d $tdir/2000_hub5_eng_eval_tr ] \
|
||||
&& tdir=$tdir/2000_hub5_eng_eval_tr
|
||||
[ ! -d $tdir/reference ] \
|
||||
&& echo Expecting directory $tdir/reference to be present && exit 1;
|
||||
[ ! -d $sdir/english ] &&
|
||||
echo Expecting directory $sdir/english to be present && exit 1
|
||||
[ -d $tdir/2000_hub5_eng_eval_tr ] &&
|
||||
tdir=$tdir/2000_hub5_eng_eval_tr
|
||||
[ ! -d $tdir/reference ] &&
|
||||
echo Expecting directory $tdir/reference to be present && exit 1
|
||||
|
||||
dir=data/local/eval2000
|
||||
mkdir -p $dir
|
||||
|
||||
dir=data/local/eval2000
|
||||
mkdir -p $dir
|
||||
find $sdir/english -iname '*.sph' | sort >$dir/sph.flist
|
||||
sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
|
||||
>$dir/sph.scp
|
||||
|
||||
find $sdir/english -iname '*.sph' | sort > $dir/sph.flist
|
||||
sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
|
||||
> $dir/sph.scp
|
||||
sph2pipe=sph2pipe
|
||||
[ ! -x $sph2pipe ] &&
|
||||
echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1
|
||||
|
||||
sph2pipe=sph2pipe
|
||||
[ ! -x $sph2pipe ] \
|
||||
&& echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
|
||||
|
||||
awk -v sph2pipe=$sph2pipe '{
|
||||
awk -v sph2pipe=$sph2pipe '{
|
||||
printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
|
||||
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
|
||||
}' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
|
||||
#side A - channel 1, side B - channel 2
|
||||
}' <$dir/sph.scp | sort >$dir/wav.scp || exit 1
|
||||
#side A - channel 1, side B - channel 2
|
||||
|
||||
# Get segments file...
|
||||
# segments file format is: utt-id side-id start-time end-time, e.g.:
|
||||
# sw02001-A_000098-001156 sw02001-A 0.98 11.56
|
||||
pem=$sdir/english/hub5e_00.pem
|
||||
[ ! -f $pem ] && echo "$0: No such file $pem" && exit 1;
|
||||
[ ! -f $pem ] && echo "$0: No such file $pem" && exit 1
|
||||
# pem file has lines like:
|
||||
# en_4156 A unknown_speaker 301.85 302.48
|
||||
|
||||
# we ignore the warnings below for now, although they seem to indicate some problems
|
||||
# with the data.
|
||||
grep -v ';;' $pem \
|
||||
| awk '{
|
||||
grep -v ';;' $pem |
|
||||
awk '{
|
||||
spk=$1"-"$2;
|
||||
utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
|
||||
print utt,spk,$4,$5;}' \
|
||||
| sort -u | local/extend_segments.pl 0.1 > $dir/segments
|
||||
print utt,spk,$4,$5;}' |
|
||||
sort -u | local/extend_segments.pl 0.1 >$dir/segments
|
||||
|
||||
# stm file has lines like:
|
||||
# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F> HE IS A POLICE OFFICER
|
||||
# TODO(arnab): We should really be lowercasing this since the Edinburgh
|
||||
# recipe uses lowercase. This is not used in the actual scoring.
|
||||
grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
|
||||
| awk '{
|
||||
grep -v ';;' $tdir/reference/hub5e00.english.000405.stm |
|
||||
awk '{
|
||||
spk=$1"-"$2;
|
||||
utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
|
||||
printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' \
|
||||
| sort > $dir/text.all
|
||||
printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' |
|
||||
sort >$dir/text.all
|
||||
|
||||
# We'll use the stm file for sclite scoring. There seem to be various errors
|
||||
# in the stm file that upset hubscr.pl, and we fix them here.
|
||||
sed -e 's:((:(:' -e 's:<B_ASIDE>::g' -e 's:<E_ASIDE>::g' \
|
||||
$tdir/reference/hub5e00.english.000405.stm > $dir/stm
|
||||
cp $tdir/reference/en20000405_hub5.glm $dir/glm
|
||||
$tdir/reference/hub5e00.english.000405.stm >$dir/stm
|
||||
cp $tdir/reference/en20000405_hub5.glm $dir/glm
|
||||
|
||||
# next line uses command substitution
|
||||
# Just checking that the segments are the same in pem vs. stm.
|
||||
! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && \
|
||||
echo "Segments from pem file and stm file do not match." && exit 1;
|
||||
! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) &&
|
||||
echo "Segments from pem file and stm file do not match." && exit 1
|
||||
|
||||
grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
|
||||
grep -v IGNORE_TIME_SEGMENT_ $dir/text.all >$dir/text
|
||||
|
||||
# create an utt2spk file that assumes each conversation side is
|
||||
# a separate speaker.
|
||||
awk '{print $1,$2;}' $dir/segments > $dir/utt2spk
|
||||
utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
|
||||
awk '{print $1,$2;}' $dir/segments >$dir/utt2spk
|
||||
utils/utt2spk_to_spk2utt.pl $dir/utt2spk >$dir/spk2utt
|
||||
|
||||
# cp $dir/segments $dir/segments.tmp
|
||||
# awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \
|
||||
# $dir/segments.tmp > $dir/segments
|
||||
|
||||
awk '{print $1}' $dir/wav.scp \
|
||||
| perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
|
||||
awk '{print $1}' $dir/wav.scp |
|
||||
perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
|
||||
print "$1-$2 $1 $2\n"; ' \
|
||||
> $dir/reco2file_and_channel || exit 1;
|
||||
>$dir/reco2file_and_channel || exit 1
|
||||
|
||||
echo Data preparation and formatting completed for Eval 2000
|
||||
echo "(but not MFCC extraction)"
|
||||
|
||||
echo Data preparation and formatting completed for Eval 2000
|
||||
echo "(but not MFCC extraction)"
|
||||
utils/fix_data_dir.sh $dir
|
||||
|
||||
utils/fix_data_dir.sh $dir
|
||||
|
||||
if [ $(wc -l < $dir/wav.scp) -ne 80 ]; then
|
||||
echo "$0: error: expected 80 lines in wav.scp, got $(wc -l < $dir/wav.scp)"
|
||||
exit 1;
|
||||
fi
|
||||
if [ $(wc -l <$dir/wav.scp) -ne 80 ]; then
|
||||
echo "$0: error: expected 80 lines in wav.scp, got $(wc -l <$dir/wav.scp)"
|
||||
exit 1
|
||||
fi
|
||||
|
@ -15,30 +15,30 @@ if [ $# -ne 1 ]; then
|
||||
fi
|
||||
|
||||
sdir=$1
|
||||
[ ! -d $sdir/data/audio/eval03/english/cts ] \
|
||||
&& echo Expecting directory $sdir/data/audio/eval03/english/cts to be present && exit 1;
|
||||
[ ! -d $sdir/data/references/eval03/english/cts ] \
|
||||
&& echo Expecting directory $tdir/data/references/eval03/english/cts to be present && exit 1;
|
||||
[ ! -d $sdir/data/audio/eval03/english/cts ] &&
|
||||
echo Expecting directory $sdir/data/audio/eval03/english/cts to be present && exit 1
|
||||
[ ! -d $sdir/data/references/eval03/english/cts ] &&
|
||||
echo Expecting directory $tdir/data/references/eval03/english/cts to be present && exit 1
|
||||
|
||||
dir=data/local/rt03
|
||||
mkdir -p $dir
|
||||
dir=data/local/rt03
|
||||
mkdir -p $dir
|
||||
|
||||
rtroot=$sdir
|
||||
tdir=$sdir/data/references/eval03/english/cts
|
||||
sdir=$sdir/data/audio/eval03/english/cts
|
||||
rtroot=$sdir
|
||||
tdir=$sdir/data/references/eval03/english/cts
|
||||
sdir=$sdir/data/audio/eval03/english/cts
|
||||
|
||||
find -L $sdir -iname '*.sph' | sort > $dir/sph.flist
|
||||
sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
|
||||
> $dir/sph.scp
|
||||
find -L $sdir -iname '*.sph' | sort >$dir/sph.flist
|
||||
sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
|
||||
>$dir/sph.scp
|
||||
|
||||
sph2pipe=sph2pipe
|
||||
! command -v "${sph2pipe}" &> /dev/null \
|
||||
&& echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
|
||||
sph2pipe=sph2pipe
|
||||
! command -v "${sph2pipe}" &>/dev/null &&
|
||||
echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1
|
||||
|
||||
awk -v sph2pipe=$sph2pipe '{
|
||||
awk -v sph2pipe=$sph2pipe '{
|
||||
printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
|
||||
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
|
||||
}' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
|
||||
}' <$dir/sph.scp | sort >$dir/wav.scp || exit 1
|
||||
#side A - channel 1, side B - channel 2
|
||||
|
||||
# Get segments file...
|
||||
@ -50,58 +50,58 @@ sdir=$1
|
||||
# en_4156 A unknown_speaker 301.85 302.48
|
||||
|
||||
#grep -v ';;' $pem \
|
||||
cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \
|
||||
| awk '{
|
||||
cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap |
|
||||
awk '{
|
||||
spk=$1"-"(($2==1)?"A":"B");
|
||||
utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
|
||||
print utt,spk,$4,$5;}' \
|
||||
| sort -u > $dir/segments
|
||||
print utt,spk,$4,$5;}' |
|
||||
sort -u >$dir/segments
|
||||
|
||||
# stm file has lines like:
|
||||
# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F> HE IS A POLICE OFFICER
|
||||
# TODO(arnab): We should really be lowercasing this since the Edinburgh
|
||||
# recipe uses lowercase. This is not used in the actual scoring.
|
||||
#grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
|
||||
cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \
|
||||
| awk '{
|
||||
cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap |
|
||||
awk '{
|
||||
spk=$1"-"(($2==1)?"A":"B");
|
||||
utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
|
||||
printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' \
|
||||
| sort > $dir/text.all
|
||||
printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' |
|
||||
sort >$dir/text.all
|
||||
|
||||
# We'll use the stm file for sclite scoring. There seem to be various errors
|
||||
# in the stm file that upset hubscr.pl, and we fix them here.
|
||||
cat $tdir/*.stm | \
|
||||
sed -e 's:((:(:' -e 's:<B_ASIDE>::g' -e 's:<E_ASIDE>::g' | \
|
||||
grep -v inter_segment_gap | \
|
||||
cat $tdir/*.stm |
|
||||
sed -e 's:((:(:' -e 's:<B_ASIDE>::g' -e 's:<E_ASIDE>::g' |
|
||||
grep -v inter_segment_gap |
|
||||
awk '{
|
||||
printf $1; if ($1==";;") printf(" %s",$2); else printf(($2==1)?" A":" B"); for(n=3;n<=NF;n++) printf(" %s", $n); print ""; }'\
|
||||
> $dir/stm
|
||||
#$tdir/reference/hub5e00.english.000405.stm > $dir/stm
|
||||
cp $rtroot/data/trans_rules/en20030506.glm $dir/glm
|
||||
printf $1; if ($1==";;") printf(" %s",$2); else printf(($2==1)?" A":" B"); for(n=3;n<=NF;n++) printf(" %s", $n); print ""; }' \
|
||||
>$dir/stm
|
||||
#$tdir/reference/hub5e00.english.000405.stm > $dir/stm
|
||||
cp $rtroot/data/trans_rules/en20030506.glm $dir/glm
|
||||
|
||||
# next line uses command substitution
|
||||
# Just checking that the segments are the same in pem vs. stm.
|
||||
! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && \
|
||||
echo "Segments from pem file and stm file do not match." && exit 1;
|
||||
! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) &&
|
||||
echo "Segments from pem file and stm file do not match." && exit 1
|
||||
|
||||
grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
|
||||
grep -v IGNORE_TIME_SEGMENT_ $dir/text.all >$dir/text
|
||||
|
||||
# create an utt2spk file that assumes each conversation side is
|
||||
# a separate speaker.
|
||||
awk '{print $1,$2;}' $dir/segments > $dir/utt2spk
|
||||
utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
|
||||
awk '{print $1,$2;}' $dir/segments >$dir/utt2spk
|
||||
utils/utt2spk_to_spk2utt.pl $dir/utt2spk >$dir/spk2utt
|
||||
|
||||
# cp $dir/segments $dir/segments.tmp
|
||||
# awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \
|
||||
# $dir/segments.tmp > $dir/segments
|
||||
|
||||
awk '{print $1}' $dir/wav.scp \
|
||||
| perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
|
||||
awk '{print $1}' $dir/wav.scp |
|
||||
perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
|
||||
print "$1-$2 $1 $2\n"; ' \
|
||||
> $dir/reco2file_and_channel || exit 1;
|
||||
>$dir/reco2file_and_channel || exit 1
|
||||
|
||||
./utils/fix_data_dir.sh $dir
|
||||
./utils/fix_data_dir.sh $dir
|
||||
|
||||
echo Data preparation and formatting completed for RT-03
|
||||
echo "(but not MFCC extraction)"
|
||||
echo Data preparation and formatting completed for RT-03
|
||||
echo "(but not MFCC extraction)"
|
||||
|
@ -17,11 +17,10 @@
|
||||
## will be using "find" to locate this file so we don't make any assumptions
|
||||
## on the directory structure. (Peng Qi, Aug 2014)
|
||||
|
||||
|
||||
#check existing directories
|
||||
if [ $# != 1 -a $# != 2 ]; then
|
||||
echo "Usage: swbd1_data_prep.sh /path/to/SWBD [/path/to/SWBD_DOC]"
|
||||
exit 1;
|
||||
exit 1
|
||||
fi
|
||||
|
||||
SWBD_DIR=$1
|
||||
@ -29,29 +28,27 @@ SWBD_DIR=$1
|
||||
dir=data/local/train
|
||||
mkdir -p $dir
|
||||
|
||||
|
||||
# Audio data directory check
|
||||
if [ ! -d $SWBD_DIR ]; then
|
||||
echo "Error: run.sh requires a directory argument"
|
||||
exit 1;
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sph2pipe=sph2pipe
|
||||
! command -v "${sph2pipe}" &> /dev/null \
|
||||
&& echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
|
||||
! command -v "${sph2pipe}" &>/dev/null &&
|
||||
echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1
|
||||
|
||||
# Option A: SWBD dictionary file check
|
||||
[ ! -f ./swb_ms98_transcriptions/sw-ms98-dict.text ] && \
|
||||
echo "SWBD dictionary file does not exist" && exit 1;
|
||||
[ ! -f ./swb_ms98_transcriptions/sw-ms98-dict.text ] &&
|
||||
echo "SWBD dictionary file does not exist" && exit 1
|
||||
|
||||
# find sph audio files
|
||||
find -L $SWBD_DIR -iname '*.sph' | sort > $dir/sph.flist
|
||||
find -L $SWBD_DIR -iname '*.sph' | sort >$dir/sph.flist
|
||||
|
||||
n=`cat $dir/sph.flist | wc -l`
|
||||
[ $n -ne 2435 ] && [ $n -ne 2438 ] && \
|
||||
n=$(cat $dir/sph.flist | wc -l)
|
||||
[ $n -ne 2435 ] && [ $n -ne 2438 ] &&
|
||||
echo Warning: expected 2435 or 2438 data data files, found $n
|
||||
|
||||
|
||||
# (1a) Transcriptions preparation
|
||||
# make basic transcription file (add segments info)
|
||||
# **NOTE: In the default Kaldi recipe, everything is made uppercase, while we
|
||||
@ -64,11 +61,11 @@ stime=$2; etime=$3;
|
||||
printf("%s-%s_%06.0f-%06.0f",
|
||||
name, side, int(100*stime+0.5), int(100*etime+0.5));
|
||||
for(i=4;i<=NF;i++) printf(" %s", $i); printf "\n"
|
||||
}' ./swb_ms98_transcriptions/*/*/*-trans.text > $dir/transcripts1.txt
|
||||
}' ./swb_ms98_transcriptions/*/*/*-trans.text >$dir/transcripts1.txt
|
||||
|
||||
# test if trans. file is sorted
|
||||
export LC_ALL=C;
|
||||
sort -c $dir/transcripts1.txt || exit 1; # check it's sorted.
|
||||
export LC_ALL=C
|
||||
sort -c $dir/transcripts1.txt || exit 1 # check it's sorted.
|
||||
|
||||
# Remove SILENCE, <B_ASIDE> and <E_ASIDE>.
|
||||
|
||||
@ -77,22 +74,21 @@ sort -c $dir/transcripts1.txt || exit 1; # check it's sorted.
|
||||
# speech to somone; we will give phones to the other three (NSN, SPN, LAU).
|
||||
# There will also be a silence phone, SIL.
|
||||
# **NOTE: modified the pattern matches to make them case insensitive
|
||||
cat $dir/transcripts1.txt \
|
||||
| perl -ane 's:\s\[SILENCE\](\s|$):$1:gi;
|
||||
cat $dir/transcripts1.txt |
|
||||
perl -ane 's:\s\[SILENCE\](\s|$):$1:gi;
|
||||
s/<B_ASIDE>//gi;
|
||||
s/<E_ASIDE>//gi;
|
||||
print;' \
|
||||
| awk '{if(NF > 1) { print; } } ' > $dir/transcripts2.txt
|
||||
|
||||
print;' |
|
||||
awk '{if(NF > 1) { print; } } ' >$dir/transcripts2.txt
|
||||
|
||||
# **NOTE: swbd1_map_words.pl has been modified to make the pattern matches
|
||||
# case insensitive
|
||||
local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt > $dir/text
|
||||
local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt >$dir/text
|
||||
|
||||
# format acronyms in text
|
||||
python3 local/map_acronyms_transcripts.py -i $dir/text -o $dir/text_map \
|
||||
-M data/local/dict_nosp/acronyms.map
|
||||
mv $dir/text_map $dir/text
|
||||
mv $dir/text_map $dir/text
|
||||
|
||||
# (1c) Make segment files from transcript
|
||||
#segments file format is: utt-id side-id start-time end-time, e.g.:
|
||||
@ -102,15 +98,15 @@ segment=$1;
|
||||
split(segment,S,"[_-]");
|
||||
side=S[2]; audioname=S[1]; startf=S[3]; endf=S[4];
|
||||
print segment " " audioname "-" side " " startf/100 " " endf/100
|
||||
}' < $dir/text > $dir/segments
|
||||
}' <$dir/text >$dir/segments
|
||||
|
||||
sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
|
||||
> $dir/sph.scp
|
||||
>$dir/sph.scp
|
||||
|
||||
awk -v sph2pipe=$sph2pipe '{
|
||||
printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
|
||||
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
|
||||
}' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
|
||||
}' <$dir/sph.scp | sort >$dir/wav.scp || exit 1
|
||||
#side A - channel 1, side B - channel 2
|
||||
|
||||
# this file reco2file_and_channel maps recording-id (e.g. sw02001-A)
|
||||
@ -118,15 +114,15 @@ printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
|
||||
# sw02001-A sw02001 A
|
||||
# In this case it's trivial, but in other corpora the information might
|
||||
# be less obvious. Later it will be needed for ctm scoring.
|
||||
awk '{print $1}' $dir/wav.scp \
|
||||
| perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
|
||||
awk '{print $1}' $dir/wav.scp |
|
||||
perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
|
||||
print "$1-$2 $1 $2\n"; ' \
|
||||
> $dir/reco2file_and_channel || exit 1;
|
||||
>$dir/reco2file_and_channel || exit 1
|
||||
|
||||
awk '{spk=substr($1,1,9); print $1 " " spk}' $dir/segments > $dir/utt2spk \
|
||||
|| exit 1;
|
||||
sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
|
||||
awk '{spk=substr($1,1,9); print $1 " " spk}' $dir/segments >$dir/utt2spk ||
|
||||
exit 1
|
||||
sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl >$dir/spk2utt || exit 1
|
||||
|
||||
echo Switchboard-1 data preparation succeeded.
|
||||
echo Switchboard-1 data preparation succeeded.
|
||||
|
||||
utils/fix_data_dir.sh data/local/train
|
||||
utils/fix_data_dir.sh data/local/train
|
||||
|
@ -5,32 +5,36 @@
|
||||
|
||||
# To be run from one directory above this script.
|
||||
|
||||
|
||||
#check existing directories
|
||||
[ $# != 0 ] && echo "Usage: local/swbd1_data_prep.sh" && exit 1;
|
||||
[ $# != 0 ] && echo "Usage: local/swbd1_data_prep.sh" && exit 1
|
||||
|
||||
srcdir=. # This is where we downloaded some stuff..
|
||||
srcdir=. # This is where we downloaded some stuff..
|
||||
dir=./data/local/dict_nosp
|
||||
mkdir -p $dir
|
||||
srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text
|
||||
|
||||
# assume swbd_p1_data_prep.sh was done already.
|
||||
[ ! -f "$srcdict" ] && echo "$0: No such file $srcdict" && exit 1;
|
||||
[ ! -f "$srcdict" ] && echo "$0: No such file $srcdict" && exit 1
|
||||
|
||||
cp $srcdict $dir/lexicon0.txt || exit 1;
|
||||
cp $srcdict $dir/lexicon0.txt || exit 1
|
||||
chmod a+w $dir/lexicon0.txt
|
||||
patch <local/dict.patch $dir/lexicon0.txt || exit 1;
|
||||
patch <local/dict.patch $dir/lexicon0.txt || exit 1
|
||||
|
||||
#(2a) Dictionary preparation:
|
||||
# Pre-processing (remove comments)
|
||||
grep -v '^#' $dir/lexicon0.txt | awk 'NF>0' | sort > $dir/lexicon1.txt || exit 1;
|
||||
grep -v '^#' $dir/lexicon0.txt | awk 'NF>0' | sort >$dir/lexicon1.txt || exit 1
|
||||
|
||||
cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \
|
||||
grep -v sil > $dir/nonsilence_phones.txt || exit 1;
|
||||
cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' |
|
||||
grep -v sil >$dir/nonsilence_phones.txt || exit 1
|
||||
|
||||
( echo sil; echo spn; echo nsn; echo lau ) > $dir/silence_phones.txt
|
||||
(
|
||||
echo sil
|
||||
echo spn
|
||||
echo nsn
|
||||
echo lau
|
||||
) >$dir/silence_phones.txt
|
||||
|
||||
echo sil > $dir/optional_silence.txt
|
||||
echo sil >$dir/optional_silence.txt
|
||||
|
||||
# No "extra questions" in the input to this setup, as we don't
|
||||
# have stress or tone.
|
||||
@ -41,9 +45,14 @@ cp local/MSU_single_letter.txt $dir/
|
||||
# Add single letter lexicon
|
||||
# The original swbd lexicon does not have precise single letter lexicion
|
||||
# e.g. it does not have entry of W
|
||||
( echo '!sil sil'; echo '[vocalized-noise] spn'; echo '[noise] nsn'; \
|
||||
echo '[laughter] lau'; echo '<unk> spn' ) \
|
||||
| cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt > $dir/lexicon2.txt || exit 1;
|
||||
(
|
||||
echo '!sil sil'
|
||||
echo '[vocalized-noise] spn'
|
||||
echo '[noise] nsn'
|
||||
echo '[laughter] lau'
|
||||
echo '<unk> spn'
|
||||
) |
|
||||
cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt >$dir/lexicon2.txt || exit 1
|
||||
|
||||
# Map the words in the lexicon. That is-- for each word in the lexicon, we map it
|
||||
# to a new written form. The transformations we do are:
|
||||
@ -77,16 +86,16 @@ cp local/MSU_single_letter.txt $dir/
|
||||
# in the lexicon.
|
||||
|
||||
local/swbd1_map_words.pl -f 1 $dir/lexicon2.txt | sort -u \
|
||||
> $dir/lexicon3.txt || exit 1;
|
||||
>$dir/lexicon3.txt || exit 1
|
||||
|
||||
python3 local/format_acronyms_dict.py -i $dir/lexicon3.txt -o $dir/lexicon4.txt \
|
||||
-L $dir/MSU_single_letter.txt -M $dir/acronyms_raw.map
|
||||
cat $dir/acronyms_raw.map | sort -u > $dir/acronyms.map
|
||||
cat $dir/acronyms_raw.map | sort -u >$dir/acronyms.map
|
||||
|
||||
( echo 'i ay' )| cat - $dir/lexicon4.txt | tr '[A-Z]' '[a-z]' | sort -u > $dir/lexicon5.txt
|
||||
(echo 'i ay') | cat - $dir/lexicon4.txt | tr '[A-Z]' '[a-z]' | sort -u >$dir/lexicon5.txt
|
||||
|
||||
pushd $dir >&/dev/null
|
||||
ln -sf lexicon5.txt lexicon.txt # This is the final lexicon.
|
||||
popd >&/dev/null
|
||||
rm $dir/lexiconp.txt 2>/dev/null
|
||||
echo Prepared input dictionary and phone-sets for Switchboard phase 1.
|
||||
pushd $dir >&/dev/null
|
||||
ln -sf lexicon5.txt lexicon.txt # This is the final lexicon.
|
||||
popd >&/dev/null
|
||||
rm $dir/lexiconp.txt 2>/dev/null
|
||||
echo Prepared input dictionary and phone-sets for Switchboard phase 1.
|
||||
|
@ -78,28 +78,28 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
cp data/local/${x}/text data/local/${x}/text.org
|
||||
paste -d "" \
|
||||
<(cut -f 1 -d" " data/local/${x}/text.org) \
|
||||
<(awk '{$1=""; print tolower($0)}' data/local/${x}/text.org | perl -pe 's| \(\%.*\)||g' | perl -pe 's| \<.*\>||g' | sed -e "s/(//g" -e "s/)//g") \
|
||||
| sed -e 's/\s\+/ /g' > data/local/${x}/text
|
||||
rm data/local/${x}/text.org
|
||||
done
|
||||
<(awk '{$1=""; print tolower($0)}' data/local/${x}/text.org | perl -pe 's| \(\%.*\)||g' | perl -pe 's| \<.*\>||g' | sed -e "s/(//g" -e "s/)//g") |
|
||||
sed -e 's/\s\+/ /g' >data/local/${x}/text
|
||||
rm data/local/${x}/text.org
|
||||
done
|
||||
|
||||
python ./local/filter_empty_text.py --kaldi-data-dir data/local/eval2000
|
||||
./utils/fix_data_dir.sh data/local/eval2000
|
||||
lhotse kaldi import data/local/eval2000 8000 data/manifests_eval2000
|
||||
mv data/manifests_eval2000/recordings.jsonl.gz data/manifests_eval2000/swbd_recordings_eval2000.jsonl.gz
|
||||
mv data/manifests_eval2000/supervisions.jsonl.gz data/manifests_eval2000/swbd_supervisions_eval2000.jsonl.gz
|
||||
python ./local/filter_empty_text.py --kaldi-data-dir data/local/eval2000
|
||||
./utils/fix_data_dir.sh data/local/eval2000
|
||||
lhotse kaldi import data/local/eval2000 8000 data/manifests_eval2000
|
||||
mv data/manifests_eval2000/recordings.jsonl.gz data/manifests_eval2000/swbd_recordings_eval2000.jsonl.gz
|
||||
mv data/manifests_eval2000/supervisions.jsonl.gz data/manifests_eval2000/swbd_supervisions_eval2000.jsonl.gz
|
||||
|
||||
python ./local/filter_empty_text.py --kaldi-data-dir data/local/rt03
|
||||
./utils/fix_data_dir.sh data/local/rt03
|
||||
lhotse kaldi import data/local/rt03 8000 data/manifests_rt03
|
||||
mv data/manifests_rt03/recordings.jsonl.gz data/manifests_rt03/swbd_recordings_rt03.jsonl.gz
|
||||
mv data/manifests_rt03/supervisions.jsonl.gz data/manifests_rt03/swbd_supervisions_rt03.jsonl.gz
|
||||
python ./local/filter_empty_text.py --kaldi-data-dir data/local/rt03
|
||||
./utils/fix_data_dir.sh data/local/rt03
|
||||
lhotse kaldi import data/local/rt03 8000 data/manifests_rt03
|
||||
mv data/manifests_rt03/recordings.jsonl.gz data/manifests_rt03/swbd_recordings_rt03.jsonl.gz
|
||||
mv data/manifests_rt03/supervisions.jsonl.gz data/manifests_rt03/swbd_supervisions_rt03.jsonl.gz
|
||||
|
||||
lhotse fix data/manifests_train/swbd_recordings_all.jsonl.gz data/manifests_train/swbd_supervisions_all.jsonl.gz data/manifests
|
||||
lhotse fix data/manifests_eval2000/swbd_recordings_eval2000.jsonl.gz data/manifests_eval2000/swbd_supervisions_eval2000.jsonl.gz data/manifests
|
||||
lhotse fix data/manifests_rt03/swbd_recordings_rt03.jsonl.gz data/manifests_rt03/swbd_supervisions_rt03.jsonl.gz data/manifests
|
||||
lhotse fix data/manifests_train/swbd_recordings_all.jsonl.gz data/manifests_train/swbd_supervisions_all.jsonl.gz data/manifests
|
||||
lhotse fix data/manifests_eval2000/swbd_recordings_eval2000.jsonl.gz data/manifests_eval2000/swbd_supervisions_eval2000.jsonl.gz data/manifests
|
||||
lhotse fix data/manifests_rt03/swbd_recordings_rt03.jsonl.gz data/manifests_rt03/swbd_supervisions_rt03.jsonl.gz data/manifests
|
||||
|
||||
touch data/manifests/.swbd.done
|
||||
touch data/manifests/.swbd.done
|
||||
fi
|
||||
fi
|
||||
|
||||
@ -260,11 +260,11 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
|
||||
-ngram-order 3 \
|
||||
-text ${lang_dir}/input.txt \
|
||||
-lm data/lm/3-gram.arpa
|
||||
python3 -m kaldilm \
|
||||
--read-symbol-table="data/lang_phone/words.txt" \
|
||||
--disambig-symbol='#0' \
|
||||
--max-order=3 \
|
||||
data/lm/3-gram.arpa >data/lm/G_3_gram.fst.txt
|
||||
python3 -m kaldilm \
|
||||
--read-symbol-table="data/lang_phone/words.txt" \
|
||||
--disambig-symbol='#0' \
|
||||
--max-order=3 \
|
||||
data/lm/3-gram.arpa >data/lm/G_3_gram.fst.txt
|
||||
fi
|
||||
|
||||
if [ ! -f data/lm/G_4_gram.fst.txt ]; then
|
||||
@ -273,11 +273,11 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
|
||||
-ngram-order 4 \
|
||||
-text ${lang_dir}/input.txt \
|
||||
-lm data/lm/4-gram.arpa
|
||||
python3 -m kaldilm \
|
||||
--read-symbol-table="data/lang_phone/words.txt" \
|
||||
--disambig-symbol='#0' \
|
||||
--max-order=4 \
|
||||
data/lm/4-gram.arpa >data/lm/G_4_gram.fst.txt
|
||||
python3 -m kaldilm \
|
||||
--read-symbol-table="data/lang_phone/words.txt" \
|
||||
--disambig-symbol='#0' \
|
||||
--max-order=4 \
|
||||
data/lm/4-gram.arpa >data/lm/G_4_gram.fst.txt
|
||||
fi
|
||||
fi
|
||||
|
||||
@ -325,7 +325,7 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
|
||||
--bpe-model $lang_dir/bpe.model \
|
||||
--lm-data data/lang_phone/input.txt \
|
||||
--lm-archive $out_dir/lm_data.pt
|
||||
done
|
||||
done
|
||||
fi
|
||||
|
||||
# if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
|
||||
@ -373,8 +373,8 @@ if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
|
||||
--bpe-model $lang_dir/bpe.model \
|
||||
--lm-data $out_dir/${testset}.txt \
|
||||
--lm-archive $out_dir/lm_data-${testset}.pt
|
||||
done
|
||||
done
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $stage -le 14 ] && [ $stop_stage -ge 14 ]; then
|
||||
@ -393,11 +393,11 @@ if [ $stage -le 14 ] && [ $stop_stage -ge 14 ]; then
|
||||
--in-lm-data $out_dir/lm_data.pt \
|
||||
--out-lm-data $out_dir/sorted_lm_data.pt \
|
||||
--out-statistics $out_dir/statistics.txt
|
||||
for testset in ${testsets[@]}; do
|
||||
./local/sort_lm_training_data.py \
|
||||
--in-lm-data $out_dir/lm_data-${testset}.pt \
|
||||
--out-lm-data $out_dir/sorted_lm_data-${testset}.pt \
|
||||
--out-statistics $out_dir/statistics-test-${testset}.txt
|
||||
done
|
||||
done
|
||||
for testset in ${testsets[@]}; do
|
||||
./local/sort_lm_training_data.py \
|
||||
--in-lm-data $out_dir/lm_data-${testset}.pt \
|
||||
--out-lm-data $out_dir/sorted_lm_data-${testset}.pt \
|
||||
--out-statistics $out_dir/statistics-test-${testset}.txt
|
||||
done
|
||||
done
|
||||
fi
|
||||
|
Loading…
x
Reference in New Issue
Block a user