mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-27 02:34:21 +00:00
support BPE based lang
This commit is contained in:
parent
1d58765bd5
commit
3dbb15bda2
1
egs/gigaspeech/ASR/local/convert_transcript_words_to_tokens.py
Symbolic link
1
egs/gigaspeech/ASR/local/convert_transcript_words_to_tokens.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/local/convert_transcript_words_to_tokens.py
|
1
egs/gigaspeech/ASR/local/prepare_lang.py
Symbolic link
1
egs/gigaspeech/ASR/local/prepare_lang.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/local/prepare_lang.py
|
1
egs/gigaspeech/ASR/local/prepare_lang_bpe.py
Symbolic link
1
egs/gigaspeech/ASR/local/prepare_lang_bpe.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/local/prepare_lang_bpe.py
|
1
egs/gigaspeech/ASR/local/train_bpe_model.py
Symbolic link
1
egs/gigaspeech/ASR/local/train_bpe_model.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/local/train_bpe_model.py
|
@ -31,9 +31,9 @@ dl_dir=$PWD/download
|
||||
# data/lang_bpe_yyy if the array contains xxx, yyy
|
||||
vocab_sizes=(
|
||||
5000
|
||||
2000
|
||||
1000
|
||||
500
|
||||
# 2000
|
||||
# 1000
|
||||
# 500
|
||||
)
|
||||
|
||||
# All files generated by this script are saved in "data".
|
||||
@ -125,15 +125,61 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||
lang_dir=data/lang_phone
|
||||
mkdir -p $lang_dir
|
||||
|
||||
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |
|
||||
cat - $dl_dir/lm/librispeech-lexicon.txt |
|
||||
sort | uniq > $lang_dir/lexicon.txt
|
||||
# (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |
|
||||
# cat - $dl_dir/lm/librispeech-lexicon.txt |
|
||||
# sort | uniq > $lang_dir/lexicon.txt
|
||||
|
||||
if [ ! -f $lang_dir/L_disambig.pt ]; then
|
||||
./local/prepare_lang.py --lang-dir $lang_dir
|
||||
fi
|
||||
# if [ ! -f $lang_dir/L_disambig.pt ]; then
|
||||
# ./local/prepare_lang.py --lang-dir $lang_dir
|
||||
# fi
|
||||
if [ ! -f $lang_dir/transcript_words.txt ]; then
|
||||
gunzip -c "data/manifests/gigaspeech_supervisions_XL.jsonl.gz" \
|
||||
| jq '.text' \
|
||||
| sed 's/"//g' \
|
||||
> $lang_dir/transcript_words.txt
|
||||
|
||||
# Delete utterances with garbage meta tags
|
||||
garbage_utterance_tags="<SIL> <MUSIC> <NOISE> <OTHER>"
|
||||
for tag in $garbage_utterance_tags; do
|
||||
sed -i "/${tag}/d" $lang_dir/transcript_words.txt
|
||||
done
|
||||
|
||||
# Delete punctuations in utterances
|
||||
punctuation_tags="<COMMA> <EXCLAMATIONPOINT> <PERIOD> <QUESTIONMARK>"
|
||||
for tag in $punctuation_tags; do
|
||||
sed -i "s/${tag}//g" $lang_dir/transcript_words.txt
|
||||
done
|
||||
|
||||
# Ensure space only appears once
|
||||
sed -i 's/\t/ /g' $lang_dir/transcript_words.txt
|
||||
sed -i 's/[ ][ ]*/ /g' $lang_dir/transcript_words.txt
|
||||
fi
|
||||
|
||||
cat $lang_dir/transcript_words.txt | sed 's| |\n|g' \
|
||||
| sort -u | sed '/^$/d' > $lang_dir/words.txt
|
||||
(echo '!SIL'; echo '<SPOKEN_NOISE>'; echo '<UNK>'; ) |
|
||||
cat - $lang_dir/words.txt | sort | uniq | awk '
|
||||
BEGIN {
|
||||
print "<eps> 0";
|
||||
}
|
||||
{
|
||||
if ($1 == "<s>") {
|
||||
print "<s> is in the vocabulary!" | "cat 1>&2"
|
||||
exit 1;
|
||||
}
|
||||
if ($1 == "</s>") {
|
||||
print "</s> is in the vocabulary!" | "cat 1>&2"
|
||||
exit 1;
|
||||
}
|
||||
printf("%s %d\n", $1, NR);
|
||||
}
|
||||
END {
|
||||
printf("#0 %d\n", NR+1);
|
||||
printf("<s> %d\n", NR+2);
|
||||
printf("</s> %d\n", NR+3);
|
||||
}' > $lang_dir/words || exit 1;
|
||||
mv $lang_dir/words $lang_dir/words.txt
|
||||
fi
|
||||
|
||||
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||
log "Stage 6: Prepare BPE based lang"
|
||||
@ -141,26 +187,24 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||
for vocab_size in ${vocab_sizes[@]}; do
|
||||
lang_dir=data/lang_bpe_${vocab_size}
|
||||
mkdir -p $lang_dir
|
||||
# We reuse words.txt from phone based lexicon
|
||||
# so that the two can share G.pt later.
|
||||
cp data/lang_phone/words.txt $lang_dir
|
||||
# # We reuse words.txt from phone based lexicon
|
||||
# # so that the two can share G.pt later.
|
||||
cp data/lang_phone/{words.txt,transcript_words.txt} $lang_dir
|
||||
|
||||
if [ ! -f $lang_dir/transcript_words.txt ]; then
|
||||
log "Generate data for BPE training"
|
||||
files=$(
|
||||
find "$dl_dir/LibriSpeech/train-clean-100" -name "*.trans.txt"
|
||||
find "$dl_dir/LibriSpeech/train-clean-360" -name "*.trans.txt"
|
||||
find "$dl_dir/LibriSpeech/train-other-500" -name "*.trans.txt"
|
||||
)
|
||||
for f in ${files[@]}; do
|
||||
cat $f | cut -d " " -f 2-
|
||||
done > $lang_dir/transcript_words.txt
|
||||
gunzip -c "data/manifests/gigaspeech_supervisions_XL.jsonl.gz" \
|
||||
| jq '.text' \
|
||||
| sed 's/"//g' \
|
||||
> $lang_dir/transcript_words.txt
|
||||
fi
|
||||
|
||||
if [ ! -f $lang_dir/bpe.model ]; then
|
||||
./local/train_bpe_model.py \
|
||||
--lang-dir $lang_dir \
|
||||
--vocab-size $vocab_size \
|
||||
--transcript $lang_dir/transcript_words.txt
|
||||
fi
|
||||
|
||||
if [ ! -f $lang_dir/L_disambig.pt ]; then
|
||||
./local/prepare_lang_bpe.py --lang-dir $lang_dir
|
||||
|
Loading…
x
Reference in New Issue
Block a user