mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-27 02:34:21 +00:00
support BPE based lang
This commit is contained in:
parent
1d58765bd5
commit
3dbb15bda2
1
egs/gigaspeech/ASR/local/convert_transcript_words_to_tokens.py
Symbolic link
1
egs/gigaspeech/ASR/local/convert_transcript_words_to_tokens.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/local/convert_transcript_words_to_tokens.py
|
1
egs/gigaspeech/ASR/local/prepare_lang.py
Symbolic link
1
egs/gigaspeech/ASR/local/prepare_lang.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/local/prepare_lang.py
|
1
egs/gigaspeech/ASR/local/prepare_lang_bpe.py
Symbolic link
1
egs/gigaspeech/ASR/local/prepare_lang_bpe.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/local/prepare_lang_bpe.py
|
1
egs/gigaspeech/ASR/local/train_bpe_model.py
Symbolic link
1
egs/gigaspeech/ASR/local/train_bpe_model.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/local/train_bpe_model.py
|
@ -31,9 +31,9 @@ dl_dir=$PWD/download
|
|||||||
# data/lang_bpe_yyy if the array contains xxx, yyy
|
# data/lang_bpe_yyy if the array contains xxx, yyy
|
||||||
vocab_sizes=(
|
vocab_sizes=(
|
||||||
5000
|
5000
|
||||||
2000
|
# 2000
|
||||||
1000
|
# 1000
|
||||||
500
|
# 500
|
||||||
)
|
)
|
||||||
|
|
||||||
# All files generated by this script are saved in "data".
|
# All files generated by this script are saved in "data".
|
||||||
@ -125,15 +125,61 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|||||||
lang_dir=data/lang_phone
|
lang_dir=data/lang_phone
|
||||||
mkdir -p $lang_dir
|
mkdir -p $lang_dir
|
||||||
|
|
||||||
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |
|
# (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |
|
||||||
cat - $dl_dir/lm/librispeech-lexicon.txt |
|
# cat - $dl_dir/lm/librispeech-lexicon.txt |
|
||||||
sort | uniq > $lang_dir/lexicon.txt
|
# sort | uniq > $lang_dir/lexicon.txt
|
||||||
|
|
||||||
if [ ! -f $lang_dir/L_disambig.pt ]; then
|
# if [ ! -f $lang_dir/L_disambig.pt ]; then
|
||||||
./local/prepare_lang.py --lang-dir $lang_dir
|
# ./local/prepare_lang.py --lang-dir $lang_dir
|
||||||
|
# fi
|
||||||
|
if [ ! -f $lang_dir/transcript_words.txt ]; then
|
||||||
|
gunzip -c "data/manifests/gigaspeech_supervisions_XL.jsonl.gz" \
|
||||||
|
| jq '.text' \
|
||||||
|
| sed 's/"//g' \
|
||||||
|
> $lang_dir/transcript_words.txt
|
||||||
|
|
||||||
|
# Delete utterances with garbage meta tags
|
||||||
|
garbage_utterance_tags="<SIL> <MUSIC> <NOISE> <OTHER>"
|
||||||
|
for tag in $garbage_utterance_tags; do
|
||||||
|
sed -i "/${tag}/d" $lang_dir/transcript_words.txt
|
||||||
|
done
|
||||||
|
|
||||||
|
# Delete punctuations in utterances
|
||||||
|
punctuation_tags="<COMMA> <EXCLAMATIONPOINT> <PERIOD> <QUESTIONMARK>"
|
||||||
|
for tag in $punctuation_tags; do
|
||||||
|
sed -i "s/${tag}//g" $lang_dir/transcript_words.txt
|
||||||
|
done
|
||||||
|
|
||||||
|
# Ensure space only appears once
|
||||||
|
sed -i 's/\t/ /g' $lang_dir/transcript_words.txt
|
||||||
|
sed -i 's/[ ][ ]*/ /g' $lang_dir/transcript_words.txt
|
||||||
fi
|
fi
|
||||||
fi
|
|
||||||
|
|
||||||
|
cat $lang_dir/transcript_words.txt | sed 's| |\n|g' \
|
||||||
|
| sort -u | sed '/^$/d' > $lang_dir/words.txt
|
||||||
|
(echo '!SIL'; echo '<SPOKEN_NOISE>'; echo '<UNK>'; ) |
|
||||||
|
cat - $lang_dir/words.txt | sort | uniq | awk '
|
||||||
|
BEGIN {
|
||||||
|
print "<eps> 0";
|
||||||
|
}
|
||||||
|
{
|
||||||
|
if ($1 == "<s>") {
|
||||||
|
print "<s> is in the vocabulary!" | "cat 1>&2"
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
if ($1 == "</s>") {
|
||||||
|
print "</s> is in the vocabulary!" | "cat 1>&2"
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
printf("%s %d\n", $1, NR);
|
||||||
|
}
|
||||||
|
END {
|
||||||
|
printf("#0 %d\n", NR+1);
|
||||||
|
printf("<s> %d\n", NR+2);
|
||||||
|
printf("</s> %d\n", NR+3);
|
||||||
|
}' > $lang_dir/words || exit 1;
|
||||||
|
mv $lang_dir/words $lang_dir/words.txt
|
||||||
|
fi
|
||||||
|
|
||||||
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||||
log "Stage 6: Prepare BPE based lang"
|
log "Stage 6: Prepare BPE based lang"
|
||||||
@ -141,26 +187,24 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
|||||||
for vocab_size in ${vocab_sizes[@]}; do
|
for vocab_size in ${vocab_sizes[@]}; do
|
||||||
lang_dir=data/lang_bpe_${vocab_size}
|
lang_dir=data/lang_bpe_${vocab_size}
|
||||||
mkdir -p $lang_dir
|
mkdir -p $lang_dir
|
||||||
# We reuse words.txt from phone based lexicon
|
# # We reuse words.txt from phone based lexicon
|
||||||
# so that the two can share G.pt later.
|
# # so that the two can share G.pt later.
|
||||||
cp data/lang_phone/words.txt $lang_dir
|
cp data/lang_phone/{words.txt,transcript_words.txt} $lang_dir
|
||||||
|
|
||||||
if [ ! -f $lang_dir/transcript_words.txt ]; then
|
if [ ! -f $lang_dir/transcript_words.txt ]; then
|
||||||
log "Generate data for BPE training"
|
log "Generate data for BPE training"
|
||||||
files=$(
|
gunzip -c "data/manifests/gigaspeech_supervisions_XL.jsonl.gz" \
|
||||||
find "$dl_dir/LibriSpeech/train-clean-100" -name "*.trans.txt"
|
| jq '.text' \
|
||||||
find "$dl_dir/LibriSpeech/train-clean-360" -name "*.trans.txt"
|
| sed 's/"//g' \
|
||||||
find "$dl_dir/LibriSpeech/train-other-500" -name "*.trans.txt"
|
> $lang_dir/transcript_words.txt
|
||||||
)
|
|
||||||
for f in ${files[@]}; do
|
|
||||||
cat $f | cut -d " " -f 2-
|
|
||||||
done > $lang_dir/transcript_words.txt
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
./local/train_bpe_model.py \
|
if [ ! -f $lang_dir/bpe.model ]; then
|
||||||
--lang-dir $lang_dir \
|
./local/train_bpe_model.py \
|
||||||
--vocab-size $vocab_size \
|
--lang-dir $lang_dir \
|
||||||
--transcript $lang_dir/transcript_words.txt
|
--vocab-size $vocab_size \
|
||||||
|
--transcript $lang_dir/transcript_words.txt
|
||||||
|
fi
|
||||||
|
|
||||||
if [ ! -f $lang_dir/L_disambig.pt ]; then
|
if [ ! -f $lang_dir/L_disambig.pt ]; then
|
||||||
./local/prepare_lang_bpe.py --lang-dir $lang_dir
|
./local/prepare_lang_bpe.py --lang-dir $lang_dir
|
||||||
|
Loading…
x
Reference in New Issue
Block a user