icefall/egs/mgb2/ASR/local/prep_mgb2_lexicon.sh
Amir Hussein 6f71981667
MGB2 (#396)
* mgb2

* mgb2

* adding pruned transducer stateless to mgb2

* update display_manifest_statistics.py

* .

* stateless transducer MGB-2

* Update README.md

* Update RESULTS.md

* Update prepare_lang_bpe.py

* Update asr_datamodule.py

* .nfs removed

* Adding symlink

* .

* resolving conflicts

* Update .gitignore

* black formatting

* Update compile_hlg.py

* Update compute_fbank_musan.py

* Update convert_transcript_words_to_tokens.py

* Update download_lm.py

* Update generate_unique_lexicon.py

* adding simlinks

* fixing symbolic links
2022-12-02 10:58:34 +08:00

31 lines
1.2 KiB
Bash
Executable File

#!/usr/bin/env bash
# Copyright 2022 QCRI (author: Amir Hussein)
# Apache 2.0
# This script prepares the graphemic lexicon.
dir=data/local/dict
lexicon_url1="https://arabicspeech.org/arabicspeech-portal-resources/lexicon/ar-ar_grapheme_lexicon_20160209.bz2";
lexicon_url2="https://arabicspeech.org/arabicspeech-portal-resources/lexicon/ar-ar_phoneme_lexicon_20140317.bz2";
stage=0
lang_dir=download/lm
mkdir -p $lang_dir
if [ $stage -le 0 ]; then
echo "$0: Downloading text for lexicon... $(date)."
wget --no-check-certificate -P $lang_dir $lexicon_url1
wget --no-check-certificate -P $lang_dir $lexicon_url2
bzcat $lang_dir/ar-ar_grapheme_lexicon_20160209.bz2 | sed '1,3d' | awk '{print $1}' > $lang_dir/grapheme_lexicon
bzcat $lang_dir/ar-ar_phoneme_lexicon_20140317.bz2 | sed '1,3d' | awk '{print $1}' >> $lang_dir/phoneme_lexicon
cat download/lm/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> $lang_dir/uniq_words
fi
if [ $stage -le 0 ]; then
echo "$0: processing lexicon text and creating lexicon... $(date)."
# remove vowels and rare alef wasla
cat $lang_dir/uniq_words | sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sed -r '/^\s*$/d' | sort -u > $lang_dir/grapheme_lexicon.txt
fi
echo "$0: Lexicon preparation succeeded"