icefall/egs/mgb2/ASR/local/prep_mgb2_lexicon.sh
2024-01-14 08:05:49 -05:00

31 lines
1.2 KiB
Bash
Executable File

#!/usr/bin/env bash
# Copyright 2022 QCRI (author: Amir Hussein)
# Apache 2.0
# This script prepares the graphemic lexicon.
dir=data/local/dict
lexicon_url1="https://arabicspeech.org/arabicspeech-portal-resources/lexicon/ar-ar_grapheme_lexicon_20160209.bz2";
lexicon_url2="https://arabicspeech.org/arabicspeech-portal-resources/lexicon/ar-ar_phoneme_lexicon_20140317.bz2";
stage=0
lang_dir=download/lm
mkdir -p $lang_dir
if [ $stage -le 0 ]; then
echo "$0: Downloading text for lexicon... $(date)."
wget --no-check-certificate -P $lang_dir $lexicon_url1
wget --no-check-certificate -P $lang_dir $lexicon_url2
bzcat $lang_dir/ar-ar_grapheme_lexicon_20160209.bz2 | sed '1,3d' | awk '{print $1}' > $lang_dir/grapheme_lexicon
bzcat $lang_dir/ar-ar_phoneme_lexicon_20140317.bz2 | sed '1,3d' | awk '{print $1}' >> $lang_dir/phoneme_lexicon
cat download/lm/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> $lang_dir/uniq_words
fi
if [ $stage -le 0 ]; then
echo "$0: processing lexicon text and creating lexicon... $(date)."
# remove vowels and rare alef wasla
cat $lang_dir/uniq_words | sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sed -r '/^\s*$/d' | sort -u > $lang_dir/grapheme_lexicon.txt
fi
echo "$0: Lexicon preparation succeeded"