mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-08 09:32:20 +00:00
* mgb2 * mgb2 * adding pruned transducer stateless to mgb2 * update display_manifest_statistics.py * . * stateless transducer MGB-2 * Update README.md * Update RESULTS.md * Update prepare_lang_bpe.py * Update asr_datamodule.py * .nfs removed * Adding symlink * . * resolving conflicts * Update .gitignore * black formatting * Update compile_hlg.py * Update compute_fbank_musan.py * Update convert_transcript_words_to_tokens.py * Update download_lm.py * Update generate_unique_lexicon.py * adding simlinks * fixing symbolic links
31 lines
1.2 KiB
Bash
Executable File
31 lines
1.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
# Copyright 2022 QCRI (author: Amir Hussein)
|
|
# Apache 2.0
|
|
# This script prepares the graphemic lexicon.
|
|
|
|
dir=data/local/dict
|
|
lexicon_url1="https://arabicspeech.org/arabicspeech-portal-resources/lexicon/ar-ar_grapheme_lexicon_20160209.bz2";
|
|
lexicon_url2="https://arabicspeech.org/arabicspeech-portal-resources/lexicon/ar-ar_phoneme_lexicon_20140317.bz2";
|
|
stage=0
|
|
lang_dir=download/lm
|
|
mkdir -p $lang_dir
|
|
|
|
if [ $stage -le 0 ]; then
|
|
echo "$0: Downloading text for lexicon... $(date)."
|
|
wget --no-check-certificate -P $lang_dir $lexicon_url1
|
|
wget --no-check-certificate -P $lang_dir $lexicon_url2
|
|
bzcat $lang_dir/ar-ar_grapheme_lexicon_20160209.bz2 | sed '1,3d' | awk '{print $1}' > $lang_dir/grapheme_lexicon
|
|
bzcat $lang_dir/ar-ar_phoneme_lexicon_20140317.bz2 | sed '1,3d' | awk '{print $1}' >> $lang_dir/phoneme_lexicon
|
|
cat download/lm/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> $lang_dir/uniq_words
|
|
fi
|
|
|
|
|
|
if [ $stage -le 0 ]; then
|
|
echo "$0: processing lexicon text and creating lexicon... $(date)."
|
|
# remove vowels and rare alef wasla
|
|
cat $lang_dir/uniq_words | sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sed -r '/^\s*$/d' | sort -u > $lang_dir/grapheme_lexicon.txt
|
|
fi
|
|
|
|
echo "$0: Lexicon preparation succeeded"
|