icefall/egs/iwslt22_ta/ST/local/prep_lexicon.sh
2023-11-01 06:39:24 +03:00

22 lines
736 B
Bash
Executable File

#!/usr/bin/env bash
# Copyright 2022 QCRI (author: Amir Hussein)
# Apache 2.0
# This script prepares the graphemic lexicon.
dir=data/local/dict
stage=0
lang_dir_src=$1
lang_dir_tgt=$2
cat $lang_dir_src/transcript_words.txt | tr -s " " "\n" | sort -u > $lang_dir_src/uniq_words
cat $lang_dir_tgt/transcript_words.txt | tr -s " " "\n" | sort -u > $lang_dir_tgt/uniq_words
echo "$0: processing lexicon text and creating lexicon... $(date)."
# remove vowels and rare alef wasla
cat $lang_dir_src/uniq_words | sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sed -r '/^\s*$/d' | sort -u > $lang_dir_src/words.txt
cat $lang_dir_tgt/uniq_words | sed -r '/^\s*$/d' | sort -u > $lang_dir_tgt/words.txt
echo "$0: Lexicon preparation succeeded"