From b21cd0bb73551c98f016090f31404ae82ec4fff1 Mon Sep 17 00:00:00 2001 From: luomingshuang <739314837@qq.com> Date: Sun, 13 Mar 2022 20:40:07 +0800 Subject: [PATCH] change for prepare.sh --- egs/tedlium3/ASR/local/prepare_lexicon.py | 100 ++++++++++++++++++++++ egs/tedlium3/ASR/prepare.sh | 14 +-- 2 files changed, 107 insertions(+), 7 deletions(-) create mode 100644 egs/tedlium3/ASR/local/prepare_lexicon.py diff --git a/egs/tedlium3/ASR/local/prepare_lexicon.py b/egs/tedlium3/ASR/local/prepare_lexicon.py new file mode 100644 index 000000000..59377b5aa --- /dev/null +++ b/egs/tedlium3/ASR/local/prepare_lexicon.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +# Copyright 2022 Xiaomi Corp. (authors: Mingshuang Luo) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script takes as input supervisions json dir "data/manifests" +consisting of supervisions_train.json and does the following: + +1. Generate lexicon_words.txt. + +""" +import argparse +import json +import logging +from pathlib import Path + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--manifests-dir", + type=str, + help="""Input directory. + """, + ) + parser.add_argument( + "--lang-dir", + type=str, + help="""Output directory. + """, + ) + + return parser.parse_args() + + +def prepare_lexicon(manifests_dir: str, lang_dir: str): + """ + Args: + manifests_dir: + The manifests directory, e.g., data/manifests. + lang_dir: + The language directory, e.g., data/lang_phone. + + Return: + The lexicon_words.txt file. + """ + words = set() + + supervisions_train = Path(manifests_dir) / "supervisions_train.json" + lexicon = Path(lang_dir) / "lexicon_words.txt" + + logging.info(f"Loading {supervisions_train}!") + with open(supervisions_train, "r") as load_f: + load_dicts = json.load(load_f) + for load_dict in load_dicts: + text = load_dict["text"] + # list the words units and filter the empty item + words_list = list(filter(None, text.split())) + + for word in words_list: + if word not in words and word != "": + words.add(word) + + with open(lexicon, "w") as f: + for word in sorted(words): + f.write(word + " " + word) + f.write("\n") + + +def main(): + args = get_args() + manifests_dir = Path(args.manifests_dir) + lang_dir = Path(args.lang_dir) + + logging.info("Generating lexicon_words.txt") + prepare_lexicon(manifests_dir, lang_dir) + + +if __name__ == "__main__": + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + ) + + logging.basicConfig(format=formatter, level=logging.INFO) + + main() diff --git a/egs/tedlium3/ASR/prepare.sh b/egs/tedlium3/ASR/prepare.sh index 4f2269430..1f238b940 100644 --- a/egs/tedlium3/ASR/prepare.sh +++ b/egs/tedlium3/ASR/prepare.sh @@ -108,14 +108,14 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then --lang-dir $lang_dir \ --manifests-dir data/manifests fi - cat download/tedlium3/TEDLIUM.152k.dic | \ - grep -v -w "" | \ - grep -v -w "" | \ - grep -v -w "" | \ - LANG= LC_ALL= sort | \ - sed 's:([0-9])::g' > $lang_dir/lexicon_words.txt - (echo ' '; ) | + if [ ! -f $lang_dir/lexicon_words.txt ]; then + ./local/prepare_lexicon.py \ + --manifests-dir data/manifests \ + --lang-dir $lang_dir + fi + + (echo '!SIL SIL'; echo ' '; ) | cat - $lang_dir/lexicon_words.txt | sort | uniq > $lang_dir/lexicon.txt