From 7f1cb9e991e5cfd79529b195313bbd70f7556873 Mon Sep 17 00:00:00 2001 From: jinzr Date: Fri, 8 Mar 2024 19:34:06 +0800 Subject: [PATCH] misc. fixes --- egs/mdcc/ASR/local/preprocess_mdcc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/mdcc/ASR/local/preprocess_mdcc.py b/egs/mdcc/ASR/local/preprocess_mdcc.py index 436bcad11..29925c63c 100755 --- a/egs/mdcc/ASR/local/preprocess_mdcc.py +++ b/egs/mdcc/ASR/local/preprocess_mdcc.py @@ -118,7 +118,7 @@ def get_word_segments(lines: List[str]) -> List[str]: def get_words(lines: List[str]) -> List[str]: words = set() for line in tqdm(lines, desc="Getting words"): - words.update(line.replace("\n", "").split(" ")) + words.update(line.strip().split(" ")) return list(words) @@ -142,7 +142,7 @@ if __name__ == "__main__": with open(output_dir / "text_words_segmentation", "w+", encoding="utf-8") as f: f.writelines(text_words_segments) - words = get_words(text_words_segments) + words = get_words(text_words_segments)[1:] # remove "\n" from words with open(output_dir / "words_no_ids.txt", "w+", encoding="utf-8") as f: f.writelines([word + "\n" for word in sorted(words)])