From 7f1cb9e991e5cfd79529b195313bbd70f7556873 Mon Sep 17 00:00:00 2001
From: jinzr <zengrui.jin0@gmail.com>
Date: Fri, 8 Mar 2024 19:34:06 +0800
Subject: [PATCH] misc. fixes

---
 egs/mdcc/ASR/local/preprocess_mdcc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/mdcc/ASR/local/preprocess_mdcc.py b/egs/mdcc/ASR/local/preprocess_mdcc.py
index 436bcad11..29925c63c 100755
--- a/egs/mdcc/ASR/local/preprocess_mdcc.py
+++ b/egs/mdcc/ASR/local/preprocess_mdcc.py
@@ -118,7 +118,7 @@ def get_word_segments(lines: List[str]) -> List[str]:
 def get_words(lines: List[str]) -> List[str]:
     words = set()
     for line in tqdm(lines, desc="Getting words"):
-        words.update(line.replace("\n", "").split(" "))
+        words.update(line.strip().split(" "))
     return list(words)
 
 
@@ -142,7 +142,7 @@ if __name__ == "__main__":
     with open(output_dir / "text_words_segmentation", "w+", encoding="utf-8") as f:
         f.writelines(text_words_segments)
 
-    words = get_words(text_words_segments)
+    words = get_words(text_words_segments)[1:]  # remove "\n" from words
     with open(output_dir / "words_no_ids.txt", "w+", encoding="utf-8") as f:
         f.writelines([word + "\n" for word in sorted(words)])