mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 10:16:14 +00:00
misc. fixes
This commit is contained in:
parent
123cd9cb6d
commit
7f1cb9e991
@ -118,7 +118,7 @@ def get_word_segments(lines: List[str]) -> List[str]:
|
|||||||
def get_words(lines: List[str]) -> List[str]:
|
def get_words(lines: List[str]) -> List[str]:
|
||||||
words = set()
|
words = set()
|
||||||
for line in tqdm(lines, desc="Getting words"):
|
for line in tqdm(lines, desc="Getting words"):
|
||||||
words.update(line.replace("\n", "").split(" "))
|
words.update(line.strip().split(" "))
|
||||||
return list(words)
|
return list(words)
|
||||||
|
|
||||||
|
|
||||||
@ -142,7 +142,7 @@ if __name__ == "__main__":
|
|||||||
with open(output_dir / "text_words_segmentation", "w+", encoding="utf-8") as f:
|
with open(output_dir / "text_words_segmentation", "w+", encoding="utf-8") as f:
|
||||||
f.writelines(text_words_segments)
|
f.writelines(text_words_segments)
|
||||||
|
|
||||||
words = get_words(text_words_segments)
|
words = get_words(text_words_segments)[1:] # remove "\n" from words
|
||||||
with open(output_dir / "words_no_ids.txt", "w+", encoding="utf-8") as f:
|
with open(output_dir / "words_no_ids.txt", "w+", encoding="utf-8") as f:
|
||||||
f.writelines([word + "\n" for word in sorted(words)])
|
f.writelines([word + "\n" for word in sorted(words)])
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user