misc. fixes

This commit is contained in:
jinzr 2024-03-08 19:28:47 +08:00
parent 7d22fef6f2
commit 27c9a90dc6

View File

@ -70,11 +70,14 @@ def get_norm_lines(lines: List[str]) -> List[str]:
# about, for example, {梁佳佳},我是{}人. # about, for example, {梁佳佳},我是{}人.
return ( return (
text.strip() text.strip()
.upper()
.replace("(music)", "") .replace("(music)", "")
.replace("(music", "") .replace("(music", "")
.replace("{", "") .replace("{", "")
.replace("}", "") .replace("}", "")
.replace(
"BB所以就指腹為親喇", "BB 所以就指腹為親喇"
) # manually fix the error
.upper()
) )
return [_text_norm(line) for line in lines] return [_text_norm(line) for line in lines]
@ -139,7 +142,7 @@ if __name__ == "__main__":
with open(output_dir / "text_words_segmentation", "w+", encoding="utf-8") as f: with open(output_dir / "text_words_segmentation", "w+", encoding="utf-8") as f:
f.writelines(text_words_segments) f.writelines(text_words_segments)
words = get_words(norm_lines) words = get_words(text_words_segments)
with open(output_dir / "words_no_ids.txt", "w+", encoding="utf-8") as f: with open(output_dir / "words_no_ids.txt", "w+", encoding="utf-8") as f:
f.writelines([word + "\n" for word in sorted(words)]) f.writelines([word + "\n" for word in sorted(words)])