mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
misc. fixes
This commit is contained in:
parent
7d22fef6f2
commit
27c9a90dc6
@ -70,11 +70,14 @@ def get_norm_lines(lines: List[str]) -> List[str]:
|
|||||||
# about, for example, {梁佳佳},我是{}人.
|
# about, for example, {梁佳佳},我是{}人.
|
||||||
return (
|
return (
|
||||||
text.strip()
|
text.strip()
|
||||||
.upper()
|
|
||||||
.replace("(music)", "")
|
.replace("(music)", "")
|
||||||
.replace("(music", "")
|
.replace("(music", "")
|
||||||
.replace("{", "")
|
.replace("{", "")
|
||||||
.replace("}", "")
|
.replace("}", "")
|
||||||
|
.replace(
|
||||||
|
"BB所以就指腹為親喇", "BB 所以就指腹為親喇"
|
||||||
|
) # manually fix the error
|
||||||
|
.upper()
|
||||||
)
|
)
|
||||||
|
|
||||||
return [_text_norm(line) for line in lines]
|
return [_text_norm(line) for line in lines]
|
||||||
@ -139,7 +142,7 @@ if __name__ == "__main__":
|
|||||||
with open(output_dir / "text_words_segmentation", "w+", encoding="utf-8") as f:
|
with open(output_dir / "text_words_segmentation", "w+", encoding="utf-8") as f:
|
||||||
f.writelines(text_words_segments)
|
f.writelines(text_words_segments)
|
||||||
|
|
||||||
words = get_words(norm_lines)
|
words = get_words(text_words_segments)
|
||||||
with open(output_dir / "words_no_ids.txt", "w+", encoding="utf-8") as f:
|
with open(output_dir / "words_no_ids.txt", "w+", encoding="utf-8") as f:
|
||||||
f.writelines([word + "\n" for word in sorted(words)])
|
f.writelines([word + "\n" for word in sorted(words)])
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user