mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-27 10:44:19 +00:00
Update preprocess_mdcc.py
This commit is contained in:
parent
1c509a24f8
commit
ba01616103
@ -26,6 +26,7 @@ files in the directory "data/lang_char":
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
@ -86,6 +87,7 @@ def get_word_segments(lines: List[str]) -> List[str]:
|
|||||||
new_lines = []
|
new_lines = []
|
||||||
|
|
||||||
for line in tqdm(lines, desc="Segmenting lines"):
|
for line in tqdm(lines, desc="Segmenting lines"):
|
||||||
|
try:
|
||||||
# code switching
|
# code switching
|
||||||
if len(line.split(" ")) > 1:
|
if len(line.split(" ")) > 1:
|
||||||
segments = []
|
segments = []
|
||||||
@ -98,6 +100,8 @@ def get_word_segments(lines: List[str]) -> List[str]:
|
|||||||
# not code switching
|
# not code switching
|
||||||
else:
|
else:
|
||||||
new_lines.append(" ".join(pycantonese.segment(line)) + "\n")
|
new_lines.append(" ".join(pycantonese.segment(line)) + "\n")
|
||||||
|
except:
|
||||||
|
logging.error(f"Failed to process line: {line}")
|
||||||
return new_lines
|
return new_lines
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user