mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
handle code switching cases
This commit is contained in:
parent
bca12ed23c
commit
1c509a24f8
@ -32,6 +32,8 @@ from typing import List
|
|||||||
import pycantonese
|
import pycantonese
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
|
|
||||||
|
from icefall.utils import is_cjk
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
@ -67,6 +69,7 @@ def get_norm_lines(lines: List[str]) -> List[str]:
|
|||||||
# about, for example, {梁佳佳},我是{}人.
|
# about, for example, {梁佳佳},我是{}人.
|
||||||
return (
|
return (
|
||||||
text.strip()
|
text.strip()
|
||||||
|
.upper()
|
||||||
.replace("(music)", "")
|
.replace("(music)", "")
|
||||||
.replace("(music", "")
|
.replace("(music", "")
|
||||||
.replace("{", "")
|
.replace("{", "")
|
||||||
@ -77,16 +80,31 @@ def get_norm_lines(lines: List[str]) -> List[str]:
|
|||||||
|
|
||||||
|
|
||||||
def get_word_segments(lines: List[str]) -> List[str]:
|
def get_word_segments(lines: List[str]) -> List[str]:
|
||||||
return [
|
# the current pycantonese segmenter does not handle the case when the input
|
||||||
" ".join(pycantonese.segment(line)) + "\n"
|
# is code switching, so we need to handle it separately
|
||||||
for line in tqdm(lines, desc="Segmenting lines")
|
|
||||||
]
|
new_lines = []
|
||||||
|
|
||||||
|
for line in tqdm(lines, desc="Segmenting lines"):
|
||||||
|
# code switching
|
||||||
|
if len(line.split(" ")) > 1:
|
||||||
|
segments = []
|
||||||
|
for segment in line.split(" "):
|
||||||
|
if not is_cjk(segment[0]): # en segment
|
||||||
|
segments.append(segment)
|
||||||
|
else: # zh segment
|
||||||
|
segments.extend(pycantonese.segment(segment))
|
||||||
|
new_lines.append(" ".join(segments) + "\n")
|
||||||
|
# not code switching
|
||||||
|
else:
|
||||||
|
new_lines.append(" ".join(pycantonese.segment(line)) + "\n")
|
||||||
|
return new_lines
|
||||||
|
|
||||||
|
|
||||||
def get_words(lines: List[str]) -> List[str]:
|
def get_words(lines: List[str]) -> List[str]:
|
||||||
words = set()
|
words = set()
|
||||||
for line in tqdm(lines, desc="Getting words"):
|
for line in tqdm(lines, desc="Getting words"):
|
||||||
words.update(pycantonese.segment(line))
|
words.update(line.strip().split(" "))
|
||||||
return list(words)
|
return list(words)
|
||||||
|
|
||||||
|
|
||||||
@ -106,6 +124,10 @@ if __name__ == "__main__":
|
|||||||
with open(output_dir / "text_norm", "w+", encoding="utf-8") as f:
|
with open(output_dir / "text_norm", "w+", encoding="utf-8") as f:
|
||||||
f.writelines([line + "\n" for line in norm_lines])
|
f.writelines([line + "\n" for line in norm_lines])
|
||||||
|
|
||||||
|
text_words_segments = get_word_segments(norm_lines)
|
||||||
|
with open(output_dir / "text_words_segmentation", "w+", encoding="utf-8") as f:
|
||||||
|
f.writelines(text_words_segments)
|
||||||
|
|
||||||
words = get_words(norm_lines)
|
words = get_words(norm_lines)
|
||||||
with open(output_dir / "words_no_ids.txt", "w+", encoding="utf-8") as f:
|
with open(output_dir / "words_no_ids.txt", "w+", encoding="utf-8") as f:
|
||||||
f.writelines([word + "\n" for word in sorted(words)])
|
f.writelines([word + "\n" for word in sorted(words)])
|
||||||
@ -118,7 +140,3 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
with open(output_dir / "words.txt", "w+", encoding="utf-8") as f:
|
with open(output_dir / "words.txt", "w+", encoding="utf-8") as f:
|
||||||
f.writelines([f"{word} {i}\n" for i, word in enumerate(words)])
|
f.writelines([f"{word} {i}\n" for i, word in enumerate(words)])
|
||||||
|
|
||||||
text_words_segments = get_word_segments(norm_lines)
|
|
||||||
with open(output_dir / "text_words_segmentation", "w+", encoding="utf-8") as f:
|
|
||||||
f.writelines(text_words_segments)
|
|
||||||
|
@ -172,6 +172,9 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|||||||
if [ ! -f $lang_char_dir/text_words_segmentation ]; then
|
if [ ! -f $lang_char_dir/text_words_segmentation ]; then
|
||||||
./local/preprocess_mdcc.py --input-file $lang_char_dir/text \
|
./local/preprocess_mdcc.py --input-file $lang_char_dir/text \
|
||||||
--output-dir $lang_char_dir
|
--output-dir $lang_char_dir
|
||||||
|
|
||||||
|
mv $lang_char_dir/text $lang_char_dir/_text
|
||||||
|
cp $lang_char_dir/text_words_segmentation $lang_char_dir/text
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -f $lang_char_dir/tokens.txt ]; then
|
if [ ! -f $lang_char_dir/tokens.txt ]; then
|
||||||
|
Loading…
x
Reference in New Issue
Block a user