From 1c509a24f831a8eaacf09b90ac9399d94d424699 Mon Sep 17 00:00:00 2001
From: jinzr <zengrui.jin0@gmail.com>
Date: Fri, 8 Mar 2024 19:14:09 +0800
Subject: [PATCH] handle code switching cases

---
 egs/mdcc/ASR/local/preprocess_mdcc.py | 36 ++++++++++++++++++++-------
 egs/mdcc/ASR/prepare.sh               |  3 +++
 2 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/egs/mdcc/ASR/local/preprocess_mdcc.py b/egs/mdcc/ASR/local/preprocess_mdcc.py
index e7a87f0d5..751b0a8b8 100755
--- a/egs/mdcc/ASR/local/preprocess_mdcc.py
+++ b/egs/mdcc/ASR/local/preprocess_mdcc.py
@@ -32,6 +32,8 @@ from typing import List
 import pycantonese
 from tqdm.auto import tqdm
 
+from icefall.utils import is_cjk
+
 
 def get_parser():
     parser = argparse.ArgumentParser(
@@ -67,6 +69,7 @@ def get_norm_lines(lines: List[str]) -> List[str]:
         # about, for example, {梁佳佳}，我是{}人.
         return (
             text.strip()
+            .upper()
             .replace("(music)", "")
             .replace("(music", "")
             .replace("{", "")
@@ -77,16 +80,31 @@ def get_norm_lines(lines: List[str]) -> List[str]:
 
 
 def get_word_segments(lines: List[str]) -> List[str]:
-    return [
-        " ".join(pycantonese.segment(line)) + "\n"
-        for line in tqdm(lines, desc="Segmenting lines")
-    ]
+    # the current pycantonese segmenter does not handle the case when the input
+    # is code switching, so we need to handle it separately
+
+    new_lines = []
+
+    for line in tqdm(lines, desc="Segmenting lines"):
+        # code switching
+        if len(line.split(" ")) > 1:
+            segments = []
+            for segment in line.split(" "):
+                if not is_cjk(segment[0]):  # en segment
+                    segments.append(segment)
+                else:  # zh segment
+                    segments.extend(pycantonese.segment(segment))
+            new_lines.append(" ".join(segments) + "\n")
+        # not code switching
+        else:
+            new_lines.append(" ".join(pycantonese.segment(line)) + "\n")
+    return new_lines
 
 
 def get_words(lines: List[str]) -> List[str]:
     words = set()
     for line in tqdm(lines, desc="Getting words"):
-        words.update(pycantonese.segment(line))
+        words.update(line.strip().split(" "))
     return list(words)
 
 
@@ -106,6 +124,10 @@ if __name__ == "__main__":
     with open(output_dir / "text_norm", "w+", encoding="utf-8") as f:
         f.writelines([line + "\n" for line in norm_lines])
 
+    text_words_segments = get_word_segments(norm_lines)
+    with open(output_dir / "text_words_segmentation", "w+", encoding="utf-8") as f:
+        f.writelines(text_words_segments)
+
     words = get_words(norm_lines)
     with open(output_dir / "words_no_ids.txt", "w+", encoding="utf-8") as f:
         f.writelines([word + "\n" for word in sorted(words)])
@@ -118,7 +140,3 @@ if __name__ == "__main__":
 
     with open(output_dir / "words.txt", "w+", encoding="utf-8") as f:
         f.writelines([f"{word} {i}\n" for i, word in enumerate(words)])
-
-    text_words_segments = get_word_segments(norm_lines)
-    with open(output_dir / "text_words_segmentation", "w+", encoding="utf-8") as f:
-        f.writelines(text_words_segments)
diff --git a/egs/mdcc/ASR/prepare.sh b/egs/mdcc/ASR/prepare.sh
index d892c04d8..f4d9bc47e 100755
--- a/egs/mdcc/ASR/prepare.sh
+++ b/egs/mdcc/ASR/prepare.sh
@@ -172,6 +172,9 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
   if [ ! -f $lang_char_dir/text_words_segmentation ]; then
     ./local/preprocess_mdcc.py --input-file $lang_char_dir/text \
       --output-dir $lang_char_dir
+    
+    mv $lang_char_dir/text $lang_char_dir/_text
+    cp $lang_char_dir/text_words_segmentation $lang_char_dir/text
   fi
 
   if [ ! -f $lang_char_dir/tokens.txt ]; then