added scripts for char-based lang prep

2025-12-10 22:45:27 +00:00 · 2024-03-12 12:12:35 +08:00 · 2024-03-12 12:12:35 +08:00 · 4a1d4be94a
commit 4a1d4be94a
parent ddefabcb7a
5 changed files with 251 additions and 84 deletions
--- a/egs/commonvoice/ASR/local/prepare_char.py
+++ b/egs/commonvoice/ASR/local/prepare_char.py
@ -0,0 +1 @@
+../../../aishell/ASR/local/prepare_char.py
--- a/egs/commonvoice/ASR/local/prepare_lang.py
+++ b/egs/commonvoice/ASR/local/prepare_lang.py
@ -0,0 +1 @@
+../../../librispeech/ASR/local/prepare_lang.py
--- a/egs/commonvoice/ASR/local/prepare_lang_fst.py
+++ b/egs/commonvoice/ASR/local/prepare_lang_fst.py
@ -0,0 +1 @@
+../../../librispeech/ASR/local/prepare_lang_fst.py
--- a/egs/commonvoice/ASR/local/word_segment_yue.py
+++ b/egs/commonvoice/ASR/local/word_segment_yue.py
@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+# Copyright    2024  Xiaomi Corp.        (authors: Zengrui Jin)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script takes a text file "data/lang_char/text" as input, the file consist of
+lines each containing a transcript, applies text norm and generates the following
+files in the directory "data/lang_char":
+    - transcript_words.txt
+    - words.txt
+    - words_no_ids.txt
+"""
+
+import argparse
+import logging
+from pathlib import Path
+from typing import List
+
+import pycantonese
+from tqdm.auto import tqdm
+
+from icefall.utils import is_cjk
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="Prepare char lexicon",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--input-file",
+        "-i",
+        default="data/yue/lang_char/text",
+        type=str,
+        help="The input text file",
+    )
+    parser.add_argument(
+        "--output-dir",
+        "-o",
+        default="data/yue/lang_char/",
+        type=str,
+        help="The output directory",
+    )
+    return parser
+
+
+def get_word_segments(lines: List[str]) -> List[str]:
+    # the current pycantonese segmenter does not handle the case when the input
+    # is code switching, so we need to handle it separately
+
+    new_lines = []
+
+    for line in tqdm(lines, desc="Segmenting lines"):
+        try:
+            # code switching
+            if len(line.strip().split(" ")) > 1:
+                segments = []
+                for segment in line.strip().split(" "):
+                    if segment.strip() == "":
+                        continue
+                    try:
+                        if not is_cjk(segment[0]):  # en segment
+                            segments.append(segment)
+                        else:  # zh segment
+                            segments.extend(pycantonese.segment(segment))
+                    except Exception as e:
+                        logging.error(f"Failed to process segment: {segment}")
+                        raise e
+                new_lines.append(" ".join(segments) + "\n")
+            # not code switching
+            else:
+                new_lines.append(" ".join(pycantonese.segment(line)) + "\n")
+        except Exception as e:
+            logging.error(f"Failed to process line: {line}")
+            raise e
+    return new_lines
+
+
+def get_words(lines: List[str]) -> List[str]:
+    words = set()
+    for line in tqdm(lines, desc="Getting words"):
+        words.update(line.strip().split(" "))
+    return list(words)
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+
+    input_file = Path(args.input_file)
+    output_dir = Path(args.output_dir)
+
+    assert input_file.is_file(), f"{input_file} does not exist"
+    assert output_dir.is_dir(), f"{output_dir} does not exist"
+
+    lines = input_file.read_text(encoding="utf-8").strip().split("\n")
+
+    text_words_segments = get_word_segments(lines)
+    with open(output_dir / "transcript_words.txt", "w+", encoding="utf-8") as f:
+        f.writelines(text_words_segments)
+
+    words = get_words(text_words_segments)[1:]  # remove "\n" from words
+    with open(output_dir / "words_no_ids.txt", "w+", encoding="utf-8") as f:
+        f.writelines([word + "\n" for word in sorted(words)])
+
+    words = (
+        ["<eps>", "!SIL", "<SPOKEN_NOISE>", "<UNK>"]
+        + sorted(words)
+        + ["#0", "<s>", "<\s>"]
+    )
+
+    with open(output_dir / "words.txt", "w+", encoding="utf-8") as f:
+        f.writelines([f"{word} {i}\n" for i, word in enumerate(words)])
--- a/egs/commonvoice/ASR/prepare.sh
+++ b/egs/commonvoice/ASR/prepare.sh
@ -172,8 +172,41 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
 fi

 if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
-  log "Stage 9: Prepare BPE based lang"
+  if [ $lang == "yue" ] || [ $lang == "zh_TW" ] || [ $lang == "zh_CN" ] || [ $lang == "zh_HK" ]; then
+    log "Stage 9: Prepare Char based lang"
+    lang_dir=data/${lang}/lang_char/
+    mkdir -p $lang_dir

+    if [ ! -f $lang_dir/transcript_words.txt ]; then
+        log "Generate data for lang preparation"
+        file=$(
+          find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz"
+        )
+        gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/text
+
+        # Ensure space only appears once
+        sed -i 's/\t/ /g' $lang_dir/text
+        sed -i 's/[ ][ ]*/ /g' $lang_dir/text
+
+        if [ $lang == "yue" ]; then
+          # Get words.txt and words_no_ids.txt
+          ./local/word_segment_yue.py \
+            --input-file $lang_dir/text \
+            --output-dir $lang_dir
+
+          mv $lang_dir/text $lang_dir/_text
+          cp $lang_dir/transcript_words.txt $lang_dir/text
+
+          if [ ! -f $lang_dir/tokens.txt ]; then
+            ./local/prepare_char.py --lang-dir $lang_dir
+          fi
+        else
+          log "word_segment_${lang}.py not implemented yet"
+          exit 1
+        fi
+      fi
+  else
+    log "Stage 9: Prepare BPE based lang"
    for vocab_size in ${vocab_sizes[@]}; do
      lang_dir=data/${lang}/lang_bpe_${vocab_size}
      mkdir -p $lang_dir
@ -249,6 +282,7 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
          $lang_dir/L_disambig.fst
      fi
    done
+  fi
 fi

 if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
@ -256,6 +290,9 @@ if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
  # We assume you have install kaldilm, if not, please install
  # it using: pip install kaldilm

+  if [ $lang == "yue" ] || [ $lang == "zh_TW" ] || [ $lang == "zh_CN" ] || [ $lang == "zh_HK" ]; then
+    echo "TO BE IMPLEMENTED"
+  else
    for vocab_size in ${vocab_sizes[@]}; do
      lang_dir=data/${lang}/lang_bpe_${vocab_size}
      mkdir -p $lang_dir/lm
@ -277,6 +314,7 @@ if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
        fi
      done
    done
+  fi
 fi

 if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/local/prepare_lang.py`