added scripts for char-based lang prep

2025-12-10 22:45:27 +00:00 · 2024-03-12 12:12:35 +08:00 · 2024-03-12 12:12:35 +08:00 · 4a1d4be94a
commit 4a1d4be94a
parent ddefabcb7a
5 changed files with 251 additions and 84 deletions
--- a/egs/commonvoice/ASR/local/prepare_char.py
+++ b/egs/commonvoice/ASR/local/prepare_char.py
@ -0,0 +1 @@
 ../../../aishell/ASR/local/prepare_char.py
--- a/egs/commonvoice/ASR/local/prepare_lang.py
+++ b/egs/commonvoice/ASR/local/prepare_lang.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/local/prepare_lang.py
--- a/egs/commonvoice/ASR/local/prepare_lang_fst.py
+++ b/egs/commonvoice/ASR/local/prepare_lang_fst.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/local/prepare_lang_fst.py
--- a/egs/commonvoice/ASR/local/word_segment_yue.py
+++ b/egs/commonvoice/ASR/local/word_segment_yue.py
@ -0,0 +1,126 @@
 #!/usr/bin/env python3
 # Copyright    2024  Xiaomi Corp.        (authors: Zengrui Jin)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script takes a text file "data/lang_char/text" as input, the file consist of
 lines each containing a transcript, applies text norm and generates the following
 files in the directory "data/lang_char":
    - transcript_words.txt
    - words.txt
    - words_no_ids.txt
 """
 import argparse
 import logging
 from pathlib import Path
 from typing import List
 import pycantonese
 from tqdm.auto import tqdm
 from icefall.utils import is_cjk
 def get_parser():
    parser = argparse.ArgumentParser(
        description="Prepare char lexicon",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--input-file",
        "-i",
        default="data/yue/lang_char/text",
        type=str,
        help="The input text file",
    )
    parser.add_argument(
        "--output-dir",
        "-o",
        default="data/yue/lang_char/",
        type=str,
        help="The output directory",
    )
    return parser
 def get_word_segments(lines: List[str]) -> List[str]:
    # the current pycantonese segmenter does not handle the case when the input
    # is code switching, so we need to handle it separately
    new_lines = []
    for line in tqdm(lines, desc="Segmenting lines"):
        try:
            # code switching
            if len(line.strip().split(" ")) > 1:
                segments = []
                for segment in line.strip().split(" "):
                    if segment.strip() == "":
                        continue
                    try:
                        if not is_cjk(segment[0]):  # en segment
                            segments.append(segment)
                        else:  # zh segment
                            segments.extend(pycantonese.segment(segment))
                    except Exception as e:
                        logging.error(f"Failed to process segment: {segment}")
                        raise e
                new_lines.append(" ".join(segments) + "\n")
            # not code switching
            else:
                new_lines.append(" ".join(pycantonese.segment(line)) + "\n")
        except Exception as e:
            logging.error(f"Failed to process line: {line}")
            raise e
    return new_lines
 def get_words(lines: List[str]) -> List[str]:
    words = set()
    for line in tqdm(lines, desc="Getting words"):
        words.update(line.strip().split(" "))
    return list(words)
 if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()
    input_file = Path(args.input_file)
    output_dir = Path(args.output_dir)
    assert input_file.is_file(), f"{input_file} does not exist"
    assert output_dir.is_dir(), f"{output_dir} does not exist"
    lines = input_file.read_text(encoding="utf-8").strip().split("\n")
    text_words_segments = get_word_segments(lines)
    with open(output_dir / "transcript_words.txt", "w+", encoding="utf-8") as f:
        f.writelines(text_words_segments)
    words = get_words(text_words_segments)[1:]  # remove "\n" from words
    with open(output_dir / "words_no_ids.txt", "w+", encoding="utf-8") as f:
        f.writelines([word + "\n" for word in sorted(words)])
    words = (
        ["<eps>", "!SIL", "<SPOKEN_NOISE>", "<UNK>"]
        + sorted(words)
        + ["#0", "<s>", "<\s>"]
    )
    with open(output_dir / "words.txt", "w+", encoding="utf-8") as f:
        f.writelines([f"{word} {i}\n" for i, word in enumerate(words)])
--- a/egs/commonvoice/ASR/prepare.sh
+++ b/egs/commonvoice/ASR/prepare.sh
@ -172,83 +172,117 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
 fi
 if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
-  log "Stage 9: Prepare BPE based lang"
+  if [ $lang == "yue" ] || [ $lang == "zh_TW" ] || [ $lang == "zh_CN" ] || [ $lang == "zh_HK" ]; then
-
+    log "Stage 9: Prepare Char based lang"
-  for vocab_size in ${vocab_sizes[@]}; do
+    lang_dir=data/${lang}/lang_char/
    lang_dir=data/${lang}/lang_bpe_${vocab_size}
    mkdir -p $lang_dir
    if [ ! -f $lang_dir/transcript_words.txt ]; then
-      log "Generate data for BPE training"
+        log "Generate data for lang preparation"
-      file=$(
+        file=$(
-        find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz"
+          find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz"
-      )
+        )
-      gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/transcript_words.txt
+        gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/text
-      # Ensure space only appears once
+        # Ensure space only appears once
-      sed -i 's/\t/ /g' $lang_dir/transcript_words.txt
+        sed -i 's/\t/ /g' $lang_dir/text
-      sed -i 's/[ ][ ]*/ /g' $lang_dir/transcript_words.txt
+        sed -i 's/[ ][ ]*/ /g' $lang_dir/text
    fi
-    if [ ! -f $lang_dir/words.txt ]; then
+        if [ $lang == "yue" ]; then
-      cat $lang_dir/transcript_words.txt | sed 's/ /\n/g' \
+          # Get words.txt and words_no_ids.txt
-        | sort -u | sed '/^$/d' > $lang_dir/words.txt
+          ./local/word_segment_yue.py \
-      (echo '!SIL'; echo '<SPOKEN_NOISE>'; echo '<UNK>'; ) |
+            --input-file $lang_dir/text \
-        cat - $lang_dir/words.txt | sort | uniq | awk '
+            --output-dir $lang_dir
-        BEGIN {
+
-          print "<eps> 0";
+          mv $lang_dir/text $lang_dir/_text
-        }
+          cp $lang_dir/transcript_words.txt $lang_dir/text
-        {
+
-          if ($1 == "<s>") {
+          if [ ! -f $lang_dir/tokens.txt ]; then
-            print "<s> is in the vocabulary!" | "cat 1>&2"
+            ./local/prepare_char.py --lang-dir $lang_dir
-            exit 1;
+          fi
        else
          log "word_segment_${lang}.py not implemented yet"
          exit 1
        fi
      fi
  else
    log "Stage 9: Prepare BPE based lang"
    for vocab_size in ${vocab_sizes[@]}; do
      lang_dir=data/${lang}/lang_bpe_${vocab_size}
      mkdir -p $lang_dir
      if [ ! -f $lang_dir/transcript_words.txt ]; then
        log "Generate data for BPE training"
        file=$(
          find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz"
        )
        gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/transcript_words.txt
        # Ensure space only appears once
        sed -i 's/\t/ /g' $lang_dir/transcript_words.txt
        sed -i 's/[ ][ ]*/ /g' $lang_dir/transcript_words.txt
      fi
      if [ ! -f $lang_dir/words.txt ]; then
        cat $lang_dir/transcript_words.txt | sed 's/ /\n/g' \
          | sort -u | sed '/^$/d' > $lang_dir/words.txt
        (echo '!SIL'; echo '<SPOKEN_NOISE>'; echo '<UNK>'; ) |
          cat - $lang_dir/words.txt | sort | uniq | awk '
          BEGIN {
            print "<eps> 0";
          }
-          if ($1 == "</s>") {
+          {
-            print "</s> is in the vocabulary!" | "cat 1>&2"
+            if ($1 == "<s>") {
-            exit 1;
+              print "<s> is in the vocabulary!" | "cat 1>&2"
              exit 1;
            }
            if ($1 == "</s>") {
              print "</s> is in the vocabulary!" | "cat 1>&2"
              exit 1;
            }
            printf("%s %d\n", $1, NR);
          }
-          printf("%s %d\n", $1, NR);
+          END {
-        }
+            printf("#0 %d\n", NR+1);
-        END {
+            printf("<s> %d\n", NR+2);
-          printf("#0 %d\n", NR+1);
+            printf("</s> %d\n", NR+3);
-          printf("<s> %d\n", NR+2);
+          }' > $lang_dir/words || exit 1;
-          printf("</s> %d\n", NR+3);
+        mv $lang_dir/words $lang_dir/words.txt
-        }' > $lang_dir/words || exit 1;
+      fi
      mv $lang_dir/words $lang_dir/words.txt
    fi
-    if [ ! -f $lang_dir/bpe.model ]; then
+      if [ ! -f $lang_dir/bpe.model ]; then
-      ./local/train_bpe_model.py \
+        ./local/train_bpe_model.py \
-        --lang-dir $lang_dir \
+          --lang-dir $lang_dir \
-        --vocab-size $vocab_size \
+          --vocab-size $vocab_size \
-        --transcript $lang_dir/transcript_words.txt
+          --transcript $lang_dir/transcript_words.txt
-    fi
+      fi
-    if [ ! -f $lang_dir/L_disambig.pt ]; then
+      if [ ! -f $lang_dir/L_disambig.pt ]; then
-      ./local/prepare_lang_bpe.py --lang-dir $lang_dir
+        ./local/prepare_lang_bpe.py --lang-dir $lang_dir
-      log "Validating $lang_dir/lexicon.txt"
+        log "Validating $lang_dir/lexicon.txt"
-      ./local/validate_bpe_lexicon.py \
+        ./local/validate_bpe_lexicon.py \
-        --lexicon $lang_dir/lexicon.txt \
+          --lexicon $lang_dir/lexicon.txt \
-        --bpe-model $lang_dir/bpe.model
+          --bpe-model $lang_dir/bpe.model
-    fi
+      fi
-    if [ ! -f $lang_dir/L.fst ]; then
+      if [ ! -f $lang_dir/L.fst ]; then
-      log "Converting L.pt to L.fst"
+        log "Converting L.pt to L.fst"
-      ./shared/convert-k2-to-openfst.py \
+        ./shared/convert-k2-to-openfst.py \
-        --olabels aux_labels \
+          --olabels aux_labels \
-        $lang_dir/L.pt \
+          $lang_dir/L.pt \
-        $lang_dir/L.fst
+          $lang_dir/L.fst
-    fi
+      fi
-    if [ ! -f $lang_dir/L_disambig.fst ]; then
+      if [ ! -f $lang_dir/L_disambig.fst ]; then
-      log "Converting L_disambig.pt to L_disambig.fst"
+        log "Converting L_disambig.pt to L_disambig.fst"
-      ./shared/convert-k2-to-openfst.py \
+        ./shared/convert-k2-to-openfst.py \
-        --olabels aux_labels \
+          --olabels aux_labels \
-        $lang_dir/L_disambig.pt \
+          $lang_dir/L_disambig.pt \
-        $lang_dir/L_disambig.fst
+          $lang_dir/L_disambig.fst
-    fi
+      fi
-  done
+    done
  fi
 fi
 if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
@ -256,27 +290,31 @@ if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
  # We assume you have install kaldilm, if not, please install
  # it using: pip install kaldilm
-  for vocab_size in ${vocab_sizes[@]}; do
+  if [ $lang == "yue" ] || [ $lang == "zh_TW" ] || [ $lang == "zh_CN" ] || [ $lang == "zh_HK" ]; then
-    lang_dir=data/${lang}/lang_bpe_${vocab_size}
+    echo "TO BE IMPLEMENTED"
-    mkdir -p $lang_dir/lm
+  else
-    #3-gram used in building HLG, 4-gram used for LM rescoring
+    for vocab_size in ${vocab_sizes[@]}; do
-    for ngram in 3 4; do
+      lang_dir=data/${lang}/lang_bpe_${vocab_size}
-      if [ ! -f $lang_dir/lm/${ngram}gram.arpa ]; then
+      mkdir -p $lang_dir/lm
-        ./shared/make_kn_lm.py \
+      #3-gram used in building HLG, 4-gram used for LM rescoring
-          -ngram-order ${ngram} \
+      for ngram in 3 4; do
-          -text $lang_dir/transcript_words.txt \
+        if [ ! -f $lang_dir/lm/${ngram}gram.arpa ]; then
-          -lm $lang_dir/lm/${ngram}gram.arpa
+          ./shared/make_kn_lm.py \
-      fi
+            -ngram-order ${ngram} \
            -text $lang_dir/transcript_words.txt \
            -lm $lang_dir/lm/${ngram}gram.arpa
        fi
-      if [ ! -f $lang_dir/lm/${ngram}gram.fst.txt ]; then
+        if [ ! -f $lang_dir/lm/${ngram}gram.fst.txt ]; then
-        python3 -m kaldilm \
+          python3 -m kaldilm \
-          --read-symbol-table="$lang_dir/words.txt" \
+            --read-symbol-table="$lang_dir/words.txt" \
-          --disambig-symbol='#0' \
+            --disambig-symbol='#0' \
-          --max-order=${ngram} \
+            --max-order=${ngram} \
-          $lang_dir/lm/${ngram}gram.arpa > $lang_dir/lm/G_${ngram}_gram.fst.txt
+            $lang_dir/lm/${ngram}gram.arpa > $lang_dir/lm/G_${ngram}_gram.fst.txt
-      fi
+        fi
      done
    done
-  done
+  fi
 fi
 if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/local/prepare_lang.py`