diff --git a/egs/commonvoice/ASR/local/prepare_char.py b/egs/commonvoice/ASR/local/prepare_char.py new file mode 120000 index 000000000..42743b544 --- /dev/null +++ b/egs/commonvoice/ASR/local/prepare_char.py @@ -0,0 +1 @@ +../../../aishell/ASR/local/prepare_char.py \ No newline at end of file diff --git a/egs/commonvoice/ASR/local/prepare_lang.py b/egs/commonvoice/ASR/local/prepare_lang.py new file mode 120000 index 000000000..747f2ab39 --- /dev/null +++ b/egs/commonvoice/ASR/local/prepare_lang.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/prepare_lang.py \ No newline at end of file diff --git a/egs/commonvoice/ASR/local/prepare_lang_fst.py b/egs/commonvoice/ASR/local/prepare_lang_fst.py new file mode 120000 index 000000000..c5787c534 --- /dev/null +++ b/egs/commonvoice/ASR/local/prepare_lang_fst.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/prepare_lang_fst.py \ No newline at end of file diff --git a/egs/commonvoice/ASR/local/word_segment_yue.py b/egs/commonvoice/ASR/local/word_segment_yue.py new file mode 100755 index 000000000..e5f645d80 --- /dev/null +++ b/egs/commonvoice/ASR/local/word_segment_yue.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +# Copyright 2024 Xiaomi Corp. (authors: Zengrui Jin) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This script takes a text file "data/lang_char/text" as input, the file consist of +lines each containing a transcript, applies text norm and generates the following +files in the directory "data/lang_char": + - transcript_words.txt + - words.txt + - words_no_ids.txt +""" + +import argparse +import logging +from pathlib import Path +from typing import List + +import pycantonese +from tqdm.auto import tqdm + +from icefall.utils import is_cjk + + +def get_parser(): + parser = argparse.ArgumentParser( + description="Prepare char lexicon", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--input-file", + "-i", + default="data/yue/lang_char/text", + type=str, + help="The input text file", + ) + parser.add_argument( + "--output-dir", + "-o", + default="data/yue/lang_char/", + type=str, + help="The output directory", + ) + return parser + + +def get_word_segments(lines: List[str]) -> List[str]: + # the current pycantonese segmenter does not handle the case when the input + # is code switching, so we need to handle it separately + + new_lines = [] + + for line in tqdm(lines, desc="Segmenting lines"): + try: + # code switching + if len(line.strip().split(" ")) > 1: + segments = [] + for segment in line.strip().split(" "): + if segment.strip() == "": + continue + try: + if not is_cjk(segment[0]): # en segment + segments.append(segment) + else: # zh segment + segments.extend(pycantonese.segment(segment)) + except Exception as e: + logging.error(f"Failed to process segment: {segment}") + raise e + new_lines.append(" ".join(segments) + "\n") + # not code switching + else: + new_lines.append(" ".join(pycantonese.segment(line)) + "\n") + except Exception as e: + logging.error(f"Failed to process line: {line}") + raise e + return new_lines + + +def get_words(lines: List[str]) -> List[str]: + words = set() + for line in tqdm(lines, desc="Getting words"): + words.update(line.strip().split(" ")) + return list(words) + + +if __name__ == "__main__": + parser = get_parser() + args = parser.parse_args() + + input_file = Path(args.input_file) + output_dir = Path(args.output_dir) + + assert input_file.is_file(), f"{input_file} does not exist" + assert output_dir.is_dir(), f"{output_dir} does not exist" + + lines = input_file.read_text(encoding="utf-8").strip().split("\n") + + text_words_segments = get_word_segments(lines) + with open(output_dir / "transcript_words.txt", "w+", encoding="utf-8") as f: + f.writelines(text_words_segments) + + words = get_words(text_words_segments)[1:] # remove "\n" from words + with open(output_dir / "words_no_ids.txt", "w+", encoding="utf-8") as f: + f.writelines([word + "\n" for word in sorted(words)]) + + words = ( + ["", "!SIL", "", ""] + + sorted(words) + + ["#0", "", "<\s>"] + ) + + with open(output_dir / "words.txt", "w+", encoding="utf-8") as f: + f.writelines([f"{word} {i}\n" for i, word in enumerate(words)]) diff --git a/egs/commonvoice/ASR/prepare.sh b/egs/commonvoice/ASR/prepare.sh index edac0e8e6..dcd09c90b 100755 --- a/egs/commonvoice/ASR/prepare.sh +++ b/egs/commonvoice/ASR/prepare.sh @@ -172,83 +172,117 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then fi if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then - log "Stage 9: Prepare BPE based lang" - - for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/${lang}/lang_bpe_${vocab_size} + if [ $lang == "yue" ] || [ $lang == "zh_TW" ] || [ $lang == "zh_CN" ] || [ $lang == "zh_HK" ]; then + log "Stage 9: Prepare Char based lang" + lang_dir=data/${lang}/lang_char/ mkdir -p $lang_dir if [ ! -f $lang_dir/transcript_words.txt ]; then - log "Generate data for BPE training" - file=$( - find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz" - ) - gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/transcript_words.txt + log "Generate data for lang preparation" + file=$( + find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz" + ) + gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/text - # Ensure space only appears once - sed -i 's/\t/ /g' $lang_dir/transcript_words.txt - sed -i 's/[ ][ ]*/ /g' $lang_dir/transcript_words.txt - fi + # Ensure space only appears once + sed -i 's/\t/ /g' $lang_dir/text + sed -i 's/[ ][ ]*/ /g' $lang_dir/text - if [ ! -f $lang_dir/words.txt ]; then - cat $lang_dir/transcript_words.txt | sed 's/ /\n/g' \ - | sort -u | sed '/^$/d' > $lang_dir/words.txt - (echo '!SIL'; echo ''; echo ''; ) | - cat - $lang_dir/words.txt | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - if ($1 == "") { - print " is in the vocabulary!" | "cat 1>&2" - exit 1; + if [ $lang == "yue" ]; then + # Get words.txt and words_no_ids.txt + ./local/word_segment_yue.py \ + --input-file $lang_dir/text \ + --output-dir $lang_dir + + mv $lang_dir/text $lang_dir/_text + cp $lang_dir/transcript_words.txt $lang_dir/text + + if [ ! -f $lang_dir/tokens.txt ]; then + ./local/prepare_char.py --lang-dir $lang_dir + fi + else + log "word_segment_${lang}.py not implemented yet" + exit 1 + fi + fi + else + log "Stage 9: Prepare BPE based lang" + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/${lang}/lang_bpe_${vocab_size} + mkdir -p $lang_dir + + if [ ! -f $lang_dir/transcript_words.txt ]; then + log "Generate data for BPE training" + file=$( + find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz" + ) + gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/transcript_words.txt + + # Ensure space only appears once + sed -i 's/\t/ /g' $lang_dir/transcript_words.txt + sed -i 's/[ ][ ]*/ /g' $lang_dir/transcript_words.txt + fi + + if [ ! -f $lang_dir/words.txt ]; then + cat $lang_dir/transcript_words.txt | sed 's/ /\n/g' \ + | sort -u | sed '/^$/d' > $lang_dir/words.txt + (echo '!SIL'; echo ''; echo ''; ) | + cat - $lang_dir/words.txt | sort | uniq | awk ' + BEGIN { + print " 0"; } - if ($1 == "") { - print " is in the vocabulary!" | "cat 1>&2" - exit 1; + { + if ($1 == "") { + print " is in the vocabulary!" | "cat 1>&2" + exit 1; + } + if ($1 == "") { + print " is in the vocabulary!" | "cat 1>&2" + exit 1; + } + printf("%s %d\n", $1, NR); } - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $lang_dir/words || exit 1; - mv $lang_dir/words $lang_dir/words.txt - fi + END { + printf("#0 %d\n", NR+1); + printf(" %d\n", NR+2); + printf(" %d\n", NR+3); + }' > $lang_dir/words || exit 1; + mv $lang_dir/words $lang_dir/words.txt + fi - if [ ! -f $lang_dir/bpe.model ]; then - ./local/train_bpe_model.py \ - --lang-dir $lang_dir \ - --vocab-size $vocab_size \ - --transcript $lang_dir/transcript_words.txt - fi + if [ ! -f $lang_dir/bpe.model ]; then + ./local/train_bpe_model.py \ + --lang-dir $lang_dir \ + --vocab-size $vocab_size \ + --transcript $lang_dir/transcript_words.txt + fi - if [ ! -f $lang_dir/L_disambig.pt ]; then - ./local/prepare_lang_bpe.py --lang-dir $lang_dir + if [ ! -f $lang_dir/L_disambig.pt ]; then + ./local/prepare_lang_bpe.py --lang-dir $lang_dir - log "Validating $lang_dir/lexicon.txt" - ./local/validate_bpe_lexicon.py \ - --lexicon $lang_dir/lexicon.txt \ - --bpe-model $lang_dir/bpe.model - fi + log "Validating $lang_dir/lexicon.txt" + ./local/validate_bpe_lexicon.py \ + --lexicon $lang_dir/lexicon.txt \ + --bpe-model $lang_dir/bpe.model + fi - if [ ! -f $lang_dir/L.fst ]; then - log "Converting L.pt to L.fst" - ./shared/convert-k2-to-openfst.py \ - --olabels aux_labels \ - $lang_dir/L.pt \ - $lang_dir/L.fst - fi + if [ ! -f $lang_dir/L.fst ]; then + log "Converting L.pt to L.fst" + ./shared/convert-k2-to-openfst.py \ + --olabels aux_labels \ + $lang_dir/L.pt \ + $lang_dir/L.fst + fi - if [ ! -f $lang_dir/L_disambig.fst ]; then - log "Converting L_disambig.pt to L_disambig.fst" - ./shared/convert-k2-to-openfst.py \ - --olabels aux_labels \ - $lang_dir/L_disambig.pt \ - $lang_dir/L_disambig.fst - fi - done + if [ ! -f $lang_dir/L_disambig.fst ]; then + log "Converting L_disambig.pt to L_disambig.fst" + ./shared/convert-k2-to-openfst.py \ + --olabels aux_labels \ + $lang_dir/L_disambig.pt \ + $lang_dir/L_disambig.fst + fi + done + fi fi if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then @@ -256,27 +290,31 @@ if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then # We assume you have install kaldilm, if not, please install # it using: pip install kaldilm - for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/${lang}/lang_bpe_${vocab_size} - mkdir -p $lang_dir/lm - #3-gram used in building HLG, 4-gram used for LM rescoring - for ngram in 3 4; do - if [ ! -f $lang_dir/lm/${ngram}gram.arpa ]; then - ./shared/make_kn_lm.py \ - -ngram-order ${ngram} \ - -text $lang_dir/transcript_words.txt \ - -lm $lang_dir/lm/${ngram}gram.arpa - fi + if [ $lang == "yue" ] || [ $lang == "zh_TW" ] || [ $lang == "zh_CN" ] || [ $lang == "zh_HK" ]; then + echo "TO BE IMPLEMENTED" + else + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/${lang}/lang_bpe_${vocab_size} + mkdir -p $lang_dir/lm + #3-gram used in building HLG, 4-gram used for LM rescoring + for ngram in 3 4; do + if [ ! -f $lang_dir/lm/${ngram}gram.arpa ]; then + ./shared/make_kn_lm.py \ + -ngram-order ${ngram} \ + -text $lang_dir/transcript_words.txt \ + -lm $lang_dir/lm/${ngram}gram.arpa + fi - if [ ! -f $lang_dir/lm/${ngram}gram.fst.txt ]; then - python3 -m kaldilm \ - --read-symbol-table="$lang_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=${ngram} \ - $lang_dir/lm/${ngram}gram.arpa > $lang_dir/lm/G_${ngram}_gram.fst.txt - fi + if [ ! -f $lang_dir/lm/${ngram}gram.fst.txt ]; then + python3 -m kaldilm \ + --read-symbol-table="$lang_dir/words.txt" \ + --disambig-symbol='#0' \ + --max-order=${ngram} \ + $lang_dir/lm/${ngram}gram.arpa > $lang_dir/lm/G_${ngram}_gram.fst.txt + fi + done done - done + fi fi if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then