added scripts for char-based lang prep

This commit is contained in:
jinzr 2024-03-12 12:12:35 +08:00
parent ddefabcb7a
commit 4a1d4be94a
5 changed files with 251 additions and 84 deletions

View File

@ -0,0 +1 @@
../../../aishell/ASR/local/prepare_char.py

View File

@ -0,0 +1 @@
../../../librispeech/ASR/local/prepare_lang.py

View File

@ -0,0 +1 @@
../../../librispeech/ASR/local/prepare_lang_fst.py

View File

@ -0,0 +1,126 @@
#!/usr/bin/env python3
# Copyright 2024 Xiaomi Corp. (authors: Zengrui Jin)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script takes a text file "data/lang_char/text" as input, the file consist of
lines each containing a transcript, applies text norm and generates the following
files in the directory "data/lang_char":
- transcript_words.txt
- words.txt
- words_no_ids.txt
"""
import argparse
import logging
from pathlib import Path
from typing import List
import pycantonese
from tqdm.auto import tqdm
from icefall.utils import is_cjk
def get_parser():
parser = argparse.ArgumentParser(
description="Prepare char lexicon",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--input-file",
"-i",
default="data/yue/lang_char/text",
type=str,
help="The input text file",
)
parser.add_argument(
"--output-dir",
"-o",
default="data/yue/lang_char/",
type=str,
help="The output directory",
)
return parser
def get_word_segments(lines: List[str]) -> List[str]:
# the current pycantonese segmenter does not handle the case when the input
# is code switching, so we need to handle it separately
new_lines = []
for line in tqdm(lines, desc="Segmenting lines"):
try:
# code switching
if len(line.strip().split(" ")) > 1:
segments = []
for segment in line.strip().split(" "):
if segment.strip() == "":
continue
try:
if not is_cjk(segment[0]): # en segment
segments.append(segment)
else: # zh segment
segments.extend(pycantonese.segment(segment))
except Exception as e:
logging.error(f"Failed to process segment: {segment}")
raise e
new_lines.append(" ".join(segments) + "\n")
# not code switching
else:
new_lines.append(" ".join(pycantonese.segment(line)) + "\n")
except Exception as e:
logging.error(f"Failed to process line: {line}")
raise e
return new_lines
def get_words(lines: List[str]) -> List[str]:
words = set()
for line in tqdm(lines, desc="Getting words"):
words.update(line.strip().split(" "))
return list(words)
if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()
input_file = Path(args.input_file)
output_dir = Path(args.output_dir)
assert input_file.is_file(), f"{input_file} does not exist"
assert output_dir.is_dir(), f"{output_dir} does not exist"
lines = input_file.read_text(encoding="utf-8").strip().split("\n")
text_words_segments = get_word_segments(lines)
with open(output_dir / "transcript_words.txt", "w+", encoding="utf-8") as f:
f.writelines(text_words_segments)
words = get_words(text_words_segments)[1:] # remove "\n" from words
with open(output_dir / "words_no_ids.txt", "w+", encoding="utf-8") as f:
f.writelines([word + "\n" for word in sorted(words)])
words = (
["<eps>", "!SIL", "<SPOKEN_NOISE>", "<UNK>"]
+ sorted(words)
+ ["#0", "<s>", "<\s>"]
)
with open(output_dir / "words.txt", "w+", encoding="utf-8") as f:
f.writelines([f"{word} {i}\n" for i, word in enumerate(words)])

View File

@ -172,83 +172,117 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
fi fi
if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
log "Stage 9: Prepare BPE based lang" if [ $lang == "yue" ] || [ $lang == "zh_TW" ] || [ $lang == "zh_CN" ] || [ $lang == "zh_HK" ]; then
log "Stage 9: Prepare Char based lang"
for vocab_size in ${vocab_sizes[@]}; do lang_dir=data/${lang}/lang_char/
lang_dir=data/${lang}/lang_bpe_${vocab_size}
mkdir -p $lang_dir mkdir -p $lang_dir
if [ ! -f $lang_dir/transcript_words.txt ]; then if [ ! -f $lang_dir/transcript_words.txt ]; then
log "Generate data for BPE training" log "Generate data for lang preparation"
file=$( file=$(
find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz" find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz"
) )
gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/transcript_words.txt gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/text
# Ensure space only appears once # Ensure space only appears once
sed -i 's/\t/ /g' $lang_dir/transcript_words.txt sed -i 's/\t/ /g' $lang_dir/text
sed -i 's/[ ][ ]*/ /g' $lang_dir/transcript_words.txt sed -i 's/[ ][ ]*/ /g' $lang_dir/text
fi
if [ ! -f $lang_dir/words.txt ]; then if [ $lang == "yue" ]; then
cat $lang_dir/transcript_words.txt | sed 's/ /\n/g' \ # Get words.txt and words_no_ids.txt
| sort -u | sed '/^$/d' > $lang_dir/words.txt ./local/word_segment_yue.py \
(echo '!SIL'; echo '<SPOKEN_NOISE>'; echo '<UNK>'; ) | --input-file $lang_dir/text \
cat - $lang_dir/words.txt | sort | uniq | awk ' --output-dir $lang_dir
BEGIN {
print "<eps> 0"; mv $lang_dir/text $lang_dir/_text
} cp $lang_dir/transcript_words.txt $lang_dir/text
{
if ($1 == "<s>") { if [ ! -f $lang_dir/tokens.txt ]; then
print "<s> is in the vocabulary!" | "cat 1>&2" ./local/prepare_char.py --lang-dir $lang_dir
exit 1; fi
else
log "word_segment_${lang}.py not implemented yet"
exit 1
fi
fi
else
log "Stage 9: Prepare BPE based lang"
for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/${lang}/lang_bpe_${vocab_size}
mkdir -p $lang_dir
if [ ! -f $lang_dir/transcript_words.txt ]; then
log "Generate data for BPE training"
file=$(
find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz"
)
gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/transcript_words.txt
# Ensure space only appears once
sed -i 's/\t/ /g' $lang_dir/transcript_words.txt
sed -i 's/[ ][ ]*/ /g' $lang_dir/transcript_words.txt
fi
if [ ! -f $lang_dir/words.txt ]; then
cat $lang_dir/transcript_words.txt | sed 's/ /\n/g' \
| sort -u | sed '/^$/d' > $lang_dir/words.txt
(echo '!SIL'; echo '<SPOKEN_NOISE>'; echo '<UNK>'; ) |
cat - $lang_dir/words.txt | sort | uniq | awk '
BEGIN {
print "<eps> 0";
} }
if ($1 == "</s>") { {
print "</s> is in the vocabulary!" | "cat 1>&2" if ($1 == "<s>") {
exit 1; print "<s> is in the vocabulary!" | "cat 1>&2"
exit 1;
}
if ($1 == "</s>") {
print "</s> is in the vocabulary!" | "cat 1>&2"
exit 1;
}
printf("%s %d\n", $1, NR);
} }
printf("%s %d\n", $1, NR); END {
} printf("#0 %d\n", NR+1);
END { printf("<s> %d\n", NR+2);
printf("#0 %d\n", NR+1); printf("</s> %d\n", NR+3);
printf("<s> %d\n", NR+2); }' > $lang_dir/words || exit 1;
printf("</s> %d\n", NR+3); mv $lang_dir/words $lang_dir/words.txt
}' > $lang_dir/words || exit 1; fi
mv $lang_dir/words $lang_dir/words.txt
fi
if [ ! -f $lang_dir/bpe.model ]; then if [ ! -f $lang_dir/bpe.model ]; then
./local/train_bpe_model.py \ ./local/train_bpe_model.py \
--lang-dir $lang_dir \ --lang-dir $lang_dir \
--vocab-size $vocab_size \ --vocab-size $vocab_size \
--transcript $lang_dir/transcript_words.txt --transcript $lang_dir/transcript_words.txt
fi fi
if [ ! -f $lang_dir/L_disambig.pt ]; then if [ ! -f $lang_dir/L_disambig.pt ]; then
./local/prepare_lang_bpe.py --lang-dir $lang_dir ./local/prepare_lang_bpe.py --lang-dir $lang_dir
log "Validating $lang_dir/lexicon.txt" log "Validating $lang_dir/lexicon.txt"
./local/validate_bpe_lexicon.py \ ./local/validate_bpe_lexicon.py \
--lexicon $lang_dir/lexicon.txt \ --lexicon $lang_dir/lexicon.txt \
--bpe-model $lang_dir/bpe.model --bpe-model $lang_dir/bpe.model
fi fi
if [ ! -f $lang_dir/L.fst ]; then if [ ! -f $lang_dir/L.fst ]; then
log "Converting L.pt to L.fst" log "Converting L.pt to L.fst"
./shared/convert-k2-to-openfst.py \ ./shared/convert-k2-to-openfst.py \
--olabels aux_labels \ --olabels aux_labels \
$lang_dir/L.pt \ $lang_dir/L.pt \
$lang_dir/L.fst $lang_dir/L.fst
fi fi
if [ ! -f $lang_dir/L_disambig.fst ]; then if [ ! -f $lang_dir/L_disambig.fst ]; then
log "Converting L_disambig.pt to L_disambig.fst" log "Converting L_disambig.pt to L_disambig.fst"
./shared/convert-k2-to-openfst.py \ ./shared/convert-k2-to-openfst.py \
--olabels aux_labels \ --olabels aux_labels \
$lang_dir/L_disambig.pt \ $lang_dir/L_disambig.pt \
$lang_dir/L_disambig.fst $lang_dir/L_disambig.fst
fi fi
done done
fi
fi fi
if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
@ -256,27 +290,31 @@ if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
# We assume you have install kaldilm, if not, please install # We assume you have install kaldilm, if not, please install
# it using: pip install kaldilm # it using: pip install kaldilm
for vocab_size in ${vocab_sizes[@]}; do if [ $lang == "yue" ] || [ $lang == "zh_TW" ] || [ $lang == "zh_CN" ] || [ $lang == "zh_HK" ]; then
lang_dir=data/${lang}/lang_bpe_${vocab_size} echo "TO BE IMPLEMENTED"
mkdir -p $lang_dir/lm else
#3-gram used in building HLG, 4-gram used for LM rescoring for vocab_size in ${vocab_sizes[@]}; do
for ngram in 3 4; do lang_dir=data/${lang}/lang_bpe_${vocab_size}
if [ ! -f $lang_dir/lm/${ngram}gram.arpa ]; then mkdir -p $lang_dir/lm
./shared/make_kn_lm.py \ #3-gram used in building HLG, 4-gram used for LM rescoring
-ngram-order ${ngram} \ for ngram in 3 4; do
-text $lang_dir/transcript_words.txt \ if [ ! -f $lang_dir/lm/${ngram}gram.arpa ]; then
-lm $lang_dir/lm/${ngram}gram.arpa ./shared/make_kn_lm.py \
fi -ngram-order ${ngram} \
-text $lang_dir/transcript_words.txt \
-lm $lang_dir/lm/${ngram}gram.arpa
fi
if [ ! -f $lang_dir/lm/${ngram}gram.fst.txt ]; then if [ ! -f $lang_dir/lm/${ngram}gram.fst.txt ]; then
python3 -m kaldilm \ python3 -m kaldilm \
--read-symbol-table="$lang_dir/words.txt" \ --read-symbol-table="$lang_dir/words.txt" \
--disambig-symbol='#0' \ --disambig-symbol='#0' \
--max-order=${ngram} \ --max-order=${ngram} \
$lang_dir/lm/${ngram}gram.arpa > $lang_dir/lm/G_${ngram}_gram.fst.txt $lang_dir/lm/${ngram}gram.arpa > $lang_dir/lm/G_${ngram}_gram.fst.txt
fi fi
done
done done
done fi
fi fi
if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then