mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
added scripts for char-based lang prep
This commit is contained in:
parent
ddefabcb7a
commit
4a1d4be94a
1
egs/commonvoice/ASR/local/prepare_char.py
Symbolic link
1
egs/commonvoice/ASR/local/prepare_char.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../aishell/ASR/local/prepare_char.py
|
1
egs/commonvoice/ASR/local/prepare_lang.py
Symbolic link
1
egs/commonvoice/ASR/local/prepare_lang.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/local/prepare_lang.py
|
1
egs/commonvoice/ASR/local/prepare_lang_fst.py
Symbolic link
1
egs/commonvoice/ASR/local/prepare_lang_fst.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/local/prepare_lang_fst.py
|
126
egs/commonvoice/ASR/local/word_segment_yue.py
Executable file
126
egs/commonvoice/ASR/local/word_segment_yue.py
Executable file
@ -0,0 +1,126 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2024 Xiaomi Corp. (authors: Zengrui Jin)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script takes a text file "data/lang_char/text" as input, the file consist of
|
||||||
|
lines each containing a transcript, applies text norm and generates the following
|
||||||
|
files in the directory "data/lang_char":
|
||||||
|
- transcript_words.txt
|
||||||
|
- words.txt
|
||||||
|
- words_no_ids.txt
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import pycantonese
|
||||||
|
from tqdm.auto import tqdm
|
||||||
|
|
||||||
|
from icefall.utils import is_cjk
|
||||||
|
|
||||||
|
|
||||||
|
def get_parser():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Prepare char lexicon",
|
||||||
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--input-file",
|
||||||
|
"-i",
|
||||||
|
default="data/yue/lang_char/text",
|
||||||
|
type=str,
|
||||||
|
help="The input text file",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-dir",
|
||||||
|
"-o",
|
||||||
|
default="data/yue/lang_char/",
|
||||||
|
type=str,
|
||||||
|
help="The output directory",
|
||||||
|
)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def get_word_segments(lines: List[str]) -> List[str]:
|
||||||
|
# the current pycantonese segmenter does not handle the case when the input
|
||||||
|
# is code switching, so we need to handle it separately
|
||||||
|
|
||||||
|
new_lines = []
|
||||||
|
|
||||||
|
for line in tqdm(lines, desc="Segmenting lines"):
|
||||||
|
try:
|
||||||
|
# code switching
|
||||||
|
if len(line.strip().split(" ")) > 1:
|
||||||
|
segments = []
|
||||||
|
for segment in line.strip().split(" "):
|
||||||
|
if segment.strip() == "":
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
if not is_cjk(segment[0]): # en segment
|
||||||
|
segments.append(segment)
|
||||||
|
else: # zh segment
|
||||||
|
segments.extend(pycantonese.segment(segment))
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to process segment: {segment}")
|
||||||
|
raise e
|
||||||
|
new_lines.append(" ".join(segments) + "\n")
|
||||||
|
# not code switching
|
||||||
|
else:
|
||||||
|
new_lines.append(" ".join(pycantonese.segment(line)) + "\n")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to process line: {line}")
|
||||||
|
raise e
|
||||||
|
return new_lines
|
||||||
|
|
||||||
|
|
||||||
|
def get_words(lines: List[str]) -> List[str]:
|
||||||
|
words = set()
|
||||||
|
for line in tqdm(lines, desc="Getting words"):
|
||||||
|
words.update(line.strip().split(" "))
|
||||||
|
return list(words)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = get_parser()
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
input_file = Path(args.input_file)
|
||||||
|
output_dir = Path(args.output_dir)
|
||||||
|
|
||||||
|
assert input_file.is_file(), f"{input_file} does not exist"
|
||||||
|
assert output_dir.is_dir(), f"{output_dir} does not exist"
|
||||||
|
|
||||||
|
lines = input_file.read_text(encoding="utf-8").strip().split("\n")
|
||||||
|
|
||||||
|
text_words_segments = get_word_segments(lines)
|
||||||
|
with open(output_dir / "transcript_words.txt", "w+", encoding="utf-8") as f:
|
||||||
|
f.writelines(text_words_segments)
|
||||||
|
|
||||||
|
words = get_words(text_words_segments)[1:] # remove "\n" from words
|
||||||
|
with open(output_dir / "words_no_ids.txt", "w+", encoding="utf-8") as f:
|
||||||
|
f.writelines([word + "\n" for word in sorted(words)])
|
||||||
|
|
||||||
|
words = (
|
||||||
|
["<eps>", "!SIL", "<SPOKEN_NOISE>", "<UNK>"]
|
||||||
|
+ sorted(words)
|
||||||
|
+ ["#0", "<s>", "<\s>"]
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(output_dir / "words.txt", "w+", encoding="utf-8") as f:
|
||||||
|
f.writelines([f"{word} {i}\n" for i, word in enumerate(words)])
|
@ -172,83 +172,117 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
|
if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
|
||||||
log "Stage 9: Prepare BPE based lang"
|
if [ $lang == "yue" ] || [ $lang == "zh_TW" ] || [ $lang == "zh_CN" ] || [ $lang == "zh_HK" ]; then
|
||||||
|
log "Stage 9: Prepare Char based lang"
|
||||||
for vocab_size in ${vocab_sizes[@]}; do
|
lang_dir=data/${lang}/lang_char/
|
||||||
lang_dir=data/${lang}/lang_bpe_${vocab_size}
|
|
||||||
mkdir -p $lang_dir
|
mkdir -p $lang_dir
|
||||||
|
|
||||||
if [ ! -f $lang_dir/transcript_words.txt ]; then
|
if [ ! -f $lang_dir/transcript_words.txt ]; then
|
||||||
log "Generate data for BPE training"
|
log "Generate data for lang preparation"
|
||||||
file=$(
|
file=$(
|
||||||
find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz"
|
find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz"
|
||||||
)
|
)
|
||||||
gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/transcript_words.txt
|
gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/text
|
||||||
|
|
||||||
# Ensure space only appears once
|
# Ensure space only appears once
|
||||||
sed -i 's/\t/ /g' $lang_dir/transcript_words.txt
|
sed -i 's/\t/ /g' $lang_dir/text
|
||||||
sed -i 's/[ ][ ]*/ /g' $lang_dir/transcript_words.txt
|
sed -i 's/[ ][ ]*/ /g' $lang_dir/text
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -f $lang_dir/words.txt ]; then
|
if [ $lang == "yue" ]; then
|
||||||
cat $lang_dir/transcript_words.txt | sed 's/ /\n/g' \
|
# Get words.txt and words_no_ids.txt
|
||||||
| sort -u | sed '/^$/d' > $lang_dir/words.txt
|
./local/word_segment_yue.py \
|
||||||
(echo '!SIL'; echo '<SPOKEN_NOISE>'; echo '<UNK>'; ) |
|
--input-file $lang_dir/text \
|
||||||
cat - $lang_dir/words.txt | sort | uniq | awk '
|
--output-dir $lang_dir
|
||||||
BEGIN {
|
|
||||||
print "<eps> 0";
|
mv $lang_dir/text $lang_dir/_text
|
||||||
}
|
cp $lang_dir/transcript_words.txt $lang_dir/text
|
||||||
{
|
|
||||||
if ($1 == "<s>") {
|
if [ ! -f $lang_dir/tokens.txt ]; then
|
||||||
print "<s> is in the vocabulary!" | "cat 1>&2"
|
./local/prepare_char.py --lang-dir $lang_dir
|
||||||
exit 1;
|
fi
|
||||||
|
else
|
||||||
|
log "word_segment_${lang}.py not implemented yet"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log "Stage 9: Prepare BPE based lang"
|
||||||
|
for vocab_size in ${vocab_sizes[@]}; do
|
||||||
|
lang_dir=data/${lang}/lang_bpe_${vocab_size}
|
||||||
|
mkdir -p $lang_dir
|
||||||
|
|
||||||
|
if [ ! -f $lang_dir/transcript_words.txt ]; then
|
||||||
|
log "Generate data for BPE training"
|
||||||
|
file=$(
|
||||||
|
find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz"
|
||||||
|
)
|
||||||
|
gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/transcript_words.txt
|
||||||
|
|
||||||
|
# Ensure space only appears once
|
||||||
|
sed -i 's/\t/ /g' $lang_dir/transcript_words.txt
|
||||||
|
sed -i 's/[ ][ ]*/ /g' $lang_dir/transcript_words.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f $lang_dir/words.txt ]; then
|
||||||
|
cat $lang_dir/transcript_words.txt | sed 's/ /\n/g' \
|
||||||
|
| sort -u | sed '/^$/d' > $lang_dir/words.txt
|
||||||
|
(echo '!SIL'; echo '<SPOKEN_NOISE>'; echo '<UNK>'; ) |
|
||||||
|
cat - $lang_dir/words.txt | sort | uniq | awk '
|
||||||
|
BEGIN {
|
||||||
|
print "<eps> 0";
|
||||||
}
|
}
|
||||||
if ($1 == "</s>") {
|
{
|
||||||
print "</s> is in the vocabulary!" | "cat 1>&2"
|
if ($1 == "<s>") {
|
||||||
exit 1;
|
print "<s> is in the vocabulary!" | "cat 1>&2"
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
if ($1 == "</s>") {
|
||||||
|
print "</s> is in the vocabulary!" | "cat 1>&2"
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
printf("%s %d\n", $1, NR);
|
||||||
}
|
}
|
||||||
printf("%s %d\n", $1, NR);
|
END {
|
||||||
}
|
printf("#0 %d\n", NR+1);
|
||||||
END {
|
printf("<s> %d\n", NR+2);
|
||||||
printf("#0 %d\n", NR+1);
|
printf("</s> %d\n", NR+3);
|
||||||
printf("<s> %d\n", NR+2);
|
}' > $lang_dir/words || exit 1;
|
||||||
printf("</s> %d\n", NR+3);
|
mv $lang_dir/words $lang_dir/words.txt
|
||||||
}' > $lang_dir/words || exit 1;
|
fi
|
||||||
mv $lang_dir/words $lang_dir/words.txt
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -f $lang_dir/bpe.model ]; then
|
if [ ! -f $lang_dir/bpe.model ]; then
|
||||||
./local/train_bpe_model.py \
|
./local/train_bpe_model.py \
|
||||||
--lang-dir $lang_dir \
|
--lang-dir $lang_dir \
|
||||||
--vocab-size $vocab_size \
|
--vocab-size $vocab_size \
|
||||||
--transcript $lang_dir/transcript_words.txt
|
--transcript $lang_dir/transcript_words.txt
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -f $lang_dir/L_disambig.pt ]; then
|
if [ ! -f $lang_dir/L_disambig.pt ]; then
|
||||||
./local/prepare_lang_bpe.py --lang-dir $lang_dir
|
./local/prepare_lang_bpe.py --lang-dir $lang_dir
|
||||||
|
|
||||||
log "Validating $lang_dir/lexicon.txt"
|
log "Validating $lang_dir/lexicon.txt"
|
||||||
./local/validate_bpe_lexicon.py \
|
./local/validate_bpe_lexicon.py \
|
||||||
--lexicon $lang_dir/lexicon.txt \
|
--lexicon $lang_dir/lexicon.txt \
|
||||||
--bpe-model $lang_dir/bpe.model
|
--bpe-model $lang_dir/bpe.model
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -f $lang_dir/L.fst ]; then
|
if [ ! -f $lang_dir/L.fst ]; then
|
||||||
log "Converting L.pt to L.fst"
|
log "Converting L.pt to L.fst"
|
||||||
./shared/convert-k2-to-openfst.py \
|
./shared/convert-k2-to-openfst.py \
|
||||||
--olabels aux_labels \
|
--olabels aux_labels \
|
||||||
$lang_dir/L.pt \
|
$lang_dir/L.pt \
|
||||||
$lang_dir/L.fst
|
$lang_dir/L.fst
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -f $lang_dir/L_disambig.fst ]; then
|
if [ ! -f $lang_dir/L_disambig.fst ]; then
|
||||||
log "Converting L_disambig.pt to L_disambig.fst"
|
log "Converting L_disambig.pt to L_disambig.fst"
|
||||||
./shared/convert-k2-to-openfst.py \
|
./shared/convert-k2-to-openfst.py \
|
||||||
--olabels aux_labels \
|
--olabels aux_labels \
|
||||||
$lang_dir/L_disambig.pt \
|
$lang_dir/L_disambig.pt \
|
||||||
$lang_dir/L_disambig.fst
|
$lang_dir/L_disambig.fst
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
|
if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
|
||||||
@ -256,27 +290,31 @@ if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
|
|||||||
# We assume you have install kaldilm, if not, please install
|
# We assume you have install kaldilm, if not, please install
|
||||||
# it using: pip install kaldilm
|
# it using: pip install kaldilm
|
||||||
|
|
||||||
for vocab_size in ${vocab_sizes[@]}; do
|
if [ $lang == "yue" ] || [ $lang == "zh_TW" ] || [ $lang == "zh_CN" ] || [ $lang == "zh_HK" ]; then
|
||||||
lang_dir=data/${lang}/lang_bpe_${vocab_size}
|
echo "TO BE IMPLEMENTED"
|
||||||
mkdir -p $lang_dir/lm
|
else
|
||||||
#3-gram used in building HLG, 4-gram used for LM rescoring
|
for vocab_size in ${vocab_sizes[@]}; do
|
||||||
for ngram in 3 4; do
|
lang_dir=data/${lang}/lang_bpe_${vocab_size}
|
||||||
if [ ! -f $lang_dir/lm/${ngram}gram.arpa ]; then
|
mkdir -p $lang_dir/lm
|
||||||
./shared/make_kn_lm.py \
|
#3-gram used in building HLG, 4-gram used for LM rescoring
|
||||||
-ngram-order ${ngram} \
|
for ngram in 3 4; do
|
||||||
-text $lang_dir/transcript_words.txt \
|
if [ ! -f $lang_dir/lm/${ngram}gram.arpa ]; then
|
||||||
-lm $lang_dir/lm/${ngram}gram.arpa
|
./shared/make_kn_lm.py \
|
||||||
fi
|
-ngram-order ${ngram} \
|
||||||
|
-text $lang_dir/transcript_words.txt \
|
||||||
|
-lm $lang_dir/lm/${ngram}gram.arpa
|
||||||
|
fi
|
||||||
|
|
||||||
if [ ! -f $lang_dir/lm/${ngram}gram.fst.txt ]; then
|
if [ ! -f $lang_dir/lm/${ngram}gram.fst.txt ]; then
|
||||||
python3 -m kaldilm \
|
python3 -m kaldilm \
|
||||||
--read-symbol-table="$lang_dir/words.txt" \
|
--read-symbol-table="$lang_dir/words.txt" \
|
||||||
--disambig-symbol='#0' \
|
--disambig-symbol='#0' \
|
||||||
--max-order=${ngram} \
|
--max-order=${ngram} \
|
||||||
$lang_dir/lm/${ngram}gram.arpa > $lang_dir/lm/G_${ngram}_gram.fst.txt
|
$lang_dir/lm/${ngram}gram.arpa > $lang_dir/lm/G_${ngram}_gram.fst.txt
|
||||||
fi
|
fi
|
||||||
|
done
|
||||||
done
|
done
|
||||||
done
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
|
if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
|
||||||
|
Loading…
x
Reference in New Issue
Block a user