Add BPE training

This commit is contained in:
Yifan Yang 2023-07-25 12:35:17 +08:00
parent 8e560e9880
commit 3c54619f84
5 changed files with 9 additions and 4 deletions

View File

@ -0,0 +1 @@
../../../librispeech/ASR/local/prepare_lang_bpe.py

View File

@ -38,8 +38,8 @@ def get_args():
def normalize_text(utt: str) -> str: def normalize_text(utt: str) -> str:
punc = "~`!#$%^&*()_+-=|';\":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》《{}" opr_and_punc = "=\+\-\*\/%<>×÷" + "।,;:\?!'\.\"-\[\]\{\}\(\)–—―~"
return re.sub(r"[{0}]+".format(punc), "", utt).upper() return re.sub(r"[{0}]+".format(opr_and_punc), "", utt).upper()
def preprocess_bengaliai_speech( def preprocess_bengaliai_speech(
@ -109,7 +109,9 @@ def preprocess_bengaliai_speech(
def main(): def main():
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" formatter = (
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
)
logging.basicConfig(format=formatter, level=logging.INFO) logging.basicConfig(format=formatter, level=logging.INFO)
args = get_args() args = get_args()

View File

@ -0,0 +1 @@
../../../librispeech/ASR/local/train_bpe_model.py

View File

@ -0,0 +1 @@
../../../librispeech/ASR/local/validate_bpe_lexicon.py

View File

@ -135,7 +135,7 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
--num-workers $nj \ --num-workers $nj \
--batch-duration 600 \ --batch-duration 600 \
--start 0 \ --start 0 \
--num-splits 300 --num-splits 233
touch data/fbank/.bengaliai_speech_train.done touch data/fbank/.bengaliai_speech_train.done
fi fi
fi fi