mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-19 05:54:20 +00:00
Add BPE training
This commit is contained in:
parent
8e560e9880
commit
3c54619f84
1
egs/bengaliai_speech/ASR/local/prepare_lang_bpe.py
Symbolic link
1
egs/bengaliai_speech/ASR/local/prepare_lang_bpe.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/local/prepare_lang_bpe.py
|
@ -38,8 +38,8 @@ def get_args():
|
|||||||
|
|
||||||
|
|
||||||
def normalize_text(utt: str) -> str:
|
def normalize_text(utt: str) -> str:
|
||||||
punc = "~`!#$%^&*()_+-=|';\":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》《{}"
|
opr_and_punc = "=\+\-\*\/%<>×÷" + "।,;:\?!'\.\"-\[\]\{\}\(\)–—―~"
|
||||||
return re.sub(r"[{0}]+".format(punc), "", utt).upper()
|
return re.sub(r"[{0}]+".format(opr_and_punc), "", utt).upper()
|
||||||
|
|
||||||
|
|
||||||
def preprocess_bengaliai_speech(
|
def preprocess_bengaliai_speech(
|
||||||
@ -109,7 +109,9 @@ def preprocess_bengaliai_speech(
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
formatter = (
|
||||||
|
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||||
|
)
|
||||||
|
|
||||||
logging.basicConfig(format=formatter, level=logging.INFO)
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||||
args = get_args()
|
args = get_args()
|
||||||
|
1
egs/bengaliai_speech/ASR/local/train_bpe_model.py
Symbolic link
1
egs/bengaliai_speech/ASR/local/train_bpe_model.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/local/train_bpe_model.py
|
1
egs/bengaliai_speech/ASR/local/validate_bpe_lexicon.py
Symbolic link
1
egs/bengaliai_speech/ASR/local/validate_bpe_lexicon.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/local/validate_bpe_lexicon.py
|
@ -135,7 +135,7 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
|||||||
--num-workers $nj \
|
--num-workers $nj \
|
||||||
--batch-duration 600 \
|
--batch-duration 600 \
|
||||||
--start 0 \
|
--start 0 \
|
||||||
--num-splits 300
|
--num-splits 233
|
||||||
touch data/fbank/.bengaliai_speech_train.done
|
touch data/fbank/.bengaliai_speech_train.done
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
Loading…
x
Reference in New Issue
Block a user