remove unwanted changes

This commit is contained in:
Desh Raj 2023-03-09 17:23:21 -05:00
parent 8a8e827317
commit f2d8bf632f
3 changed files with 42 additions and 19 deletions

View File

@ -2,7 +2,7 @@
lang_dir=data/lang_bpe_500 lang_dir=data/lang_bpe_500
for ngram in 2 3 5; do for ngram in 2 3 4 5; do
if [ ! -f $lang_dir/${ngram}gram.arpa ]; then if [ ! -f $lang_dir/${ngram}gram.arpa ]; then
./shared/make_kn_lm.py \ ./shared/make_kn_lm.py \
-ngram-order ${ngram} \ -ngram-order ${ngram} \

View File

@ -54,10 +54,20 @@ def get_args():
help="""Path to the bpe.model. If not None, we will remove short and help="""Path to the bpe.model. If not None, we will remove short and
long utterances before extracting features""", long utterances before extracting features""",
) )
parser.add_argument(
"--dataset",
type=str,
help="""Dataset parts to compute fbank. If None, we will use all""",
)
return parser.parse_args() return parser.parse_args()
def compute_fbank_librispeech(bpe_model: Optional[str] = None): def compute_fbank_librispeech(
bpe_model: Optional[str] = None,
dataset: Optional[str] = None,
):
src_dir = Path("data/manifests") src_dir = Path("data/manifests")
output_dir = Path("data/fbank") output_dir = Path("data/fbank")
num_jobs = min(15, os.cpu_count()) num_jobs = min(15, os.cpu_count())
@ -68,15 +78,19 @@ def compute_fbank_librispeech(bpe_model: Optional[str] = None):
sp = spm.SentencePieceProcessor() sp = spm.SentencePieceProcessor()
sp.load(bpe_model) sp.load(bpe_model)
dataset_parts = ( if dataset is None:
"dev-clean", dataset_parts = (
"dev-other", "dev-clean",
"test-clean", "dev-other",
"test-other", "test-clean",
"train-clean-100", "test-other",
"train-clean-360", "train-clean-100",
"train-other-500", "train-clean-360",
) "train-other-500",
)
else:
dataset_parts = dataset.split(" ", -1)
prefix = "librispeech" prefix = "librispeech"
suffix = "jsonl.gz" suffix = "jsonl.gz"
manifests = read_manifests_if_cached( manifests = read_manifests_if_cached(
@ -131,4 +145,4 @@ if __name__ == "__main__":
logging.basicConfig(format=formatter, level=logging.INFO) logging.basicConfig(format=formatter, level=logging.INFO)
args = get_args() args = get_args()
logging.info(vars(args)) logging.info(vars(args))
compute_fbank_librispeech(bpe_model=args.bpe_model) compute_fbank_librispeech(bpe_model=args.bpe_model, dataset=args.dataset)

View File

@ -123,10 +123,12 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
touch data/fbank/.librispeech.done touch data/fbank/.librispeech.done
fi fi
cat <(gunzip -c data/fbank/librispeech_cuts_train-clean-100.jsonl.gz) \ if [ ! -f data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz ]; then
<(gunzip -c data/fbank/librispeech_cuts_train-clean-360.jsonl.gz) \ cat <(gunzip -c data/fbank/librispeech_cuts_train-clean-100.jsonl.gz) \
<(gunzip -c data/fbank/librispeech_cuts_train-other-500.jsonl.gz) | \ <(gunzip -c data/fbank/librispeech_cuts_train-clean-360.jsonl.gz) \
shuf | gzip -c > data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz <(gunzip -c data/fbank/librispeech_cuts_train-other-500.jsonl.gz) | \
shuf | gzip -c > data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz
fi
if [ ! -e data/fbank/.librispeech-validated.done ]; then if [ ! -e data/fbank/.librispeech-validated.done ]; then
log "Validating data/fbank for LibriSpeech" log "Validating data/fbank for LibriSpeech"
@ -244,7 +246,7 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
fi fi
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
log "Stage 7: Prepare bigram P" log "Stage 7: Prepare bigram token-level P for MMI training"
for vocab_size in ${vocab_sizes[@]}; do for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bpe_${vocab_size} lang_dir=data/lang_bpe_${vocab_size}
@ -302,13 +304,20 @@ fi
if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
log "Stage 9: Compile HLG" log "Stage 9: Compile HLG"
./local/compile_hlg.py --lang-dir data/lang_phone ./local/compile_hlg.py --lang-dir data/lang_phone
./local/compile_hlg_using_openfst.py --lang-dir data/lang_phone
# Note If ./local/compile_hlg.py throws OOM,
# please switch to the following command
#
# ./local/compile_hlg_using_openfst.py --lang-dir data/lang_phone
for vocab_size in ${vocab_sizes[@]}; do for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bpe_${vocab_size} lang_dir=data/lang_bpe_${vocab_size}
./local/compile_hlg.py --lang-dir $lang_dir ./local/compile_hlg.py --lang-dir $lang_dir
./local/compile_hlg_using_openfst.py --lang-dir $lang_dir # Note If ./local/compile_hlg.py throws OOM,
# please switch to the following command
#
# ./local/compile_hlg_using_openfst.py --lang-dir $lang_dir
done done
fi fi