diff --git a/egs/libriheavy/ASR/local/compute_fbank_libriheavy.py b/egs/libriheavy/ASR/local/compute_fbank_libriheavy.py index 05ade450c..73c978b98 100755 --- a/egs/libriheavy/ASR/local/compute_fbank_libriheavy.py +++ b/egs/libriheavy/ASR/local/compute_fbank_libriheavy.py @@ -188,7 +188,7 @@ def compute_fbank_libriheavy_splits(args): extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device)) logging.info(f"device: {device}") - prefix = "librilight" + prefix = "libriheavy" num_digits = 8 # num_digits is fixed by lhotse split-lazy for i in range(start, stop): diff --git a/egs/libriheavy/ASR/local/prepare_validation_sets.py b/egs/libriheavy/ASR/local/prepare_validation_sets.py index 23dd4bbff..7ad4e6d81 100755 --- a/egs/libriheavy/ASR/local/prepare_validation_sets.py +++ b/egs/libriheavy/ASR/local/prepare_validation_sets.py @@ -36,7 +36,13 @@ def get_args(): parser = argparse.ArgumentParser() parser.add_argument( - "--manifest", type=str, help="The original manifest coming from" + "--in-manifest", type=str, help="The original manifest coming from" + ) + + parser.add_argument( + "--out-manifest", + type=str, + help="Where to store the manifest after filtering out the test/dev sets", ) return parser.parse_args() @@ -44,8 +50,8 @@ def get_args(): def main(args): - logging.info(f"Loading manifest {args.manifest}") - cuts = load_manifest_lazy(args.manifest) + logging.info(f"Loading manifest {args.in_manifest}") + cuts = load_manifest_lazy(args.in_manifest) all_test_sets = [ "dev", @@ -53,19 +59,28 @@ def main(args): "test-other", ] + all_books = [] for test_set in all_test_sets: logging.info(f"Processing test set: {test_set}") with open(f"data/manifests/{test_set}.txt", "r") as f: books = f.read().split("\n") + all_books += books + out_name = f"data/manifests/libriheavy_cuts_{test_set}.jsonl.gz" + if os.path.exists(out_name): + continue # find the cuts belonging to the given books selected_cuts = cuts.filter(lambda c: c.text_path.split("/")[-2] in books) selected_cuts.describe() - out_name = f"data/manifests/libriheavy_cuts_{test_set}.jsonl.gz" logging.info(f"Saving the cuts contained in the book list to {out_name}") selected_cuts.to_file(out_name) + filtered_cuts = cuts.filter(lambda c: c.text_path.split("/")[-2] not in all_books) + logging.info(f"Saving the filtered manifest to {args.out_manifest}.") + filtered_cuts.to_file(args.out_manifest) + logging.info("Done") + if __name__ == "__main__": formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" diff --git a/egs/libriheavy/ASR/prepare.sh b/egs/libriheavy/ASR/prepare.sh index 0aa6c91ae..dc00395cb 100755 --- a/egs/libriheavy/ASR/prepare.sh +++ b/egs/libriheavy/ASR/prepare.sh @@ -12,6 +12,8 @@ stop_stage=100 start=0 stop=-1 num_per_split=2000 +split_per_job=20 +char_coverage=0.99 . shared/parse_options.sh || exit 1 @@ -19,7 +21,7 @@ num_per_split=2000 # It will generate data/lang_bpe_xxx, # data/lang_bpe_yyy if the array contains xxx, yyy vocab_sizes=( - 1000 + 750 ) mkdir -p data @@ -35,20 +37,20 @@ fbank_dir=data/fbank mkdir -p $manifest_dir -subset="large" +subset="medium" if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then log "Stage 1: Split libri-heavy ${subset}" if [ $subset == "large" ]; then num_per_split=8000 - log "Change num_per_split to ${num_per_split} 8000 for large" + log "Change num_per_split to ${num_per_split} for large" fi split_dir=$fbank_dir/libriheavy_${subset}_split mkdir -p $split_dir if [ ! -e $split_dir/.split_completed ]; then - lhotse split-lazy $manifest_dir/librilight_cuts_${subset}_raw.jsonl.gz $split_dir $num_per_split + lhotse split-lazy $manifest_dir/libriheavy_cuts_${subset}_raw.jsonl.gz $split_dir $num_per_split touch $split_dir/.split_completed fi fi @@ -56,11 +58,18 @@ fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 2: Compute fbank for Libri-heavy ${subset}" mkdir -p $fbank_dir - num_splits=$(find $fbank_dir/libriheavy_${subset}_split -name "librilight_cuts_${subset}_raw.*.jsonl.gz" | wc -l) + num_splits=$(find $fbank_dir/libriheavy_${subset}_split -name "libriheavy_cuts_${subset}_raw.*.jsonl.gz" | wc -l) + if [ $subset == "large" ]; then + split_per_job=210 + log "Change split_per_job to ${split_per_job} for large" + elif [ $subset == "medium" ]; then + split_per_job=100 + log "Change split_per_job to ${split_per_job} for medium" + fi if [ ! -e $fbank_dir/.libriheavy.${subset}.done ]; then for i in $(seq 0 1 7); do - start=$(( i * 200 )) - end=$(( (i+1) * 200 )) + start=$(( i * $split_per_job )) + end=$(( (i+1) * $split_per_job )) ./local/compute_fbank_libriheavy.py \ --dataset ${subset} \ --fbank-dir $fbank_dir \ @@ -76,21 +85,29 @@ fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "Stage 3: Combine features for ${subset}" - if [ ! -f $fbank_dir/librilight_cuts_${subset}.jsonl.gz ]; then - pieces=$(find $fbank_dir/libriheavy_${subset}_split -name "librilight_cuts_${subset}.*.jsonl.gz") - lhotse combine $pieces $fbank_dir/librilight_cuts_${subset}.jsonl.gz + if [ ! -f $fbank_dir/libriheavy_cuts_${subset}.jsonl.gz ]; then + pieces=$(find $fbank_dir/libriheavy_${subset}_split -name "libriheavy_cuts_${subset}.*.jsonl.gz") + lhotse combine $pieces $fbank_dir/libriheavy_cuts_${subset}.jsonl.gz fi fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: Prepare BPE model" + log "Stage 4: Prepare the validation&test sets" + + ./local/prepare_validation_sets.py \ + --in-manifest $fbank_dir/libriheavy_cuts_medium.jsonl.gz \ + --out-manifest $fbank_dir/libriheavy_cuts_medium_filtered.jsonl.gz +fi + +if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then + log "Stage 5: Prepare BPE model" tmp_dir=data/tmp mkdir -p $tmp_dir if [ ! -f $tmp_dir/transcript_words.txt ]; then for part in "small" "medium" "large"; do - gunzip -c $manifest_dir/librilight_cuts_${part}_raw.jsonl.gz | + gunzip -c $manifest_dir/libriheavy_cuts_${part}_raw.jsonl.gz | jq '.supervisions[].custom.texts[]' | sed 's/" //' | sed 's/\(.*\)"/\1/' > $tmp_dir/transcript_words_${part}.txt done cat $tmp_dir/transcript_words_small.txt $tmp_dir/transcript_words_medium.txt $tmp_dir/transcript_words_large.txt > $tmp_dir/transcript_words.txt @@ -125,17 +142,19 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then fi for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bpe_${vocab_size} + lang_dir=data/lang_bpe_${vocab_size}_fallback_coverage_${char_coverage} mkdir -p $lang_dir cp $tmp_dir/words.txt $lang_dir/words.txt pushd $lang_dir ln -s ../$tmp_dir/transcript_words.txt transcript_words.txt popd - + if [ ! -f $lang_dir/bpe.model ]; then ./local/train_bpe_model.py \ --lang-dir $lang_dir \ --vocab-size $vocab_size \ + --byte-fallback True \ + --character-coverage $char_coverage \ --transcript $tmp_dir/transcript_words_medium.txt fi