From eb79f1eceb986ee63a34e3206c90a3b3ccaa312b Mon Sep 17 00:00:00 2001 From: Yuekai Zhang Date: Tue, 19 Dec 2023 18:16:50 +0800 Subject: [PATCH] update eval data preparation --- .../ASR/local/compute_fbank_icmcasr.py | 8 +-- .../ASR/local/prepare_icmc_enhanced.py | 4 +- egs/icmcasr/ASR/local/prepare_icmc_gss.sh | 8 +-- egs/icmcasr/ASR/prepare.sh | 68 ++----------------- 4 files changed, 17 insertions(+), 71 deletions(-) diff --git a/egs/icmcasr/ASR/local/compute_fbank_icmcasr.py b/egs/icmcasr/ASR/local/compute_fbank_icmcasr.py index e5623634b..3e77f1c9e 100755 --- a/egs/icmcasr/ASR/local/compute_fbank_icmcasr.py +++ b/egs/icmcasr/ASR/local/compute_fbank_icmcasr.py @@ -58,14 +58,14 @@ def compute_fbank_icmcasr(num_mel_bins: int = 80, perturb_speed: bool = False): suffix="jsonl.gz", ) manifests_sdm = read_manifests_if_cached( - dataset_parts=["train", "dev"], + dataset_parts=["train", "dev", "eval_track1"], output_dir=src_dir, prefix="icmcasr-sdm", suffix="jsonl.gz", ) # For GSS we already have cuts so we read them directly. manifests_gss = read_manifests_if_cached( - dataset_parts=["train", "dev"], + dataset_parts=["train", "dev", "eval_track1"], output_dir=src_dir, prefix="icmcasr-gss", suffix="jsonl.gz", @@ -96,7 +96,7 @@ def compute_fbank_icmcasr(num_mel_bins: int = 80, perturb_speed: bool = False): storage_path=storage_path, manifest_path=manifest_path, batch_duration=5000, - num_workers=8, + num_workers=4, storage_type=LilcomChunkyWriter, ) @@ -153,7 +153,7 @@ def compute_fbank_icmcasr(num_mel_bins: int = 80, perturb_speed: bool = False): ) logging.info("Preparing test cuts: IHM, SDM, GSS (optional)") - for split in ["dev"]: + for split in ["dev", "eval_track1"]: logging.info(f"Processing {split} IHM") cuts_ihm = ( CutSet.from_manifests(**manifests_ihm[split]) diff --git a/egs/icmcasr/ASR/local/prepare_icmc_enhanced.py b/egs/icmcasr/ASR/local/prepare_icmc_enhanced.py index 19f1fab72..6845ce91e 100644 --- a/egs/icmcasr/ASR/local/prepare_icmc_enhanced.py +++ b/egs/icmcasr/ASR/local/prepare_icmc_enhanced.py @@ -92,7 +92,7 @@ def main(args): # Load manifests from cache if they exist (saves time) manifests = read_manifests_if_cached( - dataset_parts=["train", "dev"], + dataset_parts=["train", "dev", "eval_track1"], output_dir=manifests_dir, prefix="icmcasr-sdm", suffix="jsonl.gz", @@ -103,7 +103,7 @@ def main(args): ) with ThreadPoolExecutor(args.num_jobs) as ex: - for part in ["train", "dev",]: + for part in ["train", "dev", "eval_track1"]: logging.info(f"Processing {part}...") supervisions_orig = manifests[part]["supervisions"].filter( lambda s: s.duration >= args.min_segment_duration diff --git a/egs/icmcasr/ASR/local/prepare_icmc_gss.sh b/egs/icmcasr/ASR/local/prepare_icmc_gss.sh index b3490f9ab..69dd42dfa 100644 --- a/egs/icmcasr/ASR/local/prepare_icmc_gss.sh +++ b/egs/icmcasr/ASR/local/prepare_icmc_gss.sh @@ -30,7 +30,7 @@ log() { if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then log "Stage 1: Prepare cut sets" - for part in train dev; do + for part in train dev eval_track1; do lhotse cut simple \ -r $DATA_DIR/icmcasr-mdm_recordings_${part}.jsonl.gz \ -s $DATA_DIR/icmcasr-mdm_supervisions_${part}.jsonl.gz \ @@ -40,7 +40,7 @@ fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 2: Trim cuts to supervisions (1 cut per supervision segment)" - for part in train dev; do + for part in train dev eval_track1; do lhotse cut trim-to-supervisions --discard-overlapping \ $EXP_DIR/cuts_${part}.jsonl.gz $EXP_DIR/cuts_per_segment_${part}.jsonl.gz done @@ -48,7 +48,7 @@ fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "Stage 3: Split manifests for multi-GPU processing (optional)" - for part in train dev; do + for part in train dev eval_track1; do gss utils split $nj $EXP_DIR/cuts_per_segment_${part}.jsonl.gz \ $EXP_DIR/cuts_per_segment_${part}_split$nj done @@ -75,7 +75,7 @@ fi if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then log "Stage 5: Enhance eval/test segments using GSS (using GPU)" # for eval/test, we use larger context and smaller batches to get better quality - for part in dev; do + for part in dev eval_track1; do for JOB in $(seq $nj); do gss enhance cuts $EXP_DIR/cuts_${part}.jsonl.gz \ $EXP_DIR/cuts_per_segment_${part}_split$nj/cuts_per_segment_${part}.$JOB.jsonl.gz \ diff --git a/egs/icmcasr/ASR/prepare.sh b/egs/icmcasr/ASR/prepare.sh index 9f1130a18..1de9562a9 100755 --- a/egs/icmcasr/ASR/prepare.sh +++ b/egs/icmcasr/ASR/prepare.sh @@ -6,8 +6,8 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python set -eou pipefail nj=15 -stage=3 -stop_stage=3 +stage=4 +stop_stage=4 # We assume dl_dir (download dir) contains the following # directories and files. If not, they will be downloaded @@ -83,19 +83,19 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 3: Compute fbank for icmcasr" + log "Stage 4: Compute fbank for icmcasr" if [ ! -f data/fbank/.icmcasr.done ]; then mkdir -p data/fbank ./local/compute_fbank_icmcasr.py --perturb-speed True echo "Combining manifests" - lhotse combine data/manifests/cuts_train_{ihm,ihm_rvb,sdm,gss}.jsonl.gz - | shuf |\ - gzip -c > data/manifests/cuts_train_all.jsonl.gz + lhotse combine data/manifests/cuts_train_{ihm,ihm_rvb,sdm,gss}.jsonl.gz - | shuf |\ + gzip -c > data/manifests/cuts_train_all.jsonl.gz touch data/fbank/.icmcasr.done fi fi if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 4: Compute fbank for musan" + log "Stage 5: Compute fbank for musan" if [ ! -f data/fbank/.msuan.done ]; then mkdir -p data/fbank ./local/compute_fbank_musan.py @@ -105,7 +105,7 @@ fi lang_phone_dir=data/lang_phone if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then - log "Stage 5: Prepare phone based lang" + log "Stage 6: Prepare G.fst" mkdir -p $lang_phone_dir (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) | @@ -119,57 +119,3 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then fi fi -lang_char_dir=data/lang_char -if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then - log "Stage 6: Prepare char based lang" - mkdir -p $lang_char_dir - # We reuse words.txt from phone based lexicon - # so that the two can share G.pt later. - - # The transcripts in training set, generated in stage 5 - cp $lang_phone_dir/transcript_words.txt $lang_char_dir/transcript_words.txt - - cat $dl_dir/icmcasr/data_icmcasr/transcript/icmcasr_transcript_v0.8.txt | - cut -d " " -f 2- > $lang_char_dir/text - - (echo ' 0'; echo '!SIL 1'; echo ' 2'; echo ' 3';) \ - > $lang_char_dir/words.txt - - cat $lang_char_dir/text | sed 's/ /\n/g' | sort -u | sed '/^$/d' \ - | awk '{print $1" "NR+3}' >> $lang_char_dir/words.txt - - num_lines=$(< $lang_char_dir/words.txt wc -l) - (echo "#0 $num_lines"; echo " $(($num_lines + 1))"; echo " $(($num_lines + 2))";) \ - >> $lang_char_dir/words.txt - - if [ ! -f $lang_char_dir/L_disambig.pt ]; then - ./local/prepare_char.py --lang-dir $lang_char_dir - fi - - if [ ! -f $lang_char_dir/HLG.fst ]; then - ./local/prepare_lang_fst.py --lang-dir $lang_phone_dir --ngram-G ./data/lm/G_3_gram.fst.txt - fi -fi - -if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then - log "Stage 7: Prepare Byte BPE based lang" - - for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bbpe_${vocab_size} - mkdir -p $lang_dir - - cp $lang_char_dir/words.txt $lang_dir - cp $lang_char_dir/text $lang_dir - - if [ ! -f $lang_dir/bbpe.model ]; then - ./local/train_bbpe_model.py \ - --lang-dir $lang_dir \ - --vocab-size $vocab_size \ - --transcript $lang_dir/text - fi - - if [ ! -f $lang_dir/L_disambig.pt ]; then - ./local/prepare_lang_bbpe.py --lang-dir $lang_dir - fi - done -fi