update eval data preparation

This commit is contained in:
Yuekai Zhang 2023-12-19 18:16:50 +08:00
parent 22f68dd344
commit eb79f1eceb
4 changed files with 17 additions and 71 deletions

View File

@ -58,14 +58,14 @@ def compute_fbank_icmcasr(num_mel_bins: int = 80, perturb_speed: bool = False):
suffix="jsonl.gz",
)
manifests_sdm = read_manifests_if_cached(
dataset_parts=["train", "dev"],
dataset_parts=["train", "dev", "eval_track1"],
output_dir=src_dir,
prefix="icmcasr-sdm",
suffix="jsonl.gz",
)
# For GSS we already have cuts so we read them directly.
manifests_gss = read_manifests_if_cached(
dataset_parts=["train", "dev"],
dataset_parts=["train", "dev", "eval_track1"],
output_dir=src_dir,
prefix="icmcasr-gss",
suffix="jsonl.gz",
@ -96,7 +96,7 @@ def compute_fbank_icmcasr(num_mel_bins: int = 80, perturb_speed: bool = False):
storage_path=storage_path,
manifest_path=manifest_path,
batch_duration=5000,
num_workers=8,
num_workers=4,
storage_type=LilcomChunkyWriter,
)
@ -153,7 +153,7 @@ def compute_fbank_icmcasr(num_mel_bins: int = 80, perturb_speed: bool = False):
)
logging.info("Preparing test cuts: IHM, SDM, GSS (optional)")
for split in ["dev"]:
for split in ["dev", "eval_track1"]:
logging.info(f"Processing {split} IHM")
cuts_ihm = (
CutSet.from_manifests(**manifests_ihm[split])

View File

@ -92,7 +92,7 @@ def main(args):
# Load manifests from cache if they exist (saves time)
manifests = read_manifests_if_cached(
dataset_parts=["train", "dev"],
dataset_parts=["train", "dev", "eval_track1"],
output_dir=manifests_dir,
prefix="icmcasr-sdm",
suffix="jsonl.gz",
@ -103,7 +103,7 @@ def main(args):
)
with ThreadPoolExecutor(args.num_jobs) as ex:
for part in ["train", "dev",]:
for part in ["train", "dev", "eval_track1"]:
logging.info(f"Processing {part}...")
supervisions_orig = manifests[part]["supervisions"].filter(
lambda s: s.duration >= args.min_segment_duration

View File

@ -30,7 +30,7 @@ log() {
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare cut sets"
for part in train dev; do
for part in train dev eval_track1; do
lhotse cut simple \
-r $DATA_DIR/icmcasr-mdm_recordings_${part}.jsonl.gz \
-s $DATA_DIR/icmcasr-mdm_supervisions_${part}.jsonl.gz \
@ -40,7 +40,7 @@ fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Trim cuts to supervisions (1 cut per supervision segment)"
for part in train dev; do
for part in train dev eval_track1; do
lhotse cut trim-to-supervisions --discard-overlapping \
$EXP_DIR/cuts_${part}.jsonl.gz $EXP_DIR/cuts_per_segment_${part}.jsonl.gz
done
@ -48,7 +48,7 @@ fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Split manifests for multi-GPU processing (optional)"
for part in train dev; do
for part in train dev eval_track1; do
gss utils split $nj $EXP_DIR/cuts_per_segment_${part}.jsonl.gz \
$EXP_DIR/cuts_per_segment_${part}_split$nj
done
@ -75,7 +75,7 @@ fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Enhance eval/test segments using GSS (using GPU)"
# for eval/test, we use larger context and smaller batches to get better quality
for part in dev; do
for part in dev eval_track1; do
for JOB in $(seq $nj); do
gss enhance cuts $EXP_DIR/cuts_${part}.jsonl.gz \
$EXP_DIR/cuts_per_segment_${part}_split$nj/cuts_per_segment_${part}.$JOB.jsonl.gz \

View File

@ -6,8 +6,8 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
set -eou pipefail
nj=15
stage=3
stop_stage=3
stage=4
stop_stage=4
# We assume dl_dir (download dir) contains the following
# directories and files. If not, they will be downloaded
@ -83,7 +83,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 3: Compute fbank for icmcasr"
log "Stage 4: Compute fbank for icmcasr"
if [ ! -f data/fbank/.icmcasr.done ]; then
mkdir -p data/fbank
./local/compute_fbank_icmcasr.py --perturb-speed True
@ -95,7 +95,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 4: Compute fbank for musan"
log "Stage 5: Compute fbank for musan"
if [ ! -f data/fbank/.msuan.done ]; then
mkdir -p data/fbank
./local/compute_fbank_musan.py
@ -105,7 +105,7 @@ fi
lang_phone_dir=data/lang_phone
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 5: Prepare phone based lang"
log "Stage 6: Prepare G.fst"
mkdir -p $lang_phone_dir
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |
@ -119,57 +119,3 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
fi
fi
lang_char_dir=data/lang_char
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
log "Stage 6: Prepare char based lang"
mkdir -p $lang_char_dir
# We reuse words.txt from phone based lexicon
# so that the two can share G.pt later.
# The transcripts in training set, generated in stage 5
cp $lang_phone_dir/transcript_words.txt $lang_char_dir/transcript_words.txt
cat $dl_dir/icmcasr/data_icmcasr/transcript/icmcasr_transcript_v0.8.txt |
cut -d " " -f 2- > $lang_char_dir/text
(echo '<eps> 0'; echo '!SIL 1'; echo '<SPOKEN_NOISE> 2'; echo '<UNK> 3';) \
> $lang_char_dir/words.txt
cat $lang_char_dir/text | sed 's/ /\n/g' | sort -u | sed '/^$/d' \
| awk '{print $1" "NR+3}' >> $lang_char_dir/words.txt
num_lines=$(< $lang_char_dir/words.txt wc -l)
(echo "#0 $num_lines"; echo "<s> $(($num_lines + 1))"; echo "</s> $(($num_lines + 2))";) \
>> $lang_char_dir/words.txt
if [ ! -f $lang_char_dir/L_disambig.pt ]; then
./local/prepare_char.py --lang-dir $lang_char_dir
fi
if [ ! -f $lang_char_dir/HLG.fst ]; then
./local/prepare_lang_fst.py --lang-dir $lang_phone_dir --ngram-G ./data/lm/G_3_gram.fst.txt
fi
fi
if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
log "Stage 7: Prepare Byte BPE based lang"
for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bbpe_${vocab_size}
mkdir -p $lang_dir
cp $lang_char_dir/words.txt $lang_dir
cp $lang_char_dir/text $lang_dir
if [ ! -f $lang_dir/bbpe.model ]; then
./local/train_bbpe_model.py \
--lang-dir $lang_dir \
--vocab-size $vocab_size \
--transcript $lang_dir/text
fi
if [ ! -f $lang_dir/L_disambig.pt ]; then
./local/prepare_lang_bbpe.py --lang-dir $lang_dir
fi
done
fi