mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-11 19:12:30 +00:00
update eval data preparation
This commit is contained in:
parent
22f68dd344
commit
eb79f1eceb
@ -58,14 +58,14 @@ def compute_fbank_icmcasr(num_mel_bins: int = 80, perturb_speed: bool = False):
|
||||
suffix="jsonl.gz",
|
||||
)
|
||||
manifests_sdm = read_manifests_if_cached(
|
||||
dataset_parts=["train", "dev"],
|
||||
dataset_parts=["train", "dev", "eval_track1"],
|
||||
output_dir=src_dir,
|
||||
prefix="icmcasr-sdm",
|
||||
suffix="jsonl.gz",
|
||||
)
|
||||
# For GSS we already have cuts so we read them directly.
|
||||
manifests_gss = read_manifests_if_cached(
|
||||
dataset_parts=["train", "dev"],
|
||||
dataset_parts=["train", "dev", "eval_track1"],
|
||||
output_dir=src_dir,
|
||||
prefix="icmcasr-gss",
|
||||
suffix="jsonl.gz",
|
||||
@ -96,7 +96,7 @@ def compute_fbank_icmcasr(num_mel_bins: int = 80, perturb_speed: bool = False):
|
||||
storage_path=storage_path,
|
||||
manifest_path=manifest_path,
|
||||
batch_duration=5000,
|
||||
num_workers=8,
|
||||
num_workers=4,
|
||||
storage_type=LilcomChunkyWriter,
|
||||
)
|
||||
|
||||
@ -153,7 +153,7 @@ def compute_fbank_icmcasr(num_mel_bins: int = 80, perturb_speed: bool = False):
|
||||
)
|
||||
|
||||
logging.info("Preparing test cuts: IHM, SDM, GSS (optional)")
|
||||
for split in ["dev"]:
|
||||
for split in ["dev", "eval_track1"]:
|
||||
logging.info(f"Processing {split} IHM")
|
||||
cuts_ihm = (
|
||||
CutSet.from_manifests(**manifests_ihm[split])
|
||||
|
@ -92,7 +92,7 @@ def main(args):
|
||||
|
||||
# Load manifests from cache if they exist (saves time)
|
||||
manifests = read_manifests_if_cached(
|
||||
dataset_parts=["train", "dev"],
|
||||
dataset_parts=["train", "dev", "eval_track1"],
|
||||
output_dir=manifests_dir,
|
||||
prefix="icmcasr-sdm",
|
||||
suffix="jsonl.gz",
|
||||
@ -103,7 +103,7 @@ def main(args):
|
||||
)
|
||||
|
||||
with ThreadPoolExecutor(args.num_jobs) as ex:
|
||||
for part in ["train", "dev",]:
|
||||
for part in ["train", "dev", "eval_track1"]:
|
||||
logging.info(f"Processing {part}...")
|
||||
supervisions_orig = manifests[part]["supervisions"].filter(
|
||||
lambda s: s.duration >= args.min_segment_duration
|
||||
|
@ -30,7 +30,7 @@ log() {
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
log "Stage 1: Prepare cut sets"
|
||||
for part in train dev; do
|
||||
for part in train dev eval_track1; do
|
||||
lhotse cut simple \
|
||||
-r $DATA_DIR/icmcasr-mdm_recordings_${part}.jsonl.gz \
|
||||
-s $DATA_DIR/icmcasr-mdm_supervisions_${part}.jsonl.gz \
|
||||
@ -40,7 +40,7 @@ fi
|
||||
|
||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||
log "Stage 2: Trim cuts to supervisions (1 cut per supervision segment)"
|
||||
for part in train dev; do
|
||||
for part in train dev eval_track1; do
|
||||
lhotse cut trim-to-supervisions --discard-overlapping \
|
||||
$EXP_DIR/cuts_${part}.jsonl.gz $EXP_DIR/cuts_per_segment_${part}.jsonl.gz
|
||||
done
|
||||
@ -48,7 +48,7 @@ fi
|
||||
|
||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||
log "Stage 3: Split manifests for multi-GPU processing (optional)"
|
||||
for part in train dev; do
|
||||
for part in train dev eval_track1; do
|
||||
gss utils split $nj $EXP_DIR/cuts_per_segment_${part}.jsonl.gz \
|
||||
$EXP_DIR/cuts_per_segment_${part}_split$nj
|
||||
done
|
||||
@ -75,7 +75,7 @@ fi
|
||||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||
log "Stage 5: Enhance eval/test segments using GSS (using GPU)"
|
||||
# for eval/test, we use larger context and smaller batches to get better quality
|
||||
for part in dev; do
|
||||
for part in dev eval_track1; do
|
||||
for JOB in $(seq $nj); do
|
||||
gss enhance cuts $EXP_DIR/cuts_${part}.jsonl.gz \
|
||||
$EXP_DIR/cuts_per_segment_${part}_split$nj/cuts_per_segment_${part}.$JOB.jsonl.gz \
|
||||
|
@ -6,8 +6,8 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
||||
set -eou pipefail
|
||||
|
||||
nj=15
|
||||
stage=3
|
||||
stop_stage=3
|
||||
stage=4
|
||||
stop_stage=4
|
||||
|
||||
# We assume dl_dir (download dir) contains the following
|
||||
# directories and files. If not, they will be downloaded
|
||||
@ -83,19 +83,19 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||
fi
|
||||
|
||||
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||
log "Stage 3: Compute fbank for icmcasr"
|
||||
log "Stage 4: Compute fbank for icmcasr"
|
||||
if [ ! -f data/fbank/.icmcasr.done ]; then
|
||||
mkdir -p data/fbank
|
||||
./local/compute_fbank_icmcasr.py --perturb-speed True
|
||||
echo "Combining manifests"
|
||||
lhotse combine data/manifests/cuts_train_{ihm,ihm_rvb,sdm,gss}.jsonl.gz - | shuf |\
|
||||
gzip -c > data/manifests/cuts_train_all.jsonl.gz
|
||||
lhotse combine data/manifests/cuts_train_{ihm,ihm_rvb,sdm,gss}.jsonl.gz - | shuf |\
|
||||
gzip -c > data/manifests/cuts_train_all.jsonl.gz
|
||||
touch data/fbank/.icmcasr.done
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||
log "Stage 4: Compute fbank for musan"
|
||||
log "Stage 5: Compute fbank for musan"
|
||||
if [ ! -f data/fbank/.msuan.done ]; then
|
||||
mkdir -p data/fbank
|
||||
./local/compute_fbank_musan.py
|
||||
@ -105,7 +105,7 @@ fi
|
||||
|
||||
lang_phone_dir=data/lang_phone
|
||||
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||
log "Stage 5: Prepare phone based lang"
|
||||
log "Stage 6: Prepare G.fst"
|
||||
mkdir -p $lang_phone_dir
|
||||
|
||||
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |
|
||||
@ -119,57 +119,3 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||
fi
|
||||
fi
|
||||
|
||||
lang_char_dir=data/lang_char
|
||||
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
|
||||
log "Stage 6: Prepare char based lang"
|
||||
mkdir -p $lang_char_dir
|
||||
# We reuse words.txt from phone based lexicon
|
||||
# so that the two can share G.pt later.
|
||||
|
||||
# The transcripts in training set, generated in stage 5
|
||||
cp $lang_phone_dir/transcript_words.txt $lang_char_dir/transcript_words.txt
|
||||
|
||||
cat $dl_dir/icmcasr/data_icmcasr/transcript/icmcasr_transcript_v0.8.txt |
|
||||
cut -d " " -f 2- > $lang_char_dir/text
|
||||
|
||||
(echo '<eps> 0'; echo '!SIL 1'; echo '<SPOKEN_NOISE> 2'; echo '<UNK> 3';) \
|
||||
> $lang_char_dir/words.txt
|
||||
|
||||
cat $lang_char_dir/text | sed 's/ /\n/g' | sort -u | sed '/^$/d' \
|
||||
| awk '{print $1" "NR+3}' >> $lang_char_dir/words.txt
|
||||
|
||||
num_lines=$(< $lang_char_dir/words.txt wc -l)
|
||||
(echo "#0 $num_lines"; echo "<s> $(($num_lines + 1))"; echo "</s> $(($num_lines + 2))";) \
|
||||
>> $lang_char_dir/words.txt
|
||||
|
||||
if [ ! -f $lang_char_dir/L_disambig.pt ]; then
|
||||
./local/prepare_char.py --lang-dir $lang_char_dir
|
||||
fi
|
||||
|
||||
if [ ! -f $lang_char_dir/HLG.fst ]; then
|
||||
./local/prepare_lang_fst.py --lang-dir $lang_phone_dir --ngram-G ./data/lm/G_3_gram.fst.txt
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
|
||||
log "Stage 7: Prepare Byte BPE based lang"
|
||||
|
||||
for vocab_size in ${vocab_sizes[@]}; do
|
||||
lang_dir=data/lang_bbpe_${vocab_size}
|
||||
mkdir -p $lang_dir
|
||||
|
||||
cp $lang_char_dir/words.txt $lang_dir
|
||||
cp $lang_char_dir/text $lang_dir
|
||||
|
||||
if [ ! -f $lang_dir/bbpe.model ]; then
|
||||
./local/train_bbpe_model.py \
|
||||
--lang-dir $lang_dir \
|
||||
--vocab-size $vocab_size \
|
||||
--transcript $lang_dir/text
|
||||
fi
|
||||
|
||||
if [ ! -f $lang_dir/L_disambig.pt ]; then
|
||||
./local/prepare_lang_bbpe.py --lang-dir $lang_dir
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
Loading…
x
Reference in New Issue
Block a user