mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-11 11:02:29 +00:00
update eval data preparation
This commit is contained in:
parent
22f68dd344
commit
eb79f1eceb
@ -58,14 +58,14 @@ def compute_fbank_icmcasr(num_mel_bins: int = 80, perturb_speed: bool = False):
|
|||||||
suffix="jsonl.gz",
|
suffix="jsonl.gz",
|
||||||
)
|
)
|
||||||
manifests_sdm = read_manifests_if_cached(
|
manifests_sdm = read_manifests_if_cached(
|
||||||
dataset_parts=["train", "dev"],
|
dataset_parts=["train", "dev", "eval_track1"],
|
||||||
output_dir=src_dir,
|
output_dir=src_dir,
|
||||||
prefix="icmcasr-sdm",
|
prefix="icmcasr-sdm",
|
||||||
suffix="jsonl.gz",
|
suffix="jsonl.gz",
|
||||||
)
|
)
|
||||||
# For GSS we already have cuts so we read them directly.
|
# For GSS we already have cuts so we read them directly.
|
||||||
manifests_gss = read_manifests_if_cached(
|
manifests_gss = read_manifests_if_cached(
|
||||||
dataset_parts=["train", "dev"],
|
dataset_parts=["train", "dev", "eval_track1"],
|
||||||
output_dir=src_dir,
|
output_dir=src_dir,
|
||||||
prefix="icmcasr-gss",
|
prefix="icmcasr-gss",
|
||||||
suffix="jsonl.gz",
|
suffix="jsonl.gz",
|
||||||
@ -96,7 +96,7 @@ def compute_fbank_icmcasr(num_mel_bins: int = 80, perturb_speed: bool = False):
|
|||||||
storage_path=storage_path,
|
storage_path=storage_path,
|
||||||
manifest_path=manifest_path,
|
manifest_path=manifest_path,
|
||||||
batch_duration=5000,
|
batch_duration=5000,
|
||||||
num_workers=8,
|
num_workers=4,
|
||||||
storage_type=LilcomChunkyWriter,
|
storage_type=LilcomChunkyWriter,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -153,7 +153,7 @@ def compute_fbank_icmcasr(num_mel_bins: int = 80, perturb_speed: bool = False):
|
|||||||
)
|
)
|
||||||
|
|
||||||
logging.info("Preparing test cuts: IHM, SDM, GSS (optional)")
|
logging.info("Preparing test cuts: IHM, SDM, GSS (optional)")
|
||||||
for split in ["dev"]:
|
for split in ["dev", "eval_track1"]:
|
||||||
logging.info(f"Processing {split} IHM")
|
logging.info(f"Processing {split} IHM")
|
||||||
cuts_ihm = (
|
cuts_ihm = (
|
||||||
CutSet.from_manifests(**manifests_ihm[split])
|
CutSet.from_manifests(**manifests_ihm[split])
|
||||||
|
@ -92,7 +92,7 @@ def main(args):
|
|||||||
|
|
||||||
# Load manifests from cache if they exist (saves time)
|
# Load manifests from cache if they exist (saves time)
|
||||||
manifests = read_manifests_if_cached(
|
manifests = read_manifests_if_cached(
|
||||||
dataset_parts=["train", "dev"],
|
dataset_parts=["train", "dev", "eval_track1"],
|
||||||
output_dir=manifests_dir,
|
output_dir=manifests_dir,
|
||||||
prefix="icmcasr-sdm",
|
prefix="icmcasr-sdm",
|
||||||
suffix="jsonl.gz",
|
suffix="jsonl.gz",
|
||||||
@ -103,7 +103,7 @@ def main(args):
|
|||||||
)
|
)
|
||||||
|
|
||||||
with ThreadPoolExecutor(args.num_jobs) as ex:
|
with ThreadPoolExecutor(args.num_jobs) as ex:
|
||||||
for part in ["train", "dev",]:
|
for part in ["train", "dev", "eval_track1"]:
|
||||||
logging.info(f"Processing {part}...")
|
logging.info(f"Processing {part}...")
|
||||||
supervisions_orig = manifests[part]["supervisions"].filter(
|
supervisions_orig = manifests[part]["supervisions"].filter(
|
||||||
lambda s: s.duration >= args.min_segment_duration
|
lambda s: s.duration >= args.min_segment_duration
|
||||||
|
@ -30,7 +30,7 @@ log() {
|
|||||||
|
|
||||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||||
log "Stage 1: Prepare cut sets"
|
log "Stage 1: Prepare cut sets"
|
||||||
for part in train dev; do
|
for part in train dev eval_track1; do
|
||||||
lhotse cut simple \
|
lhotse cut simple \
|
||||||
-r $DATA_DIR/icmcasr-mdm_recordings_${part}.jsonl.gz \
|
-r $DATA_DIR/icmcasr-mdm_recordings_${part}.jsonl.gz \
|
||||||
-s $DATA_DIR/icmcasr-mdm_supervisions_${part}.jsonl.gz \
|
-s $DATA_DIR/icmcasr-mdm_supervisions_${part}.jsonl.gz \
|
||||||
@ -40,7 +40,7 @@ fi
|
|||||||
|
|
||||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||||
log "Stage 2: Trim cuts to supervisions (1 cut per supervision segment)"
|
log "Stage 2: Trim cuts to supervisions (1 cut per supervision segment)"
|
||||||
for part in train dev; do
|
for part in train dev eval_track1; do
|
||||||
lhotse cut trim-to-supervisions --discard-overlapping \
|
lhotse cut trim-to-supervisions --discard-overlapping \
|
||||||
$EXP_DIR/cuts_${part}.jsonl.gz $EXP_DIR/cuts_per_segment_${part}.jsonl.gz
|
$EXP_DIR/cuts_${part}.jsonl.gz $EXP_DIR/cuts_per_segment_${part}.jsonl.gz
|
||||||
done
|
done
|
||||||
@ -48,7 +48,7 @@ fi
|
|||||||
|
|
||||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||||
log "Stage 3: Split manifests for multi-GPU processing (optional)"
|
log "Stage 3: Split manifests for multi-GPU processing (optional)"
|
||||||
for part in train dev; do
|
for part in train dev eval_track1; do
|
||||||
gss utils split $nj $EXP_DIR/cuts_per_segment_${part}.jsonl.gz \
|
gss utils split $nj $EXP_DIR/cuts_per_segment_${part}.jsonl.gz \
|
||||||
$EXP_DIR/cuts_per_segment_${part}_split$nj
|
$EXP_DIR/cuts_per_segment_${part}_split$nj
|
||||||
done
|
done
|
||||||
@ -75,7 +75,7 @@ fi
|
|||||||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||||
log "Stage 5: Enhance eval/test segments using GSS (using GPU)"
|
log "Stage 5: Enhance eval/test segments using GSS (using GPU)"
|
||||||
# for eval/test, we use larger context and smaller batches to get better quality
|
# for eval/test, we use larger context and smaller batches to get better quality
|
||||||
for part in dev; do
|
for part in dev eval_track1; do
|
||||||
for JOB in $(seq $nj); do
|
for JOB in $(seq $nj); do
|
||||||
gss enhance cuts $EXP_DIR/cuts_${part}.jsonl.gz \
|
gss enhance cuts $EXP_DIR/cuts_${part}.jsonl.gz \
|
||||||
$EXP_DIR/cuts_per_segment_${part}_split$nj/cuts_per_segment_${part}.$JOB.jsonl.gz \
|
$EXP_DIR/cuts_per_segment_${part}_split$nj/cuts_per_segment_${part}.$JOB.jsonl.gz \
|
||||||
|
@ -6,8 +6,8 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
|||||||
set -eou pipefail
|
set -eou pipefail
|
||||||
|
|
||||||
nj=15
|
nj=15
|
||||||
stage=3
|
stage=4
|
||||||
stop_stage=3
|
stop_stage=4
|
||||||
|
|
||||||
# We assume dl_dir (download dir) contains the following
|
# We assume dl_dir (download dir) contains the following
|
||||||
# directories and files. If not, they will be downloaded
|
# directories and files. If not, they will be downloaded
|
||||||
@ -83,19 +83,19 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||||
log "Stage 3: Compute fbank for icmcasr"
|
log "Stage 4: Compute fbank for icmcasr"
|
||||||
if [ ! -f data/fbank/.icmcasr.done ]; then
|
if [ ! -f data/fbank/.icmcasr.done ]; then
|
||||||
mkdir -p data/fbank
|
mkdir -p data/fbank
|
||||||
./local/compute_fbank_icmcasr.py --perturb-speed True
|
./local/compute_fbank_icmcasr.py --perturb-speed True
|
||||||
echo "Combining manifests"
|
echo "Combining manifests"
|
||||||
lhotse combine data/manifests/cuts_train_{ihm,ihm_rvb,sdm,gss}.jsonl.gz - | shuf |\
|
lhotse combine data/manifests/cuts_train_{ihm,ihm_rvb,sdm,gss}.jsonl.gz - | shuf |\
|
||||||
gzip -c > data/manifests/cuts_train_all.jsonl.gz
|
gzip -c > data/manifests/cuts_train_all.jsonl.gz
|
||||||
touch data/fbank/.icmcasr.done
|
touch data/fbank/.icmcasr.done
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||||
log "Stage 4: Compute fbank for musan"
|
log "Stage 5: Compute fbank for musan"
|
||||||
if [ ! -f data/fbank/.msuan.done ]; then
|
if [ ! -f data/fbank/.msuan.done ]; then
|
||||||
mkdir -p data/fbank
|
mkdir -p data/fbank
|
||||||
./local/compute_fbank_musan.py
|
./local/compute_fbank_musan.py
|
||||||
@ -105,7 +105,7 @@ fi
|
|||||||
|
|
||||||
lang_phone_dir=data/lang_phone
|
lang_phone_dir=data/lang_phone
|
||||||
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||||
log "Stage 5: Prepare phone based lang"
|
log "Stage 6: Prepare G.fst"
|
||||||
mkdir -p $lang_phone_dir
|
mkdir -p $lang_phone_dir
|
||||||
|
|
||||||
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |
|
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |
|
||||||
@ -119,57 +119,3 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
lang_char_dir=data/lang_char
|
|
||||||
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
|
|
||||||
log "Stage 6: Prepare char based lang"
|
|
||||||
mkdir -p $lang_char_dir
|
|
||||||
# We reuse words.txt from phone based lexicon
|
|
||||||
# so that the two can share G.pt later.
|
|
||||||
|
|
||||||
# The transcripts in training set, generated in stage 5
|
|
||||||
cp $lang_phone_dir/transcript_words.txt $lang_char_dir/transcript_words.txt
|
|
||||||
|
|
||||||
cat $dl_dir/icmcasr/data_icmcasr/transcript/icmcasr_transcript_v0.8.txt |
|
|
||||||
cut -d " " -f 2- > $lang_char_dir/text
|
|
||||||
|
|
||||||
(echo '<eps> 0'; echo '!SIL 1'; echo '<SPOKEN_NOISE> 2'; echo '<UNK> 3';) \
|
|
||||||
> $lang_char_dir/words.txt
|
|
||||||
|
|
||||||
cat $lang_char_dir/text | sed 's/ /\n/g' | sort -u | sed '/^$/d' \
|
|
||||||
| awk '{print $1" "NR+3}' >> $lang_char_dir/words.txt
|
|
||||||
|
|
||||||
num_lines=$(< $lang_char_dir/words.txt wc -l)
|
|
||||||
(echo "#0 $num_lines"; echo "<s> $(($num_lines + 1))"; echo "</s> $(($num_lines + 2))";) \
|
|
||||||
>> $lang_char_dir/words.txt
|
|
||||||
|
|
||||||
if [ ! -f $lang_char_dir/L_disambig.pt ]; then
|
|
||||||
./local/prepare_char.py --lang-dir $lang_char_dir
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -f $lang_char_dir/HLG.fst ]; then
|
|
||||||
./local/prepare_lang_fst.py --lang-dir $lang_phone_dir --ngram-G ./data/lm/G_3_gram.fst.txt
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
|
|
||||||
log "Stage 7: Prepare Byte BPE based lang"
|
|
||||||
|
|
||||||
for vocab_size in ${vocab_sizes[@]}; do
|
|
||||||
lang_dir=data/lang_bbpe_${vocab_size}
|
|
||||||
mkdir -p $lang_dir
|
|
||||||
|
|
||||||
cp $lang_char_dir/words.txt $lang_dir
|
|
||||||
cp $lang_char_dir/text $lang_dir
|
|
||||||
|
|
||||||
if [ ! -f $lang_dir/bbpe.model ]; then
|
|
||||||
./local/train_bbpe_model.py \
|
|
||||||
--lang-dir $lang_dir \
|
|
||||||
--vocab-size $vocab_size \
|
|
||||||
--transcript $lang_dir/text
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -f $lang_dir/L_disambig.pt ]; then
|
|
||||||
./local/prepare_lang_bbpe.py --lang-dir $lang_dir
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user