mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-08 17:42:21 +00:00
Full libri fix manifest (#804)
* modify the name of the directory of vq manifest * fix missing manifest in full libri training
This commit is contained in:
parent
2fd970b682
commit
80cce141b4
@ -43,7 +43,7 @@ mkdir -p $exp_dir
|
|||||||
# full_libri can be "True" or "False"
|
# full_libri can be "True" or "False"
|
||||||
# "True" -> use full librispeech dataset for distillation
|
# "True" -> use full librispeech dataset for distillation
|
||||||
# "False" -> use train-clean-100 subset for distillation
|
# "False" -> use train-clean-100 subset for distillation
|
||||||
full_libri=False
|
full_libri=True
|
||||||
|
|
||||||
# use_extracted_codebook can be "True" or "False"
|
# use_extracted_codebook can be "True" or "False"
|
||||||
# "True" -> stage 0 and stage 1 would be skipped,
|
# "True" -> stage 0 and stage 1 would be skipped,
|
||||||
@ -145,8 +145,12 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
|||||||
log "Currently we only uploaded codebook indexes from teacher model hubert_xtralarge_ll60k_finetune_ls960"
|
log "Currently we only uploaded codebook indexes from teacher model hubert_xtralarge_ll60k_finetune_ls960"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
# The codebook indexes to be downloaded are generated using the following setup:
|
||||||
|
embedding_layer=36
|
||||||
|
num_codebooks=8
|
||||||
|
|
||||||
mkdir -p $exp_dir/vq
|
mkdir -p $exp_dir/vq
|
||||||
codebook_dir=$exp_dir/vq/$teacher_model_id
|
codebook_dir=$exp_dir/vq/${teacher_model_id}_layer${embedding_layer}_cb${num_codebooks}
|
||||||
mkdir -p codebook_dir
|
mkdir -p codebook_dir
|
||||||
codebook_download_dir=$exp_dir/download_codebook
|
codebook_download_dir=$exp_dir/download_codebook
|
||||||
if [ -d $codebook_download_dir ]; then
|
if [ -d $codebook_download_dir ]; then
|
||||||
@ -164,8 +168,9 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
|||||||
git lfs install
|
git lfs install
|
||||||
git clone https://huggingface.co/marcoyang/pruned_transducer_stateless6_hubert_xtralarge_ll60k_finetune_ls960 $codebook_download_dir
|
git clone https://huggingface.co/marcoyang/pruned_transducer_stateless6_hubert_xtralarge_ll60k_finetune_ls960 $codebook_download_dir
|
||||||
|
|
||||||
mkdir -p data/vq_fbank
|
vq_fbank=data/vq_fbank_layer${embedding_layer}_cb${num_codebooks}/
|
||||||
mv $codebook_download_dir/*.jsonl.gz data/vq_fbank/
|
mkdir -p $vq_fbank
|
||||||
|
mv $codebook_download_dir/*.jsonl.gz $vq_fbank
|
||||||
mkdir -p $codebook_dir/splits4
|
mkdir -p $codebook_dir/splits4
|
||||||
mv $codebook_download_dir/*.h5 $codebook_dir/splits4/
|
mv $codebook_download_dir/*.h5 $codebook_dir/splits4/
|
||||||
log "Remove $codebook_download_dir"
|
log "Remove $codebook_download_dir"
|
||||||
@ -181,6 +186,15 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
|||||||
--max-duration 100 \
|
--max-duration 100 \
|
||||||
--teacher-model-id $teacher_model_id \
|
--teacher-model-id $teacher_model_id \
|
||||||
--use-extracted-codebook $use_extracted_codebook
|
--use-extracted-codebook $use_extracted_codebook
|
||||||
|
|
||||||
|
if [ "$full_libri" == "True" ]; then
|
||||||
|
# Merge the 3 subsets and create a full one
|
||||||
|
rm ${vq_fbank}/librispeech_cuts_train-all-shuf.jsonl.gz
|
||||||
|
cat <(gunzip -c ${vq_fbank}/librispeech_cuts_train-clean-100.jsonl.gz) \
|
||||||
|
<(gunzip -c ${vq_fbank}/librispeech_cuts_train-clean-360.jsonl.gz) \
|
||||||
|
<(gunzip -c ${vq_fbank}/librispeech_cuts_train-other-500.jsonl.gz) | \
|
||||||
|
shuf | gzip -c > ${vq_fbank}/librispeech_cuts_train-all-shuf.jsonl.gz
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||||
|
@ -68,7 +68,10 @@ class CodebookIndexExtractor:
|
|||||||
def init_dirs(self):
|
def init_dirs(self):
|
||||||
# vq_dir is the root dir for quantization, containing:
|
# vq_dir is the root dir for quantization, containing:
|
||||||
# training data, trained quantizer, and extracted codebook indexes
|
# training data, trained quantizer, and extracted codebook indexes
|
||||||
self.vq_dir = self.params.exp_dir / f"vq/{self.params.teacher_model_id}/"
|
self.vq_dir = (
|
||||||
|
self.params.exp_dir
|
||||||
|
/ f"vq/{self.params.teacher_model_id}_layer{self.params.embedding_layer}_cb{self.params.num_codebooks}/"
|
||||||
|
)
|
||||||
self.vq_dir.mkdir(parents=True, exist_ok=True)
|
self.vq_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# manifest_dir contains:
|
# manifest_dir contains:
|
||||||
@ -79,7 +82,10 @@ class CodebookIndexExtractor:
|
|||||||
# It's doesn't matter whether ori_manifest_dir is str or Path.
|
# It's doesn't matter whether ori_manifest_dir is str or Path.
|
||||||
# Set it to Path to be consistent.
|
# Set it to Path to be consistent.
|
||||||
self.ori_manifest_dir = Path("./data/fbank/")
|
self.ori_manifest_dir = Path("./data/fbank/")
|
||||||
self.dst_manifest_dir = Path("./data/vq_fbank/")
|
self.dst_manifest_dir = Path(
|
||||||
|
f"./data/vq_fbank_layer"
|
||||||
|
+ f"{self.params.embedding_layer}_cb{self.params.num_codebooks}/"
|
||||||
|
)
|
||||||
|
|
||||||
self.dst_manifest_dir.mkdir(parents=True, exist_ok=True)
|
self.dst_manifest_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
@ -284,7 +290,10 @@ class CodebookIndexExtractor:
|
|||||||
Merge generated vq included manfiests and storage to self.dst_manifest_dir.
|
Merge generated vq included manfiests and storage to self.dst_manifest_dir.
|
||||||
"""
|
"""
|
||||||
for subset in self.params.subsets:
|
for subset in self.params.subsets:
|
||||||
vq_manifests = f"{self.manifest_dir}/with_codebook_indexes-librispeech-cuts_train-{subset}*.jsonl.gz"
|
vq_manifests = (
|
||||||
|
f"{self.manifest_dir}/"
|
||||||
|
+ f"with_codebook_indexes-librispeech-cuts_train-{subset}*.jsonl.gz"
|
||||||
|
)
|
||||||
dst_vq_manifest = (
|
dst_vq_manifest = (
|
||||||
self.dst_manifest_dir / f"librispeech_cuts_train-{subset}-vq.jsonl.gz"
|
self.dst_manifest_dir / f"librispeech_cuts_train-{subset}-vq.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user