Full libri fix manifest (#804)

* modify the name of the directory of vq manifest

* fix missing manifest in full libri training
This commit is contained in:
marcoyang1998 2023-01-03 15:40:53 +08:00 committed by GitHub
parent 2fd970b682
commit 80cce141b4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 30 additions and 7 deletions

View File

@ -43,7 +43,7 @@ mkdir -p $exp_dir
# full_libri can be "True" or "False"
# "True" -> use full librispeech dataset for distillation
# "False" -> use train-clean-100 subset for distillation
full_libri=False
full_libri=True
# use_extracted_codebook can be "True" or "False"
# "True" -> stage 0 and stage 1 would be skipped,
@ -145,8 +145,12 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Currently we only uploaded codebook indexes from teacher model hubert_xtralarge_ll60k_finetune_ls960"
exit 1
fi
# The codebook indexes to be downloaded are generated using the following setup:
embedding_layer=36
num_codebooks=8
mkdir -p $exp_dir/vq
codebook_dir=$exp_dir/vq/$teacher_model_id
codebook_dir=$exp_dir/vq/${teacher_model_id}_layer${embedding_layer}_cb${num_codebooks}
mkdir -p codebook_dir
codebook_download_dir=$exp_dir/download_codebook
if [ -d $codebook_download_dir ]; then
@ -164,8 +168,9 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
git lfs install
git clone https://huggingface.co/marcoyang/pruned_transducer_stateless6_hubert_xtralarge_ll60k_finetune_ls960 $codebook_download_dir
mkdir -p data/vq_fbank
mv $codebook_download_dir/*.jsonl.gz data/vq_fbank/
vq_fbank=data/vq_fbank_layer${embedding_layer}_cb${num_codebooks}/
mkdir -p $vq_fbank
mv $codebook_download_dir/*.jsonl.gz $vq_fbank
mkdir -p $codebook_dir/splits4
mv $codebook_download_dir/*.h5 $codebook_dir/splits4/
log "Remove $codebook_download_dir"
@ -181,6 +186,15 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
--max-duration 100 \
--teacher-model-id $teacher_model_id \
--use-extracted-codebook $use_extracted_codebook
if [ "$full_libri" == "True" ]; then
# Merge the 3 subsets and create a full one
rm ${vq_fbank}/librispeech_cuts_train-all-shuf.jsonl.gz
cat <(gunzip -c ${vq_fbank}/librispeech_cuts_train-clean-100.jsonl.gz) \
<(gunzip -c ${vq_fbank}/librispeech_cuts_train-clean-360.jsonl.gz) \
<(gunzip -c ${vq_fbank}/librispeech_cuts_train-other-500.jsonl.gz) | \
shuf | gzip -c > ${vq_fbank}/librispeech_cuts_train-all-shuf.jsonl.gz
fi
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then

View File

@ -68,7 +68,10 @@ class CodebookIndexExtractor:
def init_dirs(self):
# vq_dir is the root dir for quantization, containing:
# training data, trained quantizer, and extracted codebook indexes
self.vq_dir = self.params.exp_dir / f"vq/{self.params.teacher_model_id}/"
self.vq_dir = (
self.params.exp_dir
/ f"vq/{self.params.teacher_model_id}_layer{self.params.embedding_layer}_cb{self.params.num_codebooks}/"
)
self.vq_dir.mkdir(parents=True, exist_ok=True)
# manifest_dir contains:
@ -79,7 +82,10 @@ class CodebookIndexExtractor:
# It's doesn't matter whether ori_manifest_dir is str or Path.
# Set it to Path to be consistent.
self.ori_manifest_dir = Path("./data/fbank/")
self.dst_manifest_dir = Path("./data/vq_fbank/")
self.dst_manifest_dir = Path(
f"./data/vq_fbank_layer"
+ f"{self.params.embedding_layer}_cb{self.params.num_codebooks}/"
)
self.dst_manifest_dir.mkdir(parents=True, exist_ok=True)
@ -284,7 +290,10 @@ class CodebookIndexExtractor:
Merge generated vq included manfiests and storage to self.dst_manifest_dir.
"""
for subset in self.params.subsets:
vq_manifests = f"{self.manifest_dir}/with_codebook_indexes-librispeech-cuts_train-{subset}*.jsonl.gz"
vq_manifests = (
f"{self.manifest_dir}/"
+ f"with_codebook_indexes-librispeech-cuts_train-{subset}*.jsonl.gz"
)
dst_vq_manifest = (
self.dst_manifest_dir / f"librispeech_cuts_train-{subset}-vq.jsonl.gz"
)