From cd14b3fe1cbb27dcf6e8a1f9fc4155d6b7826bb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20=C5=BBelasko?= Date: Thu, 16 Jun 2022 16:53:31 +0000 Subject: [PATCH] Update file names in Hubert VQ recipe --- .../ASR/distillation_with_hubert.sh | 6 +++--- .../pruned_transducer_stateless6/vq_utils.py | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/egs/librispeech/ASR/distillation_with_hubert.sh b/egs/librispeech/ASR/distillation_with_hubert.sh index e18ba8f55..94e06e111 100644 --- a/egs/librispeech/ASR/distillation_with_hubert.sh +++ b/egs/librispeech/ASR/distillation_with_hubert.sh @@ -30,10 +30,10 @@ stage=$1 # even you only have ONE GPU. It needed by CodebookIndexExtractor to determine numbert of jobs to extract codebook indexes parallelly. # Suppose only one GPU exists: -# export CUDA_VISIBLE_DEVICES="0" +export CUDA_VISIBLE_DEVICES="0" # # Suppose GPU 2,3,4,5 are available. -export CUDA_VISIBLE_DEVICES="2,3,4,5" +# export CUDA_VISIBLE_DEVICES="2,3,4,5" if [ $stage -eq 0 ]; then @@ -71,7 +71,7 @@ if [ $stage -eq 0 ]; then if [ -f ${hubert_model} ]; then echo "hubert model alread exists." else - wget -c https://dl.fbaipublicfiles.com/hubert/${model_id} -P ${hubert_model} + wget -c https://dl.fbaipublicfiles.com/hubert/${model_id}.pt -P ${hubert_model} wget -c wget https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt -P ${hubert_model_dir} fi fi diff --git a/egs/librispeech/ASR/pruned_transducer_stateless6/vq_utils.py b/egs/librispeech/ASR/pruned_transducer_stateless6/vq_utils.py index c4935f921..d11696bb2 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless6/vq_utils.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless6/vq_utils.py @@ -36,7 +36,7 @@ from icefall.utils import ( AttributeDict, setup_logger, ) -from lhotse import CutSet, load_manifest +from lhotse import CutSet, load_manifest_lazy from lhotse.features.io import NumpyHdf5Writer @@ -222,7 +222,7 @@ class CodebookIndexExtractor: """ for subset in self.params.subsets: logging.info(f"About to split {subset}.") - ori_manifest = f"./data/fbank/cuts_train-{subset}.json.gz" + ori_manifest = f"./data/fbank/cuts_train-{subset}.jsonl.gz" split_cmd = f"lhotse split {self.params.world_size} {ori_manifest} {self.manifest_dir}" os.system(f"{split_cmd}") @@ -231,9 +231,9 @@ class CodebookIndexExtractor: Merge generated vq included manfiests and storage to self.dst_manifest_dir. """ for subset in self.params.subsets: - vq_manifests = f"{self.manifest_dir}/with_codebook_indexes-cuts_train-{subset}*.json.gz" + vq_manifests = f"{self.manifest_dir}/with_codebook_indexes-cuts_train-{subset}*.jsonl.gz" dst_vq_manifest = ( - self.dst_manifest_dir / f"cuts_train-{subset}.json.gz" + self.dst_manifest_dir / f"cuts_train-{subset}.jsonl.gz" ) if 1 == self.params.world_size: merge_cmd = f"cp {vq_manifests} {dst_vq_manifest}" @@ -294,14 +294,14 @@ class CodebookIndexExtractor: def load_ori_dl(self, subset): if self.params.world_size == 1: - ori_manifest_path = f"./data/fbank/cuts_train-{subset}.json.gz" + ori_manifest_path = f"./data/fbank/librispeech_cuts_train-{subset}.jsonl.gz" else: ori_manifest_path = ( self.manifest_dir - / f"cuts_train-{subset}.{self.params.manifest_index}.json.gz" + / f"librispeech_cuts_train-{subset}.{self.params.manifest_index}.jsonl.gz" ) - cuts = load_manifest(ori_manifest_path) + cuts = load_manifest_lazy(ori_manifest_path) dl = LibriSpeechAsrDataModule(self.params).train_dataloaders(cuts) return dl @@ -373,9 +373,9 @@ class CodebookIndexExtractor: json_file_path = ( self.manifest_dir - / f"with_codebook_indexes-cuts_train-{manifest_file_id}.json.gz" + / f"with_codebook_indexes-cuts_train-{manifest_file_id}.jsonl.gz" ) - CutSet.from_cuts(cuts).to_json(json_file_path) + CutSet.from_cuts(cuts).to_file(json_file_path) @torch.no_grad()