From cd14b3fe1cbb27dcf6e8a1f9fc4155d6b7826bb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Thu, 16 Jun 2022 16:53:31 +0000
Subject: [PATCH] Update file names in Hubert VQ recipe

---
 .../ASR/distillation_with_hubert.sh            |  6 +++---
 .../pruned_transducer_stateless6/vq_utils.py   | 18 +++++++++---------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/egs/librispeech/ASR/distillation_with_hubert.sh b/egs/librispeech/ASR/distillation_with_hubert.sh
index e18ba8f55..94e06e111 100644
--- a/egs/librispeech/ASR/distillation_with_hubert.sh
+++ b/egs/librispeech/ASR/distillation_with_hubert.sh
@@ -30,10 +30,10 @@ stage=$1
 # even you only have ONE GPU. It needed by CodebookIndexExtractor to determine numbert of jobs to extract codebook indexes parallelly.
 
 # Suppose only one GPU exists:
-# export CUDA_VISIBLE_DEVICES="0"
+export CUDA_VISIBLE_DEVICES="0"
 #
 # Suppose GPU 2,3,4,5 are available.
-export CUDA_VISIBLE_DEVICES="2,3,4,5"
+# export CUDA_VISIBLE_DEVICES="2,3,4,5"
 
 
 if [ $stage -eq 0 ]; then
@@ -71,7 +71,7 @@ if [ $stage -eq 0 ]; then
   if [ -f ${hubert_model} ]; then
     echo "hubert model alread exists."
   else
-    wget -c https://dl.fbaipublicfiles.com/hubert/${model_id} -P ${hubert_model}
+    wget -c https://dl.fbaipublicfiles.com/hubert/${model_id}.pt -P ${hubert_model}
     wget -c wget https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt -P ${hubert_model_dir}
   fi
 fi
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless6/vq_utils.py b/egs/librispeech/ASR/pruned_transducer_stateless6/vq_utils.py
index c4935f921..d11696bb2 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless6/vq_utils.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless6/vq_utils.py
@@ -36,7 +36,7 @@ from icefall.utils import (
     AttributeDict,
     setup_logger,
 )
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy
 from lhotse.features.io import NumpyHdf5Writer
 
 
@@ -222,7 +222,7 @@ class CodebookIndexExtractor:
         """
         for subset in self.params.subsets:
             logging.info(f"About to split {subset}.")
-            ori_manifest = f"./data/fbank/cuts_train-{subset}.json.gz"
+            ori_manifest = f"./data/fbank/cuts_train-{subset}.jsonl.gz"
             split_cmd = f"lhotse split {self.params.world_size} {ori_manifest} {self.manifest_dir}"
             os.system(f"{split_cmd}")
 
@@ -231,9 +231,9 @@ class CodebookIndexExtractor:
         Merge generated vq included manfiests and storage to self.dst_manifest_dir.
         """
         for subset in self.params.subsets:
-            vq_manifests = f"{self.manifest_dir}/with_codebook_indexes-cuts_train-{subset}*.json.gz"
+            vq_manifests = f"{self.manifest_dir}/with_codebook_indexes-cuts_train-{subset}*.jsonl.gz"
             dst_vq_manifest = (
-                self.dst_manifest_dir / f"cuts_train-{subset}.json.gz"
+                self.dst_manifest_dir / f"cuts_train-{subset}.jsonl.gz"
             )
             if 1 == self.params.world_size:
                 merge_cmd = f"cp {vq_manifests} {dst_vq_manifest}"
@@ -294,14 +294,14 @@ class CodebookIndexExtractor:
 
     def load_ori_dl(self, subset):
         if self.params.world_size == 1:
-            ori_manifest_path = f"./data/fbank/cuts_train-{subset}.json.gz"
+            ori_manifest_path = f"./data/fbank/librispeech_cuts_train-{subset}.jsonl.gz"
         else:
             ori_manifest_path = (
                 self.manifest_dir
-                / f"cuts_train-{subset}.{self.params.manifest_index}.json.gz"
+                / f"librispeech_cuts_train-{subset}.{self.params.manifest_index}.jsonl.gz"
             )
 
-        cuts = load_manifest(ori_manifest_path)
+        cuts = load_manifest_lazy(ori_manifest_path)
         dl = LibriSpeechAsrDataModule(self.params).train_dataloaders(cuts)
         return dl
 
@@ -373,9 +373,9 @@ class CodebookIndexExtractor:
 
                 json_file_path = (
                     self.manifest_dir
-                    / f"with_codebook_indexes-cuts_train-{manifest_file_id}.json.gz"
+                    / f"with_codebook_indexes-cuts_train-{manifest_file_id}.jsonl.gz"
                 )
-                CutSet.from_cuts(cuts).to_json(json_file_path)
+                CutSet.from_cuts(cuts).to_file(json_file_path)
 
 
 @torch.no_grad()