Minor fixes to tedlimu3 to make ./prepare.sh working. (#258)

2025-08-08 09:32:20 +00:00 · 2022-03-20 20:26:03 +08:00 · 2022-03-20 20:26:03 +08:00 · 910e6c9306
commit 910e6c9306
parent ad28c8c5eb
5 changed files with 31 additions and 21 deletions
--- a/egs/tedlium3/ASR/local/compute_fbank_tedlium.py
+++ b/egs/tedlium3/ASR/local/compute_fbank_tedlium.py
@ -15,8 +15,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
 """
 This file computes fbank features of the TedLium3 dataset.
 It looks for manifests in the directory data/manifests.
@ -77,11 +75,14 @@ def compute_fbank_tedlium():
                    + cut_set.perturb_speed(0.9)
                    + cut_set.perturb_speed(1.1)
                )
+            cur_num_jobs = num_jobs if ex is None else 80
+            cur_num_jobs = min(cur_num_jobs, len(cut_set))
+
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f"{output_dir}/feats_{partition}",
                # when an executor is specified, make more partitions
-                num_jobs=num_jobs if ex is None else 80,
+                num_jobs=cur_num_jobs,
                executor=ex,
                storage_type=ChunkedLilcomHdf5Writer,
            )
--- a/egs/tedlium3/ASR/local/prepare_lexicon.py
+++ b/egs/tedlium3/ASR/local/prepare_lexicon.py
--- a/egs/tedlium3/ASR/prepare.sh
+++ b/egs/tedlium3/ASR/prepare.sh
@ -71,31 +71,44 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
 fi

 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
-  log "Stage 1: Prepare tedlium3 manifest"
-  # We assume that you have downloaded the tedlium3 corpus
-  # to $dl_dir/tedlium3
-  mkdir -p data/manifests
-  lhotse prepare tedlium $dl_dir/tedlium3 data/manifests
+  log "Stage 1: Prepare tedlium3 manifests"
+  if [ ! -f data/manifests/.tedlium3.done ]; then
+    # We assume that you have downloaded the tedlium3 corpus
+    # to $dl_dir/tedlium3
+    mkdir -p data/manifests
+    lhotse prepare tedlium $dl_dir/tedlium3 data/manifests
+    touch data/manifests/.tedlium3.done
+  fi
 fi

 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
-  log "Stage 2: Prepare musan manifest"
+  log "Stage 2: Prepare musan manifests"
  # We assume that you have downloaded the musan corpus
  # to data/musan
-  mkdir -p data/manifests
-  lhotse prepare musan $dl_dir/musan data/manifests
+  if [ ! -e data/manifests/.musan.done ]; then
+    mkdir -p data/manifests
+    lhotse prepare musan $dl_dir/musan data/manifests
+    touch data/manifests/.musan.done
+  fi
 fi

 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  log "Stage 3: Compute fbank for tedlium3"
-  mkdir -p data/fbank
-  ./local/compute_fbank_tedlium.py
+
+  if [ ! -e data/fbank/.tedlium3.done ]; then
+    mkdir -p data/fbank
+    python3 ./local/compute_fbank_tedlium.py
+    touch data/fbank/.tedlium3.done
+  fi
 fi

 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Compute fbank for musan"
-  mkdir -p data/fbank
-  ./local/compute_fbank_musan.py
+  if [ ! -e data/fbank/.musan.done ]; then
+    mkdir -p data/fbank
+    python3 ./local/compute_fbank_musan.py
+    touch data/fbank/.musan.done
+  fi
 fi

 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
--- a/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py
+++ b/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py
@ -278,7 +278,6 @@ class TedLiumAsrDataModule:
                shuffle=self.args.shuffle,
            )
        logging.info("About to create train dataloader")
-        # print(train)
        train_dl = DataLoader(
            train,
            sampler=train_sampler,
@ -351,7 +350,6 @@ class TedLiumAsrDataModule:
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
-        print(self.args.manifest_dir)
        return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")

    @lru_cache()
--- a/egs/tedlium3/ASR/transducer_stateless/train.py
+++ b/egs/tedlium3/ASR/transducer_stateless/train.py
@ -624,10 +624,8 @@ def run(rank, world_size, args):
    train_cuts = tedlium.train_cuts()

    def remove_short_and_long_utt(c: Cut):
-        # Keep only utterances with duration between 1 second and max seconds
-        # Here, we set max as 20.0.
-        # If you want to use a big max-duration, you can set it as 17.0.
-        return 1.0 <= c.duration <= 20.0
+        # Keep only utterances with duration between 1 second and 17 seconds
+        return 1.0 <= c.duration <= 17.0

    num_in_total = len(train_cuts)