remove unwanted changes

2025-12-11 06:55:27 +00:00 · 2023-03-09 17:23:21 -05:00 · 2023-03-09 17:23:21 -05:00 · f2d8bf632f
commit f2d8bf632f
parent 8a8e827317
3 changed files with 42 additions and 19 deletions
--- a/egs/librispeech/ASR/generate-lm.sh
+++ b/egs/librispeech/ASR/generate-lm.sh
@ -2,7 +2,7 @@

 lang_dir=data/lang_bpe_500

-for ngram in 2 3 5; do
+for ngram in 2 3 4 5; do
  if [ ! -f $lang_dir/${ngram}gram.arpa ]; then
    ./shared/make_kn_lm.py \
      -ngram-order ${ngram} \
--- a/egs/librispeech/ASR/local/compute_fbank_librispeech.py
+++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
@ -54,10 +54,20 @@ def get_args():
        help="""Path to the bpe.model. If not None, we will remove short and
        long utterances before extracting features""",
    )
+
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        help="""Dataset parts to compute fbank. If None, we will use all""",
+    )
+
    return parser.parse_args()


-def compute_fbank_librispeech(bpe_model: Optional[str] = None):
+def compute_fbank_librispeech(
+    bpe_model: Optional[str] = None,
+    dataset: Optional[str] = None,
+):
    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
    num_jobs = min(15, os.cpu_count())
@ -68,15 +78,19 @@ def compute_fbank_librispeech(bpe_model: Optional[str] = None):
        sp = spm.SentencePieceProcessor()
        sp.load(bpe_model)

-    dataset_parts = (
-        "dev-clean",
-        "dev-other",
-        "test-clean",
-        "test-other",
-        "train-clean-100",
-        "train-clean-360",
-        "train-other-500",
-    )
+    if dataset is None:
+        dataset_parts = (
+            "dev-clean",
+            "dev-other",
+            "test-clean",
+            "test-other",
+            "train-clean-100",
+            "train-clean-360",
+            "train-other-500",
+        )
+    else:
+        dataset_parts = dataset.split(" ", -1)
+
    prefix = "librispeech"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
@ -131,4 +145,4 @@ if __name__ == "__main__":
    logging.basicConfig(format=formatter, level=logging.INFO)
    args = get_args()
    logging.info(vars(args))
-    compute_fbank_librispeech(bpe_model=args.bpe_model)
+    compute_fbank_librispeech(bpe_model=args.bpe_model, dataset=args.dataset)
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@ -123,10 +123,12 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
    touch data/fbank/.librispeech.done
  fi

-  cat <(gunzip -c data/fbank/librispeech_cuts_train-clean-100.jsonl.gz) \
-    <(gunzip -c data/fbank/librispeech_cuts_train-clean-360.jsonl.gz) \
-    <(gunzip -c data/fbank/librispeech_cuts_train-other-500.jsonl.gz) | \
-    shuf | gzip -c > data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz
+  if [ ! -f data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz ]; then
+    cat <(gunzip -c data/fbank/librispeech_cuts_train-clean-100.jsonl.gz) \
+      <(gunzip -c data/fbank/librispeech_cuts_train-clean-360.jsonl.gz) \
+      <(gunzip -c data/fbank/librispeech_cuts_train-other-500.jsonl.gz) | \
+      shuf | gzip -c > data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz
+  fi

  if [ ! -e data/fbank/.librispeech-validated.done ]; then
    log "Validating data/fbank for LibriSpeech"
@ -244,7 +246,7 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
 fi

 if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
-  log "Stage 7: Prepare bigram P"
+  log "Stage 7: Prepare bigram token-level P for MMI training"

  for vocab_size in ${vocab_sizes[@]}; do
    lang_dir=data/lang_bpe_${vocab_size}
@ -302,13 +304,20 @@ fi
 if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
  log "Stage 9: Compile HLG"
  ./local/compile_hlg.py --lang-dir data/lang_phone
-  ./local/compile_hlg_using_openfst.py --lang-dir data/lang_phone
+
+  # Note If ./local/compile_hlg.py throws OOM,
+  # please switch to the following command
+  #
+  # ./local/compile_hlg_using_openfst.py --lang-dir data/lang_phone

  for vocab_size in ${vocab_sizes[@]}; do
    lang_dir=data/lang_bpe_${vocab_size}
    ./local/compile_hlg.py --lang-dir $lang_dir

-    ./local/compile_hlg_using_openfst.py --lang-dir $lang_dir
+    # Note If ./local/compile_hlg.py throws OOM,
+    # please switch to the following command
+    #
+    # ./local/compile_hlg_using_openfst.py --lang-dir $lang_dir
  done
 fi