fixes

2025-12-11 06:55:27 +00:00 · 2022-01-21 19:20:57 +00:00 · 2022-01-21 19:20:57 +00:00 · cb329d1342
commit cb329d1342
parent e76de3ba59
2 changed files with 8 additions and 19 deletions
--- a/egs/fisher_swbd/ASR/local/prepare_lang_g2pen.py
+++ b/egs/fisher_swbd/ASR/local/prepare_lang_g2pen.py
@ -17,21 +17,10 @@


 """
-This script takes as input a lexicon file "data/lang_phone/lexicon.txt"
-consisting of words and tokens (i.e., phones) and does the following:
-
-1. Add disambiguation symbols to the lexicon and generate lexicon_disambig.txt
-
-2. Generate tokens.txt, the token table mapping a token to a unique integer.
-
-3. Generate words.txt, the word table mapping a word to a unique integer.
-
-4. Generate L.pt, in k2 format. It can be loaded by
-
-        d = torch.load("L.pt")
-        lexicon = k2.Fsa.from_dict(d)
-
-5. Generate L_disambig.pt, in k2 format.
+This script takes as input a wors.txt file "data/lang_phone/words.txt"
+consisting of words and their IDs and creates a lexicon with g2p_en python package
+(it's CMUdict based). It also creates rest of the files typically expected in a lang 
+dir, including L.pt and Linv.pt.
 """
 import argparse
 import math
--- a/egs/fisher_swbd/ASR/prepare.sh
+++ b/egs/fisher_swbd/ASR/prepare.sh
@ -103,10 +103,10 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then

  # Combine Fisher and SWBD recordings and supervisions
  if $swbd_only; then
-    cp data/manifests/swbd/swbd_recordings.jsonl \
-      data/manifests/fisher-swbd_recordings.jsonl.gz
-    cp data/manifests/swbd/swbd_supervisions.jsonl \
-      data/manifests/fisher-swbd_supervisions.jsonl.gz
+    gunzip -c data/manifests/swbd/swbd_recordings.jsonl \
+      > data/manifests/fisher-swbd_recordings.jsonl.gz
+    gunzip -c data/manifests/swbd/swbd_supervisions.jsonl \
+      > data/manifests/fisher-swbd_supervisions.jsonl.gz
  else
    lhotse combine \
      data/manifests/fisher/recordings.jsonl.gz \