fixes

2025-12-11 06:55:27 +00:00 · 2022-01-21 19:20:57 +00:00 · 2022-01-21 19:20:57 +00:00 · cb329d1342
commit cb329d1342
parent e76de3ba59
2 changed files with 8 additions and 19 deletions
--- a/egs/fisher_swbd/ASR/local/prepare_lang_g2pen.py
+++ b/egs/fisher_swbd/ASR/local/prepare_lang_g2pen.py
@ -17,21 +17,10 @@
 """
-This script takes as input a lexicon file "data/lang_phone/lexicon.txt"
+This script takes as input a wors.txt file "data/lang_phone/words.txt"
-consisting of words and tokens (i.e., phones) and does the following:
+consisting of words and their IDs and creates a lexicon with g2p_en python package
-
+(it's CMUdict based). It also creates rest of the files typically expected in a lang 
-1. Add disambiguation symbols to the lexicon and generate lexicon_disambig.txt
+dir, including L.pt and Linv.pt.
 2. Generate tokens.txt, the token table mapping a token to a unique integer.
 3. Generate words.txt, the word table mapping a word to a unique integer.
 4. Generate L.pt, in k2 format. It can be loaded by
        d = torch.load("L.pt")
        lexicon = k2.Fsa.from_dict(d)
 5. Generate L_disambig.pt, in k2 format.
 """
 import argparse
 import math
--- a/egs/fisher_swbd/ASR/prepare.sh
+++ b/egs/fisher_swbd/ASR/prepare.sh
@ -103,10 +103,10 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  # Combine Fisher and SWBD recordings and supervisions
  if $swbd_only; then
-    cp data/manifests/swbd/swbd_recordings.jsonl \
+    gunzip -c data/manifests/swbd/swbd_recordings.jsonl \
-      data/manifests/fisher-swbd_recordings.jsonl.gz
+      > data/manifests/fisher-swbd_recordings.jsonl.gz
-    cp data/manifests/swbd/swbd_supervisions.jsonl \
+    gunzip -c data/manifests/swbd/swbd_supervisions.jsonl \
-      data/manifests/fisher-swbd_supervisions.jsonl.gz
+      > data/manifests/fisher-swbd_supervisions.jsonl.gz
  else
    lhotse combine \
      data/manifests/fisher/recordings.jsonl.gz \