From cb329d1342c67331b31042446db7d5e6ba0129b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Fri, 21 Jan 2022 19:20:57 +0000
Subject: [PATCH] fixes

---
 .../ASR/local/prepare_lang_g2pen.py           | 19 ++++---------------
 egs/fisher_swbd/ASR/prepare.sh                |  8 ++++----
 2 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/egs/fisher_swbd/ASR/local/prepare_lang_g2pen.py b/egs/fisher_swbd/ASR/local/prepare_lang_g2pen.py
index 4768e1dc0..0549d7306 100755
--- a/egs/fisher_swbd/ASR/local/prepare_lang_g2pen.py
+++ b/egs/fisher_swbd/ASR/local/prepare_lang_g2pen.py
@@ -17,21 +17,10 @@
 
 
 """
-This script takes as input a lexicon file "data/lang_phone/lexicon.txt"
-consisting of words and tokens (i.e., phones) and does the following:
-
-1. Add disambiguation symbols to the lexicon and generate lexicon_disambig.txt
-
-2. Generate tokens.txt, the token table mapping a token to a unique integer.
-
-3. Generate words.txt, the word table mapping a word to a unique integer.
-
-4. Generate L.pt, in k2 format. It can be loaded by
-
-        d = torch.load("L.pt")
-        lexicon = k2.Fsa.from_dict(d)
-
-5. Generate L_disambig.pt, in k2 format.
+This script takes as input a wors.txt file "data/lang_phone/words.txt"
+consisting of words and their IDs and creates a lexicon with g2p_en python package
+(it's CMUdict based). It also creates rest of the files typically expected in a lang 
+dir, including L.pt and Linv.pt.
 """
 import argparse
 import math
diff --git a/egs/fisher_swbd/ASR/prepare.sh b/egs/fisher_swbd/ASR/prepare.sh
index 4a23bff5c..0f2562507 100755
--- a/egs/fisher_swbd/ASR/prepare.sh
+++ b/egs/fisher_swbd/ASR/prepare.sh
@@ -103,10 +103,10 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
 
   # Combine Fisher and SWBD recordings and supervisions
   if $swbd_only; then
-    cp data/manifests/swbd/swbd_recordings.jsonl \
-      data/manifests/fisher-swbd_recordings.jsonl.gz
-    cp data/manifests/swbd/swbd_supervisions.jsonl \
-      data/manifests/fisher-swbd_supervisions.jsonl.gz
+    gunzip -c data/manifests/swbd/swbd_recordings.jsonl \
+      > data/manifests/fisher-swbd_recordings.jsonl.gz
+    gunzip -c data/manifests/swbd/swbd_supervisions.jsonl \
+      > data/manifests/fisher-swbd_supervisions.jsonl.gz
   else
     lhotse combine \
       data/manifests/fisher/recordings.jsonl.gz \