diff --git a/egs/fisher_swbd/ASR/local/prepare_lang_g2pen.py b/egs/fisher_swbd/ASR/local/prepare_lang_g2pen.py index 4768e1dc0..0549d7306 100755 --- a/egs/fisher_swbd/ASR/local/prepare_lang_g2pen.py +++ b/egs/fisher_swbd/ASR/local/prepare_lang_g2pen.py @@ -17,21 +17,10 @@ """ -This script takes as input a lexicon file "data/lang_phone/lexicon.txt" -consisting of words and tokens (i.e., phones) and does the following: - -1. Add disambiguation symbols to the lexicon and generate lexicon_disambig.txt - -2. Generate tokens.txt, the token table mapping a token to a unique integer. - -3. Generate words.txt, the word table mapping a word to a unique integer. - -4. Generate L.pt, in k2 format. It can be loaded by - - d = torch.load("L.pt") - lexicon = k2.Fsa.from_dict(d) - -5. Generate L_disambig.pt, in k2 format. +This script takes as input a wors.txt file "data/lang_phone/words.txt" +consisting of words and their IDs and creates a lexicon with g2p_en python package +(it's CMUdict based). It also creates rest of the files typically expected in a lang +dir, including L.pt and Linv.pt. """ import argparse import math diff --git a/egs/fisher_swbd/ASR/prepare.sh b/egs/fisher_swbd/ASR/prepare.sh index 4a23bff5c..0f2562507 100755 --- a/egs/fisher_swbd/ASR/prepare.sh +++ b/egs/fisher_swbd/ASR/prepare.sh @@ -103,10 +103,10 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then # Combine Fisher and SWBD recordings and supervisions if $swbd_only; then - cp data/manifests/swbd/swbd_recordings.jsonl \ - data/manifests/fisher-swbd_recordings.jsonl.gz - cp data/manifests/swbd/swbd_supervisions.jsonl \ - data/manifests/fisher-swbd_supervisions.jsonl.gz + gunzip -c data/manifests/swbd/swbd_recordings.jsonl \ + > data/manifests/fisher-swbd_recordings.jsonl.gz + gunzip -c data/manifests/swbd/swbd_supervisions.jsonl \ + > data/manifests/fisher-swbd_supervisions.jsonl.gz else lhotse combine \ data/manifests/fisher/recordings.jsonl.gz \