From cb329d1342c67331b31042446db7d5e6ba0129b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20=C5=BBelasko?= Date: Fri, 21 Jan 2022 19:20:57 +0000 Subject: [PATCH] fixes --- .../ASR/local/prepare_lang_g2pen.py | 19 ++++--------------- egs/fisher_swbd/ASR/prepare.sh | 8 ++++---- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/egs/fisher_swbd/ASR/local/prepare_lang_g2pen.py b/egs/fisher_swbd/ASR/local/prepare_lang_g2pen.py index 4768e1dc0..0549d7306 100755 --- a/egs/fisher_swbd/ASR/local/prepare_lang_g2pen.py +++ b/egs/fisher_swbd/ASR/local/prepare_lang_g2pen.py @@ -17,21 +17,10 @@ """ -This script takes as input a lexicon file "data/lang_phone/lexicon.txt" -consisting of words and tokens (i.e., phones) and does the following: - -1. Add disambiguation symbols to the lexicon and generate lexicon_disambig.txt - -2. Generate tokens.txt, the token table mapping a token to a unique integer. - -3. Generate words.txt, the word table mapping a word to a unique integer. - -4. Generate L.pt, in k2 format. It can be loaded by - - d = torch.load("L.pt") - lexicon = k2.Fsa.from_dict(d) - -5. Generate L_disambig.pt, in k2 format. +This script takes as input a wors.txt file "data/lang_phone/words.txt" +consisting of words and their IDs and creates a lexicon with g2p_en python package +(it's CMUdict based). It also creates rest of the files typically expected in a lang +dir, including L.pt and Linv.pt. """ import argparse import math diff --git a/egs/fisher_swbd/ASR/prepare.sh b/egs/fisher_swbd/ASR/prepare.sh index 4a23bff5c..0f2562507 100755 --- a/egs/fisher_swbd/ASR/prepare.sh +++ b/egs/fisher_swbd/ASR/prepare.sh @@ -103,10 +103,10 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then # Combine Fisher and SWBD recordings and supervisions if $swbd_only; then - cp data/manifests/swbd/swbd_recordings.jsonl \ - data/manifests/fisher-swbd_recordings.jsonl.gz - cp data/manifests/swbd/swbd_supervisions.jsonl \ - data/manifests/fisher-swbd_supervisions.jsonl.gz + gunzip -c data/manifests/swbd/swbd_recordings.jsonl \ + > data/manifests/fisher-swbd_recordings.jsonl.gz + gunzip -c data/manifests/swbd/swbd_supervisions.jsonl \ + > data/manifests/fisher-swbd_supervisions.jsonl.gz else lhotse combine \ data/manifests/fisher/recordings.jsonl.gz \