This commit is contained in:
Piotr Żelasko 2022-01-21 19:20:57 +00:00
parent e76de3ba59
commit cb329d1342
2 changed files with 8 additions and 19 deletions

View File

@ -17,21 +17,10 @@
""" """
This script takes as input a lexicon file "data/lang_phone/lexicon.txt" This script takes as input a wors.txt file "data/lang_phone/words.txt"
consisting of words and tokens (i.e., phones) and does the following: consisting of words and their IDs and creates a lexicon with g2p_en python package
(it's CMUdict based). It also creates rest of the files typically expected in a lang
1. Add disambiguation symbols to the lexicon and generate lexicon_disambig.txt dir, including L.pt and Linv.pt.
2. Generate tokens.txt, the token table mapping a token to a unique integer.
3. Generate words.txt, the word table mapping a word to a unique integer.
4. Generate L.pt, in k2 format. It can be loaded by
d = torch.load("L.pt")
lexicon = k2.Fsa.from_dict(d)
5. Generate L_disambig.pt, in k2 format.
""" """
import argparse import argparse
import math import math

View File

@ -103,10 +103,10 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
# Combine Fisher and SWBD recordings and supervisions # Combine Fisher and SWBD recordings and supervisions
if $swbd_only; then if $swbd_only; then
cp data/manifests/swbd/swbd_recordings.jsonl \ gunzip -c data/manifests/swbd/swbd_recordings.jsonl \
data/manifests/fisher-swbd_recordings.jsonl.gz > data/manifests/fisher-swbd_recordings.jsonl.gz
cp data/manifests/swbd/swbd_supervisions.jsonl \ gunzip -c data/manifests/swbd/swbd_supervisions.jsonl \
data/manifests/fisher-swbd_supervisions.jsonl.gz > data/manifests/fisher-swbd_supervisions.jsonl.gz
else else
lhotse combine \ lhotse combine \
data/manifests/fisher/recordings.jsonl.gz \ data/manifests/fisher/recordings.jsonl.gz \