mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-12 11:32:19 +00:00
Missing steps in prepare.sh
This commit is contained in:
parent
ccab93e8e2
commit
a7666d864c
@ -17,15 +17,34 @@ def get_args():
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
# Note: the functions "normalize" and "keep" implement the logic similar to
|
||||
# Kaldi's data prep scripts for Fisher:
|
||||
# https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/fisher_data_prep.sh
|
||||
# and for SWBD:
|
||||
# https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
|
||||
class FisherSwbdNormalizer:
|
||||
"""
|
||||
Note: the functions "normalize" and "keep" implement the logic similar to
|
||||
Kaldi's data prep scripts for Fisher:
|
||||
https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/fisher_data_prep.sh
|
||||
and for SWBD:
|
||||
https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
|
||||
|
||||
One notable difference is that we don't change [cough], [lipsmack], etc. to [noise].
|
||||
We also don't implement all the edge cases of normalization from Kaldi
|
||||
(hopefully won't make too much difference).
|
||||
"""
|
||||
|
||||
|
||||
class Normalizer:
|
||||
def __init__(self) -> None:
|
||||
|
||||
self.remove_regexp_before = re.compile(
|
||||
r"|".join([
|
||||
# special symbols
|
||||
r"\[\[SKIP.*\]\]",
|
||||
r"\[SKIP.*\]",
|
||||
r"\[PAUSE.*\]",
|
||||
r"\[SILENCE\]",
|
||||
r"<B_ASIDE>",
|
||||
r"<E_ASIDE>",
|
||||
])
|
||||
)
|
||||
|
||||
# tuples of (pattern, replacement)
|
||||
# note: Kaldi replaces sighs, coughs, etc with [noise].
|
||||
# We don't do that here.
|
||||
@ -63,19 +82,12 @@ class Normalizer:
|
||||
(re.compile(r"\s-\s"), r" "),
|
||||
(re.compile(r"\s-\s"), r" "),
|
||||
# special symbol with trailing dash
|
||||
(re.compile(r"(\[\w+\])-"), r"\1"),
|
||||
(re.compile(r"(\[.*?\])-"), r"\1"),
|
||||
]
|
||||
|
||||
# unwanted symbols in the transcripts
|
||||
self.remove_regexp = re.compile(
|
||||
self.remove_regexp_after = re.compile(
|
||||
r"|".join([
|
||||
# special symbols
|
||||
r"\[\[SKIP.*\]\]",
|
||||
r"\[SKIP.*\]",
|
||||
r"\[PAUSE.*\]",
|
||||
r"\[SILENCE\]",
|
||||
r"<B_ASIDE>",
|
||||
r"<E_ASIDE>",
|
||||
# remaining punctuation
|
||||
r"\.",
|
||||
r",",
|
||||
@ -92,12 +104,15 @@ class Normalizer:
|
||||
def normalize(self, text: str) -> str:
|
||||
text = text.upper()
|
||||
|
||||
# first replace
|
||||
# first remove
|
||||
text = self.remove_regexp_before.sub("", text)
|
||||
|
||||
# then replace
|
||||
for pattern, sub in self.replace_regexps:
|
||||
text = pattern.sub(sub, text)
|
||||
|
||||
# then remove
|
||||
text = self.remove_regexp.sub("", text)
|
||||
text = self.remove_regexp_after.sub("", text)
|
||||
|
||||
# then clean up whitespace
|
||||
text = self.whitespace_regexp.sub(" ", text).strip()
|
||||
@ -159,6 +174,8 @@ def test():
|
||||
"-[ADV]AN[TAGE]",
|
||||
"-[ADV]AN[TAGE]-",
|
||||
"[WEA[SONABLE]-/REASONABLE]",
|
||||
"[VOCALIZED-NOISE]-",
|
||||
"~BULL",
|
||||
]:
|
||||
print(text)
|
||||
print(normalizer.normalize(text))
|
||||
|
@ -56,12 +56,6 @@ log() {
|
||||
|
||||
log "dl_dir: $dl_dir"
|
||||
|
||||
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
|
||||
log "Stage -1: Download LM"
|
||||
#[ ! -e $dl_dir/lm ] && mkdir -p $dl_dir/lm
|
||||
#./local/download_lm.py --out-dir=$dl_dir/lm
|
||||
fi
|
||||
|
||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
||||
log "Stage 0: Download data"
|
||||
|
||||
@ -116,35 +110,60 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||
|
||||
set -x
|
||||
|
||||
# Combine Fisher and SWBD recordings and supervisions
|
||||
lhotse combine \
|
||||
data/manifests/fisher/recordings.jsonl.gz \
|
||||
data/manifests/swbd/swbd_recordings.jsonl \
|
||||
data/manifests/fisher-swbd_recordings.jsonl.gz
|
||||
|
||||
lhotse combine \
|
||||
data/manifests/fisher/supervisions.jsonl.gz \
|
||||
data/manifests/swbd/swbd_supervisions.jsonl \
|
||||
data/manifests/fisher-swbd_supervisions.jsonl.gz
|
||||
|
||||
# Normalize text and remove supervisions that are not useful / hard to handle.
|
||||
python local/normalize_and_filter_supervisions.py \
|
||||
data/manifests/fisher-swbd_supervisions.jsonl.gz \
|
||||
data/manifests/fisher-swbd_supervisions_norm.jsonl.gz \
|
||||
|
||||
# Create cuts that span whole recording sessions.
|
||||
lhotse cut simple \
|
||||
-r data/manifests/fisher-swbd_recordings.jsonl.gz \
|
||||
-s data/manifests/fisher-swbd_supervisions_norm.jsonl.gz \
|
||||
data/manifests/fisher-swbd_cuts_unshuf.jsonl.gz
|
||||
|
||||
# Shuffle the cuts (pure bash pipes are fast).
|
||||
# We could technically skip this step but this helps ensure
|
||||
# SWBD is not only seen towards the end of training.
|
||||
gunzip -c data/manifests/fisher-swbd_cuts_unshuf.jsonl.gz \
|
||||
| shuf \
|
||||
| gzip -c \
|
||||
> data/manifests/fisher-swbd_cuts.jsonl.gz
|
||||
|
||||
# Create train/dev split -- 20 sessions for dev is about ~2h, should be good.
|
||||
num_cuts="$(gunzip -c data/manifests/fisher-swbd_cuts.jsonl.gz | wc -l)"
|
||||
num_dev_sessions=20
|
||||
lhotse subset --first $num_dev_sessions \
|
||||
data/manifests/fisher-swbd_cuts.jsonl.gz \
|
||||
data/manifests/dev_fisher-swbd_cuts.jsonl.gz
|
||||
lhotse subset --last $((num_cuts-num_dev_sessions)) \
|
||||
data/manifests/fisher-swbd_cuts.jsonl.gz \
|
||||
data/manifests/train_fisher-swbd_cuts.jsonl.gz
|
||||
|
||||
# Finally, split the full-session cuts into one cut per supervision segment.
|
||||
# In case any segments are overlapping we would discard the info about overlaps.
|
||||
# (overlaps are unlikely for this dataset because each cut sees only one channel).
|
||||
lhotse cut trim-to-supervisions \
|
||||
--discard-overlapping \
|
||||
data/manifests/train_fisher-swbd_cuts.jsonl.gz \
|
||||
data/manifests/train_utterances_fisher-swbd_cuts.jsonl.gz
|
||||
lhotse cut trim-to-supervisions \
|
||||
--discard-overlapping \
|
||||
data/manifests/dev_fisher-swbd_cuts.jsonl.gz \
|
||||
data/manifests/dev_utterances_fisher-swbd_cuts.jsonl.gz
|
||||
|
||||
set +x
|
||||
fi
|
||||
|
||||
# TODO: optional stage 5, compute features
|
||||
|
||||
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||
log "Stage 6: Dump transcripts for LM training"
|
||||
mkdir -p data/lm
|
||||
@ -154,18 +173,6 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||
> data/lm/transcript_words.txt
|
||||
fi
|
||||
|
||||
#if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||
# log "Stage 3: Compute fbank for librispeech"
|
||||
# mkdir -p data/fbank
|
||||
# ./local/compute_fbank_librispeech.py
|
||||
#fi
|
||||
#
|
||||
#if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||
# log "Stage 4: Compute fbank for musan"
|
||||
# mkdir -p data/fbank
|
||||
# ./local/compute_fbank_musan.py
|
||||
#fi
|
||||
|
||||
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
|
||||
log "Stage 7: Prepare lexicon using g2p_en"
|
||||
lang_dir=data/lang_phone
|
||||
@ -186,6 +193,12 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
|
||||
| awk '{print $0,NR+2}' \
|
||||
>> $lang_dir/words.txt
|
||||
|
||||
# Add remaining special word symbols expected by LM scripts.
|
||||
num_words=$(wc -l $lang_dir/words.txt)
|
||||
echo "<s> $((num_words))"
|
||||
echo "</s> $((num_words+1))"
|
||||
echo "#0 $((num_words+2))"
|
||||
|
||||
if [ ! -f $lang_dir/L_disambig.pt ]; then
|
||||
pip install g2p_en
|
||||
./local/prepare_lang_g2pen.py --lang-dir $lang_dir
|
||||
|
Loading…
x
Reference in New Issue
Block a user