mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-12 11:32:19 +00:00
Missing steps in prepare.sh
This commit is contained in:
parent
ccab93e8e2
commit
a7666d864c
@ -17,15 +17,34 @@ def get_args():
|
|||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
# Note: the functions "normalize" and "keep" implement the logic similar to
|
class FisherSwbdNormalizer:
|
||||||
# Kaldi's data prep scripts for Fisher:
|
"""
|
||||||
# https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/fisher_data_prep.sh
|
Note: the functions "normalize" and "keep" implement the logic similar to
|
||||||
# and for SWBD:
|
Kaldi's data prep scripts for Fisher:
|
||||||
# https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
|
https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/fisher_data_prep.sh
|
||||||
|
and for SWBD:
|
||||||
|
https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
|
||||||
|
|
||||||
|
One notable difference is that we don't change [cough], [lipsmack], etc. to [noise].
|
||||||
|
We also don't implement all the edge cases of normalization from Kaldi
|
||||||
|
(hopefully won't make too much difference).
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class Normalizer:
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
|
|
||||||
|
self.remove_regexp_before = re.compile(
|
||||||
|
r"|".join([
|
||||||
|
# special symbols
|
||||||
|
r"\[\[SKIP.*\]\]",
|
||||||
|
r"\[SKIP.*\]",
|
||||||
|
r"\[PAUSE.*\]",
|
||||||
|
r"\[SILENCE\]",
|
||||||
|
r"<B_ASIDE>",
|
||||||
|
r"<E_ASIDE>",
|
||||||
|
])
|
||||||
|
)
|
||||||
|
|
||||||
# tuples of (pattern, replacement)
|
# tuples of (pattern, replacement)
|
||||||
# note: Kaldi replaces sighs, coughs, etc with [noise].
|
# note: Kaldi replaces sighs, coughs, etc with [noise].
|
||||||
# We don't do that here.
|
# We don't do that here.
|
||||||
@ -63,19 +82,12 @@ class Normalizer:
|
|||||||
(re.compile(r"\s-\s"), r" "),
|
(re.compile(r"\s-\s"), r" "),
|
||||||
(re.compile(r"\s-\s"), r" "),
|
(re.compile(r"\s-\s"), r" "),
|
||||||
# special symbol with trailing dash
|
# special symbol with trailing dash
|
||||||
(re.compile(r"(\[\w+\])-"), r"\1"),
|
(re.compile(r"(\[.*?\])-"), r"\1"),
|
||||||
]
|
]
|
||||||
|
|
||||||
# unwanted symbols in the transcripts
|
# unwanted symbols in the transcripts
|
||||||
self.remove_regexp = re.compile(
|
self.remove_regexp_after = re.compile(
|
||||||
r"|".join([
|
r"|".join([
|
||||||
# special symbols
|
|
||||||
r"\[\[SKIP.*\]\]",
|
|
||||||
r"\[SKIP.*\]",
|
|
||||||
r"\[PAUSE.*\]",
|
|
||||||
r"\[SILENCE\]",
|
|
||||||
r"<B_ASIDE>",
|
|
||||||
r"<E_ASIDE>",
|
|
||||||
# remaining punctuation
|
# remaining punctuation
|
||||||
r"\.",
|
r"\.",
|
||||||
r",",
|
r",",
|
||||||
@ -92,12 +104,15 @@ class Normalizer:
|
|||||||
def normalize(self, text: str) -> str:
|
def normalize(self, text: str) -> str:
|
||||||
text = text.upper()
|
text = text.upper()
|
||||||
|
|
||||||
# first replace
|
# first remove
|
||||||
|
text = self.remove_regexp_before.sub("", text)
|
||||||
|
|
||||||
|
# then replace
|
||||||
for pattern, sub in self.replace_regexps:
|
for pattern, sub in self.replace_regexps:
|
||||||
text = pattern.sub(sub, text)
|
text = pattern.sub(sub, text)
|
||||||
|
|
||||||
# then remove
|
# then remove
|
||||||
text = self.remove_regexp.sub("", text)
|
text = self.remove_regexp_after.sub("", text)
|
||||||
|
|
||||||
# then clean up whitespace
|
# then clean up whitespace
|
||||||
text = self.whitespace_regexp.sub(" ", text).strip()
|
text = self.whitespace_regexp.sub(" ", text).strip()
|
||||||
@ -159,6 +174,8 @@ def test():
|
|||||||
"-[ADV]AN[TAGE]",
|
"-[ADV]AN[TAGE]",
|
||||||
"-[ADV]AN[TAGE]-",
|
"-[ADV]AN[TAGE]-",
|
||||||
"[WEA[SONABLE]-/REASONABLE]",
|
"[WEA[SONABLE]-/REASONABLE]",
|
||||||
|
"[VOCALIZED-NOISE]-",
|
||||||
|
"~BULL",
|
||||||
]:
|
]:
|
||||||
print(text)
|
print(text)
|
||||||
print(normalizer.normalize(text))
|
print(normalizer.normalize(text))
|
||||||
|
@ -56,12 +56,6 @@ log() {
|
|||||||
|
|
||||||
log "dl_dir: $dl_dir"
|
log "dl_dir: $dl_dir"
|
||||||
|
|
||||||
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
|
|
||||||
log "Stage -1: Download LM"
|
|
||||||
#[ ! -e $dl_dir/lm ] && mkdir -p $dl_dir/lm
|
|
||||||
#./local/download_lm.py --out-dir=$dl_dir/lm
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
||||||
log "Stage 0: Download data"
|
log "Stage 0: Download data"
|
||||||
|
|
||||||
@ -116,35 +110,60 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
|||||||
|
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
|
# Combine Fisher and SWBD recordings and supervisions
|
||||||
lhotse combine \
|
lhotse combine \
|
||||||
data/manifests/fisher/recordings.jsonl.gz \
|
data/manifests/fisher/recordings.jsonl.gz \
|
||||||
data/manifests/swbd/swbd_recordings.jsonl \
|
data/manifests/swbd/swbd_recordings.jsonl \
|
||||||
data/manifests/fisher-swbd_recordings.jsonl.gz
|
data/manifests/fisher-swbd_recordings.jsonl.gz
|
||||||
|
|
||||||
lhotse combine \
|
lhotse combine \
|
||||||
data/manifests/fisher/supervisions.jsonl.gz \
|
data/manifests/fisher/supervisions.jsonl.gz \
|
||||||
data/manifests/swbd/swbd_supervisions.jsonl \
|
data/manifests/swbd/swbd_supervisions.jsonl \
|
||||||
data/manifests/fisher-swbd_supervisions.jsonl.gz
|
data/manifests/fisher-swbd_supervisions.jsonl.gz
|
||||||
|
|
||||||
|
# Normalize text and remove supervisions that are not useful / hard to handle.
|
||||||
python local/normalize_and_filter_supervisions.py \
|
python local/normalize_and_filter_supervisions.py \
|
||||||
data/manifests/fisher-swbd_supervisions.jsonl.gz \
|
data/manifests/fisher-swbd_supervisions.jsonl.gz \
|
||||||
data/manifests/fisher-swbd_supervisions_norm.jsonl.gz \
|
data/manifests/fisher-swbd_supervisions_norm.jsonl.gz \
|
||||||
|
|
||||||
|
# Create cuts that span whole recording sessions.
|
||||||
lhotse cut simple \
|
lhotse cut simple \
|
||||||
-r data/manifests/fisher-swbd_recordings.jsonl.gz \
|
-r data/manifests/fisher-swbd_recordings.jsonl.gz \
|
||||||
-s data/manifests/fisher-swbd_supervisions_norm.jsonl.gz \
|
-s data/manifests/fisher-swbd_supervisions_norm.jsonl.gz \
|
||||||
data/manifests/fisher-swbd_cuts_unshuf.jsonl.gz
|
data/manifests/fisher-swbd_cuts_unshuf.jsonl.gz
|
||||||
|
|
||||||
|
# Shuffle the cuts (pure bash pipes are fast).
|
||||||
|
# We could technically skip this step but this helps ensure
|
||||||
|
# SWBD is not only seen towards the end of training.
|
||||||
gunzip -c data/manifests/fisher-swbd_cuts_unshuf.jsonl.gz \
|
gunzip -c data/manifests/fisher-swbd_cuts_unshuf.jsonl.gz \
|
||||||
| shuf \
|
| shuf \
|
||||||
| gzip -c \
|
| gzip -c \
|
||||||
> data/manifests/fisher-swbd_cuts.jsonl.gz
|
> data/manifests/fisher-swbd_cuts.jsonl.gz
|
||||||
|
|
||||||
|
# Create train/dev split -- 20 sessions for dev is about ~2h, should be good.
|
||||||
|
num_cuts="$(gunzip -c data/manifests/fisher-swbd_cuts.jsonl.gz | wc -l)"
|
||||||
|
num_dev_sessions=20
|
||||||
|
lhotse subset --first $num_dev_sessions \
|
||||||
|
data/manifests/fisher-swbd_cuts.jsonl.gz \
|
||||||
|
data/manifests/dev_fisher-swbd_cuts.jsonl.gz
|
||||||
|
lhotse subset --last $((num_cuts-num_dev_sessions)) \
|
||||||
|
data/manifests/fisher-swbd_cuts.jsonl.gz \
|
||||||
|
data/manifests/train_fisher-swbd_cuts.jsonl.gz
|
||||||
|
|
||||||
|
# Finally, split the full-session cuts into one cut per supervision segment.
|
||||||
|
# In case any segments are overlapping we would discard the info about overlaps.
|
||||||
|
# (overlaps are unlikely for this dataset because each cut sees only one channel).
|
||||||
|
lhotse cut trim-to-supervisions \
|
||||||
|
--discard-overlapping \
|
||||||
|
data/manifests/train_fisher-swbd_cuts.jsonl.gz \
|
||||||
|
data/manifests/train_utterances_fisher-swbd_cuts.jsonl.gz
|
||||||
|
lhotse cut trim-to-supervisions \
|
||||||
|
--discard-overlapping \
|
||||||
|
data/manifests/dev_fisher-swbd_cuts.jsonl.gz \
|
||||||
|
data/manifests/dev_utterances_fisher-swbd_cuts.jsonl.gz
|
||||||
|
|
||||||
set +x
|
set +x
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# TODO: optional stage 5, compute features
|
|
||||||
|
|
||||||
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||||
log "Stage 6: Dump transcripts for LM training"
|
log "Stage 6: Dump transcripts for LM training"
|
||||||
mkdir -p data/lm
|
mkdir -p data/lm
|
||||||
@ -154,18 +173,6 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
|||||||
> data/lm/transcript_words.txt
|
> data/lm/transcript_words.txt
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
|
||||||
# log "Stage 3: Compute fbank for librispeech"
|
|
||||||
# mkdir -p data/fbank
|
|
||||||
# ./local/compute_fbank_librispeech.py
|
|
||||||
#fi
|
|
||||||
#
|
|
||||||
#if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
|
||||||
# log "Stage 4: Compute fbank for musan"
|
|
||||||
# mkdir -p data/fbank
|
|
||||||
# ./local/compute_fbank_musan.py
|
|
||||||
#fi
|
|
||||||
|
|
||||||
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
|
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
|
||||||
log "Stage 7: Prepare lexicon using g2p_en"
|
log "Stage 7: Prepare lexicon using g2p_en"
|
||||||
lang_dir=data/lang_phone
|
lang_dir=data/lang_phone
|
||||||
@ -186,6 +193,12 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
|
|||||||
| awk '{print $0,NR+2}' \
|
| awk '{print $0,NR+2}' \
|
||||||
>> $lang_dir/words.txt
|
>> $lang_dir/words.txt
|
||||||
|
|
||||||
|
# Add remaining special word symbols expected by LM scripts.
|
||||||
|
num_words=$(wc -l $lang_dir/words.txt)
|
||||||
|
echo "<s> $((num_words))"
|
||||||
|
echo "</s> $((num_words+1))"
|
||||||
|
echo "#0 $((num_words+2))"
|
||||||
|
|
||||||
if [ ! -f $lang_dir/L_disambig.pt ]; then
|
if [ ! -f $lang_dir/L_disambig.pt ]; then
|
||||||
pip install g2p_en
|
pip install g2p_en
|
||||||
./local/prepare_lang_g2pen.py --lang-dir $lang_dir
|
./local/prepare_lang_g2pen.py --lang-dir $lang_dir
|
||||||
|
Loading…
x
Reference in New Issue
Block a user