This commit is contained in:
JinZr 2023-08-08 19:27:18 +08:00
parent e38afc407d
commit 5533c6278d
6 changed files with 47 additions and 41 deletions

View File

@ -225,6 +225,8 @@ class SwitchBoardAsrDataModule:
else: else:
logging.info("Disable MUSAN") logging.info("Disable MUSAN")
cuts_train = cuts_train.trim_to_supervisions(keep_overlapping=False)
if self.args.concatenate_cuts: if self.args.concatenate_cuts:
logging.info( logging.info(
f"Using cut concatenation with duration factor " f"Using cut concatenation with duration factor "
@ -392,25 +394,23 @@ class SwitchBoardAsrDataModule:
def train_all_cuts(self) -> CutSet: def train_all_cuts(self) -> CutSet:
logging.info("switchboard: About to get train cuts") logging.info("switchboard: About to get train cuts")
return ( return (
load_manifest_lazy(self.args.manifest_dir / "swbd_cuts_all.jsonl.gz") load_manifest_lazy(self.args.manifest_dir / "swbd" / "swbd_cuts_all.jsonl.gz")
.subset(last=2388) .subset(last=2388)
.trim_to_supervisions(keep_all_channels=True)
) )
@lru_cache() @lru_cache()
def dev_cuts(self) -> CutSet: def dev_cuts(self) -> CutSet:
logging.info("switchboard: About to get dev cuts") logging.info("switchboard: About to get dev cuts")
return ( return (
load_manifest_lazy(self.args.manifest_dir / "swbd_cuts_all.jsonl.gz") load_manifest_lazy(self.args.manifest_dir / "swbd" / "swbd_cuts_all.jsonl.gz")
.subset(first=50) .subset(first=50)
.trim_to_supervisions(keep_all_channels=True)
) )
@lru_cache() @lru_cache()
def test_eval2000_cuts(self) -> CutSet: def test_eval2000_cuts(self) -> CutSet:
logging.info("switchboard: About to get eval2000 cuts") logging.info("switchboard: About to get eval2000 cuts")
return load_manifest_lazy( return load_manifest_lazy(
self.args.manifest_dir / "swbd_cuts_eval2000.jsonl.gz" self.args.manifest_dir / "eval2000" / "eval2000_cuts_all.jsonl.gz"
) )
@lru_cache() @lru_cache()

View File

@ -66,7 +66,7 @@ def get_args():
parser.add_argument( parser.add_argument(
"--perturb-speed", "--perturb-speed",
type=str2bool, type=str2bool,
default=True, default=False,
help="""Perturb speed with factor 0.9 and 1.1 on train subset.""", help="""Perturb speed with factor 0.9 and 1.1 on train subset.""",
) )
@ -90,7 +90,7 @@ def compute_fbank_switchboard(
sp.load(bpe_model) sp.load(bpe_model)
if dataset is None: if dataset is None:
dataset_parts = ("all") dataset_parts = ("all",)
else: else:
dataset_parts = dataset.split(" ", -1) dataset_parts = dataset.split(" ", -1)
@ -152,7 +152,7 @@ if __name__ == "__main__":
logging.basicConfig(format=formatter, level=logging.INFO) logging.basicConfig(format=formatter, level=logging.INFO)
args = get_args() args = get_args()
logging.info(vars(args)) logging.info(vars(args))
for dir_name in ["swbd"]: for dir_name in ["swbd", "eval2000"]:
compute_fbank_switchboard( compute_fbank_switchboard(
dir_name=dir_name, dir_name=dir_name,
bpe_model=args.bpe_model, bpe_model=args.bpe_model,

10
egs/swbd/ASR/local/normalize_and_filter_supervisions.py Normal file → Executable file
View File

@ -119,6 +119,9 @@ class FisherSwbdNormalizer:
(re.compile(r"(\[.*?\])-"), r"\1"), (re.compile(r"(\[.*?\])-"), r"\1"),
# Just remove all dashes # Just remove all dashes
(re.compile(r"-"), r" "), (re.compile(r"-"), r" "),
# Fix an issue related to [vocalized-noise]
(re.compile(r"\[vocalized noise\]"), r"\[vocalized-noise\]"),
] ]
# unwanted symbols in the transcripts # unwanted symbols in the transcripts
@ -153,7 +156,7 @@ class FisherSwbdNormalizer:
# then clean up whitespace # then clean up whitespace
text = self.whitespace_regexp.sub(" ", text).strip() text = self.whitespace_regexp.sub(" ", text).strip()
return text return text.upper()
# fmt: on # fmt: on
@ -189,6 +192,7 @@ def main():
continue continue
writer.write(sup) writer.write(sup)
print(f"tot: {tot}, skip: {skip}")
def test(): def test():
@ -224,5 +228,5 @@ def test():
if __name__ == "__main__": if __name__ == "__main__":
test() # test(); exit()
# main() main()

View File

@ -249,7 +249,7 @@ def lexicon_to_fst(
lexicon: Lexicon, lexicon: Lexicon,
token2id: Dict[str, int], token2id: Dict[str, int],
word2id: Dict[str, int], word2id: Dict[str, int],
sil_token: str = "sil", sil_token: str = "SIL",
sil_prob: float = 0.5, sil_prob: float = 0.5,
need_self_loops: bool = False, need_self_loops: bool = False,
) -> k2.Fsa: ) -> k2.Fsa:
@ -346,7 +346,7 @@ def main():
args = get_args() args = get_args()
lang_dir = Path(args.lang_dir) lang_dir = Path(args.lang_dir)
lexicon_filename = lang_dir / "lexicon.txt" lexicon_filename = lang_dir / "lexicon.txt"
sil_token = "sil" sil_token = "SIL"
sil_prob = 0.5 sil_prob = 0.5
lexicon = read_lexicon(lexicon_filename) lexicon = read_lexicon(lexicon_filename)

View File

@ -178,7 +178,7 @@ def get_args():
parser.add_argument( parser.add_argument(
"--oov", "--oov",
type=str, type=str,
default="<unk>", default="<UNK>",
help="The out of vocabulary word in lexicon.", help="The out of vocabulary word in lexicon.",
) )

View File

@ -23,7 +23,8 @@ stop_stage=100
# - speech # - speech
dl_dir=./download dl_dir=./download
swbd1_dir="/export/corpora3/LDC/LDC97S62" # swbd1_dir="/export/corpora3/LDC/LDC97S62"
swbd1_dir=./download/LDC97S62/
# eval2000_dir contains the following files and directories # eval2000_dir contains the following files and directories
# downloaded from LDC website: # downloaded from LDC website:
@ -70,15 +71,14 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
if [ ! -e data/manifests/.swbd.done ]; then if [ ! -e data/manifests/.swbd.done ]; then
lhotse prepare switchboard --absolute-paths 1 --omit-silence $swbd1_dir data/manifests/swbd lhotse prepare switchboard --absolute-paths 1 --omit-silence $swbd1_dir data/manifests/swbd
./local/normalize_and_filter_supervisions.py \ ./local/normalize_and_filter_supervisions.py \
data/manifests/swbd/swbd_supervisions.jsonl \ data/manifests/swbd/swbd_supervisions_all.jsonl.gz \
data/manifests/swbd/swbd_supervisions_norm.jsonl data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz
cp data/manifests/swbd/swbd_recordings.jsonl data/manifests/recordings_swbd.jsonl mv data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz data/manifests/swbd/swbd_supervisions_all.jsonl.gz
lhotse prepare $eval2000_dir data/manifests_eval2000 lhotse prepare eval2000 --absolute-paths 1 $eval2000_dir data/manifests/eval2000
./local/normalize_eval2000.py \ ./local/normalize_eval2000.py \
data/manifests_eval2000/eval2000_supervisions_unnorm.jsonl.gz \ data/manifests/eval2000/eval2000_supervisions_unnorm.jsonl.gz \
data/manifests_eval2000/eval2000_supervisions_norm.jsonl.gz data/manifests/eval2000/eval2000_supervisions_all.jsonl.gz
./local/rt03_data_prep.sh $rt03_dir ./local/rt03_data_prep.sh $rt03_dir
@ -96,20 +96,6 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
rm data/local/${x}/text.org rm data/local/${x}/text.org
done done
python ./local/filter_empty_text.py --kaldi-data-dir data/local/eval2000
./utils/fix_data_dir.sh data/local/eval2000
lhotse kaldi import data/local/eval2000 8000 data/manifests_eval2000
mv data/manifests_eval2000/recordings.jsonl.gz data/manifests_eval2000/swbd_recordings_eval2000.jsonl.gz
mv data/manifests_eval2000/supervisions.jsonl.gz data/manifests_eval2000/swbd_supervisions_eval2000.jsonl.gz
python ./local/filter_empty_text.py --kaldi-data-dir data/local/rt03
./utils/fix_data_dir.sh data/local/rt03
lhotse kaldi import data/local/rt03 8000 data/manifests_rt03
mv data/manifests_rt03/recordings.jsonl.gz data/manifests_rt03/swbd_recordings_rt03.jsonl.gz
mv data/manifests_rt03/supervisions.jsonl.gz data/manifests_rt03/swbd_supervisions_rt03.jsonl.gz
lhotse fix data/manifests_train/swbd_recordings_all.jsonl.gz data/manifests_train/swbd_supervisions_all.jsonl.gz data/manifests
lhotse fix data/manifests_eval2000/swbd_recordings_eval2000.jsonl.gz data/manifests_eval2000/swbd_supervisions_eval2000.jsonl.gz data/manifests
lhotse fix data/manifests_rt03/swbd_recordings_rt03.jsonl.gz data/manifests_rt03/swbd_supervisions_rt03.jsonl.gz data/manifests lhotse fix data/manifests_rt03/swbd_recordings_rt03.jsonl.gz data/manifests_rt03/swbd_supervisions_rt03.jsonl.gz data/manifests
touch data/manifests/.swbd.done touch data/manifests/.swbd.done
@ -128,7 +114,7 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
fi fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Compute fbank for switchboard" log "Stage 3: Compute fbank for SwitchBoard"
mkdir -p data/fbank mkdir -p data/fbank
if [ ! -e data/fbank/.swbd.done ]; then if [ ! -e data/fbank/.swbd.done ]; then
./local/compute_fbank_swbd.py ./local/compute_fbank_swbd.py
@ -150,13 +136,29 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
lang_dir=data/lang_phone lang_dir=data/lang_phone
mkdir -p $lang_dir mkdir -p $lang_dir
if ! which jq; then
echo "This script is intended to be used with jq but you have not installed jq
Note: in Linux, you can install jq with the following command:
1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
2. chmod +x ./jq
3. cp jq /usr/bin" && exit 1
fi
if [ ! -f $lang_dir/text ] || [ ! -s $lang_dir/text ]; then
log "Prepare text."
gunzip -c data/manifests/swbd/swbd_supervisions_all.jsonl.gz \
| jq '.text' | sed 's/"//g' > $lang_dir/text
fi
log "prepare dict" log "prepare dict"
cut -f 2- -d" " data/local/train/text >${lang_dir}/input.txt ./local/swbd1_prepare_dict.sh $swbd1_dir
cut -f 2- -d" " $lang_dir/text >${lang_dir}/input.txt
# [noise] nsn # [noise] nsn
# !sil sil # !sil sil
# <unk> spn # <unk> spn
cat data/local/dict_nosp/lexicon.txt | cat data/local/dict_nosp/lexicon.txt |
sort | uniq >$lang_dir/lexicon.txt sort | uniq >$lang_dir/lexicon_lower.txt
cat $lang_dir/lexicon_lower.txt | tr a-z A-Z > $lang_dir/lexicon.txt
if [ ! -f $lang_dir/L_disambig.pt ]; then if [ ! -f $lang_dir/L_disambig.pt ]; then
./local/prepare_lang.py --lang-dir $lang_dir ./local/prepare_lang.py --lang-dir $lang_dir
@ -192,7 +194,7 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
if [ ! -f $lang_dir/transcript_words.txt ]; then if [ ! -f $lang_dir/transcript_words.txt ]; then
log "Generate data for BPE training" log "Generate data for BPE training"
cat ./data/local/train/text | cut -d " " -f 2- >$lang_dir/transcript_words.txt cat data/lang_phone/text | cut -d " " -f 2- >$lang_dir/transcript_words.txt
fi fi
if [ ! -f $lang_dir/bpe.model ]; then if [ ! -f $lang_dir/bpe.model ]; then
@ -239,7 +241,7 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
./local/convert_transcript_words_to_tokens.py \ ./local/convert_transcript_words_to_tokens.py \
--lexicon $lang_dir/lexicon.txt \ --lexicon $lang_dir/lexicon.txt \
--transcript $lang_dir/transcript_words.txt \ --transcript $lang_dir/transcript_words.txt \
--oov "<unk>" \ --oov "<UNK>" \
>$lang_dir/transcript_tokens.txt >$lang_dir/transcript_tokens.txt
fi fi