From 5533c6278d146e59557bc15f142d523a75203058 Mon Sep 17 00:00:00 2001 From: JinZr <60612200+JinZr@users.noreply.github.com> Date: Tue, 8 Aug 2023 19:27:18 +0800 Subject: [PATCH] updated --- egs/swbd/ASR/conformer_ctc/asr_datamodule.py | 10 ++-- egs/swbd/ASR/local/compute_fbank_swbd.py | 6 +- .../normalize_and_filter_supervisions.py | 10 +++- egs/swbd/ASR/local/prepare_lang.py | 4 +- egs/swbd/ASR/local/prepare_lang_bpe.py | 2 +- egs/swbd/ASR/prepare.sh | 56 ++++++++++--------- 6 files changed, 47 insertions(+), 41 deletions(-) mode change 100644 => 100755 egs/swbd/ASR/local/normalize_and_filter_supervisions.py diff --git a/egs/swbd/ASR/conformer_ctc/asr_datamodule.py b/egs/swbd/ASR/conformer_ctc/asr_datamodule.py index c45f2cbd0..41da5ab3d 100644 --- a/egs/swbd/ASR/conformer_ctc/asr_datamodule.py +++ b/egs/swbd/ASR/conformer_ctc/asr_datamodule.py @@ -225,6 +225,8 @@ class SwitchBoardAsrDataModule: else: logging.info("Disable MUSAN") + cuts_train = cuts_train.trim_to_supervisions(keep_overlapping=False) + if self.args.concatenate_cuts: logging.info( f"Using cut concatenation with duration factor " @@ -392,25 +394,23 @@ class SwitchBoardAsrDataModule: def train_all_cuts(self) -> CutSet: logging.info("switchboard: About to get train cuts") return ( - load_manifest_lazy(self.args.manifest_dir / "swbd_cuts_all.jsonl.gz") + load_manifest_lazy(self.args.manifest_dir / "swbd" / "swbd_cuts_all.jsonl.gz") .subset(last=2388) - .trim_to_supervisions(keep_all_channels=True) ) @lru_cache() def dev_cuts(self) -> CutSet: logging.info("switchboard: About to get dev cuts") return ( - load_manifest_lazy(self.args.manifest_dir / "swbd_cuts_all.jsonl.gz") + load_manifest_lazy(self.args.manifest_dir / "swbd" / "swbd_cuts_all.jsonl.gz") .subset(first=50) - .trim_to_supervisions(keep_all_channels=True) ) @lru_cache() def test_eval2000_cuts(self) -> CutSet: logging.info("switchboard: About to get eval2000 cuts") return load_manifest_lazy( - self.args.manifest_dir / "swbd_cuts_eval2000.jsonl.gz" + self.args.manifest_dir / "eval2000" / "eval2000_cuts_all.jsonl.gz" ) @lru_cache() diff --git a/egs/swbd/ASR/local/compute_fbank_swbd.py b/egs/swbd/ASR/local/compute_fbank_swbd.py index 556ef9e5a..cab516446 100755 --- a/egs/swbd/ASR/local/compute_fbank_swbd.py +++ b/egs/swbd/ASR/local/compute_fbank_swbd.py @@ -66,7 +66,7 @@ def get_args(): parser.add_argument( "--perturb-speed", type=str2bool, - default=True, + default=False, help="""Perturb speed with factor 0.9 and 1.1 on train subset.""", ) @@ -90,7 +90,7 @@ def compute_fbank_switchboard( sp.load(bpe_model) if dataset is None: - dataset_parts = ("all") + dataset_parts = ("all",) else: dataset_parts = dataset.split(" ", -1) @@ -152,7 +152,7 @@ if __name__ == "__main__": logging.basicConfig(format=formatter, level=logging.INFO) args = get_args() logging.info(vars(args)) - for dir_name in ["swbd"]: + for dir_name in ["swbd", "eval2000"]: compute_fbank_switchboard( dir_name=dir_name, bpe_model=args.bpe_model, diff --git a/egs/swbd/ASR/local/normalize_and_filter_supervisions.py b/egs/swbd/ASR/local/normalize_and_filter_supervisions.py old mode 100644 new mode 100755 index 62f7efc68..9970e112a --- a/egs/swbd/ASR/local/normalize_and_filter_supervisions.py +++ b/egs/swbd/ASR/local/normalize_and_filter_supervisions.py @@ -119,6 +119,9 @@ class FisherSwbdNormalizer: (re.compile(r"(\[.*?\])-"), r"\1"), # Just remove all dashes (re.compile(r"-"), r" "), + + # Fix an issue related to [vocalized-noise] + (re.compile(r"\[vocalized noise\]"), r"\[vocalized-noise\]"), ] # unwanted symbols in the transcripts @@ -153,7 +156,7 @@ class FisherSwbdNormalizer: # then clean up whitespace text = self.whitespace_regexp.sub(" ", text).strip() - return text + return text.upper() # fmt: on @@ -189,6 +192,7 @@ def main(): continue writer.write(sup) + print(f"tot: {tot}, skip: {skip}") def test(): @@ -224,5 +228,5 @@ def test(): if __name__ == "__main__": - test() -# main() + # test(); exit() + main() diff --git a/egs/swbd/ASR/local/prepare_lang.py b/egs/swbd/ASR/local/prepare_lang.py index f7d4609a9..d913756a1 100755 --- a/egs/swbd/ASR/local/prepare_lang.py +++ b/egs/swbd/ASR/local/prepare_lang.py @@ -249,7 +249,7 @@ def lexicon_to_fst( lexicon: Lexicon, token2id: Dict[str, int], word2id: Dict[str, int], - sil_token: str = "sil", + sil_token: str = "SIL", sil_prob: float = 0.5, need_self_loops: bool = False, ) -> k2.Fsa: @@ -346,7 +346,7 @@ def main(): args = get_args() lang_dir = Path(args.lang_dir) lexicon_filename = lang_dir / "lexicon.txt" - sil_token = "sil" + sil_token = "SIL" sil_prob = 0.5 lexicon = read_lexicon(lexicon_filename) diff --git a/egs/swbd/ASR/local/prepare_lang_bpe.py b/egs/swbd/ASR/local/prepare_lang_bpe.py index f5b3a25b3..db5e42b05 100755 --- a/egs/swbd/ASR/local/prepare_lang_bpe.py +++ b/egs/swbd/ASR/local/prepare_lang_bpe.py @@ -178,7 +178,7 @@ def get_args(): parser.add_argument( "--oov", type=str, - default="", + default="", help="The out of vocabulary word in lexicon.", ) diff --git a/egs/swbd/ASR/prepare.sh b/egs/swbd/ASR/prepare.sh index d099e34ba..0d64e6814 100755 --- a/egs/swbd/ASR/prepare.sh +++ b/egs/swbd/ASR/prepare.sh @@ -23,7 +23,8 @@ stop_stage=100 # - speech dl_dir=./download -swbd1_dir="/export/corpora3/LDC/LDC97S62" +# swbd1_dir="/export/corpora3/LDC/LDC97S62" +swbd1_dir=./download/LDC97S62/ # eval2000_dir contains the following files and directories # downloaded from LDC website: @@ -70,15 +71,14 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then if [ ! -e data/manifests/.swbd.done ]; then lhotse prepare switchboard --absolute-paths 1 --omit-silence $swbd1_dir data/manifests/swbd ./local/normalize_and_filter_supervisions.py \ - data/manifests/swbd/swbd_supervisions.jsonl \ - data/manifests/swbd/swbd_supervisions_norm.jsonl - cp data/manifests/swbd/swbd_recordings.jsonl data/manifests/recordings_swbd.jsonl - + data/manifests/swbd/swbd_supervisions_all.jsonl.gz \ + data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz + mv data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz data/manifests/swbd/swbd_supervisions_all.jsonl.gz - lhotse prepare $eval2000_dir data/manifests_eval2000 + lhotse prepare eval2000 --absolute-paths 1 $eval2000_dir data/manifests/eval2000 ./local/normalize_eval2000.py \ - data/manifests_eval2000/eval2000_supervisions_unnorm.jsonl.gz \ - data/manifests_eval2000/eval2000_supervisions_norm.jsonl.gz + data/manifests/eval2000/eval2000_supervisions_unnorm.jsonl.gz \ + data/manifests/eval2000/eval2000_supervisions_all.jsonl.gz ./local/rt03_data_prep.sh $rt03_dir @@ -96,20 +96,6 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then rm data/local/${x}/text.org done - python ./local/filter_empty_text.py --kaldi-data-dir data/local/eval2000 - ./utils/fix_data_dir.sh data/local/eval2000 - lhotse kaldi import data/local/eval2000 8000 data/manifests_eval2000 - mv data/manifests_eval2000/recordings.jsonl.gz data/manifests_eval2000/swbd_recordings_eval2000.jsonl.gz - mv data/manifests_eval2000/supervisions.jsonl.gz data/manifests_eval2000/swbd_supervisions_eval2000.jsonl.gz - - python ./local/filter_empty_text.py --kaldi-data-dir data/local/rt03 - ./utils/fix_data_dir.sh data/local/rt03 - lhotse kaldi import data/local/rt03 8000 data/manifests_rt03 - mv data/manifests_rt03/recordings.jsonl.gz data/manifests_rt03/swbd_recordings_rt03.jsonl.gz - mv data/manifests_rt03/supervisions.jsonl.gz data/manifests_rt03/swbd_supervisions_rt03.jsonl.gz - - lhotse fix data/manifests_train/swbd_recordings_all.jsonl.gz data/manifests_train/swbd_supervisions_all.jsonl.gz data/manifests - lhotse fix data/manifests_eval2000/swbd_recordings_eval2000.jsonl.gz data/manifests_eval2000/swbd_supervisions_eval2000.jsonl.gz data/manifests lhotse fix data/manifests_rt03/swbd_recordings_rt03.jsonl.gz data/manifests_rt03/swbd_supervisions_rt03.jsonl.gz data/manifests touch data/manifests/.swbd.done @@ -128,7 +114,7 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 3: Compute fbank for switchboard" + log "Stage 3: Compute fbank for SwitchBoard" mkdir -p data/fbank if [ ! -e data/fbank/.swbd.done ]; then ./local/compute_fbank_swbd.py @@ -150,13 +136,29 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then lang_dir=data/lang_phone mkdir -p $lang_dir + if ! which jq; then + echo "This script is intended to be used with jq but you have not installed jq + Note: in Linux, you can install jq with the following command: + 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 + 2. chmod +x ./jq + 3. cp jq /usr/bin" && exit 1 + fi + if [ ! -f $lang_dir/text ] || [ ! -s $lang_dir/text ]; then + log "Prepare text." + gunzip -c data/manifests/swbd/swbd_supervisions_all.jsonl.gz \ + | jq '.text' | sed 's/"//g' > $lang_dir/text + fi + log "prepare dict" - cut -f 2- -d" " data/local/train/text >${lang_dir}/input.txt + ./local/swbd1_prepare_dict.sh $swbd1_dir + cut -f 2- -d" " $lang_dir/text >${lang_dir}/input.txt # [noise] nsn # !sil sil # spn cat data/local/dict_nosp/lexicon.txt | - sort | uniq >$lang_dir/lexicon.txt + sort | uniq >$lang_dir/lexicon_lower.txt + + cat $lang_dir/lexicon_lower.txt | tr a-z A-Z > $lang_dir/lexicon.txt if [ ! -f $lang_dir/L_disambig.pt ]; then ./local/prepare_lang.py --lang-dir $lang_dir @@ -192,7 +194,7 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then if [ ! -f $lang_dir/transcript_words.txt ]; then log "Generate data for BPE training" - cat ./data/local/train/text | cut -d " " -f 2- >$lang_dir/transcript_words.txt + cat data/lang_phone/text | cut -d " " -f 2- >$lang_dir/transcript_words.txt fi if [ ! -f $lang_dir/bpe.model ]; then @@ -239,7 +241,7 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then ./local/convert_transcript_words_to_tokens.py \ --lexicon $lang_dir/lexicon.txt \ --transcript $lang_dir/transcript_words.txt \ - --oov "" \ + --oov "" \ >$lang_dir/transcript_tokens.txt fi