diff --git a/egs/multi_zh_en/ASR/local/prepare_char.py b/egs/multi_zh_en/ASR/local/prepare_char.py new file mode 100755 index 000000000..8cc0502c2 --- /dev/null +++ b/egs/multi_zh_en/ASR/local/prepare_char.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, +# Wei Kang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" + +This script takes as input `lang_dir`, which should contain:: + + - lang_dir/text, + - lang_dir/words.txt + +and generates the following files in the directory `lang_dir`: + + - lexicon.txt + - lexicon_disambig.txt + - L.pt + - L_disambig.pt + - tokens.txt +""" + +import argparse +import re +from pathlib import Path +from typing import Dict, List + +import k2 +import torch +from prepare_lang import ( + Lexicon, + add_disambig_symbols, + add_self_loops, + write_lexicon, + write_mapping, +) + + +def lexicon_to_fst_no_sil( + lexicon: Lexicon, + token2id: Dict[str, int], + word2id: Dict[str, int], + need_self_loops: bool = False, +) -> k2.Fsa: + """Convert a lexicon to an FST (in k2 format). + + Args: + lexicon: + The input lexicon. See also :func:`read_lexicon` + token2id: + A dict mapping tokens to IDs. + word2id: + A dict mapping words to IDs. + need_self_loops: + If True, add self-loop to states with non-epsilon output symbols + on at least one arc out of the state. The input label for this + self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. + Returns: + Return an instance of `k2.Fsa` representing the given lexicon. + """ + loop_state = 0 # words enter and leave from here + next_state = 1 # the next un-allocated state, will be incremented as we go + + arcs = [] + + # The blank symbol is defined in local/train_bpe_model.py + assert token2id[""] == 0 + assert word2id[""] == 0 + + eps = 0 + + for word, pieces in lexicon: + assert len(pieces) > 0, f"{word} has no pronunciations" + cur_state = loop_state + + word = word2id[word] + pieces = [token2id[i] if i in token2id else token2id[""] for i in pieces] + + for i in range(len(pieces) - 1): + w = word if i == 0 else eps + arcs.append([cur_state, next_state, pieces[i], w, 0]) + + cur_state = next_state + next_state += 1 + + # now for the last piece of this word + i = len(pieces) - 1 + w = word if i == 0 else eps + arcs.append([cur_state, loop_state, pieces[i], w, 0]) + + if need_self_loops: + disambig_token = token2id["#0"] + disambig_word = word2id["#0"] + arcs = add_self_loops( + arcs, + disambig_token=disambig_token, + disambig_word=disambig_word, + ) + + final_state = next_state + arcs.append([loop_state, final_state, -1, -1, 0]) + arcs.append([final_state]) + + arcs = sorted(arcs, key=lambda arc: arc[0]) + arcs = [[str(i) for i in arc] for arc in arcs] + arcs = [" ".join(arc) for arc in arcs] + arcs = "\n".join(arcs) + + fsa = k2.Fsa.from_str(arcs, acceptor=False) + return fsa + + +def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: + """Check if all the given tokens are in token symbol table. + + Args: + token_sym_table: + Token symbol table that contains all the valid tokens. + tokens: + A list of tokens. + Returns: + Return True if there is any token not in the token_sym_table, + otherwise False. + """ + for tok in tokens: + if tok not in token_sym_table: + return True + return False + + +def generate_lexicon(token_sym_table: Dict[str, int], words: List[str]) -> Lexicon: + """Generate a lexicon from a word list and token_sym_table. + + Args: + token_sym_table: + Token symbol table that mapping token to token ids. + words: + A list of strings representing words. + Returns: + Return a dict whose keys are words and values are the corresponding + tokens. + """ + lexicon = [] + for word in words: + chars = list(word.strip(" \t")) + if contain_oov(token_sym_table, chars): + continue + lexicon.append((word, chars)) + + # The OOV word is + lexicon.append(("", [""])) + return lexicon + + +def generate_tokens(text_file: str) -> Dict[str, int]: + """Generate tokens from the given text file. + + Args: + text_file: + A file that contains text lines to generate tokens. + Returns: + Return a dict whose keys are tokens and values are token ids ranged + from 0 to len(keys) - 1. + """ + tokens: Dict[str, int] = dict() + tokens[""] = 0 + tokens[""] = 1 + tokens[""] = 2 + whitespace = re.compile(r"([ \t\r\n]+)") + with open(text_file, "r", encoding="utf-8") as f: + for line in f: + line = re.sub(whitespace, "", line) + chars = list(line) + for char in chars: + if char not in tokens: + tokens[char] = len(tokens) + return tokens + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + It should contain the bpe.model and words.txt + """, + ) + + return parser.parse_args() + + +def main(): + args = get_args() + lang_dir = Path(args.lang_dir) + text_file = lang_dir / "text" + + word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt") + + words = word_sym_table.symbols + + excluded = ["", "!SIL", "", "", "#0", "", ""] + for w in excluded: + if w in words: + words.remove(w) + + token_sym_table = generate_tokens(text_file) + + lexicon = generate_lexicon(token_sym_table, words) + + lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) + + next_token_id = max(token_sym_table.values()) + 1 + for i in range(max_disambig + 1): + disambig = f"#{i}" + assert disambig not in token_sym_table + token_sym_table[disambig] = next_token_id + next_token_id += 1 + + word_sym_table.add("#0") + word_sym_table.add("") + word_sym_table.add("") + + write_mapping(lang_dir / "tokens.txt", token_sym_table) + + write_lexicon(lang_dir / "lexicon.txt", lexicon) + write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig) + + L = lexicon_to_fst_no_sil( + lexicon, + token2id=token_sym_table, + word2id=word_sym_table, + ) + + L_disambig = lexicon_to_fst_no_sil( + lexicon_disambig, + token2id=token_sym_table, + word2id=word_sym_table, + need_self_loops=True, + ) + torch.save(L.as_dict(), lang_dir / "L.pt") + torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt") + + +if __name__ == "__main__": + main() diff --git a/egs/multi_zh_en/ASR/prepare.sh b/egs/multi_zh_en/ASR/prepare.sh index 2c4fd371b..4945b8bc4 100755 --- a/egs/multi_zh_en/ASR/prepare.sh +++ b/egs/multi_zh_en/ASR/prepare.sh @@ -74,7 +74,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: preparation for training BPE model" + log "Stage 4: Prepare Byte BPE based lang" mkdir -p data/fbank if [ ! -d ../../aishell2/ASR/data/lang_char ]; then log "Abort! Please run ../../aishell2/ASR/prepare.sh --stage 3 --stop-stage 3" @@ -107,40 +107,40 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then ./local/prepare_for_bpe_model.py \ --lang_dir ./$lang_dir \ --text $lang_dir/text + + if [ ! -f $lang_dir/text_words_segmentation ]; then + python3 ./local/text2segments.py \ + --input-file ./data/lang_char/text \ + --output-file $lang_dir/text_words_segmentation + + cat ./data/lang_bpe_500/transcript_words.txt \ + >> $lang_dir/text_words_segmentation + fi + + cat $lang_dir/text_words_segmentation | sed 's/ /\n/g' \ + | sort -u | sed '/^$/d' | uniq > $lang_dir/words_no_ids.txt + + if [ ! -f $lang_dir/words.txt ]; then + python3 ./local/prepare_words.py \ + --input-file $lang_dir/words_no_ids.txt \ + --output-file $lang_dir/words.txt + fi + + if [ ! -f $lang_dir/bbpe.model ]; then + ./local/train_bbpe_model.py \ + --lang-dir $lang_dir \ + --vocab-size $vocab_size \ + --transcript $lang_dir/transcript_chars.txt + fi + + if [ ! -f $lang_dir/L_disambig.pt ]; then + ./local/prepare_lang_bbpe.py --lang-dir $lang_dir + + log "Validating $lang_dir/lexicon.txt" + ./local/validate_bpe_lexicon.py \ + --lexicon $lang_dir/lexicon.txt \ + --bpe-model $lang_dir/bbpe.model + fi done fi -if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 5: training BPE model" - if [ ! -d data/lang_char ]; then - log "Abort! Please run stage 4 first" - exit 1 - fi - - if [ ! -d data/lang_phone ]; then - log "Abort! Please run stage 4 first" - exit 1 - fi - - for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bbpe_${vocab_size} - mkdir -p $lang_dir - - if [ ! -f $lang_dir/bpe.model ]; then - log "Training BPE model with vocab size ${vocab_size}" - python3 -m k2_cli.prepares.prep_bpe \ - --input-dir data/lang_char \ - --output-dir data/lang_char \ - --vocab-size ${vocab_size} - fi - - if [ ! -f data/lang_phone/bpe${vocab_size}.model ]; then - log "Training BPE model with vocab size ${vocab_size}" - python3 -m k2_cli.prepares.prep_bpe \ - --input-dir data/lang_phone \ - --output-dir data/lang_phone \ - --vocab-size ${vocab_size} - fi - done -fi - diff --git a/egs/multi_zh_en/ASR/zipformer/multi_dataset.py b/egs/multi_zh_en/ASR/zipformer/multi_dataset.py index b1920e62e..7a357c83d 100644 --- a/egs/multi_zh_en/ASR/zipformer/multi_dataset.py +++ b/egs/multi_zh_en/ASR/zipformer/multi_dataset.py @@ -31,213 +31,42 @@ class MultiDataset: Args: manifest_dir: It is expected to contain the following files: - - aidatatang_cuts_train.jsonl.gz - - aishell_cuts_train.jsonl.gz - aishell2_cuts_train.jsonl.gz - - aishell4_cuts_train_L.jsonl.gz - - aishell4_cuts_train_M.jsonl.gz - - aishell4_cuts_train_S.jsonl.gz - - alimeeting-far_cuts_train.jsonl.gz - - magicdata_cuts_train.jsonl.gz - - primewords_cuts_train.jsonl.gz - - stcmds_cuts_train.jsonl.gz - - thchs_30_cuts_train.jsonl.gz - - kespeech/kespeech-asr_cuts_train_phase1.jsonl.gz - - kespeech/kespeech-asr_cuts_train_phase2.jsonl.gz - - wenetspeech/cuts_L.jsonl.gz """ self.fbank_dir = Path(fbank_dir) def train_cuts(self) -> CutSet: logging.info("About to get multidataset train cuts") - # THCHS-30 - logging.info("Loading THCHS-30 in lazy mode") - thchs_30_cuts = load_manifest_lazy( - self.fbank_dir / "thchs_30_cuts_train.jsonl.gz" - ) - - # AISHELL-1 - logging.info("Loading Aishell-1 in lazy mode") - aishell_cuts = load_manifest_lazy( - self.fbank_dir / "aishell_cuts_train.jsonl.gz" - ) - # AISHELL-2 logging.info("Loading Aishell-2 in lazy mode") aishell_2_cuts = load_manifest_lazy( self.fbank_dir / "aishell2_cuts_train.jsonl.gz" ) - # AISHELL-4 - logging.info("Loading Aishell-4 in lazy mode") - aishell_4_L_cuts = load_manifest_lazy( - self.fbank_dir / "aishell4_cuts_train_L.jsonl.gz" - ) - aishell_4_M_cuts = load_manifest_lazy( - self.fbank_dir / "aishell4_cuts_train_M.jsonl.gz" - ) - aishell_4_S_cuts = load_manifest_lazy( - self.fbank_dir / "aishell4_cuts_train_S.jsonl.gz" - ) - - # ST-CMDS - logging.info("Loading ST-CMDS in lazy mode") - stcmds_cuts = load_manifest_lazy(self.fbank_dir / "stcmds_cuts_train.jsonl.gz") - - # Primewords - logging.info("Loading Primewords in lazy mode") - primewords_cuts = load_manifest_lazy( - self.fbank_dir / "primewords_cuts_train.jsonl.gz" - ) - - # MagicData - logging.info("Loading MagicData in lazy mode") - magicdata_cuts = load_manifest_lazy( - self.fbank_dir / "magicdata_cuts_train.jsonl.gz" - ) - - # Aidatatang_200zh - logging.info("Loading Aidatatang_200zh in lazy mode") - aidatatang_200zh_cuts = load_manifest_lazy( - self.fbank_dir / "aidatatang_cuts_train.jsonl.gz" - ) - - # Ali-Meeting - logging.info("Loading Ali-Meeting in lazy mode") - alimeeting_cuts = load_manifest_lazy( - self.fbank_dir / "alimeeting-far_cuts_train.jsonl.gz" - ) - - # WeNetSpeech - logging.info("Loading WeNetSpeech in lazy mode") - wenetspeech_L_cuts = load_manifest_lazy( - self.fbank_dir / "wenetspeech" / "cuts_L.jsonl.gz" - ) - - # KeSpeech - logging.info("Loading KeSpeech in lazy mode") - kespeech_1_cuts = load_manifest_lazy( - self.fbank_dir / "kespeech" / "kespeech-asr_cuts_train_phase1.jsonl.gz" - ) - kespeech_2_cuts = load_manifest_lazy( - self.fbank_dir / "kespeech" / "kespeech-asr_cuts_train_phase2.jsonl.gz" - ) - return CutSet.mux( - thchs_30_cuts, - aishell_cuts, aishell_2_cuts, - aishell_4_L_cuts, - aishell_4_M_cuts, - aishell_4_S_cuts, - stcmds_cuts, - primewords_cuts, - magicdata_cuts, - aidatatang_200zh_cuts, - alimeeting_cuts, - wenetspeech_L_cuts, - kespeech_1_cuts, - kespeech_2_cuts, weights=[ - len(thchs_30_cuts), - len(aishell_cuts), len(aishell_2_cuts), - len(aishell_4_L_cuts), - len(aishell_4_M_cuts), - len(aishell_4_S_cuts), - len(stcmds_cuts), - len(primewords_cuts), - len(magicdata_cuts), - len(aidatatang_200zh_cuts), - len(alimeeting_cuts), - len(wenetspeech_L_cuts), - len(kespeech_1_cuts), - len(kespeech_2_cuts), ], ) - def dev_cuts(self) -> CutSet: + def dev_cuts(self) -> List[CutSet]: logging.info("About to get multidataset dev cuts") - # Aidatatang_200zh - logging.info("Loading Aidatatang_200zh DEV set in lazy mode") - aidatatang_dev_cuts = load_manifest_lazy( - self.fbank_dir / "aidatatang_cuts_dev.jsonl.gz" - ) - - # AISHELL - logging.info("Loading Aishell DEV set in lazy mode") - aishell_dev_cuts = load_manifest_lazy( - self.fbank_dir / "aishell_cuts_dev.jsonl.gz" - ) - # AISHELL-2 logging.info("Loading Aishell-2 DEV set in lazy mode") aishell2_dev_cuts = load_manifest_lazy( self.fbank_dir / "aishell2_cuts_dev.jsonl.gz" ) - # Ali-Meeting - logging.info("Loading Ali-Meeting DEV set in lazy mode") - alimeeting_dev_cuts = load_manifest_lazy( - self.fbank_dir / "alimeeting-far_cuts_eval.jsonl.gz" - ) - - # MagicData - logging.info("Loading MagicData DEV set in lazy mode") - magicdata_dev_cuts = load_manifest_lazy( - self.fbank_dir / "magicdata_cuts_dev.jsonl.gz" - ) - - # KeSpeech - logging.info("Loading KeSpeech DEV set in lazy mode") - kespeech_dev_phase1_cuts = load_manifest_lazy( - self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase1.jsonl.gz" - ) - kespeech_dev_phase2_cuts = load_manifest_lazy( - self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase2.jsonl.gz" - ) - - # WeNetSpeech - logging.info("Loading WeNetSpeech DEV set in lazy mode") - wenetspeech_dev_cuts = load_manifest_lazy( - self.fbank_dir / "wenetspeech" / "cuts_DEV.jsonl.gz" - ) - - return wenetspeech_dev_cuts - # return [ - # aidatatang_dev_cuts, - # aishell_dev_cuts, - # aishell2_dev_cuts, - # alimeeting_dev_cuts, - # magicdata_dev_cuts, - # kespeech_dev_phase1_cuts, - # kespeech_dev_phase2_cuts, - # wenetspeech_dev_cuts, - # ] + return [ + aishell2_dev_cuts, + ] def test_cuts(self) -> Dict[str, CutSet]: logging.info("About to get multidataset test cuts") - # Aidatatang_200zh - logging.info("Loading Aidatatang_200zh set in lazy mode") - aidatatang_test_cuts = load_manifest_lazy( - self.fbank_dir / "aidatatang_cuts_test.jsonl.gz" - ) - aidatatang_dev_cuts = load_manifest_lazy( - self.fbank_dir / "aidatatang_cuts_dev.jsonl.gz" - ) - - # AISHELL - logging.info("Loading Aishell set in lazy mode") - aishell_test_cuts = load_manifest_lazy( - self.fbank_dir / "aishell_cuts_test.jsonl.gz" - ) - aishell_dev_cuts = load_manifest_lazy( - self.fbank_dir / "aishell_cuts_dev.jsonl.gz" - ) - # AISHELL-2 logging.info("Loading Aishell-2 set in lazy mode") aishell2_test_cuts = load_manifest_lazy( @@ -247,70 +76,7 @@ class MultiDataset: self.fbank_dir / "aishell2_cuts_dev.jsonl.gz" ) - # AISHELL-4 - logging.info("Loading Aishell-4 TEST set in lazy mode") - aishell4_test_cuts = load_manifest_lazy( - self.fbank_dir / "aishell4_cuts_test.jsonl.gz" - ) - - # Ali-Meeting - logging.info("Loading Ali-Meeting set in lazy mode") - alimeeting_test_cuts = load_manifest_lazy( - self.fbank_dir / "alimeeting-far_cuts_test.jsonl.gz" - ) - alimeeting_eval_cuts = load_manifest_lazy( - self.fbank_dir / "alimeeting-far_cuts_eval.jsonl.gz" - ) - - # MagicData - logging.info("Loading MagicData set in lazy mode") - magicdata_test_cuts = load_manifest_lazy( - self.fbank_dir / "magicdata_cuts_test.jsonl.gz" - ) - magicdata_dev_cuts = load_manifest_lazy( - self.fbank_dir / "magicdata_cuts_dev.jsonl.gz" - ) - - # KeSpeech - logging.info("Loading KeSpeech set in lazy mode") - kespeech_test_cuts = load_manifest_lazy( - self.fbank_dir / "kespeech" / "kespeech-asr_cuts_test.jsonl.gz" - ) - kespeech_dev_phase1_cuts = load_manifest_lazy( - self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase1.jsonl.gz" - ) - kespeech_dev_phase2_cuts = load_manifest_lazy( - self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase2.jsonl.gz" - ) - - # WeNetSpeech - logging.info("Loading WeNetSpeech set in lazy mode") - wenetspeech_test_meeting_cuts = load_manifest_lazy( - self.fbank_dir / "wenetspeech" / "cuts_TEST_MEETING.jsonl.gz" - ) - wenetspeech_test_net_cuts = load_manifest_lazy( - self.fbank_dir / "wenetspeech" / "cuts_TEST_NET.jsonl.gz" - ) - wenetspeech_dev_cuts = load_manifest_lazy( - self.fbank_dir / "wenetspeech" / "cuts_DEV.jsonl.gz" - ) - return { - "aidatatang_test": aidatatang_test_cuts, - "aidatatang_dev": aidatatang_dev_cuts, - "alimeeting_test": alimeeting_test_cuts, - "alimeeting_eval": alimeeting_eval_cuts, - "aishell_test": aishell_test_cuts, - "aishell_dev": aishell_dev_cuts, "aishell-2_test": aishell2_test_cuts, "aishell-2_dev": aishell2_dev_cuts, - "aishell-4": aishell4_test_cuts, - "magicdata_test": magicdata_test_cuts, - "magicdata_dev": magicdata_dev_cuts, - "kespeech-asr_test": kespeech_test_cuts, - "kespeech-asr_dev_phase1": kespeech_dev_phase1_cuts, - "kespeech-asr_dev_phase2": kespeech_dev_phase2_cuts, - "wenetspeech-meeting_test": wenetspeech_test_meeting_cuts, - "wenetspeech-net_test": wenetspeech_test_net_cuts, - "wenetspeech_dev": wenetspeech_dev_cuts, }