updated

2023-09-02 00:14:05 +08:00 · 2023-09-02 00:14:05 +08:00 · cec73bd28b
commit cec73bd28b
parent 75e87dd6e3
9 changed files with 1248 additions and 403 deletions
--- a/egs/librispeech/ASR/zipformer/zipformer.py
+++ b/egs/librispeech/ASR/zipformer/zipformer.py
--- a/egs/multi_zh-hans/ASR/local/bpe_model_to_tokens.py
+++ b/egs/multi_zh-hans/ASR/local/bpe_model_to_tokens.py
@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+"""
+This script takes `bpe.model` as input and generates a file `tokens.txt`
+from it.
+
+Usage:
+./bpe_model_to_tokens.py /path/to/input/bpe.model > tokens.txt
+"""
+import argparse
+
+import sentencepiece as spm
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "bpe_model",
+        type=str,
+        help="Path to the input bpe.model",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(args.bpe_model)
+
+    for i in range(sp.vocab_size()):
+        print(sp.id_to_piece(i), i)
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/multi_zh-hans/ASR/local/compile_lg.py
+++ b/egs/multi_zh-hans/ASR/local/compile_lg.py
@ -0,0 +1 @@
+../../../librispeech/ASR/local/compile_lg.py
--- a/egs/multi_zh-hans/ASR/local/prepare_char.py
+++ b/egs/multi_zh-hans/ASR/local/prepare_char.py
@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                  Wei Kang,
+#                                                  Mingshuang Luo)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This script takes as input `lang_dir`, which should contain::
+    - lang_dir/text,
+    - lang_dir/words.txt
+and generates the following files in the directory `lang_dir`:
+    - lexicon.txt
+    - lexicon_disambig.txt
+    - L.pt
+    - L_disambig.pt
+    - tokens.txt
+"""
+
+import argparse
+import re
+from pathlib import Path
+from typing import Dict, List
+
+import k2
+import torch
+from prepare_lang import (
+    Lexicon,
+    add_disambig_symbols,
+    add_self_loops,
+    write_lexicon,
+    write_mapping,
+)
+
+
+def lexicon_to_fst_no_sil(
+    lexicon: Lexicon,
+    token2id: Dict[str, int],
+    word2id: Dict[str, int],
+    need_self_loops: bool = False,
+) -> k2.Fsa:
+    """Convert a lexicon to an FST (in k2 format).
+    Args:
+      lexicon:
+        The input lexicon. See also :func:`read_lexicon`
+      token2id:
+        A dict mapping tokens to IDs.
+      word2id:
+        A dict mapping words to IDs.
+      need_self_loops:
+        If True, add self-loop to states with non-epsilon output symbols
+        on at least one arc out of the state. The input label for this
+        self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
+    Returns:
+      Return an instance of `k2.Fsa` representing the given lexicon.
+    """
+    loop_state = 0  # words enter and leave from here
+    next_state = 1  # the next un-allocated state, will be incremented as we go
+
+    arcs = []
+
+    # The blank symbol <blk> is defined in local/train_bpe_model.py
+    assert token2id["<blk>"] == 0
+    assert word2id["<eps>"] == 0
+
+    eps = 0
+
+    for word, pieces in lexicon:
+        assert len(pieces) > 0, f"{word} has no pronunciations"
+        cur_state = loop_state
+
+        word = word2id[word]
+        pieces = [token2id[i] if i in token2id else token2id["<unk>"] for i in pieces]
+
+        for i in range(len(pieces) - 1):
+            w = word if i == 0 else eps
+            arcs.append([cur_state, next_state, pieces[i], w, 0])
+
+            cur_state = next_state
+            next_state += 1
+
+        # now for the last piece of this word
+        i = len(pieces) - 1
+        w = word if i == 0 else eps
+        arcs.append([cur_state, loop_state, pieces[i], w, 0])
+
+    if need_self_loops:
+        disambig_token = token2id["#0"]
+        disambig_word = word2id["#0"]
+        arcs = add_self_loops(
+            arcs,
+            disambig_token=disambig_token,
+            disambig_word=disambig_word,
+        )
+
+    final_state = next_state
+    arcs.append([loop_state, final_state, -1, -1, 0])
+    arcs.append([final_state])
+
+    arcs = sorted(arcs, key=lambda arc: arc[0])
+    arcs = [[str(i) for i in arc] for arc in arcs]
+    arcs = [" ".join(arc) for arc in arcs]
+    arcs = "\n".join(arcs)
+
+    fsa = k2.Fsa.from_str(arcs, acceptor=False)
+    return fsa
+
+
+def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool:
+    """Check if all the given tokens are in token symbol table.
+    Args:
+      token_sym_table:
+        Token symbol table that contains all the valid tokens.
+      tokens:
+        A list of tokens.
+    Returns:
+      Return True if there is any token not in the token_sym_table,
+      otherwise False.
+    """
+    for tok in tokens:
+        if tok not in token_sym_table:
+            return True
+    return False
+
+
+def generate_lexicon(token_sym_table: Dict[str, int], words: List[str]) -> Lexicon:
+    """Generate a lexicon from a word list and token_sym_table.
+    Args:
+      token_sym_table:
+        Token symbol table that mapping token to token ids.
+      words:
+        A list of strings representing words.
+    Returns:
+      Return a dict whose keys are words and values are the corresponding
+          tokens.
+    """
+    lexicon = []
+    for word in words:
+        chars = list(word.strip(" \t"))
+        if contain_oov(token_sym_table, chars):
+            continue
+        lexicon.append((word, chars))
+
+    # The OOV word is <UNK>
+    lexicon.append(("<UNK>", ["<unk>"]))
+    return lexicon
+
+
+def generate_tokens(text_file: str) -> Dict[str, int]:
+    """Generate tokens from the given text file.
+    Args:
+      text_file:
+        A file that contains text lines to generate tokens.
+    Returns:
+      Return a dict whose keys are tokens and values are token ids ranged
+      from 0 to len(keys) - 1.
+    """
+    tokens: Dict[str, int] = dict()
+    tokens["<blk>"] = 0
+    tokens["<sos/eos>"] = 1
+    tokens["<unk>"] = 2
+    whitespace = re.compile(r"([ \t\r\n]+)")
+    with open(text_file, "r", encoding="utf-8") as f:
+        for line in f:
+            line = re.sub(whitespace, "", line)
+            tokens_list = list(line)
+            for token in tokens_list:
+                if token not in tokens:
+                    tokens[token] = len(tokens)
+    return tokens
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--lang-dir", type=str, help="The lang directory.")
+    args = parser.parse_args()
+
+    lang_dir = Path(args.lang_dir)
+    text_file = lang_dir / "text"
+
+    word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
+
+    words = word_sym_table.symbols
+
+    excluded = ["<eps>", "!SIL", "<SPOKEN_NOISE>", "<UNK>", "#0", "<s>", "</s>"]
+    for w in excluded:
+        if w in words:
+            words.remove(w)
+
+    token_sym_table = generate_tokens(text_file)
+
+    lexicon = generate_lexicon(token_sym_table, words)
+
+    lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
+
+    next_token_id = max(token_sym_table.values()) + 1
+    for i in range(max_disambig + 1):
+        disambig = f"#{i}"
+        assert disambig not in token_sym_table
+        token_sym_table[disambig] = next_token_id
+        next_token_id += 1
+
+    word_sym_table.add("#0")
+    word_sym_table.add("<s>")
+    word_sym_table.add("</s>")
+
+    write_mapping(lang_dir / "tokens.txt", token_sym_table)
+
+    write_lexicon(lang_dir / "lexicon.txt", lexicon)
+    write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig)
+
+    L = lexicon_to_fst_no_sil(
+        lexicon,
+        token2id=token_sym_table,
+        word2id=word_sym_table,
+    )
+
+    L_disambig = lexicon_to_fst_no_sil(
+        lexicon_disambig,
+        token2id=token_sym_table,
+        word2id=word_sym_table,
+        need_self_loops=True,
+    )
+    torch.save(L.as_dict(), lang_dir / "L.pt")
+    torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt")
+
+
+if __name__ == "__main__":
+    main()
+
--- a/egs/multi_zh-hans/ASR/local/prepare_lang.py
+++ b/egs/multi_zh-hans/ASR/local/prepare_lang.py
@ -0,0 +1 @@
+../../../librispeech/ASR/local/prepare_lang.py
--- a/egs/multi_zh-hans/ASR/local/prepare_lang_bpe.py
+++ b/egs/multi_zh-hans/ASR/local/prepare_lang_bpe.py
@ -0,0 +1,266 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
+
+"""
+
+This script takes as input `lang_dir`, which should contain::
+
+    - lang_dir/bpe.model,
+    - lang_dir/words.txt
+
+and generates the following files in the directory `lang_dir`:
+
+    - lexicon.txt
+    - lexicon_disambig.txt
+    - L.pt
+    - L_disambig.pt
+    - tokens.txt
+"""
+
+import argparse
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import k2
+import sentencepiece as spm
+import torch
+from prepare_lang import (
+    Lexicon,
+    add_disambig_symbols,
+    add_self_loops,
+    write_lexicon,
+    write_mapping,
+)
+
+from icefall.utils import str2bool
+
+
+def lexicon_to_fst_no_sil(
+    lexicon: Lexicon,
+    token2id: Dict[str, int],
+    word2id: Dict[str, int],
+    need_self_loops: bool = False,
+) -> k2.Fsa:
+    """Convert a lexicon to an FST (in k2 format).
+
+    Args:
+      lexicon:
+        The input lexicon. See also :func:`read_lexicon`
+      token2id:
+        A dict mapping tokens to IDs.
+      word2id:
+        A dict mapping words to IDs.
+      need_self_loops:
+        If True, add self-loop to states with non-epsilon output symbols
+        on at least one arc out of the state. The input label for this
+        self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
+    Returns:
+      Return an instance of `k2.Fsa` representing the given lexicon.
+    """
+    loop_state = 0  # words enter and leave from here
+    next_state = 1  # the next un-allocated state, will be incremented as we go
+
+    arcs = []
+
+    # The blank symbol <blk> is defined in local/train_bpe_model.py
+    assert token2id["<blk>"] == 0
+    assert word2id["<eps>"] == 0
+
+    eps = 0
+
+    for word, pieces in lexicon:
+        assert len(pieces) > 0, f"{word} has no pronunciations"
+        cur_state = loop_state
+
+        word = word2id[word]
+        pieces = [token2id[i] for i in pieces]
+
+        for i in range(len(pieces) - 1):
+            w = word if i == 0 else eps
+            arcs.append([cur_state, next_state, pieces[i], w, 0])
+
+            cur_state = next_state
+            next_state += 1
+
+        # now for the last piece of this word
+        i = len(pieces) - 1
+        w = word if i == 0 else eps
+        arcs.append([cur_state, loop_state, pieces[i], w, 0])
+
+    if need_self_loops:
+        disambig_token = token2id["#0"]
+        disambig_word = word2id["#0"]
+        arcs = add_self_loops(
+            arcs,
+            disambig_token=disambig_token,
+            disambig_word=disambig_word,
+        )
+
+    final_state = next_state
+    arcs.append([loop_state, final_state, -1, -1, 0])
+    arcs.append([final_state])
+
+    arcs = sorted(arcs, key=lambda arc: arc[0])
+    arcs = [[str(i) for i in arc] for arc in arcs]
+    arcs = [" ".join(arc) for arc in arcs]
+    arcs = "\n".join(arcs)
+
+    fsa = k2.Fsa.from_str(arcs, acceptor=False)
+    return fsa
+
+
+def generate_lexicon(
+    model_file: str, words: List[str], oov: str
+) -> Tuple[Lexicon, Dict[str, int]]:
+    """Generate a lexicon from a BPE model.
+
+    Args:
+      model_file:
+        Path to a sentencepiece model.
+      words:
+        A list of strings representing words.
+      oov:
+        The out of vocabulary word in lexicon.
+    Returns:
+      Return a tuple with two elements:
+        - A dict whose keys are words and values are the corresponding
+          word pieces.
+        - A dict representing the token symbol, mapping from tokens to IDs.
+    """
+    sp = spm.SentencePieceProcessor()
+    sp.load(str(model_file))
+
+    # Convert word to word piece IDs instead of word piece strings
+    # to avoid OOV tokens.
+    words_pieces_ids: List[List[int]] = sp.encode(words, out_type=int)
+
+    # Now convert word piece IDs back to word piece strings.
+    words_pieces: List[List[str]] = [sp.id_to_piece(ids) for ids in words_pieces_ids]
+
+    lexicon = []
+    for word, pieces in zip(words, words_pieces):
+        lexicon.append((word, pieces))
+
+    lexicon.append((oov, ["▁", sp.id_to_piece(sp.unk_id())]))
+
+    token2id: Dict[str, int] = {sp.id_to_piece(i): i for i in range(sp.vocab_size())}
+
+    return lexicon, token2id
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        help="""Input and output directory.
+        It should contain the bpe.model and words.txt
+        """,
+    )
+
+    parser.add_argument(
+        "--oov",
+        type=str,
+        default="<UNK>",
+        help="The out of vocabulary word in lexicon.",
+    )
+
+    parser.add_argument(
+        "--debug",
+        type=str2bool,
+        default=False,
+        help="""True for debugging, which will generate
+        a visualization of the lexicon FST.
+
+        Caution: If your lexicon contains hundreds of thousands
+        of lines, please set it to False!
+
+        See "test/test_bpe_lexicon.py" for usage.
+        """,
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    lang_dir = Path(args.lang_dir)
+    model_file = lang_dir / "bpe.model"
+
+    word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
+
+    words = word_sym_table.symbols
+
+    excluded = ["<eps>", "!SIL", "<SPOKEN_NOISE>", args.oov, "#0", "<s>", "</s>"]
+
+    for w in excluded:
+        if w in words:
+            words.remove(w)
+
+    lexicon, token_sym_table = generate_lexicon(model_file, words, args.oov)
+
+    lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
+
+    next_token_id = max(token_sym_table.values()) + 1
+    for i in range(max_disambig + 1):
+        disambig = f"#{i}"
+        assert disambig not in token_sym_table
+        token_sym_table[disambig] = next_token_id
+        next_token_id += 1
+
+    word_sym_table.add("#0")
+    word_sym_table.add("<s>")
+    word_sym_table.add("</s>")
+
+    write_mapping(lang_dir / "tokens.txt", token_sym_table)
+
+    write_lexicon(lang_dir / "lexicon.txt", lexicon)
+    write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig)
+
+    L = lexicon_to_fst_no_sil(
+        lexicon,
+        token2id=token_sym_table,
+        word2id=word_sym_table,
+    )
+
+    L_disambig = lexicon_to_fst_no_sil(
+        lexicon_disambig,
+        token2id=token_sym_table,
+        word2id=word_sym_table,
+        need_self_loops=True,
+    )
+    torch.save(L.as_dict(), lang_dir / "L.pt")
+    torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt")
+
+    if args.debug:
+        labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt")
+        aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt")
+
+        L.labels_sym = labels_sym
+        L.aux_labels_sym = aux_labels_sym
+        L.draw(f"{lang_dir / 'L.svg'}", title="L.pt")
+
+        L_disambig.labels_sym = labels_sym
+        L_disambig.aux_labels_sym = aux_labels_sym
+        L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", title="L_disambig.pt")
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/multi_zh-hans/ASR/local/validate_bpe_lexicon.py
+++ b/egs/multi_zh-hans/ASR/local/validate_bpe_lexicon.py
@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script checks that there are no OOV tokens in the BPE-based lexicon.
+
+Usage example:
+
+    python3 ./local/validate_bpe_lexicon.py \
+            --lexicon /path/to/lexicon.txt \
+            --bpe-model /path/to/bpe.model
+"""
+
+import argparse
+from pathlib import Path
+from typing import List, Tuple
+
+import sentencepiece as spm
+
+from icefall.lexicon import read_lexicon
+
+# Map word to word pieces
+Lexicon = List[Tuple[str, List[str]]]
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--lexicon",
+        required=True,
+        type=Path,
+        help="Path to lexicon.txt",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        required=True,
+        type=Path,
+        help="Path to bpe.model",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    assert args.lexicon.is_file(), args.lexicon
+    assert args.bpe_model.is_file(), args.bpe_model
+
+    lexicon = read_lexicon(args.lexicon)
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(str(args.bpe_model))
+
+    word_pieces = set(sp.id_to_piece(list(range(sp.vocab_size()))))
+    for word, pieces in lexicon:
+        for p in pieces:
+            if p not in word_pieces:
+                raise ValueError(f"The word {word} contains an OOV token {p}")
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/multi_zh-hans/ASR/prepare.sh
+++ b/egs/multi_zh-hans/ASR/prepare.sh
@ -18,16 +18,6 @@ vocab_sizes=(
  2000
 )

-
-# multidataset list.
-# LibriSpeech and musan are required.
-# The others are optional.
-multidataset=(
-  "gigaspeech",
-  "commonvoice",
-  "librilight",
-)
-
 # All files generated by this script are saved in "data".
 # You can safely remove "data" and rerun this script to regenerate it.
 mkdir -p data
@ -318,11 +308,63 @@ if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
    lang_dir=data/lang_bpe_${vocab_size}
    
    mkdir -p $lang_dir
-    ./local/train_bpe_model.py \
-      --lang-dir $lang_dir \
-      --transcript ./data/lang_char/transcript_chars.txt \
-      --vocab-size $vocab_size
+    if [ ! -f $lang_dir/bpe.model ]; then
+      ./local/train_bpe_model.py \
+        --lang-dir $lang_dir \
+        --transcript ./data/lang_char/transcript_chars.txt \
+        --vocab-size $vocab_size
+
+      ./local/bpe_model_to_tokens.py $lang_dir/bpe.model > $lang_dir/tokens.txt
+    fi
+
+    if [ ! -f $lang_dir/L_disambig.pt ]; then
+      cp data/lang_char/words.txt $lang_dir
+
+      ./local/prepare_lang_bpe.py --lang-dir $lang_dir
+      log "Validating $lang_dir/lexicon.txt"
+      ./local/validate_bpe_lexicon.py \
+        --lexicon $lang_dir/lexicon.txt \
+        --bpe-model $lang_dir/bpe.model
+    fi
+    
+    if [ ! -f $lang_dir/L.fst ]; then
+      log "Converting L.pt to L.fst"
+      ./shared/convert-k2-to-openfst.py \
+        --olabels aux_labels \
+        $lang_dir/L.pt \
+        $lang_dir/L.fst
+    fi
+
+    if [ ! -f $lang_dir/L_disambig.fst ]; then
+      log "Converting L_disambig.pt to L_disambig.fst"
+      ./shared/convert-k2-to-openfst.py \
+        --olabels aux_labels \
+        $lang_dir/L_disambig.pt \
+        $lang_dir/L_disambig.fst
+    fi
  done
-  
-  ./local/train_bpe_model.py --lang-dir ./data/lang_bpe_${vocab_size}
 fi
+
+if [ $stage -le 14 ] && [ $stop_stage -ge 14 ]; then
+  log "Stage 14: Prepare G"
+  
+  if [ -d ../../wenetspeech/ASR/data/lang_char/ ]; then
+    cd data
+    cp -r ../../../../wenetspeech/ASR/data/lm .
+    cd ..
+  else
+    log "Abort! Please run ../../wenetspeech/ASR/prepare.sh"
+    exit 1
+  fi
+fi
+
+if [ $stage -le 15 ] && [ $stop_stage -ge 15 ]; then
+  log "Stage 15: Compile LG"
+  for vocab_size in ${vocab_sizes[@]}; do
+    lang_dir=data/lang_bpe_${vocab_size}
+
+    python ./local/compile_lg.py --lang-dir $lang_dir
+  done
+fi
+
+
--- a/egs/multi_zh-hans/ASR/zipformer/export.py
+++ b/egs/multi_zh-hans/ASR/zipformer/export.py
@ -33,9 +33,9 @@ dataset, you should change the argument values according to your dataset.

 ./zipformer/export.py \
  --exp-dir ./zipformer/exp \
-  --tokens data/lang_bpe_500/tokens.txt \
-  --epoch 30 \
-  --avg 9 \
+  --tokens data/lang_bpe_2000/tokens.txt \
+  --epoch 23 \
+  --avg 1 \
  --jit 1

 It will generate a file `jit_script.pt` in the given `exp_dir`. You can later
@ -53,9 +53,9 @@ for how to use the exported models outside of icefall.
  --causal 1 \
  --chunk-size 16 \
  --left-context-frames 128 \
-  --tokens data/lang_bpe_500/tokens.txt \
-  --epoch 30 \
-  --avg 9 \
+  --tokens data/lang_bpe_2000/tokens.txt \
+  --epoch 23 \
+  --avg 1 \
  --jit 1

 It will generate a file `jit_script_chunk_16_left_128.pt` in the given `exp_dir`.
@ -72,18 +72,18 @@ for how to use the exported models outside of icefall.

 ./zipformer/export.py \
  --exp-dir ./zipformer/exp \
-  --tokens data/lang_bpe_500/tokens.txt \
-  --epoch 30 \
-  --avg 9
+  --tokens data/lang_bpe_2000/tokens.txt \
+  --epoch 23 \
+  --avg 1

 - For streaming model:

 ./zipformer/export.py \
  --exp-dir ./zipformer/exp \
  --causal 1 \
-  --tokens data/lang_bpe_500/tokens.txt \
-  --epoch 30 \
-  --avg 9
+  --tokens data/lang_bpe_2000/tokens.txt \
+  --epoch 23 \
+  --avg 1

 It will generate a file `pretrained.pt` in the given `exp_dir`. You can later
 load it by `icefall.checkpoint.load_checkpoint()`.
@ -103,7 +103,7 @@ you can do:
        --avg 1 \
        --max-duration 600 \
        --decoding-method greedy_search \
-        --bpe-model data/lang_bpe_500/bpe.model
+        --bpe-model data/lang_bpe_2000/bpe.model

 - For streaming model:

@ -124,7 +124,7 @@ To use the generated file with `zipformer/decode.py` and `zipformer/streaming_de
        --chunk-size 16 \
        --left-context-frames 128 \
        --decoding-method greedy_search \
-        --bpe-model data/lang_bpe_500/bpe.model
+        --bpe-model data/lang_bpe_2000/bpe.model

    # chunk-wise streaming decoding
    ./zipformer/streaming_decode.py \
@ -136,7 +136,7 @@ To use the generated file with `zipformer/decode.py` and `zipformer/streaming_de
        --chunk-size 16 \
        --left-context-frames 128 \
        --decoding-method greedy_search \
-        --bpe-model data/lang_bpe_500/bpe.model
+        --bpe-model data/lang_bpe_2000/bpe.model

 Check ./pretrained.py for its usage.

@ -207,7 +207,7 @@ def get_parser():
    parser.add_argument(
        "--epoch",
        type=int,
-        default=30,
+        default=23,
        help="""It specifies the checkpoint to use for decoding.
        Note: Epoch counts from 1.
        You can specify --avg to use more checkpoints for model averaging.""",
@ -226,7 +226,7 @@ def get_parser():
    parser.add_argument(
        "--avg",
        type=int,
-        default=9,
+        default=1,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch' and '--iter'",
@ -255,7 +255,7 @@ def get_parser():
    parser.add_argument(
        "--tokens",
        type=str,
-        default="data/lang_bpe_500/tokens.txt",
+        default="data/lang_bpe_2000/tokens.txt",
        help="Path to the tokens.txt",
    )
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/local/compile_lg.py`
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/local/prepare_lang.py`