mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-10 01:24:19 +00:00
updated
This commit is contained in:
parent
75e87dd6e3
commit
cec73bd28b
File diff suppressed because it is too large
Load Diff
37
egs/multi_zh-hans/ASR/local/bpe_model_to_tokens.py
Executable file
37
egs/multi_zh-hans/ASR/local/bpe_model_to_tokens.py
Executable file
@ -0,0 +1,37 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script takes `bpe.model` as input and generates a file `tokens.txt`
|
||||||
|
from it.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
./bpe_model_to_tokens.py /path/to/input/bpe.model > tokens.txt
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
import sentencepiece as spm
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"bpe_model",
|
||||||
|
type=str,
|
||||||
|
help="Path to the input bpe.model",
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = get_args()
|
||||||
|
|
||||||
|
sp = spm.SentencePieceProcessor()
|
||||||
|
sp.load(args.bpe_model)
|
||||||
|
|
||||||
|
for i in range(sp.vocab_size()):
|
||||||
|
print(sp.id_to_piece(i), i)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
1
egs/multi_zh-hans/ASR/local/compile_lg.py
Symbolic link
1
egs/multi_zh-hans/ASR/local/compile_lg.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/local/compile_lg.py
|
243
egs/multi_zh-hans/ASR/local/prepare_char.py
Executable file
243
egs/multi_zh-hans/ASR/local/prepare_char.py
Executable file
@ -0,0 +1,243 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang,
|
||||||
|
# Wei Kang,
|
||||||
|
# Mingshuang Luo)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script takes as input `lang_dir`, which should contain::
|
||||||
|
- lang_dir/text,
|
||||||
|
- lang_dir/words.txt
|
||||||
|
and generates the following files in the directory `lang_dir`:
|
||||||
|
- lexicon.txt
|
||||||
|
- lexicon_disambig.txt
|
||||||
|
- L.pt
|
||||||
|
- L_disambig.pt
|
||||||
|
- tokens.txt
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
|
import k2
|
||||||
|
import torch
|
||||||
|
from prepare_lang import (
|
||||||
|
Lexicon,
|
||||||
|
add_disambig_symbols,
|
||||||
|
add_self_loops,
|
||||||
|
write_lexicon,
|
||||||
|
write_mapping,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def lexicon_to_fst_no_sil(
|
||||||
|
lexicon: Lexicon,
|
||||||
|
token2id: Dict[str, int],
|
||||||
|
word2id: Dict[str, int],
|
||||||
|
need_self_loops: bool = False,
|
||||||
|
) -> k2.Fsa:
|
||||||
|
"""Convert a lexicon to an FST (in k2 format).
|
||||||
|
Args:
|
||||||
|
lexicon:
|
||||||
|
The input lexicon. See also :func:`read_lexicon`
|
||||||
|
token2id:
|
||||||
|
A dict mapping tokens to IDs.
|
||||||
|
word2id:
|
||||||
|
A dict mapping words to IDs.
|
||||||
|
need_self_loops:
|
||||||
|
If True, add self-loop to states with non-epsilon output symbols
|
||||||
|
on at least one arc out of the state. The input label for this
|
||||||
|
self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
|
||||||
|
Returns:
|
||||||
|
Return an instance of `k2.Fsa` representing the given lexicon.
|
||||||
|
"""
|
||||||
|
loop_state = 0 # words enter and leave from here
|
||||||
|
next_state = 1 # the next un-allocated state, will be incremented as we go
|
||||||
|
|
||||||
|
arcs = []
|
||||||
|
|
||||||
|
# The blank symbol <blk> is defined in local/train_bpe_model.py
|
||||||
|
assert token2id["<blk>"] == 0
|
||||||
|
assert word2id["<eps>"] == 0
|
||||||
|
|
||||||
|
eps = 0
|
||||||
|
|
||||||
|
for word, pieces in lexicon:
|
||||||
|
assert len(pieces) > 0, f"{word} has no pronunciations"
|
||||||
|
cur_state = loop_state
|
||||||
|
|
||||||
|
word = word2id[word]
|
||||||
|
pieces = [token2id[i] if i in token2id else token2id["<unk>"] for i in pieces]
|
||||||
|
|
||||||
|
for i in range(len(pieces) - 1):
|
||||||
|
w = word if i == 0 else eps
|
||||||
|
arcs.append([cur_state, next_state, pieces[i], w, 0])
|
||||||
|
|
||||||
|
cur_state = next_state
|
||||||
|
next_state += 1
|
||||||
|
|
||||||
|
# now for the last piece of this word
|
||||||
|
i = len(pieces) - 1
|
||||||
|
w = word if i == 0 else eps
|
||||||
|
arcs.append([cur_state, loop_state, pieces[i], w, 0])
|
||||||
|
|
||||||
|
if need_self_loops:
|
||||||
|
disambig_token = token2id["#0"]
|
||||||
|
disambig_word = word2id["#0"]
|
||||||
|
arcs = add_self_loops(
|
||||||
|
arcs,
|
||||||
|
disambig_token=disambig_token,
|
||||||
|
disambig_word=disambig_word,
|
||||||
|
)
|
||||||
|
|
||||||
|
final_state = next_state
|
||||||
|
arcs.append([loop_state, final_state, -1, -1, 0])
|
||||||
|
arcs.append([final_state])
|
||||||
|
|
||||||
|
arcs = sorted(arcs, key=lambda arc: arc[0])
|
||||||
|
arcs = [[str(i) for i in arc] for arc in arcs]
|
||||||
|
arcs = [" ".join(arc) for arc in arcs]
|
||||||
|
arcs = "\n".join(arcs)
|
||||||
|
|
||||||
|
fsa = k2.Fsa.from_str(arcs, acceptor=False)
|
||||||
|
return fsa
|
||||||
|
|
||||||
|
|
||||||
|
def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool:
|
||||||
|
"""Check if all the given tokens are in token symbol table.
|
||||||
|
Args:
|
||||||
|
token_sym_table:
|
||||||
|
Token symbol table that contains all the valid tokens.
|
||||||
|
tokens:
|
||||||
|
A list of tokens.
|
||||||
|
Returns:
|
||||||
|
Return True if there is any token not in the token_sym_table,
|
||||||
|
otherwise False.
|
||||||
|
"""
|
||||||
|
for tok in tokens:
|
||||||
|
if tok not in token_sym_table:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def generate_lexicon(token_sym_table: Dict[str, int], words: List[str]) -> Lexicon:
|
||||||
|
"""Generate a lexicon from a word list and token_sym_table.
|
||||||
|
Args:
|
||||||
|
token_sym_table:
|
||||||
|
Token symbol table that mapping token to token ids.
|
||||||
|
words:
|
||||||
|
A list of strings representing words.
|
||||||
|
Returns:
|
||||||
|
Return a dict whose keys are words and values are the corresponding
|
||||||
|
tokens.
|
||||||
|
"""
|
||||||
|
lexicon = []
|
||||||
|
for word in words:
|
||||||
|
chars = list(word.strip(" \t"))
|
||||||
|
if contain_oov(token_sym_table, chars):
|
||||||
|
continue
|
||||||
|
lexicon.append((word, chars))
|
||||||
|
|
||||||
|
# The OOV word is <UNK>
|
||||||
|
lexicon.append(("<UNK>", ["<unk>"]))
|
||||||
|
return lexicon
|
||||||
|
|
||||||
|
|
||||||
|
def generate_tokens(text_file: str) -> Dict[str, int]:
|
||||||
|
"""Generate tokens from the given text file.
|
||||||
|
Args:
|
||||||
|
text_file:
|
||||||
|
A file that contains text lines to generate tokens.
|
||||||
|
Returns:
|
||||||
|
Return a dict whose keys are tokens and values are token ids ranged
|
||||||
|
from 0 to len(keys) - 1.
|
||||||
|
"""
|
||||||
|
tokens: Dict[str, int] = dict()
|
||||||
|
tokens["<blk>"] = 0
|
||||||
|
tokens["<sos/eos>"] = 1
|
||||||
|
tokens["<unk>"] = 2
|
||||||
|
whitespace = re.compile(r"([ \t\r\n]+)")
|
||||||
|
with open(text_file, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = re.sub(whitespace, "", line)
|
||||||
|
tokens_list = list(line)
|
||||||
|
for token in tokens_list:
|
||||||
|
if token not in tokens:
|
||||||
|
tokens[token] = len(tokens)
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--lang-dir", type=str, help="The lang directory.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
lang_dir = Path(args.lang_dir)
|
||||||
|
text_file = lang_dir / "text"
|
||||||
|
|
||||||
|
word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
|
||||||
|
|
||||||
|
words = word_sym_table.symbols
|
||||||
|
|
||||||
|
excluded = ["<eps>", "!SIL", "<SPOKEN_NOISE>", "<UNK>", "#0", "<s>", "</s>"]
|
||||||
|
for w in excluded:
|
||||||
|
if w in words:
|
||||||
|
words.remove(w)
|
||||||
|
|
||||||
|
token_sym_table = generate_tokens(text_file)
|
||||||
|
|
||||||
|
lexicon = generate_lexicon(token_sym_table, words)
|
||||||
|
|
||||||
|
lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
|
||||||
|
|
||||||
|
next_token_id = max(token_sym_table.values()) + 1
|
||||||
|
for i in range(max_disambig + 1):
|
||||||
|
disambig = f"#{i}"
|
||||||
|
assert disambig not in token_sym_table
|
||||||
|
token_sym_table[disambig] = next_token_id
|
||||||
|
next_token_id += 1
|
||||||
|
|
||||||
|
word_sym_table.add("#0")
|
||||||
|
word_sym_table.add("<s>")
|
||||||
|
word_sym_table.add("</s>")
|
||||||
|
|
||||||
|
write_mapping(lang_dir / "tokens.txt", token_sym_table)
|
||||||
|
|
||||||
|
write_lexicon(lang_dir / "lexicon.txt", lexicon)
|
||||||
|
write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig)
|
||||||
|
|
||||||
|
L = lexicon_to_fst_no_sil(
|
||||||
|
lexicon,
|
||||||
|
token2id=token_sym_table,
|
||||||
|
word2id=word_sym_table,
|
||||||
|
)
|
||||||
|
|
||||||
|
L_disambig = lexicon_to_fst_no_sil(
|
||||||
|
lexicon_disambig,
|
||||||
|
token2id=token_sym_table,
|
||||||
|
word2id=word_sym_table,
|
||||||
|
need_self_loops=True,
|
||||||
|
)
|
||||||
|
torch.save(L.as_dict(), lang_dir / "L.pt")
|
||||||
|
torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
1
egs/multi_zh-hans/ASR/local/prepare_lang.py
Symbolic link
1
egs/multi_zh-hans/ASR/local/prepare_lang.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/local/prepare_lang.py
|
266
egs/multi_zh-hans/ASR/local/prepare_lang_bpe.py
Executable file
266
egs/multi_zh-hans/ASR/local/prepare_lang_bpe.py
Executable file
@ -0,0 +1,266 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
# Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
This script takes as input `lang_dir`, which should contain::
|
||||||
|
|
||||||
|
- lang_dir/bpe.model,
|
||||||
|
- lang_dir/words.txt
|
||||||
|
|
||||||
|
and generates the following files in the directory `lang_dir`:
|
||||||
|
|
||||||
|
- lexicon.txt
|
||||||
|
- lexicon_disambig.txt
|
||||||
|
- L.pt
|
||||||
|
- L_disambig.pt
|
||||||
|
- tokens.txt
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
|
import k2
|
||||||
|
import sentencepiece as spm
|
||||||
|
import torch
|
||||||
|
from prepare_lang import (
|
||||||
|
Lexicon,
|
||||||
|
add_disambig_symbols,
|
||||||
|
add_self_loops,
|
||||||
|
write_lexicon,
|
||||||
|
write_mapping,
|
||||||
|
)
|
||||||
|
|
||||||
|
from icefall.utils import str2bool
|
||||||
|
|
||||||
|
|
||||||
|
def lexicon_to_fst_no_sil(
|
||||||
|
lexicon: Lexicon,
|
||||||
|
token2id: Dict[str, int],
|
||||||
|
word2id: Dict[str, int],
|
||||||
|
need_self_loops: bool = False,
|
||||||
|
) -> k2.Fsa:
|
||||||
|
"""Convert a lexicon to an FST (in k2 format).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lexicon:
|
||||||
|
The input lexicon. See also :func:`read_lexicon`
|
||||||
|
token2id:
|
||||||
|
A dict mapping tokens to IDs.
|
||||||
|
word2id:
|
||||||
|
A dict mapping words to IDs.
|
||||||
|
need_self_loops:
|
||||||
|
If True, add self-loop to states with non-epsilon output symbols
|
||||||
|
on at least one arc out of the state. The input label for this
|
||||||
|
self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
|
||||||
|
Returns:
|
||||||
|
Return an instance of `k2.Fsa` representing the given lexicon.
|
||||||
|
"""
|
||||||
|
loop_state = 0 # words enter and leave from here
|
||||||
|
next_state = 1 # the next un-allocated state, will be incremented as we go
|
||||||
|
|
||||||
|
arcs = []
|
||||||
|
|
||||||
|
# The blank symbol <blk> is defined in local/train_bpe_model.py
|
||||||
|
assert token2id["<blk>"] == 0
|
||||||
|
assert word2id["<eps>"] == 0
|
||||||
|
|
||||||
|
eps = 0
|
||||||
|
|
||||||
|
for word, pieces in lexicon:
|
||||||
|
assert len(pieces) > 0, f"{word} has no pronunciations"
|
||||||
|
cur_state = loop_state
|
||||||
|
|
||||||
|
word = word2id[word]
|
||||||
|
pieces = [token2id[i] for i in pieces]
|
||||||
|
|
||||||
|
for i in range(len(pieces) - 1):
|
||||||
|
w = word if i == 0 else eps
|
||||||
|
arcs.append([cur_state, next_state, pieces[i], w, 0])
|
||||||
|
|
||||||
|
cur_state = next_state
|
||||||
|
next_state += 1
|
||||||
|
|
||||||
|
# now for the last piece of this word
|
||||||
|
i = len(pieces) - 1
|
||||||
|
w = word if i == 0 else eps
|
||||||
|
arcs.append([cur_state, loop_state, pieces[i], w, 0])
|
||||||
|
|
||||||
|
if need_self_loops:
|
||||||
|
disambig_token = token2id["#0"]
|
||||||
|
disambig_word = word2id["#0"]
|
||||||
|
arcs = add_self_loops(
|
||||||
|
arcs,
|
||||||
|
disambig_token=disambig_token,
|
||||||
|
disambig_word=disambig_word,
|
||||||
|
)
|
||||||
|
|
||||||
|
final_state = next_state
|
||||||
|
arcs.append([loop_state, final_state, -1, -1, 0])
|
||||||
|
arcs.append([final_state])
|
||||||
|
|
||||||
|
arcs = sorted(arcs, key=lambda arc: arc[0])
|
||||||
|
arcs = [[str(i) for i in arc] for arc in arcs]
|
||||||
|
arcs = [" ".join(arc) for arc in arcs]
|
||||||
|
arcs = "\n".join(arcs)
|
||||||
|
|
||||||
|
fsa = k2.Fsa.from_str(arcs, acceptor=False)
|
||||||
|
return fsa
|
||||||
|
|
||||||
|
|
||||||
|
def generate_lexicon(
|
||||||
|
model_file: str, words: List[str], oov: str
|
||||||
|
) -> Tuple[Lexicon, Dict[str, int]]:
|
||||||
|
"""Generate a lexicon from a BPE model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_file:
|
||||||
|
Path to a sentencepiece model.
|
||||||
|
words:
|
||||||
|
A list of strings representing words.
|
||||||
|
oov:
|
||||||
|
The out of vocabulary word in lexicon.
|
||||||
|
Returns:
|
||||||
|
Return a tuple with two elements:
|
||||||
|
- A dict whose keys are words and values are the corresponding
|
||||||
|
word pieces.
|
||||||
|
- A dict representing the token symbol, mapping from tokens to IDs.
|
||||||
|
"""
|
||||||
|
sp = spm.SentencePieceProcessor()
|
||||||
|
sp.load(str(model_file))
|
||||||
|
|
||||||
|
# Convert word to word piece IDs instead of word piece strings
|
||||||
|
# to avoid OOV tokens.
|
||||||
|
words_pieces_ids: List[List[int]] = sp.encode(words, out_type=int)
|
||||||
|
|
||||||
|
# Now convert word piece IDs back to word piece strings.
|
||||||
|
words_pieces: List[List[str]] = [sp.id_to_piece(ids) for ids in words_pieces_ids]
|
||||||
|
|
||||||
|
lexicon = []
|
||||||
|
for word, pieces in zip(words, words_pieces):
|
||||||
|
lexicon.append((word, pieces))
|
||||||
|
|
||||||
|
lexicon.append((oov, ["▁", sp.id_to_piece(sp.unk_id())]))
|
||||||
|
|
||||||
|
token2id: Dict[str, int] = {sp.id_to_piece(i): i for i in range(sp.vocab_size())}
|
||||||
|
|
||||||
|
return lexicon, token2id
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--lang-dir",
|
||||||
|
type=str,
|
||||||
|
help="""Input and output directory.
|
||||||
|
It should contain the bpe.model and words.txt
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--oov",
|
||||||
|
type=str,
|
||||||
|
default="<UNK>",
|
||||||
|
help="The out of vocabulary word in lexicon.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--debug",
|
||||||
|
type=str2bool,
|
||||||
|
default=False,
|
||||||
|
help="""True for debugging, which will generate
|
||||||
|
a visualization of the lexicon FST.
|
||||||
|
|
||||||
|
Caution: If your lexicon contains hundreds of thousands
|
||||||
|
of lines, please set it to False!
|
||||||
|
|
||||||
|
See "test/test_bpe_lexicon.py" for usage.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = get_args()
|
||||||
|
lang_dir = Path(args.lang_dir)
|
||||||
|
model_file = lang_dir / "bpe.model"
|
||||||
|
|
||||||
|
word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
|
||||||
|
|
||||||
|
words = word_sym_table.symbols
|
||||||
|
|
||||||
|
excluded = ["<eps>", "!SIL", "<SPOKEN_NOISE>", args.oov, "#0", "<s>", "</s>"]
|
||||||
|
|
||||||
|
for w in excluded:
|
||||||
|
if w in words:
|
||||||
|
words.remove(w)
|
||||||
|
|
||||||
|
lexicon, token_sym_table = generate_lexicon(model_file, words, args.oov)
|
||||||
|
|
||||||
|
lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
|
||||||
|
|
||||||
|
next_token_id = max(token_sym_table.values()) + 1
|
||||||
|
for i in range(max_disambig + 1):
|
||||||
|
disambig = f"#{i}"
|
||||||
|
assert disambig not in token_sym_table
|
||||||
|
token_sym_table[disambig] = next_token_id
|
||||||
|
next_token_id += 1
|
||||||
|
|
||||||
|
word_sym_table.add("#0")
|
||||||
|
word_sym_table.add("<s>")
|
||||||
|
word_sym_table.add("</s>")
|
||||||
|
|
||||||
|
write_mapping(lang_dir / "tokens.txt", token_sym_table)
|
||||||
|
|
||||||
|
write_lexicon(lang_dir / "lexicon.txt", lexicon)
|
||||||
|
write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig)
|
||||||
|
|
||||||
|
L = lexicon_to_fst_no_sil(
|
||||||
|
lexicon,
|
||||||
|
token2id=token_sym_table,
|
||||||
|
word2id=word_sym_table,
|
||||||
|
)
|
||||||
|
|
||||||
|
L_disambig = lexicon_to_fst_no_sil(
|
||||||
|
lexicon_disambig,
|
||||||
|
token2id=token_sym_table,
|
||||||
|
word2id=word_sym_table,
|
||||||
|
need_self_loops=True,
|
||||||
|
)
|
||||||
|
torch.save(L.as_dict(), lang_dir / "L.pt")
|
||||||
|
torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt")
|
||||||
|
|
||||||
|
if args.debug:
|
||||||
|
labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt")
|
||||||
|
aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt")
|
||||||
|
|
||||||
|
L.labels_sym = labels_sym
|
||||||
|
L.aux_labels_sym = aux_labels_sym
|
||||||
|
L.draw(f"{lang_dir / 'L.svg'}", title="L.pt")
|
||||||
|
|
||||||
|
L_disambig.labels_sym = labels_sym
|
||||||
|
L_disambig.aux_labels_sym = aux_labels_sym
|
||||||
|
L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", title="L_disambig.pt")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
77
egs/multi_zh-hans/ASR/local/validate_bpe_lexicon.py
Executable file
77
egs/multi_zh-hans/ASR/local/validate_bpe_lexicon.py
Executable file
@ -0,0 +1,77 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
This script checks that there are no OOV tokens in the BPE-based lexicon.
|
||||||
|
|
||||||
|
Usage example:
|
||||||
|
|
||||||
|
python3 ./local/validate_bpe_lexicon.py \
|
||||||
|
--lexicon /path/to/lexicon.txt \
|
||||||
|
--bpe-model /path/to/bpe.model
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Tuple
|
||||||
|
|
||||||
|
import sentencepiece as spm
|
||||||
|
|
||||||
|
from icefall.lexicon import read_lexicon
|
||||||
|
|
||||||
|
# Map word to word pieces
|
||||||
|
Lexicon = List[Tuple[str, List[str]]]
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--lexicon",
|
||||||
|
required=True,
|
||||||
|
type=Path,
|
||||||
|
help="Path to lexicon.txt",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--bpe-model",
|
||||||
|
required=True,
|
||||||
|
type=Path,
|
||||||
|
help="Path to bpe.model",
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = get_args()
|
||||||
|
assert args.lexicon.is_file(), args.lexicon
|
||||||
|
assert args.bpe_model.is_file(), args.bpe_model
|
||||||
|
|
||||||
|
lexicon = read_lexicon(args.lexicon)
|
||||||
|
|
||||||
|
sp = spm.SentencePieceProcessor()
|
||||||
|
sp.load(str(args.bpe_model))
|
||||||
|
|
||||||
|
word_pieces = set(sp.id_to_piece(list(range(sp.vocab_size()))))
|
||||||
|
for word, pieces in lexicon:
|
||||||
|
for p in pieces:
|
||||||
|
if p not in word_pieces:
|
||||||
|
raise ValueError(f"The word {word} contains an OOV token {p}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -18,16 +18,6 @@ vocab_sizes=(
|
|||||||
2000
|
2000
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# multidataset list.
|
|
||||||
# LibriSpeech and musan are required.
|
|
||||||
# The others are optional.
|
|
||||||
multidataset=(
|
|
||||||
"gigaspeech",
|
|
||||||
"commonvoice",
|
|
||||||
"librilight",
|
|
||||||
)
|
|
||||||
|
|
||||||
# All files generated by this script are saved in "data".
|
# All files generated by this script are saved in "data".
|
||||||
# You can safely remove "data" and rerun this script to regenerate it.
|
# You can safely remove "data" and rerun this script to regenerate it.
|
||||||
mkdir -p data
|
mkdir -p data
|
||||||
@ -318,11 +308,63 @@ if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
|
|||||||
lang_dir=data/lang_bpe_${vocab_size}
|
lang_dir=data/lang_bpe_${vocab_size}
|
||||||
|
|
||||||
mkdir -p $lang_dir
|
mkdir -p $lang_dir
|
||||||
|
if [ ! -f $lang_dir/bpe.model ]; then
|
||||||
./local/train_bpe_model.py \
|
./local/train_bpe_model.py \
|
||||||
--lang-dir $lang_dir \
|
--lang-dir $lang_dir \
|
||||||
--transcript ./data/lang_char/transcript_chars.txt \
|
--transcript ./data/lang_char/transcript_chars.txt \
|
||||||
--vocab-size $vocab_size
|
--vocab-size $vocab_size
|
||||||
done
|
|
||||||
|
|
||||||
./local/train_bpe_model.py --lang-dir ./data/lang_bpe_${vocab_size}
|
./local/bpe_model_to_tokens.py $lang_dir/bpe.model > $lang_dir/tokens.txt
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ ! -f $lang_dir/L_disambig.pt ]; then
|
||||||
|
cp data/lang_char/words.txt $lang_dir
|
||||||
|
|
||||||
|
./local/prepare_lang_bpe.py --lang-dir $lang_dir
|
||||||
|
log "Validating $lang_dir/lexicon.txt"
|
||||||
|
./local/validate_bpe_lexicon.py \
|
||||||
|
--lexicon $lang_dir/lexicon.txt \
|
||||||
|
--bpe-model $lang_dir/bpe.model
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f $lang_dir/L.fst ]; then
|
||||||
|
log "Converting L.pt to L.fst"
|
||||||
|
./shared/convert-k2-to-openfst.py \
|
||||||
|
--olabels aux_labels \
|
||||||
|
$lang_dir/L.pt \
|
||||||
|
$lang_dir/L.fst
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f $lang_dir/L_disambig.fst ]; then
|
||||||
|
log "Converting L_disambig.pt to L_disambig.fst"
|
||||||
|
./shared/convert-k2-to-openfst.py \
|
||||||
|
--olabels aux_labels \
|
||||||
|
$lang_dir/L_disambig.pt \
|
||||||
|
$lang_dir/L_disambig.fst
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 14 ] && [ $stop_stage -ge 14 ]; then
|
||||||
|
log "Stage 14: Prepare G"
|
||||||
|
|
||||||
|
if [ -d ../../wenetspeech/ASR/data/lang_char/ ]; then
|
||||||
|
cd data
|
||||||
|
cp -r ../../../../wenetspeech/ASR/data/lm .
|
||||||
|
cd ..
|
||||||
|
else
|
||||||
|
log "Abort! Please run ../../wenetspeech/ASR/prepare.sh"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 15 ] && [ $stop_stage -ge 15 ]; then
|
||||||
|
log "Stage 15: Compile LG"
|
||||||
|
for vocab_size in ${vocab_sizes[@]}; do
|
||||||
|
lang_dir=data/lang_bpe_${vocab_size}
|
||||||
|
|
||||||
|
python ./local/compile_lg.py --lang-dir $lang_dir
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
@ -33,9 +33,9 @@ dataset, you should change the argument values according to your dataset.
|
|||||||
|
|
||||||
./zipformer/export.py \
|
./zipformer/export.py \
|
||||||
--exp-dir ./zipformer/exp \
|
--exp-dir ./zipformer/exp \
|
||||||
--tokens data/lang_bpe_500/tokens.txt \
|
--tokens data/lang_bpe_2000/tokens.txt \
|
||||||
--epoch 30 \
|
--epoch 23 \
|
||||||
--avg 9 \
|
--avg 1 \
|
||||||
--jit 1
|
--jit 1
|
||||||
|
|
||||||
It will generate a file `jit_script.pt` in the given `exp_dir`. You can later
|
It will generate a file `jit_script.pt` in the given `exp_dir`. You can later
|
||||||
@ -53,9 +53,9 @@ for how to use the exported models outside of icefall.
|
|||||||
--causal 1 \
|
--causal 1 \
|
||||||
--chunk-size 16 \
|
--chunk-size 16 \
|
||||||
--left-context-frames 128 \
|
--left-context-frames 128 \
|
||||||
--tokens data/lang_bpe_500/tokens.txt \
|
--tokens data/lang_bpe_2000/tokens.txt \
|
||||||
--epoch 30 \
|
--epoch 23 \
|
||||||
--avg 9 \
|
--avg 1 \
|
||||||
--jit 1
|
--jit 1
|
||||||
|
|
||||||
It will generate a file `jit_script_chunk_16_left_128.pt` in the given `exp_dir`.
|
It will generate a file `jit_script_chunk_16_left_128.pt` in the given `exp_dir`.
|
||||||
@ -72,18 +72,18 @@ for how to use the exported models outside of icefall.
|
|||||||
|
|
||||||
./zipformer/export.py \
|
./zipformer/export.py \
|
||||||
--exp-dir ./zipformer/exp \
|
--exp-dir ./zipformer/exp \
|
||||||
--tokens data/lang_bpe_500/tokens.txt \
|
--tokens data/lang_bpe_2000/tokens.txt \
|
||||||
--epoch 30 \
|
--epoch 23 \
|
||||||
--avg 9
|
--avg 1
|
||||||
|
|
||||||
- For streaming model:
|
- For streaming model:
|
||||||
|
|
||||||
./zipformer/export.py \
|
./zipformer/export.py \
|
||||||
--exp-dir ./zipformer/exp \
|
--exp-dir ./zipformer/exp \
|
||||||
--causal 1 \
|
--causal 1 \
|
||||||
--tokens data/lang_bpe_500/tokens.txt \
|
--tokens data/lang_bpe_2000/tokens.txt \
|
||||||
--epoch 30 \
|
--epoch 23 \
|
||||||
--avg 9
|
--avg 1
|
||||||
|
|
||||||
It will generate a file `pretrained.pt` in the given `exp_dir`. You can later
|
It will generate a file `pretrained.pt` in the given `exp_dir`. You can later
|
||||||
load it by `icefall.checkpoint.load_checkpoint()`.
|
load it by `icefall.checkpoint.load_checkpoint()`.
|
||||||
@ -103,7 +103,7 @@ you can do:
|
|||||||
--avg 1 \
|
--avg 1 \
|
||||||
--max-duration 600 \
|
--max-duration 600 \
|
||||||
--decoding-method greedy_search \
|
--decoding-method greedy_search \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model
|
--bpe-model data/lang_bpe_2000/bpe.model
|
||||||
|
|
||||||
- For streaming model:
|
- For streaming model:
|
||||||
|
|
||||||
@ -124,7 +124,7 @@ To use the generated file with `zipformer/decode.py` and `zipformer/streaming_de
|
|||||||
--chunk-size 16 \
|
--chunk-size 16 \
|
||||||
--left-context-frames 128 \
|
--left-context-frames 128 \
|
||||||
--decoding-method greedy_search \
|
--decoding-method greedy_search \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model
|
--bpe-model data/lang_bpe_2000/bpe.model
|
||||||
|
|
||||||
# chunk-wise streaming decoding
|
# chunk-wise streaming decoding
|
||||||
./zipformer/streaming_decode.py \
|
./zipformer/streaming_decode.py \
|
||||||
@ -136,7 +136,7 @@ To use the generated file with `zipformer/decode.py` and `zipformer/streaming_de
|
|||||||
--chunk-size 16 \
|
--chunk-size 16 \
|
||||||
--left-context-frames 128 \
|
--left-context-frames 128 \
|
||||||
--decoding-method greedy_search \
|
--decoding-method greedy_search \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model
|
--bpe-model data/lang_bpe_2000/bpe.model
|
||||||
|
|
||||||
Check ./pretrained.py for its usage.
|
Check ./pretrained.py for its usage.
|
||||||
|
|
||||||
@ -207,7 +207,7 @@ def get_parser():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--epoch",
|
"--epoch",
|
||||||
type=int,
|
type=int,
|
||||||
default=30,
|
default=23,
|
||||||
help="""It specifies the checkpoint to use for decoding.
|
help="""It specifies the checkpoint to use for decoding.
|
||||||
Note: Epoch counts from 1.
|
Note: Epoch counts from 1.
|
||||||
You can specify --avg to use more checkpoints for model averaging.""",
|
You can specify --avg to use more checkpoints for model averaging.""",
|
||||||
@ -226,7 +226,7 @@ def get_parser():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--avg",
|
"--avg",
|
||||||
type=int,
|
type=int,
|
||||||
default=9,
|
default=1,
|
||||||
help="Number of checkpoints to average. Automatically select "
|
help="Number of checkpoints to average. Automatically select "
|
||||||
"consecutive checkpoints before the checkpoint specified by "
|
"consecutive checkpoints before the checkpoint specified by "
|
||||||
"'--epoch' and '--iter'",
|
"'--epoch' and '--iter'",
|
||||||
@ -255,7 +255,7 @@ def get_parser():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--tokens",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/tokens.txt",
|
default="data/lang_bpe_2000/tokens.txt",
|
||||||
help="Path to the tokens.txt",
|
help="Path to the tokens.txt",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user