mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-04 06:34:20 +00:00
Create convert_transcript_words_to_tokens.py
This commit is contained in:
parent
ecfbd090af
commit
2a1877486e
104
egs/multi_zh-hans/ASR/local/convert_transcript_words_to_tokens.py
Executable file
104
egs/multi_zh-hans/ASR/local/convert_transcript_words_to_tokens.py
Executable file
@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
|
||||
"""
|
||||
Convert a transcript file containing words to a corpus file containing tokens
|
||||
for LM training with the help of a lexicon.
|
||||
|
||||
If the lexicon contains phones, the resulting LM will be a phone LM; If the
|
||||
lexicon contains word pieces, the resulting LM will be a word piece LM.
|
||||
|
||||
If a word has multiple pronunciations, the one that appears first in the lexicon
|
||||
is kept; others are removed.
|
||||
|
||||
If the input transcript is:
|
||||
|
||||
hello zoo world hello
|
||||
world zoo
|
||||
foo zoo world hellO
|
||||
|
||||
and if the lexicon is
|
||||
|
||||
<UNK> SPN
|
||||
hello h e l l o 2
|
||||
hello h e l l o
|
||||
world w o r l d
|
||||
zoo z o o
|
||||
|
||||
Then the output is
|
||||
|
||||
h e l l o 2 z o o w o r l d h e l l o 2
|
||||
w o r l d z o o
|
||||
SPN z o o w o r l d SPN
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
from generate_unique_lexicon import filter_multiple_pronunications
|
||||
|
||||
from icefall.lexicon import read_lexicon
|
||||
from icefall.utils import tokenize_by_CJK_char
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--transcript",
|
||||
type=str,
|
||||
help="The input transcript file."
|
||||
"We assume that the transcript file consists of "
|
||||
"lines. Each line consists of space separated words.",
|
||||
)
|
||||
parser.add_argument("--lexicon", type=str, help="The input lexicon file.")
|
||||
parser.add_argument("--oov", type=str, default="<UNK>", help="The OOV word.")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def process_line(lexicon: Dict[str, List[str]], line: str, oov_token: str) -> None:
|
||||
"""
|
||||
Args:
|
||||
lexicon:
|
||||
A dict containing pronunciations. Its keys are words and values
|
||||
are pronunciations (i.e., tokens).
|
||||
line:
|
||||
A line of transcript consisting of space(s) separated words.
|
||||
oov_token:
|
||||
The pronunciation of the oov word if a word in `line` is not present
|
||||
in the lexicon.
|
||||
Returns:
|
||||
Return None.
|
||||
"""
|
||||
s = ""
|
||||
words = tokenize_by_CJK_char(line).strip().split()
|
||||
for i, w in enumerate(words):
|
||||
tokens = lexicon.get(w, oov_token)
|
||||
s += " ".join(tokens)
|
||||
s += " "
|
||||
print(s.strip())
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
assert Path(args.lexicon).is_file()
|
||||
assert Path(args.transcript).is_file()
|
||||
assert len(args.oov) > 0
|
||||
|
||||
# Only the first pronunciation of a word is kept
|
||||
lexicon = filter_multiple_pronunications(read_lexicon(args.lexicon))
|
||||
|
||||
lexicon = dict(lexicon)
|
||||
|
||||
assert args.oov in lexicon
|
||||
|
||||
oov_token = lexicon[args.oov]
|
||||
|
||||
with open(args.transcript) as f:
|
||||
for line in f:
|
||||
process_line(lexicon=lexicon, line=line, oov_token=oov_token)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
x
Reference in New Issue
Block a user