fix typos in icefall/utils.py (#1319)

This commit is contained in:
Surav Shrestha 2023-10-19 09:00:18 +05:45 committed by GitHub
parent 98c5286404
commit 36c60b0cf6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1447,7 +1447,7 @@ def get_parameter_groups_with_lrs(
This is for use with the ScaledAdam optimizers (more recent versions that accept lists of This is for use with the ScaledAdam optimizers (more recent versions that accept lists of
named-parameters; we can, if needed, create a version without the names). named-parameters; we can, if needed, create a version without the names).
It provides a way to specifiy learning-rate scales inside the module, so that if It provides a way to specify learning-rate scales inside the module, so that if
any nn.Module in the hierarchy has a floating-point parameter 'lr_scale', it will any nn.Module in the hierarchy has a floating-point parameter 'lr_scale', it will
scale the LR of any parameters inside that module or its submodules. Note: you scale the LR of any parameters inside that module or its submodules. Note: you
can set module parameters outside the __init__ function, e.g.: can set module parameters outside the __init__ function, e.g.:
@ -1607,10 +1607,10 @@ def tokenize_by_bpe_model(
chars = pattern.split(txt.upper()) chars = pattern.split(txt.upper())
mix_chars = [w for w in chars if len(w.strip()) > 0] mix_chars = [w for w in chars if len(w.strip()) > 0]
for ch_or_w in mix_chars: for ch_or_w in mix_chars:
# ch_or_w is a single CJK charater(i.e., "你"), do nothing. # ch_or_w is a single CJK character(i.e., "你"), do nothing.
if pattern.fullmatch(ch_or_w) is not None: if pattern.fullmatch(ch_or_w) is not None:
tokens.append(ch_or_w) tokens.append(ch_or_w)
# ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), # ch_or_w contains non-CJK characters(i.e., " IT'S OKAY "),
# encode ch_or_w using bpe_model. # encode ch_or_w using bpe_model.
else: else:
for p in sp.encode_as_pieces(ch_or_w): for p in sp.encode_as_pieces(ch_or_w):
@ -1624,7 +1624,7 @@ def tokenize_by_CJK_char(line: str) -> str:
""" """
Tokenize a line of text with CJK char. Tokenize a line of text with CJK char.
Note: All return charaters will be upper case. Note: All return characters will be upper case.
Example: Example:
input = "你好世界是 hello world 的中文" input = "你好世界是 hello world 的中文"
@ -1917,7 +1917,7 @@ def parse_bpe_timestamps_and_texts(
A k2.Fsa with best_paths.arcs.num_axes() == 3, i.e. A k2.Fsa with best_paths.arcs.num_axes() == 3, i.e.
containing multiple FSAs, which is expected to be the result containing multiple FSAs, which is expected to be the result
of k2.shortest_path (otherwise the returned values won't of k2.shortest_path (otherwise the returned values won't
be meaningful). Its attribtutes `labels` and `aux_labels` be meaningful). Its attributes `labels` and `aux_labels`
are both BPE tokens. are both BPE tokens.
sp: sp:
The BPE model. The BPE model.
@ -2045,7 +2045,7 @@ def parse_fsa_timestamps_and_texts(
) -> Tuple[List[Tuple[float, float]], List[List[str]]]: ) -> Tuple[List[Tuple[float, float]], List[List[str]]]:
"""Parse timestamps (in seconds) and texts for given decoded fsa paths. """Parse timestamps (in seconds) and texts for given decoded fsa paths.
Currently it supports two cases: Currently it supports two cases:
(1) ctc-decoding, the attribtutes `labels` and `aux_labels` (1) ctc-decoding, the attributes `labels` and `aux_labels`
are both BPE tokens. In this case, sp should be provided. are both BPE tokens. In this case, sp should be provided.
(2) HLG-based 1best, the attribtute `labels` is the prediction unit, (2) HLG-based 1best, the attribtute `labels` is the prediction unit,
e.g., phone or BPE tokens; attribute `aux_labels` is the word index. e.g., phone or BPE tokens; attribute `aux_labels` is the word index.