mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-11 19:12:30 +00:00
update tokenizer comments
This commit is contained in:
parent
07a8f050b7
commit
3feef0a7d0
@ -64,18 +64,23 @@ class WakeupWordTokenizer(object):
|
|||||||
self.negative_word_tokens = [1]
|
self.negative_word_tokens = [1]
|
||||||
self.negative_number_tokens = 1
|
self.negative_number_tokens = 1
|
||||||
|
|
||||||
def texts_to_token_ids(self, texts: List[str]) -> Tuple[torch.Tensor, int]:
|
def texts_to_token_ids(self, texts: List[str]) -> Tuple[torch.Tensor, torch.Tensor, int]:
|
||||||
"""Convert a list of texts to a list of k2.Fsa based texts.
|
"""Convert a list of texts to a list of k2.Fsa based texts.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
texts:
|
texts:
|
||||||
It is a list of strings.
|
It is a list of strings,
|
||||||
|
each element is a reference text for an audio.
|
||||||
Returns:
|
Returns:
|
||||||
Return a list of k2.Fsa, one for an element in texts.
|
Return a tuple of 3 elements.
|
||||||
If the element is `wakeup_word`, a graph for positive samples is appneded
|
The first one is torch.Tensor(List[List[int]]),
|
||||||
into resulting graph_vec, otherwise, a graph for negative samples is appended.
|
each List[int] is tokens sequence for each a reference text.
|
||||||
|
|
||||||
Number of positive samples is also returned to track its proportion.
|
The second one is number of tokens for each sample,
|
||||||
|
mainly used by CTC loss.
|
||||||
|
|
||||||
|
The last one is number_positive_samples,
|
||||||
|
used to track proportion of positive samples in each batch.
|
||||||
"""
|
"""
|
||||||
batch_token_ids = []
|
batch_token_ids = []
|
||||||
target_lengths = []
|
target_lengths = []
|
||||||
|
@ -37,7 +37,6 @@ import torch.nn as nn
|
|||||||
from asr_datamodule import HiMiaWuwDataModule
|
from asr_datamodule import HiMiaWuwDataModule
|
||||||
from tdnn import Tdnn
|
from tdnn import Tdnn
|
||||||
|
|
||||||
from lhotse.cut import Cut
|
|
||||||
from lhotse.utils import fix_random_seed
|
from lhotse.utils import fix_random_seed
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||||
|
Loading…
x
Reference in New Issue
Block a user