mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 01:52:41 +00:00
check for utterance len (#795)
Co-authored-by: behnam <basefisaray@roku.com>
This commit is contained in:
parent
d167aad4ab
commit
a54b748a02
@ -1086,7 +1086,33 @@ def run(rank, world_size, args):
|
|||||||
# You should use ../local/display_manifest_statistics.py to get
|
# You should use ../local/display_manifest_statistics.py to get
|
||||||
# an utterance duration distribution for your dataset to select
|
# an utterance duration distribution for your dataset to select
|
||||||
# the threshold
|
# the threshold
|
||||||
return 1.0 <= c.duration <= 20.0
|
if c.duration < 1.0 or c.duration > 20.0:
|
||||||
|
logging.warning(
|
||||||
|
f"Exclude cut with ID {c.id} from training. Duration: {c.duration}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# In pruned RNN-T, we require that T >= S
|
||||||
|
# where T is the number of feature frames after subsampling
|
||||||
|
# and S is the number of tokens in the utterance
|
||||||
|
|
||||||
|
# In ./zipformer.py, the conv module uses the following expression
|
||||||
|
# for subsampling
|
||||||
|
T = ((c.num_frames - 7) // 2 + 1) // 2
|
||||||
|
tokens = sp.encode(c.supervisions[0].text, out_type=str)
|
||||||
|
|
||||||
|
if T < len(tokens):
|
||||||
|
logging.warning(
|
||||||
|
f"Exclude cut with ID {c.id} from training. "
|
||||||
|
f"Number of frames (before subsampling): {c.num_frames}. "
|
||||||
|
f"Number of frames (after subsampling): {T}. "
|
||||||
|
f"Text: {c.supervisions[0].text}. "
|
||||||
|
f"Tokens: {tokens}. "
|
||||||
|
f"Number of tokens: {len(tokens)}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
train_cuts = train_cuts.filter(remove_short_and_long_utt)
|
train_cuts = train_cuts.filter(remove_short_and_long_utt)
|
||||||
|
|
||||||
|
@ -1077,7 +1077,33 @@ def run(rank, world_size, args):
|
|||||||
# You should use ../local/display_manifest_statistics.py to get
|
# You should use ../local/display_manifest_statistics.py to get
|
||||||
# an utterance duration distribution for your dataset to select
|
# an utterance duration distribution for your dataset to select
|
||||||
# the threshold
|
# the threshold
|
||||||
return 1.0 <= c.duration <= 20.0
|
if c.duration < 1.0 or c.duration > 20.0:
|
||||||
|
logging.warning(
|
||||||
|
f"Exclude cut with ID {c.id} from training. Duration: {c.duration}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# In pruned RNN-T, we require that T >= S
|
||||||
|
# where T is the number of feature frames after subsampling
|
||||||
|
# and S is the number of tokens in the utterance
|
||||||
|
|
||||||
|
# In ./zipformer.py, the conv module uses the following expression
|
||||||
|
# for subsampling
|
||||||
|
T = ((c.num_frames - 7) // 2 + 1) // 2
|
||||||
|
tokens = sp.encode(c.supervisions[0].text, out_type=str)
|
||||||
|
|
||||||
|
if T < len(tokens):
|
||||||
|
logging.warning(
|
||||||
|
f"Exclude cut with ID {c.id} from training. "
|
||||||
|
f"Number of frames (before subsampling): {c.num_frames}. "
|
||||||
|
f"Number of frames (after subsampling): {T}. "
|
||||||
|
f"Text: {c.supervisions[0].text}. "
|
||||||
|
f"Tokens: {tokens}. "
|
||||||
|
f"Number of tokens: {len(tokens)}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
train_cuts = train_cuts.filter(remove_short_and_long_utt)
|
train_cuts = train_cuts.filter(remove_short_and_long_utt)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user