mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 01:52:41 +00:00
remove ineffective normalize_text_alimeeting
This commit is contained in:
parent
26aef4a926
commit
d1c336f589
@ -68,12 +68,7 @@ from whisper_encoder_forward_monkey_patch import replace_whisper_encoder_forward
|
|||||||
|
|
||||||
from icefall.dist import get_rank, get_world_size
|
from icefall.dist import get_rank, get_world_size
|
||||||
from icefall.env import get_env_info
|
from icefall.env import get_env_info
|
||||||
from icefall.utils import (
|
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
|
||||||
AttributeDict,
|
|
||||||
MetricsTracker,
|
|
||||||
setup_logger,
|
|
||||||
str2bool,
|
|
||||||
)
|
|
||||||
|
|
||||||
DEFAULT_SPEECH_TOKEN = "<speech>"
|
DEFAULT_SPEECH_TOKEN = "<speech>"
|
||||||
|
|
||||||
@ -331,42 +326,6 @@ def compute_loss(
|
|||||||
|
|
||||||
return input_ids, attention_mask, target_ids
|
return input_ids, attention_mask, target_ids
|
||||||
|
|
||||||
def normalize_text_alimeeting(text: str, normalize: str = "m2met") -> str:
|
|
||||||
"""
|
|
||||||
Text normalization similar to M2MeT challenge baseline.
|
|
||||||
See: https://github.com/yufan-aslp/AliMeeting/blob/main/asr/local/text_normalize.pl
|
|
||||||
"""
|
|
||||||
if normalize == "none":
|
|
||||||
return text
|
|
||||||
elif normalize == "m2met":
|
|
||||||
import re
|
|
||||||
|
|
||||||
text = text.replace(" ", "")
|
|
||||||
text = text.replace("<sil>", "")
|
|
||||||
text = text.replace("<%>", "")
|
|
||||||
text = text.replace("<->", "")
|
|
||||||
text = text.replace("<$>", "")
|
|
||||||
text = text.replace("<#>", "")
|
|
||||||
text = text.replace("<_>", "")
|
|
||||||
text = text.replace("<space>", "")
|
|
||||||
text = text.replace("`", "")
|
|
||||||
text = text.replace("&", "")
|
|
||||||
text = text.replace(",", "")
|
|
||||||
if re.search("[a-zA-Z]", text):
|
|
||||||
text = text.upper()
|
|
||||||
text = text.replace("A", "A")
|
|
||||||
text = text.replace("a", "A")
|
|
||||||
text = text.replace("b", "B")
|
|
||||||
text = text.replace("c", "C")
|
|
||||||
text = text.replace("k", "K")
|
|
||||||
text = text.replace("t", "T")
|
|
||||||
text = text.replace(",", "")
|
|
||||||
text = text.replace("丶", "")
|
|
||||||
text = text.replace("。", "")
|
|
||||||
text = text.replace("、", "")
|
|
||||||
text = text.replace("?", "")
|
|
||||||
return text
|
|
||||||
|
|
||||||
device = next(model.parameters()).device
|
device = next(model.parameters()).device
|
||||||
feature = batch["inputs"]
|
feature = batch["inputs"]
|
||||||
|
|
||||||
@ -377,8 +336,6 @@ def compute_loss(
|
|||||||
batch_idx_train = params.batch_idx_train
|
batch_idx_train = params.batch_idx_train
|
||||||
supervisions = batch["supervisions"]
|
supervisions = batch["supervisions"]
|
||||||
texts = batch["supervisions"]["text"]
|
texts = batch["supervisions"]["text"]
|
||||||
# remove spaces in texts
|
|
||||||
texts = [normalize_text_alimeeting(text) for text in texts]
|
|
||||||
|
|
||||||
messages = []
|
messages = []
|
||||||
for i, text in enumerate(texts):
|
for i, text in enumerate(texts):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user