mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-11 02:52:18 +00:00
remove ineffective normalize_text_alimeeting
This commit is contained in:
parent
46b9be31cc
commit
26aef4a926
@ -357,43 +357,6 @@ def decode_dataset(
|
|||||||
Returns:
|
Returns:
|
||||||
Return a dict, whose key may be "beam-search".
|
Return a dict, whose key may be "beam-search".
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def normalize_text_alimeeting(text: str, normalize: str = "m2met") -> str:
|
|
||||||
"""
|
|
||||||
Text normalization similar to M2MeT challenge baseline.
|
|
||||||
See: https://github.com/yufan-aslp/AliMeeting/blob/main/asr/local/text_normalize.pl
|
|
||||||
"""
|
|
||||||
if normalize == "none":
|
|
||||||
return text
|
|
||||||
elif normalize == "m2met":
|
|
||||||
import re
|
|
||||||
|
|
||||||
text = text.replace(" ", "")
|
|
||||||
text = text.replace("<sil>", "")
|
|
||||||
text = text.replace("<%>", "")
|
|
||||||
text = text.replace("<->", "")
|
|
||||||
text = text.replace("<$>", "")
|
|
||||||
text = text.replace("<#>", "")
|
|
||||||
text = text.replace("<_>", "")
|
|
||||||
text = text.replace("<space>", "")
|
|
||||||
text = text.replace("`", "")
|
|
||||||
text = text.replace("&", "")
|
|
||||||
text = text.replace(",", "")
|
|
||||||
if re.search("[a-zA-Z]", text):
|
|
||||||
text = text.upper()
|
|
||||||
text = text.replace("A", "A")
|
|
||||||
text = text.replace("a", "A")
|
|
||||||
text = text.replace("b", "B")
|
|
||||||
text = text.replace("c", "C")
|
|
||||||
text = text.replace("k", "K")
|
|
||||||
text = text.replace("t", "T")
|
|
||||||
text = text.replace(",", "")
|
|
||||||
text = text.replace("丶", "")
|
|
||||||
text = text.replace("。", "")
|
|
||||||
text = text.replace("、", "")
|
|
||||||
text = text.replace("?", "")
|
|
||||||
return text
|
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
num_cuts = 0
|
num_cuts = 0
|
||||||
@ -406,6 +369,7 @@ def decode_dataset(
|
|||||||
results = defaultdict(list)
|
results = defaultdict(list)
|
||||||
for batch_idx, batch in enumerate(dl):
|
for batch_idx, batch in enumerate(dl):
|
||||||
texts = batch["supervisions"]["text"]
|
texts = batch["supervisions"]["text"]
|
||||||
|
texts = [list("".join(text.split())) for text in texts]
|
||||||
cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
|
cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
|
||||||
|
|
||||||
hyps_dict = decode_one_batch(
|
hyps_dict = decode_one_batch(
|
||||||
@ -419,11 +383,9 @@ def decode_dataset(
|
|||||||
this_batch = []
|
this_batch = []
|
||||||
assert len(hyps) == len(texts)
|
assert len(hyps) == len(texts)
|
||||||
for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
|
for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
|
||||||
ref_text = normalize_text_alimeeting(ref_text)
|
print(f"ref: {''.join(ref_text)}")
|
||||||
ref_words = ref_text.split()
|
|
||||||
print(f"ref: {ref_text}")
|
|
||||||
print(f"hyp: {''.join(hyp_words)}")
|
print(f"hyp: {''.join(hyp_words)}")
|
||||||
this_batch.append((cut_id, ref_words, hyp_words))
|
this_batch.append((cut_id, ref_text, hyp_words))
|
||||||
|
|
||||||
results[lm_scale].extend(this_batch)
|
results[lm_scale].extend(this_batch)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user