mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-12-11 06:55:27 +00:00
minor updates to utils.py
This commit is contained in:
parent
2091bb5f25
commit
73fa1651f0
@ -35,36 +35,50 @@ def get_facebook_biasing_list(
|
|||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def get_rare_words():
|
def get_rare_words(subset: str, min_count: int):
|
||||||
txt_path = f"data/lang_bpe_500/transcript_words_{subset}.txt"
|
txt_path = f"data/tmp/transcript_words_{subset}.txt"
|
||||||
rare_word_file = f"data/context_biasing/{subset}_rare_words_{min_count}.txt"
|
rare_word_file = f"data/context_biasing/{subset}_rare_words_{min_count}.txt"
|
||||||
|
|
||||||
if os.path.exists(rare_word_file):
|
if os.path.exists(rare_word_file):
|
||||||
print("File exists, do not proceed!")
|
print("File exists, do not proceed!")
|
||||||
return
|
return
|
||||||
with open(txt_path, "r") as file:
|
print(f"Finding rare words in the manifest.")
|
||||||
words = file.read().upper().split()
|
count_file = f"data/tmp/transcript_words_{subset}_count.txt"
|
||||||
|
if not os.path.exists(count_file):
|
||||||
|
with open(txt_path, "r") as file:
|
||||||
|
words = file.read().upper().split()
|
||||||
|
word_count = {}
|
||||||
|
for word in words:
|
||||||
|
word = remove_non_alphabetic(word, strict=False)
|
||||||
|
word = word.split()
|
||||||
|
for w in word:
|
||||||
|
if w not in word_count:
|
||||||
|
word_count[w] = 1
|
||||||
|
else:
|
||||||
|
word_count[w] += 1
|
||||||
|
|
||||||
|
with open(count_file, 'w') as fout:
|
||||||
|
for w in word_count:
|
||||||
|
fout.write(f"{w}\t{word_count[w]}")
|
||||||
|
else:
|
||||||
word_count = {}
|
word_count = {}
|
||||||
for word in words:
|
with open(count_file, 'r') as fin:
|
||||||
word = remove_non_alphabetic(word, strict=False)
|
word_count = fin.read().split('\n')
|
||||||
if word not in word_count:
|
word_count = [pair.split() for pair in word_count]
|
||||||
word_count[word] = 1
|
|
||||||
else:
|
|
||||||
word_count[word] += 1
|
|
||||||
|
|
||||||
print(f"A total of {len(word_count)} words appeared!")
|
print(f"A total of {len(word_count)} words appeared!")
|
||||||
rare_words = []
|
rare_words = []
|
||||||
for k in word_count:
|
for k in word_count:
|
||||||
if word_count[k] <= min_count:
|
if int(word_count[k]) <= min_count:
|
||||||
rare_words.append(k+"\n")
|
rare_words.append(k+"\n")
|
||||||
print(f"A total of {len(rare_words)} appeared <= 10 times")
|
print(f"A total of {len(rare_words)} appeared <= {min_count} times")
|
||||||
|
|
||||||
with open(rare_word_file, 'w') as f:
|
with open(rare_word_file, 'w') as f:
|
||||||
f.writelines(rare_words)
|
f.writelines(rare_words)
|
||||||
|
|
||||||
def add_context_list_to_manifest(subset: str, min_count: int):
|
def add_context_list_to_manifest(subset: str, min_count: int):
|
||||||
rare_words_file = f"data/context_biasing/{subset}_rare_words_{min_count}.txt"
|
rare_words_file = f"data/context_biasing/{subset}_rare_words_{min_count}.txt"
|
||||||
manifest_dir = f"data/fbank/librilight_cuts_train_{subset}.jsonl.gz"
|
manifest_dir = f"data/fbank/libriheavy_cuts_{subset}.jsonl.gz"
|
||||||
|
|
||||||
target_manifest_dir = manifest_dir.replace(".jsonl.gz", f"_with_context_list_min_count_{min_count}.jsonl.gz")
|
target_manifest_dir = manifest_dir.replace(".jsonl.gz", f"_with_context_list_min_count_{min_count}.jsonl.gz")
|
||||||
if os.path.exists(target_manifest_dir):
|
if os.path.exists(target_manifest_dir):
|
||||||
@ -82,7 +96,7 @@ def add_context_list_to_manifest(subset: str, min_count: int):
|
|||||||
print(f"Loaded manifest from {manifest_dir}")
|
print(f"Loaded manifest from {manifest_dir}")
|
||||||
|
|
||||||
def _add_context(c: Cut):
|
def _add_context(c: Cut):
|
||||||
splits = remove_non_alphabetic(c.supervisions[0].text).upper().split()
|
splits = remove_non_alphabetic(c.supervisions[0].texts[0], strict=False).upper().split()
|
||||||
found = []
|
found = []
|
||||||
for w in splits:
|
for w in splits:
|
||||||
if w in rare_words:
|
if w in rare_words:
|
||||||
@ -97,17 +111,21 @@ def add_context_list_to_manifest(subset: str, min_count: int):
|
|||||||
|
|
||||||
|
|
||||||
def check(subset: str, min_count: int):
|
def check(subset: str, min_count: int):
|
||||||
manifest_dir = f"data/fbank/librilight_cuts_train_{subset}_with_context_list_min_count_{min_count}.jsonl.gz"
|
#manifest_dir = f"data/fbank/libriheavy_cuts_{subset}_with_context_list_min_count_{min_count}.jsonl.gz"
|
||||||
|
print("Calculating the stats over the manifest")
|
||||||
|
manifest_dir = f"data/fbank/libriheavy_cuts_{subset}_with_context_list_min_count_{min_count}.jsonl.gz"
|
||||||
cuts = load_manifest_lazy(manifest_dir)
|
cuts = load_manifest_lazy(manifest_dir)
|
||||||
total_cuts = len(cuts)
|
total_cuts = len(cuts)
|
||||||
has_context_list = [c.supervisions[0].context_list != "" for c in cuts]
|
has_context_list = [c.supervisions[0].context_list != "" for c in cuts]
|
||||||
|
context_list_len = [len(c.supervisions[0].context_list.split()) for c in cuts]
|
||||||
print(f"{sum(has_context_list)}/{total_cuts} cuts have context list! ")
|
print(f"{sum(has_context_list)}/{total_cuts} cuts have context list! ")
|
||||||
|
print(f"Average length of non-empty context list is {sum(context_list_len)/sum(has_context_list)}")
|
||||||
|
|
||||||
|
|
||||||
if __name__=="__main__":
|
if __name__=="__main__":
|
||||||
#test_set = "test-clean"
|
#test_set = "test-clean"
|
||||||
#get_facebook_biasing_list(test_set)
|
#get_facebook_biasing_list(test_set)
|
||||||
#get_rare_words()
|
subset = "medium"
|
||||||
subset = "small"
|
min_count = 10
|
||||||
add_context_list_to_manifest(subset=subset)
|
#get_rare_words(subset, min_count)
|
||||||
|
#add_context_list_to_manifest(subset=subset, min_count=min_count)
|
||||||
|
check(subset=subset, min_count=min_count)
|
||||||
Loading…
x
Reference in New Issue
Block a user