use multi process

This commit is contained in:
yifanyeung 2024-10-30 21:11:42 -07:00
parent d3e3de8395
commit 258e106904
2 changed files with 24 additions and 19 deletions

View File

@ -19,20 +19,23 @@ import gzip
import json import json
import re import re
import sys import sys
from concurrent.futures.process import ProcessPoolExecutor
from pathlib import Path from pathlib import Path
from tn.english.normalizer import Normalizer as EnNormalizer from tn.english.normalizer import Normalizer as EnNormalizer
from tqdm import tqdm
from icefall.utils import str2bool from icefall.utils import str2bool
class TextNormlizer: class TextNormlizer:
def __init__(self): def __init__(self):
self.en_tn_model = EnNormalizer() self.en_tn_model = EnNormalizer(cache_dir="/tmp/tn", overwrite_cache=False)
def __call__(self, text): def __call__(self, cut):
# brackets text = cut["supervisions"][0]["custom"]["texts"][0]
# Always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
# Process brackets
text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text) text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
text = re.sub(r"\([^\)]*\)", " ", text) text = re.sub(r"\([^\)]*\)", " ", text)
@ -44,29 +47,31 @@ class TextNormlizer:
text = re.sub(r"\s+", " ", text).strip() text = re.sub(r"\s+", " ", text).strip()
text = self.en_tn_model.normalize(text) text = self.en_tn_model.normalize(text)
return text.strip()
cut["supervisions"][0]["text"] = text
del cut["supervisions"][0]["custom"]
del cut["custom"]
return cut
# Assign text of the supervisions and remove unnecessary entries. # Assign text of the supervisions and remove unnecessary entries.
def main(): def main():
assert ( assert len(sys.argv) == 3, "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR"
len(sys.argv) == 4
), "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR KEEP_CUSTOM_FIELDS"
fname = Path(sys.argv[1]).name fname = Path(sys.argv[1]).name
oname = Path(sys.argv[2]) / fname oname = Path(sys.argv[2]) / fname
keep_custom_fields = str2bool(sys.argv[3])
tn = TextNormlizer() tn = TextNormlizer()
with gzip.open(sys.argv[1], "r") as fin, ProcessPoolExecutor() as ex:
with gzip.open(sys.argv[1], "r") as fin, gzip.open(oname, "w") as fout: futures = []
for line in fin: cuts = []
for line in tqdm(fin, desc="Distributing tasks"):
cut = json.loads(line) cut = json.loads(line)
cut["supervisions"][0]["text"] = tn( futures.append(ex.submit(tn, cut))
cut["supervisions"][0]["custom"]["texts"][0]
) with gzip.open(oname, "w") as fout:
if not keep_custom_fields: for future in tqdm(futures, desc="Processing"):
del cut["supervisions"][0]["custom"] cut = future.result()
del cut["custom"]
fout.write((json.dumps(cut) + "\n").encode()) fout.write((json.dumps(cut) + "\n").encode())

View File

@ -87,7 +87,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
for subset in small medium large dev test_clean test_other; do for subset in small medium large dev test_clean test_other; do
if [ ! -e $manifests_dir/libriheavy_cuts_${subset}.jsonl.gz ]; then if [ ! -e $manifests_dir/libriheavy_cuts_${subset}.jsonl.gz ]; then
log "Prepare manifest for subset : ${subset}" log "Prepare manifest for subset : ${subset}"
./local/prepare_manifest.py $dl_dir/libriheavy/libriheavy_cuts_${subset}.jsonl.gz $manifests_dir False ./local/prepare_manifest.py $dl_dir/libriheavy/libriheavy_cuts_${subset}.jsonl.gz $manifests_dir
fi fi
done done
fi fi