mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-14 04:22:21 +00:00
use multi process
This commit is contained in:
parent
d3e3de8395
commit
258e106904
@ -19,20 +19,23 @@ import gzip
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from concurrent.futures.process import ProcessPoolExecutor
|
||||
from pathlib import Path
|
||||
|
||||
from tn.english.normalizer import Normalizer as EnNormalizer
|
||||
from tqdm import tqdm
|
||||
|
||||
from icefall.utils import str2bool
|
||||
|
||||
|
||||
class TextNormlizer:
|
||||
def __init__(self):
|
||||
self.en_tn_model = EnNormalizer()
|
||||
self.en_tn_model = EnNormalizer(cache_dir="/tmp/tn", overwrite_cache=False)
|
||||
|
||||
def __call__(self, text):
|
||||
# brackets
|
||||
# Always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
|
||||
def __call__(self, cut):
|
||||
text = cut["supervisions"][0]["custom"]["texts"][0]
|
||||
|
||||
# Process brackets
|
||||
text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
|
||||
text = re.sub(r"\([^\)]*\)", " ", text)
|
||||
|
||||
@ -44,29 +47,31 @@ class TextNormlizer:
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
text = self.en_tn_model.normalize(text)
|
||||
return text.strip()
|
||||
|
||||
cut["supervisions"][0]["text"] = text
|
||||
del cut["supervisions"][0]["custom"]
|
||||
del cut["custom"]
|
||||
|
||||
return cut
|
||||
|
||||
|
||||
# Assign text of the supervisions and remove unnecessary entries.
|
||||
def main():
|
||||
assert (
|
||||
len(sys.argv) == 4
|
||||
), "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR KEEP_CUSTOM_FIELDS"
|
||||
assert len(sys.argv) == 3, "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR"
|
||||
fname = Path(sys.argv[1]).name
|
||||
oname = Path(sys.argv[2]) / fname
|
||||
keep_custom_fields = str2bool(sys.argv[3])
|
||||
|
||||
tn = TextNormlizer()
|
||||
|
||||
with gzip.open(sys.argv[1], "r") as fin, gzip.open(oname, "w") as fout:
|
||||
for line in fin:
|
||||
with gzip.open(sys.argv[1], "r") as fin, ProcessPoolExecutor() as ex:
|
||||
futures = []
|
||||
cuts = []
|
||||
for line in tqdm(fin, desc="Distributing tasks"):
|
||||
cut = json.loads(line)
|
||||
cut["supervisions"][0]["text"] = tn(
|
||||
cut["supervisions"][0]["custom"]["texts"][0]
|
||||
)
|
||||
if not keep_custom_fields:
|
||||
del cut["supervisions"][0]["custom"]
|
||||
del cut["custom"]
|
||||
futures.append(ex.submit(tn, cut))
|
||||
|
||||
with gzip.open(oname, "w") as fout:
|
||||
for future in tqdm(futures, desc="Processing"):
|
||||
cut = future.result()
|
||||
fout.write((json.dumps(cut) + "\n").encode())
|
||||
|
||||
|
||||
|
@ -87,7 +87,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||
for subset in small medium large dev test_clean test_other; do
|
||||
if [ ! -e $manifests_dir/libriheavy_cuts_${subset}.jsonl.gz ]; then
|
||||
log "Prepare manifest for subset : ${subset}"
|
||||
./local/prepare_manifest.py $dl_dir/libriheavy/libriheavy_cuts_${subset}.jsonl.gz $manifests_dir False
|
||||
./local/prepare_manifest.py $dl_dir/libriheavy/libriheavy_cuts_${subset}.jsonl.gz $manifests_dir
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
Loading…
x
Reference in New Issue
Block a user