mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-15 20:22:42 +00:00
use multi process
This commit is contained in:
parent
d3e3de8395
commit
258e106904
@ -19,20 +19,23 @@ import gzip
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
from concurrent.futures.process import ProcessPoolExecutor
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from tn.english.normalizer import Normalizer as EnNormalizer
|
from tn.english.normalizer import Normalizer as EnNormalizer
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import str2bool
|
||||||
|
|
||||||
|
|
||||||
class TextNormlizer:
|
class TextNormlizer:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.en_tn_model = EnNormalizer()
|
self.en_tn_model = EnNormalizer(cache_dir="/tmp/tn", overwrite_cache=False)
|
||||||
|
|
||||||
def __call__(self, text):
|
def __call__(self, cut):
|
||||||
# brackets
|
text = cut["supervisions"][0]["custom"]["texts"][0]
|
||||||
# Always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
|
|
||||||
|
# Process brackets
|
||||||
text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
|
text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
|
||||||
text = re.sub(r"\([^\)]*\)", " ", text)
|
text = re.sub(r"\([^\)]*\)", " ", text)
|
||||||
|
|
||||||
@ -44,29 +47,31 @@ class TextNormlizer:
|
|||||||
text = re.sub(r"\s+", " ", text).strip()
|
text = re.sub(r"\s+", " ", text).strip()
|
||||||
|
|
||||||
text = self.en_tn_model.normalize(text)
|
text = self.en_tn_model.normalize(text)
|
||||||
return text.strip()
|
|
||||||
|
cut["supervisions"][0]["text"] = text
|
||||||
|
del cut["supervisions"][0]["custom"]
|
||||||
|
del cut["custom"]
|
||||||
|
|
||||||
|
return cut
|
||||||
|
|
||||||
|
|
||||||
# Assign text of the supervisions and remove unnecessary entries.
|
# Assign text of the supervisions and remove unnecessary entries.
|
||||||
def main():
|
def main():
|
||||||
assert (
|
assert len(sys.argv) == 3, "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR"
|
||||||
len(sys.argv) == 4
|
|
||||||
), "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR KEEP_CUSTOM_FIELDS"
|
|
||||||
fname = Path(sys.argv[1]).name
|
fname = Path(sys.argv[1]).name
|
||||||
oname = Path(sys.argv[2]) / fname
|
oname = Path(sys.argv[2]) / fname
|
||||||
keep_custom_fields = str2bool(sys.argv[3])
|
|
||||||
|
|
||||||
tn = TextNormlizer()
|
tn = TextNormlizer()
|
||||||
|
with gzip.open(sys.argv[1], "r") as fin, ProcessPoolExecutor() as ex:
|
||||||
with gzip.open(sys.argv[1], "r") as fin, gzip.open(oname, "w") as fout:
|
futures = []
|
||||||
for line in fin:
|
cuts = []
|
||||||
|
for line in tqdm(fin, desc="Distributing tasks"):
|
||||||
cut = json.loads(line)
|
cut = json.loads(line)
|
||||||
cut["supervisions"][0]["text"] = tn(
|
futures.append(ex.submit(tn, cut))
|
||||||
cut["supervisions"][0]["custom"]["texts"][0]
|
|
||||||
)
|
with gzip.open(oname, "w") as fout:
|
||||||
if not keep_custom_fields:
|
for future in tqdm(futures, desc="Processing"):
|
||||||
del cut["supervisions"][0]["custom"]
|
cut = future.result()
|
||||||
del cut["custom"]
|
|
||||||
fout.write((json.dumps(cut) + "\n").encode())
|
fout.write((json.dumps(cut) + "\n").encode())
|
||||||
|
|
||||||
|
|
||||||
|
@ -87,7 +87,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
|||||||
for subset in small medium large dev test_clean test_other; do
|
for subset in small medium large dev test_clean test_other; do
|
||||||
if [ ! -e $manifests_dir/libriheavy_cuts_${subset}.jsonl.gz ]; then
|
if [ ! -e $manifests_dir/libriheavy_cuts_${subset}.jsonl.gz ]; then
|
||||||
log "Prepare manifest for subset : ${subset}"
|
log "Prepare manifest for subset : ${subset}"
|
||||||
./local/prepare_manifest.py $dl_dir/libriheavy/libriheavy_cuts_${subset}.jsonl.gz $manifests_dir False
|
./local/prepare_manifest.py $dl_dir/libriheavy/libriheavy_cuts_${subset}.jsonl.gz $manifests_dir
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
Loading…
x
Reference in New Issue
Block a user