use multi process

2025-12-11 06:55:27 +00:00 · 2024-10-30 21:11:42 -07:00 · 2024-10-30 21:11:42 -07:00 · 258e106904
commit 258e106904
parent d3e3de8395
2 changed files with 24 additions and 19 deletions
--- a/egs/libriheavy/TTS/local/prepare_manifest.py
+++ b/egs/libriheavy/TTS/local/prepare_manifest.py
@ -19,20 +19,23 @@ import gzip
 import json
 import re
 import sys
 from concurrent.futures.process import ProcessPoolExecutor
 from pathlib import Path
 from tn.english.normalizer import Normalizer as EnNormalizer
 from tqdm import tqdm
 from icefall.utils import str2bool
 class TextNormlizer:
    def __init__(self):
-        self.en_tn_model = EnNormalizer()
+        self.en_tn_model = EnNormalizer(cache_dir="/tmp/tn", overwrite_cache=False)
-    def __call__(self, text):
+    def __call__(self, cut):
-        # brackets
+        text = cut["supervisions"][0]["custom"]["texts"][0]
-        # Always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
+
        # Process brackets
        text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
        text = re.sub(r"\([^\)]*\)", " ", text)
@ -44,29 +47,31 @@ class TextNormlizer:
        text = re.sub(r"\s+", " ", text).strip()
        text = self.en_tn_model.normalize(text)
-        return text.strip()
+
        cut["supervisions"][0]["text"] = text
        del cut["supervisions"][0]["custom"]
        del cut["custom"]
        return cut
 # Assign text of the supervisions and remove unnecessary entries.
 def main():
-    assert (
+    assert len(sys.argv) == 3, "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR"
        len(sys.argv) == 4
    ), "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR KEEP_CUSTOM_FIELDS"
    fname = Path(sys.argv[1]).name
    oname = Path(sys.argv[2]) / fname
    keep_custom_fields = str2bool(sys.argv[3])
    tn = TextNormlizer()
-
+    with gzip.open(sys.argv[1], "r") as fin, ProcessPoolExecutor() as ex:
-    with gzip.open(sys.argv[1], "r") as fin, gzip.open(oname, "w") as fout:
+        futures = []
-        for line in fin:
+        cuts = []
        for line in tqdm(fin, desc="Distributing tasks"):
            cut = json.loads(line)
-            cut["supervisions"][0]["text"] = tn(
+            futures.append(ex.submit(tn, cut))
-                cut["supervisions"][0]["custom"]["texts"][0]
+
-            )
+    with gzip.open(oname, "w") as fout:
-            if not keep_custom_fields:
+        for future in tqdm(futures, desc="Processing"):
-                del cut["supervisions"][0]["custom"]
+            cut = future.result()
                del cut["custom"]
            fout.write((json.dumps(cut) + "\n").encode())
--- a/egs/libriheavy/TTS/prepare.sh
+++ b/egs/libriheavy/TTS/prepare.sh
@ -87,7 +87,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  for subset in small medium large dev test_clean test_other; do
    if [ ! -e $manifests_dir/libriheavy_cuts_${subset}.jsonl.gz ]; then
      log "Prepare manifest for subset : ${subset}"
-      ./local/prepare_manifest.py $dl_dir/libriheavy/libriheavy_cuts_${subset}.jsonl.gz $manifests_dir False
+      ./local/prepare_manifest.py $dl_dir/libriheavy/libriheavy_cuts_${subset}.jsonl.gz $manifests_dir
    fi
  done
 fi