diff --git a/egs/libriheavy/TTS/local/prepare_manifest.py b/egs/libriheavy/TTS/local/prepare_manifest.py index 0c01ded2d..3225899bd 100755 --- a/egs/libriheavy/TTS/local/prepare_manifest.py +++ b/egs/libriheavy/TTS/local/prepare_manifest.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# Copyright 2023 Xiaomi Corp. (authors: Wei Kang) +# Copyright 2024 Xiaomi Corp. (authors: Yifan Yang) # # See ../../../../LICENSE for clarification regarding multiple authors # @@ -28,9 +28,10 @@ from tqdm import tqdm from icefall.utils import str2bool -class TextNormlizer: +class TextNormalizer: def __init__(self): self.en_tn_model = EnNormalizer(cache_dir="/tmp/tn", overwrite_cache=False) + self.table = str.maketrans("’‘,。;?!():-《》、“”【】", "'',.;?!(): <>/\"\"[]") def __call__(self, cut): text = cut["supervisions"][0]["custom"]["texts"][0] @@ -40,8 +41,7 @@ class TextNormlizer: text = re.sub(r"\([^\)]*\)", " ", text) # Apply mappings - table = str.maketrans("’‘,。;?!():-《》、“”【】", "'',.;?!(): <>/\"\"[]") - text = text.translate(table) + text = text.translate(self.table) # Remove extra spaces text = re.sub(r"\s+", " ", text).strip() @@ -61,18 +61,15 @@ def main(): fname = Path(sys.argv[1]).name oname = Path(sys.argv[2]) / fname - tn = TextNormlizer() - with gzip.open(sys.argv[1], "r") as fin, ProcessPoolExecutor() as ex: - futures = [] - cuts = [] - for line in tqdm(fin, desc="Distributing tasks"): - cut = json.loads(line) - futures.append(ex.submit(tn, cut)) + tn = TextNormalizer() + with ProcessPoolExecutor() as ex: + with gzip.open(sys.argv[1], "r") as fin: + cuts = (json.loads(line) for line in fin) + results = ex.map(tn, cuts) - with gzip.open(oname, "w") as fout: - for future in tqdm(futures, desc="Processing"): - cut = future.result() - fout.write((json.dumps(cut) + "\n").encode()) + with gzip.open(oname, "w") as fout: + for cut in tqdm(results, desc="Processing"): + fout.write((json.dumps(cut) + "\n").encode()) if __name__ == "__main__":