From 77560cd5e88f92f2003f40383890a2bffd7641d6 Mon Sep 17 00:00:00 2001 From: yfyeung Date: Wed, 6 Nov 2024 05:48:04 -0800 Subject: [PATCH] support resuming --- egs/libriheavy/TTS/local/prepare_manifest.py | 25 +++++++++++++++----- egs/libriheavy/TTS/prepare.sh | 2 +- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/egs/libriheavy/TTS/local/prepare_manifest.py b/egs/libriheavy/TTS/local/prepare_manifest.py index cf32a3396..3d0321cc7 100755 --- a/egs/libriheavy/TTS/local/prepare_manifest.py +++ b/egs/libriheavy/TTS/local/prepare_manifest.py @@ -57,21 +57,34 @@ class TextNormalizer: return cut -# Assign text of the supervisions and remove unnecessary entries. def main(): assert len(sys.argv) == 3, "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR" fname = Path(sys.argv[1]).name oname = Path(sys.argv[2]) / fname tn = TextNormalizer() + + cuts = set() + if oname.exists(): + with gzip.open(oname, "r") as fin: + for line in tqdm(fin, desc="Loading processed"): + cuts.add(json.loads(line)["id"]) + with ProcessPoolExecutor() as ex: with gzip.open(sys.argv[1], "r") as fin: - cuts = (json.loads(line) for line in fin) - results = ex.map(tn, cuts) + futures = [] + for line in tqdm(fin, desc="Distributing"): + parsed_line = json.loads(line) + if parsed_line["id"] not in cuts: + futures.append(ex.submit(tn, parsed_line)) - with gzip.open(oname, "w") as fout: - for cut in tqdm(results, desc="Processing"): - fout.write((json.dumps(cut) + "\n").encode()) + with gzip.open(oname, "a") as fout: + for future in tqdm(futures, desc="Processing"): + try: + result = future.result() + fout.write((json.dumps(result) + "\n").encode()) + except Exception as e: + print(f"Caught exception:\n{e}\n") if __name__ == "__main__": diff --git a/egs/libriheavy/TTS/prepare.sh b/egs/libriheavy/TTS/prepare.sh index 87556c7c5..48820ea35 100755 --- a/egs/libriheavy/TTS/prepare.sh +++ b/egs/libriheavy/TTS/prepare.sh @@ -149,7 +149,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then --batch_size 32 \ --model "speech_tokenizer_v1" cat $output_dir/part* | gzip > $output_dir/libriheavy_${subset}.jsonl.gz && rm -rf $output_dir - touch $output_dir/..extract_completed + touch $output_dir/.extract_completed fi done fi