support resuming

This commit is contained in:
yfyeung 2024-11-06 05:48:04 -08:00
parent 47e1147782
commit 77560cd5e8
2 changed files with 20 additions and 7 deletions

View File

@ -57,21 +57,34 @@ class TextNormalizer:
return cut return cut
# Assign text of the supervisions and remove unnecessary entries.
def main(): def main():
assert len(sys.argv) == 3, "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR" assert len(sys.argv) == 3, "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR"
fname = Path(sys.argv[1]).name fname = Path(sys.argv[1]).name
oname = Path(sys.argv[2]) / fname oname = Path(sys.argv[2]) / fname
tn = TextNormalizer() tn = TextNormalizer()
cuts = set()
if oname.exists():
with gzip.open(oname, "r") as fin:
for line in tqdm(fin, desc="Loading processed"):
cuts.add(json.loads(line)["id"])
with ProcessPoolExecutor() as ex: with ProcessPoolExecutor() as ex:
with gzip.open(sys.argv[1], "r") as fin: with gzip.open(sys.argv[1], "r") as fin:
cuts = (json.loads(line) for line in fin) futures = []
results = ex.map(tn, cuts) for line in tqdm(fin, desc="Distributing"):
parsed_line = json.loads(line)
if parsed_line["id"] not in cuts:
futures.append(ex.submit(tn, parsed_line))
with gzip.open(oname, "w") as fout: with gzip.open(oname, "a") as fout:
for cut in tqdm(results, desc="Processing"): for future in tqdm(futures, desc="Processing"):
fout.write((json.dumps(cut) + "\n").encode()) try:
result = future.result()
fout.write((json.dumps(result) + "\n").encode())
except Exception as e:
print(f"Caught exception:\n{e}\n")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -149,7 +149,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
--batch_size 32 \ --batch_size 32 \
--model "speech_tokenizer_v1" --model "speech_tokenizer_v1"
cat $output_dir/part* | gzip > $output_dir/libriheavy_${subset}.jsonl.gz && rm -rf $output_dir cat $output_dir/part* | gzip > $output_dir/libriheavy_${subset}.jsonl.gz && rm -rf $output_dir
touch $output_dir/..extract_completed touch $output_dir/.extract_completed
fi fi
done done
fi fi