support resuming

This commit is contained in:
yfyeung 2024-11-06 05:48:04 -08:00
parent 47e1147782
commit 77560cd5e8
2 changed files with 20 additions and 7 deletions

View File

@ -57,21 +57,34 @@ class TextNormalizer:
return cut
# Assign text of the supervisions and remove unnecessary entries.
def main():
assert len(sys.argv) == 3, "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR"
fname = Path(sys.argv[1]).name
oname = Path(sys.argv[2]) / fname
tn = TextNormalizer()
cuts = set()
if oname.exists():
with gzip.open(oname, "r") as fin:
for line in tqdm(fin, desc="Loading processed"):
cuts.add(json.loads(line)["id"])
with ProcessPoolExecutor() as ex:
with gzip.open(sys.argv[1], "r") as fin:
cuts = (json.loads(line) for line in fin)
results = ex.map(tn, cuts)
futures = []
for line in tqdm(fin, desc="Distributing"):
parsed_line = json.loads(line)
if parsed_line["id"] not in cuts:
futures.append(ex.submit(tn, parsed_line))
with gzip.open(oname, "w") as fout:
for cut in tqdm(results, desc="Processing"):
fout.write((json.dumps(cut) + "\n").encode())
with gzip.open(oname, "a") as fout:
for future in tqdm(futures, desc="Processing"):
try:
result = future.result()
fout.write((json.dumps(result) + "\n").encode())
except Exception as e:
print(f"Caught exception:\n{e}\n")
if __name__ == "__main__":

View File

@ -149,7 +149,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
--batch_size 32 \
--model "speech_tokenizer_v1"
cat $output_dir/part* | gzip > $output_dir/libriheavy_${subset}.jsonl.gz && rm -rf $output_dir
touch $output_dir/..extract_completed
touch $output_dir/.extract_completed
fi
done
fi