mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-12 19:42:19 +00:00
support resuming
This commit is contained in:
parent
47e1147782
commit
77560cd5e8
@ -57,21 +57,34 @@ class TextNormalizer:
|
||||
return cut
|
||||
|
||||
|
||||
# Assign text of the supervisions and remove unnecessary entries.
|
||||
def main():
|
||||
assert len(sys.argv) == 3, "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR"
|
||||
fname = Path(sys.argv[1]).name
|
||||
oname = Path(sys.argv[2]) / fname
|
||||
|
||||
tn = TextNormalizer()
|
||||
|
||||
cuts = set()
|
||||
if oname.exists():
|
||||
with gzip.open(oname, "r") as fin:
|
||||
for line in tqdm(fin, desc="Loading processed"):
|
||||
cuts.add(json.loads(line)["id"])
|
||||
|
||||
with ProcessPoolExecutor() as ex:
|
||||
with gzip.open(sys.argv[1], "r") as fin:
|
||||
cuts = (json.loads(line) for line in fin)
|
||||
results = ex.map(tn, cuts)
|
||||
futures = []
|
||||
for line in tqdm(fin, desc="Distributing"):
|
||||
parsed_line = json.loads(line)
|
||||
if parsed_line["id"] not in cuts:
|
||||
futures.append(ex.submit(tn, parsed_line))
|
||||
|
||||
with gzip.open(oname, "w") as fout:
|
||||
for cut in tqdm(results, desc="Processing"):
|
||||
fout.write((json.dumps(cut) + "\n").encode())
|
||||
with gzip.open(oname, "a") as fout:
|
||||
for future in tqdm(futures, desc="Processing"):
|
||||
try:
|
||||
result = future.result()
|
||||
fout.write((json.dumps(result) + "\n").encode())
|
||||
except Exception as e:
|
||||
print(f"Caught exception:\n{e}\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -149,7 +149,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||
--batch_size 32 \
|
||||
--model "speech_tokenizer_v1"
|
||||
cat $output_dir/part* | gzip > $output_dir/libriheavy_${subset}.jsonl.gz && rm -rf $output_dir
|
||||
touch $output_dir/..extract_completed
|
||||
touch $output_dir/.extract_completed
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
Loading…
x
Reference in New Issue
Block a user