mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-13 03:52:18 +00:00
support resuming
This commit is contained in:
parent
47e1147782
commit
77560cd5e8
@ -57,21 +57,34 @@ class TextNormalizer:
|
|||||||
return cut
|
return cut
|
||||||
|
|
||||||
|
|
||||||
# Assign text of the supervisions and remove unnecessary entries.
|
|
||||||
def main():
|
def main():
|
||||||
assert len(sys.argv) == 3, "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR"
|
assert len(sys.argv) == 3, "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR"
|
||||||
fname = Path(sys.argv[1]).name
|
fname = Path(sys.argv[1]).name
|
||||||
oname = Path(sys.argv[2]) / fname
|
oname = Path(sys.argv[2]) / fname
|
||||||
|
|
||||||
tn = TextNormalizer()
|
tn = TextNormalizer()
|
||||||
|
|
||||||
|
cuts = set()
|
||||||
|
if oname.exists():
|
||||||
|
with gzip.open(oname, "r") as fin:
|
||||||
|
for line in tqdm(fin, desc="Loading processed"):
|
||||||
|
cuts.add(json.loads(line)["id"])
|
||||||
|
|
||||||
with ProcessPoolExecutor() as ex:
|
with ProcessPoolExecutor() as ex:
|
||||||
with gzip.open(sys.argv[1], "r") as fin:
|
with gzip.open(sys.argv[1], "r") as fin:
|
||||||
cuts = (json.loads(line) for line in fin)
|
futures = []
|
||||||
results = ex.map(tn, cuts)
|
for line in tqdm(fin, desc="Distributing"):
|
||||||
|
parsed_line = json.loads(line)
|
||||||
|
if parsed_line["id"] not in cuts:
|
||||||
|
futures.append(ex.submit(tn, parsed_line))
|
||||||
|
|
||||||
with gzip.open(oname, "w") as fout:
|
with gzip.open(oname, "a") as fout:
|
||||||
for cut in tqdm(results, desc="Processing"):
|
for future in tqdm(futures, desc="Processing"):
|
||||||
fout.write((json.dumps(cut) + "\n").encode())
|
try:
|
||||||
|
result = future.result()
|
||||||
|
fout.write((json.dumps(result) + "\n").encode())
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Caught exception:\n{e}\n")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -149,7 +149,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
|||||||
--batch_size 32 \
|
--batch_size 32 \
|
||||||
--model "speech_tokenizer_v1"
|
--model "speech_tokenizer_v1"
|
||||||
cat $output_dir/part* | gzip > $output_dir/libriheavy_${subset}.jsonl.gz && rm -rf $output_dir
|
cat $output_dir/part* | gzip > $output_dir/libriheavy_${subset}.jsonl.gz && rm -rf $output_dir
|
||||||
touch $output_dir/..extract_completed
|
touch $output_dir/.extract_completed
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
Loading…
x
Reference in New Issue
Block a user