mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 10:02:22 +00:00
Keep the custom fields in libriheavy manifest (#1719)
This commit is contained in:
parent
6ac3343ce5
commit
5952972294
@ -29,17 +29,21 @@ def simple_cleanup(text: str) -> str:
|
||||
|
||||
# Assign text of the supervisions and remove unnecessary entries.
|
||||
def main():
|
||||
assert len(sys.argv) == 3, "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR"
|
||||
assert (
|
||||
len(sys.argv) == 4
|
||||
), "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR KEEP_CUSTOM_FIELDS"
|
||||
fname = Path(sys.argv[1]).name
|
||||
oname = Path(sys.argv[2]) / fname
|
||||
keep_custom_fields = bool(sys.argv[3])
|
||||
with gzip.open(sys.argv[1], "r") as fin, gzip.open(oname, "w") as fout:
|
||||
for line in fin:
|
||||
cut = json.loads(line)
|
||||
cut["supervisions"][0]["text"] = simple_cleanup(
|
||||
cut["supervisions"][0]["custom"]["texts"][0]
|
||||
)
|
||||
del cut["supervisions"][0]["custom"]
|
||||
del cut["custom"]
|
||||
if not keep_custom_fields:
|
||||
del cut["supervisions"][0]["custom"]
|
||||
del cut["custom"]
|
||||
fout.write((json.dumps(cut) + "\n").encode())
|
||||
|
||||
|
||||
|
@ -29,6 +29,11 @@ export CUDA_VISIBLE_DEVICES=""
|
||||
# - speech
|
||||
dl_dir=$PWD/download
|
||||
|
||||
# If you want to do PromptASR experiments, please set it to True
|
||||
# as this will keep the texts and pre_text information required for
|
||||
# the training of PromptASR.
|
||||
keep_custom_fields=False
|
||||
|
||||
. shared/parse_options.sh || exit 1
|
||||
|
||||
# vocab size for sentence piece models.
|
||||
@ -134,7 +139,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||
for subset in small medium large dev test_clean test_other; do
|
||||
if [ ! -e $manifests_dir/libriheavy_cuts_${subset}.jsonl.gz ]; then
|
||||
log "Prepare manifest for subset : ${subset}"
|
||||
./local/prepare_manifest.py $dl_dir/libriheavy/libriheavy_cuts_${subset}.jsonl.gz $manifests_dir
|
||||
./local/prepare_manifest.py $dl_dir/libriheavy/libriheavy_cuts_${subset}.jsonl.gz $manifests_dir $keep_custom_fields
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
Loading…
x
Reference in New Issue
Block a user