mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 18:12:19 +00:00
Keep the custom fields in libriheavy manifest (#1719)
This commit is contained in:
parent
6ac3343ce5
commit
5952972294
@ -29,17 +29,21 @@ def simple_cleanup(text: str) -> str:
|
|||||||
|
|
||||||
# Assign text of the supervisions and remove unnecessary entries.
|
# Assign text of the supervisions and remove unnecessary entries.
|
||||||
def main():
|
def main():
|
||||||
assert len(sys.argv) == 3, "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR"
|
assert (
|
||||||
|
len(sys.argv) == 4
|
||||||
|
), "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR KEEP_CUSTOM_FIELDS"
|
||||||
fname = Path(sys.argv[1]).name
|
fname = Path(sys.argv[1]).name
|
||||||
oname = Path(sys.argv[2]) / fname
|
oname = Path(sys.argv[2]) / fname
|
||||||
|
keep_custom_fields = bool(sys.argv[3])
|
||||||
with gzip.open(sys.argv[1], "r") as fin, gzip.open(oname, "w") as fout:
|
with gzip.open(sys.argv[1], "r") as fin, gzip.open(oname, "w") as fout:
|
||||||
for line in fin:
|
for line in fin:
|
||||||
cut = json.loads(line)
|
cut = json.loads(line)
|
||||||
cut["supervisions"][0]["text"] = simple_cleanup(
|
cut["supervisions"][0]["text"] = simple_cleanup(
|
||||||
cut["supervisions"][0]["custom"]["texts"][0]
|
cut["supervisions"][0]["custom"]["texts"][0]
|
||||||
)
|
)
|
||||||
del cut["supervisions"][0]["custom"]
|
if not keep_custom_fields:
|
||||||
del cut["custom"]
|
del cut["supervisions"][0]["custom"]
|
||||||
|
del cut["custom"]
|
||||||
fout.write((json.dumps(cut) + "\n").encode())
|
fout.write((json.dumps(cut) + "\n").encode())
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,6 +29,11 @@ export CUDA_VISIBLE_DEVICES=""
|
|||||||
# - speech
|
# - speech
|
||||||
dl_dir=$PWD/download
|
dl_dir=$PWD/download
|
||||||
|
|
||||||
|
# If you want to do PromptASR experiments, please set it to True
|
||||||
|
# as this will keep the texts and pre_text information required for
|
||||||
|
# the training of PromptASR.
|
||||||
|
keep_custom_fields=False
|
||||||
|
|
||||||
. shared/parse_options.sh || exit 1
|
. shared/parse_options.sh || exit 1
|
||||||
|
|
||||||
# vocab size for sentence piece models.
|
# vocab size for sentence piece models.
|
||||||
@ -134,7 +139,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
|||||||
for subset in small medium large dev test_clean test_other; do
|
for subset in small medium large dev test_clean test_other; do
|
||||||
if [ ! -e $manifests_dir/libriheavy_cuts_${subset}.jsonl.gz ]; then
|
if [ ! -e $manifests_dir/libriheavy_cuts_${subset}.jsonl.gz ]; then
|
||||||
log "Prepare manifest for subset : ${subset}"
|
log "Prepare manifest for subset : ${subset}"
|
||||||
./local/prepare_manifest.py $dl_dir/libriheavy/libriheavy_cuts_${subset}.jsonl.gz $manifests_dir
|
./local/prepare_manifest.py $dl_dir/libriheavy/libriheavy_cuts_${subset}.jsonl.gz $manifests_dir $keep_custom_fields
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
Loading…
x
Reference in New Issue
Block a user