Keep the custom fields in libriheavy manifest (#1719)

This commit is contained in:
Xiaoyu Yang 2024-08-17 13:24:38 +08:00 committed by GitHub
parent 6ac3343ce5
commit 5952972294
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 13 additions and 4 deletions

View File

@ -29,17 +29,21 @@ def simple_cleanup(text: str) -> str:
# Assign text of the supervisions and remove unnecessary entries.
def main():
assert len(sys.argv) == 3, "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR"
assert (
len(sys.argv) == 4
), "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR KEEP_CUSTOM_FIELDS"
fname = Path(sys.argv[1]).name
oname = Path(sys.argv[2]) / fname
keep_custom_fields = bool(sys.argv[3])
with gzip.open(sys.argv[1], "r") as fin, gzip.open(oname, "w") as fout:
for line in fin:
cut = json.loads(line)
cut["supervisions"][0]["text"] = simple_cleanup(
cut["supervisions"][0]["custom"]["texts"][0]
)
del cut["supervisions"][0]["custom"]
del cut["custom"]
if not keep_custom_fields:
del cut["supervisions"][0]["custom"]
del cut["custom"]
fout.write((json.dumps(cut) + "\n").encode())

View File

@ -29,6 +29,11 @@ export CUDA_VISIBLE_DEVICES=""
# - speech
dl_dir=$PWD/download
# If you want to do PromptASR experiments, please set it to True
# as this will keep the texts and pre_text information required for
# the training of PromptASR.
keep_custom_fields=False
. shared/parse_options.sh || exit 1
# vocab size for sentence piece models.
@ -134,7 +139,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
for subset in small medium large dev test_clean test_other; do
if [ ! -e $manifests_dir/libriheavy_cuts_${subset}.jsonl.gz ]; then
log "Prepare manifest for subset : ${subset}"
./local/prepare_manifest.py $dl_dir/libriheavy/libriheavy_cuts_${subset}.jsonl.gz $manifests_dir
./local/prepare_manifest.py $dl_dir/libriheavy/libriheavy_cuts_${subset}.jsonl.gz $manifests_dir $keep_custom_fields
fi
done
fi