diff --git a/egs/commonvoice/ASR/local/preprocess_commonvoice.py b/egs/commonvoice/ASR/local/preprocess_commonvoice.py index 4137b2dd1..d41af4015 100755 --- a/egs/commonvoice/ASR/local/preprocess_commonvoice.py +++ b/egs/commonvoice/ASR/local/preprocess_commonvoice.py @@ -170,7 +170,7 @@ def preprocess_commonvoice( The 'validated' partition contains the data of both 'train', 'dev' and 'test' partitions. We filter out the 'dev' and 'test' partition here. - """ + """ ) dev_ids = src_dir / f"cv-{language}_dev_ids" test_ids = src_dir / f"cv-{language}_test_ids" @@ -182,7 +182,9 @@ def preprocess_commonvoice( ), f"{test_ids} does not exist, please check stage 1 of the prepare.sh" dev_ids = dev_ids.read_text().strip().split("\n") test_ids = test_ids.read_text().strip().split("\n") - cut_set = cut_set.filter(lambda x: x.id not in dev_ids + test_ids) + cut_set = cut_set.filter( + lambda x: x.supervisions[0].id not in dev_ids + test_ids + ) # Run data augmentation that needs to be done in the # time domain.