diff --git a/egs/ksponspeech/ASR/local/compute_fbank_ksponspeech.py b/egs/ksponspeech/ASR/local/compute_fbank_ksponspeech.py index 7c3cb7931..b186c2296 100755 --- a/egs/ksponspeech/ASR/local/compute_fbank_ksponspeech.py +++ b/egs/ksponspeech/ASR/local/compute_fbank_ksponspeech.py @@ -63,7 +63,7 @@ def get_args(): parser.add_argument( "--data-dir", type=str, - default='data', + default="data", help="""Path of data directory""", ) @@ -74,10 +74,10 @@ def compute_fbank_speechtools( bpe_model: Optional[str] = None, dataset: Optional[str] = None, perturb_speed: Optional[bool] = False, - data_dir: Optional[str] = 'data', + data_dir: Optional[str] = "data", ): src_dir = Path(data_dir) / "manifests" - output_dir = Path(data_dir ) / "fbank" + output_dir = Path(data_dir) / "fbank" num_jobs = min(4, os.cpu_count()) num_mel_bins = 80 @@ -116,11 +116,11 @@ def compute_fbank_speechtools( if torch.cuda.is_available(): # Use cuda for fbank compute - device = 'cuda' + device = "cuda" else: - device = 'cpu' + device = "cpu" logging.info(f"Device: {device}") - + extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins, device=device)) with get_executor() as ex: # Initialize the executor only once. @@ -135,9 +135,11 @@ def compute_fbank_speechtools( recordings=m["recordings"], supervisions=m["supervisions"], ) - + # Filter duration - cut_set = cut_set.filter(lambda x: x.duration > 1 and x.sampling_rate == 16000) + cut_set = cut_set.filter( + lambda x: x.duration > 1 and x.sampling_rate == 16000 + ) if "train" in partition: if bpe_model: @@ -150,7 +152,7 @@ def compute_fbank_speechtools( + cut_set.perturb_speed(1.1) ) logging.info(f"Compute & Store features...") - if device == 'cuda': + if device == "cuda": cut_set = cut_set.compute_and_store_features_batch( extractor=extractor, storage_path=f"{output_dir}/{prefix}_feats_{partition}", diff --git a/egs/ksponspeech/ASR/local/compute_fbank_musan.py b/egs/ksponspeech/ASR/local/compute_fbank_musan.py index 7afe8e00f..c0bdacfe5 100755 --- a/egs/ksponspeech/ASR/local/compute_fbank_musan.py +++ b/egs/ksponspeech/ASR/local/compute_fbank_musan.py @@ -53,8 +53,8 @@ def is_cut_long(c: MonoCut) -> bool: def compute_fbank_musan( src_dir: str = "data/manifests", num_mel_bins: int = 80, - whisper_fbank: bool = False, - output_dir: str = "data/fbank" + whisper_fbank: bool = False, + output_dir: str = "data/fbank", ): src_dir = Path(src_dir) output_dir = Path(output_dir) diff --git a/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/asr_datamodule.py b/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/asr_datamodule.py index 5b61ccdc7..9a5b3fc52 100644 --- a/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/asr_datamodule.py +++ b/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/asr_datamodule.py @@ -399,14 +399,14 @@ class KsponSpeechAsrDataModule: return load_manifest_lazy( self.args.manifest_dir / "ksponspeech_cuts_dev.jsonl.gz" ) - + @lru_cache() def eval_clean_cuts(self) -> CutSet: logging.info("About to get eval_clean cuts") return load_manifest_lazy( self.args.manifest_dir / "ksponspeech_cuts_eval_clean.jsonl.gz" ) - + @lru_cache() def eval_other_cuts(self) -> CutSet: logging.info("About to get eval_other cuts") diff --git a/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/decode.py b/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/decode.py index 496f0f5b0..0f3f1c1ab 100755 --- a/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/decode.py +++ b/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/decode.py @@ -693,7 +693,11 @@ def save_results( errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt" with open(errs_filename, "w") as f: cer = write_error_stats( - f, f"{test_set_name}-{key}", results, enable_log=True, compute_CER=True, + f, + f"{test_set_name}-{key}", + results, + enable_log=True, + compute_CER=True, ) test_set_cers[key] = cer diff --git a/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/streaming_decode.py b/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/streaming_decode.py index ea08656bb..d777b769c 100755 --- a/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/streaming_decode.py +++ b/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/streaming_decode.py @@ -321,7 +321,6 @@ def decode_dataset( num_mel_bins=80, high_freq=-400.0, ) - log_interval = 50 @@ -426,7 +425,11 @@ def save_results( errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt" with open(errs_filename, "w") as f: cer = write_error_stats( - f, f"{test_set_name}-{key}", results, enable_log=True, compute_CER=True, + f, + f"{test_set_name}-{key}", + results, + enable_log=True, + compute_CER=True, ) test_set_cers[key] = cer diff --git a/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/train.py b/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/train.py index 9e8432ff3..bf50bf5ea 100755 --- a/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/train.py +++ b/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/train.py @@ -1031,7 +1031,7 @@ def run(rank, world_size, args): ksponspeech = KsponSpeechAsrDataModule(args) train_cuts = ksponspeech.train_cuts() - + def remove_short_and_long_utt(c: Cut): # Keep only utterances with duration between 1 second and 20 seconds # @@ -1083,7 +1083,7 @@ def run(rank, world_size, args): ) valid_cuts = ksponspeech.dev_cuts() - + # valid_cuts = valid_cuts.filter(remove_short_and_long_utt) valid_dl = ksponspeech.valid_dataloaders(valid_cuts)