mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-07 08:04:18 +00:00
Feature extraction code for GigaSpeech.
This commit is contained in:
parent
0cc13bc702
commit
4e05213f87
1
egs/librispeech/ASR/.gitignore
vendored
Normal file
1
egs/librispeech/ASR/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
log-*
|
@ -0,0 +1,92 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2021 Johns Hopkins University (Piotr Żelasko)
|
||||||
|
# Copyright 2021 Xiaomi Corp. (Fangjun Kuang)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from lhotse import (
|
||||||
|
CutSet,
|
||||||
|
KaldifeatFbank,
|
||||||
|
KaldifeatFbankConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Torch's multithreaded behavior needs to be disabled or
|
||||||
|
# it wastes a lot of CPU and slow things down.
|
||||||
|
# Do this outside of main() in case it needs to take effect
|
||||||
|
# even when we are not invoking the main (e.g. when spawning subprocesses).
|
||||||
|
torch.set_num_threads(1)
|
||||||
|
torch.set_num_interop_threads(1)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_fbank_gigaspeech_dev_test():
|
||||||
|
in_out_dir = Path("data/fbank")
|
||||||
|
# number of workers in dataloader
|
||||||
|
num_workers = 20
|
||||||
|
|
||||||
|
# number of seconds in a batch
|
||||||
|
batch_duration = 600
|
||||||
|
|
||||||
|
subsets = ("DEV", "TEST")
|
||||||
|
|
||||||
|
device = torch.device("cpu")
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
device = torch.device("cuda", 0)
|
||||||
|
extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
|
||||||
|
|
||||||
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
|
for partition in subsets:
|
||||||
|
cuts_path = in_out_dir / f"cuts_{partition}.jsonl.gz"
|
||||||
|
if cuts_path.is_file():
|
||||||
|
logging.info(f"{cuts_path} exists - skipping")
|
||||||
|
continue
|
||||||
|
|
||||||
|
raw_cuts_path = in_out_dir / f"cuts_{partition}_raw.jsonl.gz"
|
||||||
|
|
||||||
|
logging.info(f"Loading {raw_cuts_path}")
|
||||||
|
cut_set = CutSet.from_file(raw_cuts_path)
|
||||||
|
|
||||||
|
logging.info("Computing features")
|
||||||
|
|
||||||
|
cut_set = cut_set.compute_and_store_features_batch(
|
||||||
|
extractor=extractor,
|
||||||
|
storage_path=f"{in_out_dir}/feats_{partition}",
|
||||||
|
num_workers=num_workers,
|
||||||
|
batch_duration=batch_duration,
|
||||||
|
)
|
||||||
|
cut_set = cut_set.trim_to_supervisions(
|
||||||
|
keep_overlapping=False, min_duration=None
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.info(f"Saving to {cuts_path}")
|
||||||
|
cut_set.to_file(cuts_path)
|
||||||
|
logging.info(f"Saved to {cuts_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
formatter = (
|
||||||
|
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||||
|
)
|
||||||
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||||
|
|
||||||
|
compute_fbank_gigaspeech_dev_test()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
168
egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py
Normal file
168
egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2021 Johns Hopkins University (Piotr Żelasko)
|
||||||
|
# Copyright 2021 Xiaomi Corp. (Fangjun Kuang)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from lhotse import (
|
||||||
|
CutSet,
|
||||||
|
KaldifeatFbank,
|
||||||
|
KaldifeatFbankConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Torch's multithreaded behavior needs to be disabled or
|
||||||
|
# it wastes a lot of CPU and slow things down.
|
||||||
|
# Do this outside of main() in case it needs to take effect
|
||||||
|
# even when we are not invoking the main (e.g. when spawning subprocesses).
|
||||||
|
torch.set_num_threads(1)
|
||||||
|
torch.set_num_interop_threads(1)
|
||||||
|
|
||||||
|
|
||||||
|
def get_parser():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-workers",
|
||||||
|
type=int,
|
||||||
|
default=20,
|
||||||
|
help="Number of dataloading workers used for reading the audio.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--batch-duration",
|
||||||
|
type=float,
|
||||||
|
default=600.0,
|
||||||
|
help="The maximum number of audio seconds in a batch."
|
||||||
|
"Determines batch size dynamically.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-splits",
|
||||||
|
type=int,
|
||||||
|
required=True,
|
||||||
|
help="The number of splits of the XL subset",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--start",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="Process pieces starting from this number (inclusive).",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--stop",
|
||||||
|
type=int,
|
||||||
|
default=-1,
|
||||||
|
help="Stop processing pieces until this number (exclusive).",
|
||||||
|
)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def compute_fbank_gigaspeech_splits(args):
|
||||||
|
num_splits = args.num_splits
|
||||||
|
output_dir = f"data/fbank/XL_split_{num_splits}"
|
||||||
|
output_dir = Path(output_dir)
|
||||||
|
assert output_dir.exists(), f"{output_dir} does not exist!"
|
||||||
|
|
||||||
|
num_digits = len(str(num_splits))
|
||||||
|
|
||||||
|
start = args.start
|
||||||
|
stop = args.stop
|
||||||
|
if stop < start:
|
||||||
|
stop = num_splits
|
||||||
|
|
||||||
|
stop = min(stop, num_splits)
|
||||||
|
|
||||||
|
device = torch.device("cpu")
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
device = torch.device("cuda", 0)
|
||||||
|
extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
|
||||||
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
|
for i in range(start, stop):
|
||||||
|
idx = i
|
||||||
|
logging.info(f"Processing {idx}/{num_splits}")
|
||||||
|
|
||||||
|
cuts_path = output_dir / f"cuts_XL.{idx}.jsonl.gz"
|
||||||
|
if cuts_path.is_file():
|
||||||
|
logging.info(f"{cuts_path} exists - skipping")
|
||||||
|
continue
|
||||||
|
|
||||||
|
raw_cuts_path = output_dir / f"cuts_XL_raw.{idx}.jsonl.gz"
|
||||||
|
if not raw_cuts_path.is_file():
|
||||||
|
logging.info(f"{raw_cuts_path} does not exist - skipping it")
|
||||||
|
continue
|
||||||
|
|
||||||
|
logging.info(f"Loading {raw_cuts_path}")
|
||||||
|
cut_set = CutSet.from_file(raw_cuts_path)
|
||||||
|
|
||||||
|
logging.info("Computing features")
|
||||||
|
|
||||||
|
cut_set = cut_set.compute_and_store_features_batch(
|
||||||
|
extractor=extractor,
|
||||||
|
storage_path=f"{output_dir}/feats_XL_{idx}",
|
||||||
|
num_workers=args.num_workers,
|
||||||
|
batch_duration=args.batch_duration,
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.info("About to split cuts into smaller chunks.")
|
||||||
|
cut_set = cut_set.trim_to_supervisions(
|
||||||
|
keep_overlapping=False, min_duration=None
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.info(f"Saving to {cuts_path}")
|
||||||
|
cut_set.to_file(cuts_path)
|
||||||
|
logging.info(f"Saved to {cuts_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
now = datetime.now()
|
||||||
|
date_time = now.strftime("%Y-%m-%d-%H-%M-%S")
|
||||||
|
|
||||||
|
log_filename = "log-compute_fbank_gigaspeech_splits"
|
||||||
|
formatter = (
|
||||||
|
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||||
|
)
|
||||||
|
log_filename = f"{log_filename}-{date_time}"
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
filename=log_filename,
|
||||||
|
format=formatter,
|
||||||
|
level=logging.INFO,
|
||||||
|
filemode="w",
|
||||||
|
)
|
||||||
|
|
||||||
|
console = logging.StreamHandler()
|
||||||
|
console.setLevel(logging.INFO)
|
||||||
|
console.setFormatter(logging.Formatter(formatter))
|
||||||
|
logging.getLogger("").addHandler(console)
|
||||||
|
|
||||||
|
parser = get_parser()
|
||||||
|
args = parser.parse_args()
|
||||||
|
logging.info(vars(args))
|
||||||
|
|
||||||
|
compute_fbank_gigaspeech_splits(args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -101,11 +101,6 @@ def preprocess_giga_speech():
|
|||||||
+ cut_set.perturb_speed(0.9)
|
+ cut_set.perturb_speed(0.9)
|
||||||
+ cut_set.perturb_speed(1.1)
|
+ cut_set.perturb_speed(1.1)
|
||||||
)
|
)
|
||||||
|
|
||||||
logging.info("About to split cuts into smaller chunks.")
|
|
||||||
cut_set = cut_set.trim_to_supervisions(
|
|
||||||
keep_overlapping=False, min_duration=None
|
|
||||||
)
|
|
||||||
logging.info(f"Saving to {raw_cuts_path}")
|
logging.info(f"Saving to {raw_cuts_path}")
|
||||||
cut_set.to_file(raw_cuts_path)
|
cut_set.to_file(raw_cuts_path)
|
||||||
|
|
||||||
|
@ -24,6 +24,15 @@ stop_stage=100
|
|||||||
# DEV 12 hours
|
# DEV 12 hours
|
||||||
# Test 40 hours
|
# Test 40 hours
|
||||||
|
|
||||||
|
# Split XL subset to this number of pieces
|
||||||
|
# This is to avoid OOM during feature extraction.
|
||||||
|
num_splits=2000
|
||||||
|
# We use lazy split from lhotse.
|
||||||
|
# The XL subset contains 113916 cuts after speed perturbing with factors
|
||||||
|
# 0.9 and 1.1. We want to split it into 2000 splits, so each split
|
||||||
|
# contains about 113916 / 2000 = 57 cuts. As a result, there will be 1999 splits.
|
||||||
|
chunk_size=57 # number of cuts in each split. The last split may contain fewer cuts.
|
||||||
|
|
||||||
dl_dir=$PWD/download
|
dl_dir=$PWD/download
|
||||||
|
|
||||||
. shared/parse_options.sh || exit 1
|
. shared/parse_options.sh || exit 1
|
||||||
@ -107,3 +116,34 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
|||||||
touch data/fbank/.preprocess_complete
|
touch data/fbank/.preprocess_complete
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||||
|
log "Stage 3: Compute features for DEV and TEST subsets of GigaSpeech (may take 2 minutes)"
|
||||||
|
python3 ./local/compute_fbank_gigaspeech_dev_test.py
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||||
|
log "Stage 4: Split XL subset into ${num_splits} pieces"
|
||||||
|
split_dir=data/fbank/XL_split_${num_splits}
|
||||||
|
if [ ! -f $split_dir/.split_completed ]; then
|
||||||
|
lhotse split-lazy ./data/fbank/cuts_XL_raw.jsonl.gz $split_dir $chunk_size
|
||||||
|
touch $split_dir/.split_completed
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||||
|
log "Stage 5: Compute features for XL"
|
||||||
|
# Note: The script supports --start and --stop options.
|
||||||
|
# You can use several machines to compute the features in parallel.
|
||||||
|
python3 ./local/compute_fbank_gigaspeech_splits.py \
|
||||||
|
--num-workers $nj \
|
||||||
|
--batch-duration 600 \
|
||||||
|
--num-splits $num_splits
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||||
|
log "Stage 6: Combine features for XL"
|
||||||
|
if [ ! -f data/fbank/cuts_XL.jsonl.gz ]; then
|
||||||
|
pieces=$(find data/fbank/XL_split_${num_splits} -name "cuts_XL.*.jsonl.gz")
|
||||||
|
lhotse combine $pieces data/fbank/cuts_XL.jsonl.gz
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
Loading…
x
Reference in New Issue
Block a user