Use jsonl for cutsets in the librispeech recipe.

This commit is contained in:
Fangjun Kuang 2022-06-05 13:10:06 +08:00
parent 8a3068ead8
commit d93512344b
16 changed files with 107 additions and 54 deletions

View File

@ -99,7 +99,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -99,7 +99,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -99,7 +99,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -99,7 +99,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -99,7 +99,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -98,7 +98,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -98,7 +98,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -98,7 +98,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -17,6 +17,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Usage:
export CUDA_VISIBLE_DEVICES="0,1,2,3"
./conformer_ctc/train.py \
--exp-dir ./conformer_ctc/exp \
--world-size 4 \
--full-libri 1 \
--max-duration 200 \
--num-epochs 20
"""
import argparse
import logging
from pathlib import Path
@ -29,6 +40,7 @@ import torch.multiprocessing as mp
import torch.nn as nn
from asr_datamodule import LibriSpeechAsrDataModule
from conformer import Conformer
from lhotse.cut import Cut
from lhotse.utils import fix_random_seed
from torch import Tensor
from torch.nn.parallel import DistributedDataParallel as DDP
@ -676,6 +688,18 @@ def run(rank, world_size, args):
if params.full_libri:
train_cuts += librispeech.train_clean_360_cuts()
train_cuts += librispeech.train_other_500_cuts()
def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds
#
# Caution: There is a reason to select 20.0 here. Please see
# ../local/display_manifest_statistics.py
#
# You should use ../local/display_manifest_statistics.py to get
# an utterance duration distribution for your dataset to select
# the threshold
return 1.0 <= c.duration <= 20.0
train_dl = librispeech.train_dataloaders(train_cuts)
valid_cuts = librispeech.dev_clean_cuts()

View File

@ -28,7 +28,7 @@ import os
from pathlib import Path
import torch
from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor
@ -56,8 +56,13 @@ def compute_fbank_librispeech():
"train-clean-360",
"train-other-500",
)
prefix = "librispeech"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached(
prefix="librispeech", dataset_parts=dataset_parts, output_dir=src_dir
dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
)
assert manifests is not None
@ -65,7 +70,8 @@ def compute_fbank_librispeech():
with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items():
if (output_dir / f"cuts_{partition}.json.gz").is_file():
cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
if (output_dir / cuts_filename).is_file():
logging.info(f"{partition} already exists - skipping.")
continue
logging.info(f"Processing {partition}")
@ -81,13 +87,13 @@ def compute_fbank_librispeech():
)
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/feats_{partition}",
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80,
executor=ex,
storage_type=ChunkedLilcomHdf5Writer,
storage_type=LilcomChunkyWriter,
)
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
cut_set.to_file(output_dir / cuts_filename)
if __name__ == "__main__":

View File

@ -28,7 +28,7 @@ import os
from pathlib import Path
import torch
from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig, combine
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter, combine
from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor
@ -52,12 +52,22 @@ def compute_fbank_musan():
"speech",
"noise",
)
prefix = "musan"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached(
prefix="musan", dataset_parts=dataset_parts, output_dir=src_dir
dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
)
assert manifests is not None
musan_cuts_path = output_dir / "cuts_musan.json.gz"
assert len(manifests) == len(dataset_parts), (
len(manifests),
len(dataset_parts),
)
musan_cuts_path = output_dir / "musan_cuts.jsonl.gz"
if musan_cuts_path.is_file():
logging.info(f"{musan_cuts_path} already exists - skipping")
@ -79,13 +89,13 @@ def compute_fbank_musan():
.filter(lambda c: c.duration > 5)
.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/feats_musan",
storage_path=f"{output_dir}/musan_feats",
num_jobs=num_jobs if ex is None else 80,
executor=ex,
storage_type=ChunkedLilcomHdf5Writer,
storage_type=LilcomChunkyWriter,
)
)
musan_cuts.to_json(musan_cuts_path)
musan_cuts.to_file(musan_cuts_path)
if __name__ == "__main__":

View File

@ -25,7 +25,7 @@ We will add more checks later if needed.
Usage example:
python3 ./local/validate_manifest.py \
./data/fbank/cuts_train-clean-100.json.gz
./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz
"""
@ -33,7 +33,7 @@ import argparse
import logging
from pathlib import Path
from lhotse import load_manifest, CutSet
from lhotse import CutSet, load_manifest
from lhotse.cut import Cut

View File

@ -40,9 +40,9 @@ dl_dir=$PWD/download
# It will generate data/lang_bpe_xxx,
# data/lang_bpe_yyy if the array contains xxx, yyy
vocab_sizes=(
5000
2000
1000
# 5000
# 2000
# 1000
500
)
@ -132,7 +132,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
)
for part in ${parts[@]}; do
python3 ./local/validate_manifest.py \
data/fbank/cuts_${part}.json.gz
data/fbank/librispeech_cuts_${part}.jsonl.gz
done
touch data/fbank/.librispeech-validated.done
fi

View File

@ -807,28 +807,8 @@ def run(rank, world_size, args):
# the threshold
return 1.0 <= c.duration <= 20.0
num_in_total = len(train_cuts)
train_cuts = train_cuts.filter(remove_short_and_long_utt)
try:
num_left = len(train_cuts)
num_removed = num_in_total - num_left
removed_percent = num_removed / num_in_total * 100
logging.info(
f"Before removing short and long utterances: {num_in_total}"
)
logging.info(f"After removing short and long utterances: {num_left}")
logging.info(
f"Removed {num_removed} utterances ({removed_percent:.5f}%)"
)
except TypeError as e:
# You can ignore this error as previous versions of Lhotse work fine
# for the above code. In recent versions of Lhotse, it uses
# lazy filter, producing cutsets that don't have the __len__ method
logging.info(str(e))
if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
# We only load the sampler's state dict when it loads a checkpoint
# saved in the middle of an epoch

View File

@ -225,7 +225,7 @@ class LibriSpeechAsrDataModule:
logging.info("Enable MUSAN")
logging.info("About to get Musan cuts")
cuts_musan = load_manifest(
self.args.manifest_dir / "cuts_musan.json.gz"
self.args.manifest_dir / "musan_cuts.jsonl.gz"
)
transforms.append(
CutMix(
@ -408,39 +408,47 @@ class LibriSpeechAsrDataModule:
def train_clean_100_cuts(self) -> CutSet:
logging.info("About to get train-clean-100 cuts")
return load_manifest(
self.args.manifest_dir / "cuts_train-clean-100.json.gz"
self.args.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz"
)
@lru_cache()
def train_clean_360_cuts(self) -> CutSet:
logging.info("About to get train-clean-360 cuts")
return load_manifest(
self.args.manifest_dir / "cuts_train-clean-360.json.gz"
self.args.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz"
)
@lru_cache()
def train_other_500_cuts(self) -> CutSet:
logging.info("About to get train-other-500 cuts")
return load_manifest(
self.args.manifest_dir / "cuts_train-other-500.json.gz"
self.args.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz"
)
@lru_cache()
def dev_clean_cuts(self) -> CutSet:
logging.info("About to get dev-clean cuts")
return load_manifest(self.args.manifest_dir / "cuts_dev-clean.json.gz")
return load_manifest(
self.args.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
)
@lru_cache()
def dev_other_cuts(self) -> CutSet:
logging.info("About to get dev-other cuts")
return load_manifest(self.args.manifest_dir / "cuts_dev-other.json.gz")
return load_manifest(
self.args.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz"
)
@lru_cache()
def test_clean_cuts(self) -> CutSet:
logging.info("About to get test-clean cuts")
return load_manifest(self.args.manifest_dir / "cuts_test-clean.json.gz")
return load_manifest(
self.args.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz"
)
@lru_cache()
def test_other_cuts(self) -> CutSet:
logging.info("About to get test-other cuts")
return load_manifest(self.args.manifest_dir / "cuts_test-other.json.gz")
return load_manifest(
self.args.manifest_dir / "librispeech_cuts_test-other.jsonl.gz"
)

View File

@ -16,6 +16,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Usage:
export CUDA_VISIBLE_DEVICES="0,1,2,3"
./tdnn_lstm_ctc/train.py \
--world-size 4 \
--full-libri 1 \
--max-duration 300 \
--num-epochs 20
"""
import argparse
import logging
@ -29,6 +38,7 @@ import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from asr_datamodule import LibriSpeechAsrDataModule
from lhotse.cut import Cut
from lhotse.utils import fix_random_seed
from model import TdnnLstm
from torch import Tensor
@ -544,10 +554,25 @@ def run(rank, world_size, args):
if params.full_libri:
train_cuts += librispeech.train_clean_360_cuts()
train_cuts += librispeech.train_other_500_cuts()
def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds
#
# Caution: There is a reason to select 20.0 here. Please see
# ../local/display_manifest_statistics.py
#
# You should use ../local/display_manifest_statistics.py to get
# an utterance duration distribution for your dataset to select
# the threshold
return 1.0 <= c.duration <= 20.0
train_cuts = train_cuts.filter(remove_short_and_long_utt)
train_dl = librispeech.train_dataloaders(train_cuts)
valid_cuts = librispeech.dev_clean_cuts()
valid_cuts += librispeech.dev_other_cuts()
valid_dl = librispeech.valid_dataloaders(valid_cuts)
for epoch in range(params.start_epoch, params.num_epochs):