mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-10 10:32:17 +00:00
add librilight ssl recipe
update Update ssl_datamodule.py Update pretrain.py Update pretrain.sh Update pretrain.sh Update hubert_ce.py Update pretrain.py
This commit is contained in:
parent
3b257dd5ae
commit
8e296b7047
88
egs/librilight/SSL/local/analyze_codebook.py
Executable file
88
egs/librilight/SSL/local/analyze_codebook.py
Executable file
@ -0,0 +1,88 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2024 Xiaomi Corp. (authors: Yifan Yang)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from collections import Counter
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from lhotse import CutSet
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
# Torch's multithreaded behavior needs to be disabled or
|
||||||
|
# it wastes a lot of CPU and slow things down.
|
||||||
|
# Do this outside of main() in case it needs to take effect
|
||||||
|
# even when we are not invoking the main (e.g. when spawning subprocesses).
|
||||||
|
torch.set_num_threads(1)
|
||||||
|
torch.set_num_interop_threads(1)
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--cuts-path",
|
||||||
|
type=str,
|
||||||
|
default="data/kmeans/librispeech_cuts_dev-clean.jsonl.gz",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-clusters",
|
||||||
|
type=int,
|
||||||
|
default=500,
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_codebook(args):
|
||||||
|
cuts_path = Path(args.cuts_path)
|
||||||
|
assert cuts_path.is_file(), f"{cuts_path} does not exist"
|
||||||
|
|
||||||
|
logging.info(f"Loading {cuts_path}")
|
||||||
|
cut_set = CutSet.from_file(cuts_path)
|
||||||
|
|
||||||
|
cluster_counts = Counter()
|
||||||
|
logging.info("Analyzing codebook")
|
||||||
|
for cut in tqdm(cut_set):
|
||||||
|
kmeans = map(int, cut.custom["kmeans"].split())
|
||||||
|
cluster_counts.update(kmeans)
|
||||||
|
|
||||||
|
utilized_clusters = len(cluster_counts)
|
||||||
|
|
||||||
|
total_count = sum(cluster_counts.values())
|
||||||
|
counts = torch.tensor([cluster_counts[i] for i in range(args.num_clusters)])
|
||||||
|
normalized_counts = (counts / total_count).clamp(min=1e-10)
|
||||||
|
codebook_entropy = (
|
||||||
|
-(normalized_counts * normalized_counts.log()).sum()
|
||||||
|
* torch.log2(torch.tensor(torch.e))
|
||||||
|
).item()
|
||||||
|
|
||||||
|
logging.info(
|
||||||
|
f"Codebook utilization rate: {utilized_clusters / args.num_clusters:%}"
|
||||||
|
)
|
||||||
|
logging.info(f"Codebook entropy: {codebook_entropy}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||||
|
|
||||||
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||||
|
args = get_args()
|
||||||
|
logging.info(vars(args))
|
||||||
|
analyze_codebook(args)
|
251
egs/librilight/SSL/local/extract_kmeans_from_hubert_base.py
Executable file
251
egs/librilight/SSL/local/extract_kmeans_from_hubert_base.py
Executable file
@ -0,0 +1,251 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2024 Xiaomi Corp. (authors: Yifan Yang)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import fairseq
|
||||||
|
import joblib
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from lhotse import CutSet, SupervisionSegment
|
||||||
|
from lhotse.utils import fastcopy
|
||||||
|
from silero_vad import get_speech_timestamps, load_silero_vad
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
# Torch's multithreaded behavior needs to be disabled or
|
||||||
|
# it wastes a lot of CPU and slow things down.
|
||||||
|
# Do this outside of main() in case it needs to take effect
|
||||||
|
# even when we are not invoking the main (e.g. when spawning subprocesses).
|
||||||
|
torch.set_num_threads(1)
|
||||||
|
torch.set_num_interop_threads(1)
|
||||||
|
|
||||||
|
|
||||||
|
class ApplyKmeans(object):
|
||||||
|
def __init__(self, km_path):
|
||||||
|
self.km_model = joblib.load(km_path)
|
||||||
|
self.C_np = self.km_model.cluster_centers_.transpose()
|
||||||
|
self.Cnorm_np = (self.C_np**2).sum(0, keepdims=True)
|
||||||
|
|
||||||
|
self.C = torch.from_numpy(self.C_np)
|
||||||
|
self.Cnorm = torch.from_numpy(self.Cnorm_np)
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
self.C = self.C.cuda()
|
||||||
|
self.Cnorm = self.Cnorm.cuda()
|
||||||
|
|
||||||
|
def __call__(self, x):
|
||||||
|
if isinstance(x, torch.Tensor):
|
||||||
|
dist = (
|
||||||
|
x.pow(2).sum(1, keepdim=True) - 2 * torch.matmul(x, self.C) + self.Cnorm
|
||||||
|
)
|
||||||
|
return dist.argmin(dim=1).cpu().numpy()
|
||||||
|
else:
|
||||||
|
dist = (
|
||||||
|
(x**2).sum(1, keepdims=True)
|
||||||
|
- 2 * np.matmul(x, self.C_np)
|
||||||
|
+ self.Cnorm_np
|
||||||
|
)
|
||||||
|
return np.argmin(dist, axis=1)
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--subset",
|
||||||
|
type=str,
|
||||||
|
default="small",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--model-path",
|
||||||
|
type=str,
|
||||||
|
default="download/hubert_base_ls960.pt",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--kmeans-model-path",
|
||||||
|
type=str,
|
||||||
|
default="download/hubert_base_ls960_L9_km500.model",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--start",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="Process pieces starting from this number (inclusive).",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--stop",
|
||||||
|
type=int,
|
||||||
|
default=-1,
|
||||||
|
help="Stop processing pieces until this number (exclusive).",
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def extract_and_save_one_cuts(
|
||||||
|
raw_cuts_path, cuts_path, model, vad_model, apply_kmeans, do_normalize, device
|
||||||
|
):
|
||||||
|
logging.info(f"Loading {raw_cuts_path}")
|
||||||
|
cut_set = CutSet.from_file(raw_cuts_path)
|
||||||
|
|
||||||
|
logging.info("Extracting kmeans")
|
||||||
|
cuts = []
|
||||||
|
for cut in tqdm(cut_set):
|
||||||
|
assert cut.sampling_rate == 16000, f"{cut.sampling_rate}"
|
||||||
|
audio = cut.load_audio()
|
||||||
|
|
||||||
|
if audio.shape[-1] > 64 * 16000:
|
||||||
|
timestamps = get_speech_timestamps(audio, vad_model)
|
||||||
|
offsets = [i["start"] for i in timestamps]
|
||||||
|
audios = [audio[:, i["start"] : i["end"]] for i in timestamps]
|
||||||
|
logging.info(f"Trim audio {cut.id} into {len(audios)} segments")
|
||||||
|
else:
|
||||||
|
offsets = [0]
|
||||||
|
audios = [audio]
|
||||||
|
|
||||||
|
seq = 0
|
||||||
|
for audio, offset in zip(audios, offsets):
|
||||||
|
x = torch.from_numpy(audio).float().to(device)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
if do_normalize:
|
||||||
|
x = torch.nn.functional.layer_norm(x, x.shape)
|
||||||
|
|
||||||
|
feature, _ = model.extract_features(
|
||||||
|
source=x,
|
||||||
|
padding_mask=None,
|
||||||
|
mask=False,
|
||||||
|
output_layer=9,
|
||||||
|
)
|
||||||
|
feature = feature.squeeze(0)
|
||||||
|
|
||||||
|
kmeans = " ".join(map(str, apply_kmeans(feature).tolist()))
|
||||||
|
|
||||||
|
supervision_segment = fastcopy(
|
||||||
|
cut.supervisions[0],
|
||||||
|
id=f"{cut.id}-{seq}",
|
||||||
|
start=0.0,
|
||||||
|
duration=audio.shape[-1] / 16000,
|
||||||
|
)
|
||||||
|
cut_with_kmeans = fastcopy(
|
||||||
|
cut,
|
||||||
|
id=f"{cut.id}-{seq}",
|
||||||
|
start=cut.start + offset / 16000,
|
||||||
|
duration=audio.shape[-1] / 16000,
|
||||||
|
supervisions=[supervision_segment],
|
||||||
|
custom={"kmeans": kmeans},
|
||||||
|
)
|
||||||
|
cuts.append(cut_with_kmeans)
|
||||||
|
|
||||||
|
seq += 1
|
||||||
|
|
||||||
|
cuts = CutSet(cuts)
|
||||||
|
|
||||||
|
logging.info(f"Saving to {cuts_path}")
|
||||||
|
cuts.to_file(cuts_path)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_kmeans(args):
|
||||||
|
assert args.subset in ("small", "medium", "large"), f"{args.subset}"
|
||||||
|
|
||||||
|
output_dir = (
|
||||||
|
f"data/kmeans/{args.subset}_split" if args.subset != "small" else "data/kmeans"
|
||||||
|
)
|
||||||
|
output_dir = Path(output_dir)
|
||||||
|
assert output_dir.exists(), f"{output_dir} does not exist!"
|
||||||
|
|
||||||
|
device = torch.device("cpu")
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
device = torch.device("cuda", 0)
|
||||||
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
|
prefix = "librilight"
|
||||||
|
|
||||||
|
vad_model = load_silero_vad()
|
||||||
|
apply_kmeans = ApplyKmeans(args.kmeans_model_path)
|
||||||
|
model, _, task = fairseq.checkpoint_utils.load_model_ensemble_and_task(
|
||||||
|
[args.model_path]
|
||||||
|
)
|
||||||
|
model = model[0].eval().to(device)
|
||||||
|
do_normalize = task.cfg.normalize
|
||||||
|
|
||||||
|
if args.subset == "small":
|
||||||
|
cuts_path = output_dir / f"{prefix}_cuts_{args.subset}.jsonl.gz"
|
||||||
|
if cuts_path.is_file():
|
||||||
|
logging.info(f"{cuts_path} exists - skipping")
|
||||||
|
return
|
||||||
|
|
||||||
|
raw_cuts_path = output_dir / f"{prefix}_cuts_{args.subset}_raw.jsonl.gz"
|
||||||
|
if not raw_cuts_path.is_file():
|
||||||
|
logging.info(f"{raw_cuts_path} does not exist - skipping it")
|
||||||
|
return
|
||||||
|
|
||||||
|
extract_and_save_one_cuts(
|
||||||
|
raw_cuts_path,
|
||||||
|
cuts_path,
|
||||||
|
model,
|
||||||
|
vad_model,
|
||||||
|
apply_kmeans,
|
||||||
|
do_normalize,
|
||||||
|
device,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
num_digits = 8 # num_digits is fixed by lhotse split-lazy
|
||||||
|
start = args.start
|
||||||
|
stop = args.stop
|
||||||
|
assert stop > start, "stop must be larger than start!"
|
||||||
|
|
||||||
|
for i in range(start, stop):
|
||||||
|
idx = f"{i}".zfill(num_digits)
|
||||||
|
logging.info(f"Processing {idx}/{stop - 1}")
|
||||||
|
|
||||||
|
cuts_path = output_dir / f"{prefix}_cuts_{args.subset}.{idx}.jsonl.gz"
|
||||||
|
if cuts_path.is_file():
|
||||||
|
logging.info(f"{cuts_path} exists - skipping")
|
||||||
|
continue
|
||||||
|
|
||||||
|
raw_cuts_path = (
|
||||||
|
output_dir / f"{prefix}_cuts_{args.subset}_raw.{idx}.jsonl.gz"
|
||||||
|
)
|
||||||
|
if not raw_cuts_path.is_file():
|
||||||
|
logging.info(f"{raw_cuts_path} does not exist - skipping it")
|
||||||
|
continue
|
||||||
|
|
||||||
|
extract_and_save_one_cuts(
|
||||||
|
raw_cuts_path,
|
||||||
|
cuts_path,
|
||||||
|
model,
|
||||||
|
vad_model,
|
||||||
|
apply_kmeans,
|
||||||
|
do_normalize,
|
||||||
|
device,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||||
|
|
||||||
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||||
|
args = get_args()
|
||||||
|
logging.info(vars(args))
|
||||||
|
extract_kmeans(args)
|
107
egs/librilight/SSL/local/preprocess_librilight.py
Executable file
107
egs/librilight/SSL/local/preprocess_librilight.py
Executable file
@ -0,0 +1,107 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2024 Xiaomi Corp. (authors: Yifan Yang)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from lhotse import CutSet
|
||||||
|
from lhotse.recipes.utils import read_manifests_if_cached
|
||||||
|
|
||||||
|
from icefall.utils import str2bool
|
||||||
|
|
||||||
|
# Torch's multithreaded behavior needs to be disabled or
|
||||||
|
# it wastes a lot of CPU and slow things down.
|
||||||
|
# Do this outside of main() in case it needs to take effect
|
||||||
|
# even when we are not invoking the main (e.g. when spawning subprocesses).
|
||||||
|
torch.set_num_threads(1)
|
||||||
|
torch.set_num_interop_threads(1)
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--dataset",
|
||||||
|
type=str,
|
||||||
|
help="""Dataset parts to compute fbank. If None, we will use all""",
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_librilight(
|
||||||
|
dataset: Optional[str] = None,
|
||||||
|
):
|
||||||
|
src_dir = Path("data/manifests")
|
||||||
|
output_dir = Path("data/kmeans")
|
||||||
|
|
||||||
|
if dataset is None:
|
||||||
|
dataset_parts = (
|
||||||
|
"small",
|
||||||
|
"medium",
|
||||||
|
"large",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
dataset_parts = dataset.split(" ", -1)
|
||||||
|
|
||||||
|
prefix = "librilight"
|
||||||
|
suffix = "jsonl.gz"
|
||||||
|
manifests = read_manifests_if_cached(
|
||||||
|
dataset_parts=dataset_parts,
|
||||||
|
output_dir=src_dir,
|
||||||
|
prefix=prefix,
|
||||||
|
suffix=suffix,
|
||||||
|
)
|
||||||
|
assert manifests is not None
|
||||||
|
|
||||||
|
assert len(manifests) == len(dataset_parts), (
|
||||||
|
len(manifests),
|
||||||
|
len(dataset_parts),
|
||||||
|
list(manifests.keys()),
|
||||||
|
dataset_parts,
|
||||||
|
)
|
||||||
|
|
||||||
|
for partition, m in manifests.items():
|
||||||
|
cuts_filename = f"{prefix}_cuts_{partition}_raw.{suffix}"
|
||||||
|
if (output_dir / cuts_filename).is_file():
|
||||||
|
logging.info(f"{partition} already exists - skipping.")
|
||||||
|
continue
|
||||||
|
logging.info(f"Processing {partition}")
|
||||||
|
cut_set = CutSet.from_manifests(
|
||||||
|
recordings=m["recordings"],
|
||||||
|
supervisions=m["supervisions"],
|
||||||
|
)
|
||||||
|
cut_set = cut_set.trim_to_supervisions(
|
||||||
|
keep_overlapping=False, min_duration=None
|
||||||
|
)
|
||||||
|
logging.info(f"Saving to {output_dir / cuts_filename}")
|
||||||
|
cut_set.to_file(output_dir / cuts_filename)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||||
|
|
||||||
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||||
|
args = get_args()
|
||||||
|
logging.info(vars(args))
|
||||||
|
preprocess_librilight(
|
||||||
|
dataset=args.dataset,
|
||||||
|
)
|
87
egs/librilight/SSL/prepare.sh
Executable file
87
egs/librilight/SSL/prepare.sh
Executable file
@ -0,0 +1,87 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
|
||||||
|
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
||||||
|
|
||||||
|
set -eou pipefail
|
||||||
|
|
||||||
|
nj=15
|
||||||
|
# run step 0 to step 5 by default
|
||||||
|
stage=0
|
||||||
|
stop_stage=5
|
||||||
|
|
||||||
|
# We assume dl_dir (download dir) contains the following
|
||||||
|
# directories and files. If not, they will be downloaded
|
||||||
|
# by this script automatically.
|
||||||
|
#
|
||||||
|
# - $dl_dir/LibriLight
|
||||||
|
# - small
|
||||||
|
# - medium
|
||||||
|
# - large
|
||||||
|
#
|
||||||
|
# You can download them from
|
||||||
|
# - https://dl.fbaipublicfiles.com/librilight/data/small.tar
|
||||||
|
# - https://dl.fbaipublicfiles.com/librilight/data/medium.tar
|
||||||
|
# - https://dl.fbaipublicfiles.com/librilight/data/large.tar
|
||||||
|
|
||||||
|
dl_dir=$PWD/download
|
||||||
|
|
||||||
|
. shared/parse_options.sh || exit 1
|
||||||
|
|
||||||
|
# All files generated by this script are saved in "data".
|
||||||
|
# You can safely remove "data" and rerun this script to regenerate it.
|
||||||
|
mkdir -p data
|
||||||
|
|
||||||
|
log() {
|
||||||
|
# This function is from espnet
|
||||||
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
|
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
log "Running prepare.sh"
|
||||||
|
|
||||||
|
log "dl_dir: $dl_dir"
|
||||||
|
|
||||||
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||||
|
log "Stage 1: Prepare Libri-Light manifest"
|
||||||
|
# We assume that you have downloaded the Libri-Light corpus
|
||||||
|
# to $dl_dir/LibriLight
|
||||||
|
mkdir -p data/manifests
|
||||||
|
if [ ! -e data/manifests/.librilight.done ]; then
|
||||||
|
lhotse prepare librilight -j $nj $dl_dir/LibriLight data/manifests
|
||||||
|
touch data/manifests/.librilight.done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||||
|
log "Stage 2: Preprocess Libri-Light manifest"
|
||||||
|
mkdir -p data/kmeans
|
||||||
|
if [ ! -f data/kmeans/.preprocess_complete ]; then
|
||||||
|
python3 ./local/preprocess_librilight.py
|
||||||
|
touch data/fbank/.preprocess_complete
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||||
|
log "Stage 3: Split medium and large subset into pieces"
|
||||||
|
num_per_split=200000
|
||||||
|
split_dir=data/kmeans/medium_split
|
||||||
|
if [ ! -f $split_dir/.split_completed ]; then
|
||||||
|
lhotse split-lazy ./data/kmeans/librilight_cuts_medium_raw.jsonl.gz $split_dir $num_per_split
|
||||||
|
touch $split_dir/.split_completed
|
||||||
|
fi
|
||||||
|
split_dir=data/kmeans/large_split
|
||||||
|
if [ ! -f $split_dir/.split_completed ]; then
|
||||||
|
lhotse split-lazy ./data/kmeans/librilight_cuts_large_raw.jsonl.gz $split_dir $num_per_split
|
||||||
|
touch $split_dir/.split_completed
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||||
|
log "Stage 4: Extract SSL target for librilight"
|
||||||
|
mkdir -p data/fbank
|
||||||
|
if [ ! -e data/fbank/.librispeech.done ]; then
|
||||||
|
./local/compute_fbank_librispeech.py
|
||||||
|
touch data/fbank/.librispeech.done
|
||||||
|
fi
|
||||||
|
fi
|
22
egs/librilight/SSL/pretrain.sh
Executable file
22
egs/librilight/SSL/pretrain.sh
Executable file
@ -0,0 +1,22 @@
|
|||||||
|
export PYTHONPATH=$(pwd)/../../..
|
||||||
|
|
||||||
|
./zipformer/pretrain.py \
|
||||||
|
--world-size 8 \
|
||||||
|
--num-epochs 30 \
|
||||||
|
--start-epoch 1 \
|
||||||
|
--use-fp16 1 \
|
||||||
|
--exp-dir zipformer/exp_pretrain \
|
||||||
|
--max-duration 650 \
|
||||||
|
--quadratic-duration 512 \
|
||||||
|
--accum-grad 1 \
|
||||||
|
--do-normalize 1 \
|
||||||
|
--mask-prob 0.8 \
|
||||||
|
--extractor-mode "layer_norm" \
|
||||||
|
--dropout-input 0.0 \
|
||||||
|
--dropout-features 0.0 \
|
||||||
|
--feature-grad-mult 1.0 \
|
||||||
|
--num-encoder-layers 2,2,3,4,3,2 \
|
||||||
|
--feedforward-dim 512,768,1024,1536,1024,768 \
|
||||||
|
--encoder-dim 192,256,448,768,448,192 \
|
||||||
|
--encoder-unmasked-dim 192,192,256,256,256,192 \
|
||||||
|
--base-lr 0.045
|
1
egs/librilight/SSL/shared
Symbolic link
1
egs/librilight/SSL/shared
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../icefall/shared/
|
2
egs/librilight/SSL/zipformer/decode.py
Normal file → Executable file
2
egs/librilight/SSL/zipformer/decode.py
Normal file → Executable file
@ -1015,8 +1015,6 @@ def main():
|
|||||||
|
|
||||||
test_sets = ["dev-clean", "dev-other", "test-clean", "test-other"]
|
test_sets = ["dev-clean", "dev-other", "test-clean", "test-other"]
|
||||||
test_dl = [dev_clean_dl, dev_other_dl, test_clean_dl, test_other_dl]
|
test_dl = [dev_clean_dl, dev_other_dl, test_clean_dl, test_other_dl]
|
||||||
# test_sets = ["dev-clean", "dev-other"]
|
|
||||||
# test_dl = [dev_clean_dl, dev_other_dl]
|
|
||||||
|
|
||||||
for test_set, test_dl in zip(test_sets, test_dl):
|
for test_set, test_dl in zip(test_sets, test_dl):
|
||||||
results_dict = decode_dataset(
|
results_dict = decode_dataset(
|
||||||
|
19
egs/librilight/SSL/zipformer/finetune.py
Normal file → Executable file
19
egs/librilight/SSL/zipformer/finetune.py
Normal file → Executable file
@ -1,11 +1,10 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# Copyright 2021-2024 Xiaomi Corp. (authors: Fangjun Kuang,
|
# Copyright 2021-2024 Xiaomi Corp. (authors: Fangjun Kuang,
|
||||||
# Wei Kang,
|
# Wei Kang,
|
||||||
# Mingshuang Luo,
|
# Mingshuang Luo,
|
||||||
# Zengwei Yao,
|
# Zengwei Yao,
|
||||||
# Yifan Yang,
|
# Yifan Yang,
|
||||||
# Daniel Povey)
|
# Daniel Povey)
|
||||||
#
|
|
||||||
# Copyright 2024 Shanghai Jiao Tong University (authors: Jianheng Zhuo)
|
# Copyright 2024 Shanghai Jiao Tong University (authors: Jianheng Zhuo)
|
||||||
#
|
#
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
@ -1246,7 +1245,7 @@ def train_one_epoch(
|
|||||||
tb_writer, "train/valid_", params.batch_idx_train
|
tb_writer, "train/valid_", params.batch_idx_train
|
||||||
)
|
)
|
||||||
|
|
||||||
if batch_idx % params.accum_grad != params.accum_grad - 1:
|
if sub_batch_idx % params.accum_grad != params.accum_grad - 1:
|
||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
loss_value = tot_loss["loss"] / tot_loss["frames"]
|
loss_value = tot_loss["loss"] / tot_loss["frames"]
|
||||||
params.train_loss = loss_value
|
params.train_loss = loss_value
|
||||||
@ -1388,6 +1387,8 @@ def run(rank, world_size, args):
|
|||||||
train_cuts,
|
train_cuts,
|
||||||
do_normalize=params.do_normalize,
|
do_normalize=params.do_normalize,
|
||||||
sampler_state_dict=sampler_state_dict,
|
sampler_state_dict=sampler_state_dict,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
|
|
||||||
valid_cuts = librispeech.dev_clean_cuts()
|
valid_cuts = librispeech.dev_clean_cuts()
|
||||||
@ -1396,6 +1397,8 @@ def run(rank, world_size, args):
|
|||||||
valid_dl = librispeech.valid_dataloaders(
|
valid_dl = librispeech.valid_dataloaders(
|
||||||
valid_cuts,
|
valid_cuts,
|
||||||
do_normalize=params.do_normalize,
|
do_normalize=params.do_normalize,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
|
|
||||||
if params.sanity_check and not params.print_diagnostics:
|
if params.sanity_check and not params.print_diagnostics:
|
||||||
|
71
egs/librilight/SSL/zipformer/pretrain.py
Normal file → Executable file
71
egs/librilight/SSL/zipformer/pretrain.py
Normal file → Executable file
@ -1,11 +1,10 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# Copyright 2021-2024 Xiaomi Corp. (authors: Fangjun Kuang,
|
# Copyright 2021-2024 Xiaomi Corp. (authors: Fangjun Kuang,
|
||||||
# Wei Kang,
|
# Wei Kang,
|
||||||
# Mingshuang Luo,
|
# Mingshuang Luo,
|
||||||
# Zengwei Yao,
|
# Zengwei Yao,
|
||||||
# Yifan Yang,
|
# Yifan Yang,
|
||||||
# Daniel Povey)
|
# Daniel Povey)
|
||||||
#
|
|
||||||
# Copyright 2024 Shanghai Jiao Tong University (authors: Jianheng Zhuo)
|
# Copyright 2024 Shanghai Jiao Tong University (authors: Jianheng Zhuo)
|
||||||
#
|
#
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
@ -32,7 +31,8 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
|
|||||||
--num-epochs 400 \
|
--num-epochs 400 \
|
||||||
--start-epoch 1 \
|
--start-epoch 1 \
|
||||||
--use-fp16 1 \
|
--use-fp16 1 \
|
||||||
--exp-dir zipformer/exp \
|
--exp-dir hubert/exp \
|
||||||
|
--full-libri 1 \
|
||||||
--max-duration 87.5 \
|
--max-duration 87.5 \
|
||||||
--accum-grad 4
|
--accum-grad 4
|
||||||
"""
|
"""
|
||||||
@ -46,6 +46,7 @@ from pathlib import Path
|
|||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
from typing import Any, Dict, Optional, Tuple, Union
|
from typing import Any, Dict, Optional, Tuple, Union
|
||||||
|
|
||||||
|
import lhotse
|
||||||
import optim
|
import optim
|
||||||
import torch
|
import torch
|
||||||
import torch.multiprocessing as mp
|
import torch.multiprocessing as mp
|
||||||
@ -398,13 +399,6 @@ def add_model_arguments(parser: argparse.ArgumentParser):
|
|||||||
and the value should be the multiple of 4, for faster computation""",
|
and the value should be the multiple of 4, for faster computation""",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--untie-final-proj",
|
|
||||||
type=bool,
|
|
||||||
default=False,
|
|
||||||
help="use separate projection for each target",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
@ -483,7 +477,7 @@ def get_parser():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--lr-epochs",
|
"--lr-epochs",
|
||||||
type=float,
|
type=float,
|
||||||
default=10.5,
|
default=0.2,
|
||||||
help="""Number of epochs that affects how rapidly the learning rate decreases.
|
help="""Number of epochs that affects how rapidly the learning rate decreases.
|
||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
@ -541,7 +535,7 @@ def get_parser():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--save-every-n",
|
"--save-every-n",
|
||||||
type=int,
|
type=int,
|
||||||
default=100000,
|
default=10000,
|
||||||
help="""Save checkpoint after processing this number of batches"
|
help="""Save checkpoint after processing this number of batches"
|
||||||
periodically. We save checkpoint to exp-dir/ whenever
|
periodically. We save checkpoint to exp-dir/ whenever
|
||||||
params.batch_idx_train % save_every_n == 0. The checkpoint filename
|
params.batch_idx_train % save_every_n == 0. The checkpoint filename
|
||||||
@ -554,7 +548,7 @@ def get_parser():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--keep-last-k",
|
"--keep-last-k",
|
||||||
type=int,
|
type=int,
|
||||||
default=30,
|
default=100000,
|
||||||
help="""Only keep this number of checkpoints on disk.
|
help="""Only keep this number of checkpoints on disk.
|
||||||
For instance, if it is 3, there are only 3 checkpoints
|
For instance, if it is 3, there are only 3 checkpoints
|
||||||
in the exp-dir with filenames `checkpoint-xxx.pt`.
|
in the exp-dir with filenames `checkpoint-xxx.pt`.
|
||||||
@ -591,17 +585,24 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--max-sample-size",
|
"--max-keep-size",
|
||||||
type=float,
|
type=int,
|
||||||
default=250000,
|
default=1024000,
|
||||||
help="max sample size",
|
help="exclude sample longer than this.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--min-sample-size",
|
"--min-keep-size",
|
||||||
type=float,
|
type=float,
|
||||||
default=32000,
|
default=32000,
|
||||||
help="min sample size",
|
help="exclude sample longer less than this.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-sample-size",
|
||||||
|
type=float,
|
||||||
|
default=1024000,
|
||||||
|
help="max sample size to crop to for batching.",
|
||||||
)
|
)
|
||||||
|
|
||||||
add_model_arguments(parser)
|
add_model_arguments(parser)
|
||||||
@ -960,10 +961,10 @@ def train_one_epoch(
|
|||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
except: # noqa
|
except Exception as e: # noqa
|
||||||
save_bad_model()
|
save_bad_model()
|
||||||
display_and_save_batch(batch, params=params)
|
display_and_save_batch(batch, params=params)
|
||||||
raise
|
raise e
|
||||||
|
|
||||||
if params.print_diagnostics and batch_idx == 5:
|
if params.print_diagnostics and batch_idx == 5:
|
||||||
return
|
return
|
||||||
@ -1064,7 +1065,7 @@ def train_one_epoch(
|
|||||||
tb_writer, "train/valid_", params.batch_idx_train
|
tb_writer, "train/valid_", params.batch_idx_train
|
||||||
)
|
)
|
||||||
|
|
||||||
if batch_idx % params.accum_grad != params.accum_grad - 1:
|
if sub_batch_idx % params.accum_grad != params.accum_grad - 1:
|
||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
loss_value = tot_loss["loss"] / tot_loss["frames"]
|
loss_value = tot_loss["loss"] / tot_loss["frames"]
|
||||||
params.train_loss = loss_value
|
params.train_loss = loss_value
|
||||||
@ -1165,7 +1166,7 @@ def run(rank, world_size, args):
|
|||||||
|
|
||||||
librilight = LibriLightDataModule(args)
|
librilight = LibriLightDataModule(args)
|
||||||
|
|
||||||
train_cuts = librilight.train_all_shuf_cuts()
|
train_cuts = librilight.all_shuf_cuts()
|
||||||
|
|
||||||
def remove_short_and_long_utt(c: Cut):
|
def remove_short_and_long_utt(c: Cut):
|
||||||
# Keep only utterances with duration between 1 second and 20 seconds
|
# Keep only utterances with duration between 1 second and 20 seconds
|
||||||
@ -1177,11 +1178,11 @@ def run(rank, world_size, args):
|
|||||||
# an utterance duration distribution for your dataset to select
|
# an utterance duration distribution for your dataset to select
|
||||||
# the threshold
|
# the threshold
|
||||||
if (
|
if (
|
||||||
c.duration < params.min_sample_size / params.sample_rate
|
c.duration < params.min_keep_size / params.sample_rate
|
||||||
or c.duration > params.max_sample_size / params.sample_rate
|
or c.duration > params.max_keep_size / params.sample_rate
|
||||||
):
|
):
|
||||||
# logging.warning(
|
# logging.warning(
|
||||||
# f"Exclude cut with ID {c.id} from training. Duration: {c.duration}"
|
# f"Exclude cut with ID {c.id} from training. Duration: {c.duration}"
|
||||||
# )
|
# )
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -1198,6 +1199,7 @@ def run(rank, world_size, args):
|
|||||||
|
|
||||||
train_dl = librilight.train_dataloaders(
|
train_dl = librilight.train_dataloaders(
|
||||||
train_cuts,
|
train_cuts,
|
||||||
|
max_sample_size=params.max_sample_size,
|
||||||
sample_rate=params.sample_rate,
|
sample_rate=params.sample_rate,
|
||||||
label_rate=params.label_rate,
|
label_rate=params.label_rate,
|
||||||
random_crop=params.random_crop,
|
random_crop=params.random_crop,
|
||||||
@ -1205,6 +1207,8 @@ def run(rank, world_size, args):
|
|||||||
num_classes=params.num_classes,
|
num_classes=params.num_classes,
|
||||||
do_normalize=params.do_normalize,
|
do_normalize=params.do_normalize,
|
||||||
sampler_state_dict=sampler_state_dict,
|
sampler_state_dict=sampler_state_dict,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
|
|
||||||
valid_cuts = librilight.dev_clean_cuts()
|
valid_cuts = librilight.dev_clean_cuts()
|
||||||
@ -1213,12 +1217,15 @@ def run(rank, world_size, args):
|
|||||||
|
|
||||||
valid_dl = librilight.valid_dataloaders(
|
valid_dl = librilight.valid_dataloaders(
|
||||||
valid_cuts,
|
valid_cuts,
|
||||||
|
max_sample_size=params.max_sample_size,
|
||||||
sample_rate=params.sample_rate,
|
sample_rate=params.sample_rate,
|
||||||
label_rate=params.label_rate,
|
label_rate=params.label_rate,
|
||||||
random_crop=params.random_crop,
|
random_crop=params.random_crop,
|
||||||
pad_audio=False,
|
pad_audio=False,
|
||||||
num_classes=params.num_classes,
|
num_classes=params.num_classes,
|
||||||
do_normalize=params.do_normalize,
|
do_normalize=params.do_normalize,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
|
|
||||||
if params.sanity_check and not params.print_diagnostics:
|
if params.sanity_check and not params.print_diagnostics:
|
||||||
@ -1339,7 +1346,7 @@ def scan_pessimistic_batches_for_oom(
|
|||||||
f"(={crit_values[criterion]}) ..."
|
f"(={crit_values[criterion]}) ..."
|
||||||
)
|
)
|
||||||
display_and_save_batch(batch, params=params)
|
display_and_save_batch(batch, params=params)
|
||||||
raise
|
raise e
|
||||||
logging.info(
|
logging.info(
|
||||||
f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
|
f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
|
||||||
)
|
)
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
# Copyright 2021 Piotr Żelasko
|
# Copyright 2024 Xiaomi Corporation (Author: Yifan Yang)
|
||||||
# Copyright 2023 Xiaomi Corporation (Author: Yifan Yang)
|
|
||||||
#
|
#
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
#
|
#
|
||||||
@ -25,8 +24,9 @@ from pathlib import Path
|
|||||||
from typing import Any, Dict, Optional
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
import lhotse
|
||||||
from dataset import HubertDataset
|
from dataset import HubertDataset
|
||||||
from lhotse import CutSet, combine, load_manifest_lazy
|
from lhotse import CutSet, load_manifest_lazy
|
||||||
from lhotse.dataset import DynamicBucketingSampler, SimpleCutSampler
|
from lhotse.dataset import DynamicBucketingSampler, SimpleCutSampler
|
||||||
from lhotse.utils import fix_random_seed
|
from lhotse.utils import fix_random_seed
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
@ -46,7 +46,7 @@ class LibriLightDataModule:
|
|||||||
"""
|
"""
|
||||||
DataModule for SSL experiments.
|
DataModule for SSL experiments.
|
||||||
It assumes there is always one train and valid dataloader,
|
It assumes there is always one train and valid dataloader,
|
||||||
but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
|
but there can be multiple test dataloaders (e.g. LibriLight test-clean
|
||||||
and test-other).
|
and test-other).
|
||||||
|
|
||||||
It contains all the common data pipeline modules used in SSL
|
It contains all the common data pipeline modules used in SSL
|
||||||
@ -63,7 +63,7 @@ class LibriLightDataModule:
|
|||||||
@classmethod
|
@classmethod
|
||||||
def add_arguments(cls, parser: argparse.ArgumentParser):
|
def add_arguments(cls, parser: argparse.ArgumentParser):
|
||||||
group = parser.add_argument_group(
|
group = parser.add_argument_group(
|
||||||
title="ASR SSL related options",
|
title="SSL data related options",
|
||||||
description="These options are used for the preparation of "
|
description="These options are used for the preparation of "
|
||||||
"PyTorch DataLoaders from Lhotse CutSet's -- they control the "
|
"PyTorch DataLoaders from Lhotse CutSet's -- they control the "
|
||||||
"effective batch sizes, sampling strategies.",
|
"effective batch sizes, sampling strategies.",
|
||||||
@ -92,10 +92,29 @@ class LibriLightDataModule:
|
|||||||
group.add_argument(
|
group.add_argument(
|
||||||
"--num-buckets",
|
"--num-buckets",
|
||||||
type=int,
|
type=int,
|
||||||
default=30,
|
default=1000,
|
||||||
help="The number of buckets for the DynamicBucketingSampler"
|
help="The number of buckets for the DynamicBucketingSampler"
|
||||||
"(you might want to increase it for larger datasets).",
|
"(you might want to increase it for larger datasets).",
|
||||||
)
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"--num-cuts-for-bins-estimate",
|
||||||
|
type=float,
|
||||||
|
default=1000000,
|
||||||
|
help="We will draw this many cuts to estimate the duration"
|
||||||
|
"bins for creating similar-duration buckets. Larger number"
|
||||||
|
"means a better estimate to the data distribution, possibly"
|
||||||
|
"at a longer init cost."
|
||||||
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"--quadratic-duration",
|
||||||
|
type=float,
|
||||||
|
default=None,
|
||||||
|
help="When set, it adds an extra penalty that's quadratic"
|
||||||
|
"in size w.r.t. a cuts duration. This helps get a more"
|
||||||
|
"even GPU utilization across different input lengths when"
|
||||||
|
"models have quadratic input complexity. Set between 15"
|
||||||
|
"and 40 for transformers.",
|
||||||
|
)
|
||||||
group.add_argument(
|
group.add_argument(
|
||||||
"--shuffle",
|
"--shuffle",
|
||||||
type=str2bool,
|
type=str2bool,
|
||||||
@ -112,7 +131,7 @@ class LibriLightDataModule:
|
|||||||
group.add_argument(
|
group.add_argument(
|
||||||
"--num-workers",
|
"--num-workers",
|
||||||
type=int,
|
type=int,
|
||||||
default=2,
|
default=8,
|
||||||
help="The number of training dataloader workers that "
|
help="The number of training dataloader workers that "
|
||||||
"collect the batches.",
|
"collect the batches.",
|
||||||
)
|
)
|
||||||
@ -126,12 +145,13 @@ class LibriLightDataModule:
|
|||||||
"--random-crop",
|
"--random-crop",
|
||||||
type=str2bool,
|
type=str2bool,
|
||||||
default=True,
|
default=True,
|
||||||
help="audio sample rate",
|
help="always crop from the beginning if false",
|
||||||
)
|
)
|
||||||
|
|
||||||
def train_dataloaders(
|
def train_dataloaders(
|
||||||
self,
|
self,
|
||||||
cuts_train: CutSet,
|
cuts_train: CutSet,
|
||||||
|
max_sample_size: Optional[int] = None,
|
||||||
sample_rate: float = 16000,
|
sample_rate: float = 16000,
|
||||||
label_rate: float = 50,
|
label_rate: float = 50,
|
||||||
random_crop: bool = True,
|
random_crop: bool = True,
|
||||||
@ -139,6 +159,8 @@ class LibriLightDataModule:
|
|||||||
num_classes: list = [504],
|
num_classes: list = [504],
|
||||||
do_normalize: bool = True,
|
do_normalize: bool = True,
|
||||||
sampler_state_dict: Optional[Dict[str, Any]] = None,
|
sampler_state_dict: Optional[Dict[str, Any]] = None,
|
||||||
|
world_size: Optional[int] = None,
|
||||||
|
rank: Optional[int] = None,
|
||||||
) -> DataLoader:
|
) -> DataLoader:
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
@ -149,6 +171,7 @@ class LibriLightDataModule:
|
|||||||
"""
|
"""
|
||||||
logging.info("About to create train dataset")
|
logging.info("About to create train dataset")
|
||||||
train = HubertDataset(
|
train = HubertDataset(
|
||||||
|
max_sample_size=max_sample_size,
|
||||||
sample_rate=sample_rate,
|
sample_rate=sample_rate,
|
||||||
label_rate=label_rate,
|
label_rate=label_rate,
|
||||||
random_crop=random_crop,
|
random_crop=random_crop,
|
||||||
@ -162,9 +185,14 @@ class LibriLightDataModule:
|
|||||||
train_sampler = DynamicBucketingSampler(
|
train_sampler = DynamicBucketingSampler(
|
||||||
cuts_train,
|
cuts_train,
|
||||||
max_duration=self.args.max_duration,
|
max_duration=self.args.max_duration,
|
||||||
|
quadratic_duration=self.args.quadratic_duration,
|
||||||
shuffle=self.args.shuffle,
|
shuffle=self.args.shuffle,
|
||||||
num_buckets=self.args.num_buckets,
|
num_buckets=self.args.num_buckets,
|
||||||
|
buffer_size=self.args.num_buckets * 2000,
|
||||||
|
num_cuts_for_bins_estimate=self.args.num_cuts_for_bins_estimate,
|
||||||
drop_last=self.args.drop_last,
|
drop_last=self.args.drop_last,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logging.info("Using SimpleCutSampler.")
|
logging.info("Using SimpleCutSampler.")
|
||||||
@ -172,6 +200,8 @@ class LibriLightDataModule:
|
|||||||
cuts_train,
|
cuts_train,
|
||||||
max_duration=self.args.max_duration,
|
max_duration=self.args.max_duration,
|
||||||
shuffle=self.args.shuffle,
|
shuffle=self.args.shuffle,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
logging.info("About to create train dataloader")
|
logging.info("About to create train dataloader")
|
||||||
|
|
||||||
@ -198,15 +228,19 @@ class LibriLightDataModule:
|
|||||||
def valid_dataloaders(
|
def valid_dataloaders(
|
||||||
self,
|
self,
|
||||||
cuts_valid: CutSet,
|
cuts_valid: CutSet,
|
||||||
|
max_sample_size: Optional[int] = None,
|
||||||
sample_rate: float = 16000,
|
sample_rate: float = 16000,
|
||||||
label_rate: float = 50,
|
label_rate: float = 50,
|
||||||
random_crop: bool = True,
|
random_crop: bool = True,
|
||||||
pad_audio: bool = False,
|
pad_audio: bool = False,
|
||||||
num_classes: list = [504],
|
num_classes: list = [504],
|
||||||
do_normalize: bool = True,
|
do_normalize: bool = True,
|
||||||
|
world_size: Optional[int] = None,
|
||||||
|
rank: Optional[int] = None,
|
||||||
) -> DataLoader:
|
) -> DataLoader:
|
||||||
logging.info("About to create dev dataset")
|
logging.info("About to create dev dataset")
|
||||||
validate = HubertDataset(
|
validate = HubertDataset(
|
||||||
|
max_sample_size=max_sample_size,
|
||||||
sample_rate=sample_rate,
|
sample_rate=sample_rate,
|
||||||
label_rate=label_rate,
|
label_rate=label_rate,
|
||||||
random_crop=random_crop,
|
random_crop=random_crop,
|
||||||
@ -217,7 +251,10 @@ class LibriLightDataModule:
|
|||||||
valid_sampler = DynamicBucketingSampler(
|
valid_sampler = DynamicBucketingSampler(
|
||||||
cuts_valid,
|
cuts_valid,
|
||||||
max_duration=self.args.max_duration,
|
max_duration=self.args.max_duration,
|
||||||
|
quadratic_duration=self.args.quadratic_duration,
|
||||||
shuffle=False,
|
shuffle=False,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
logging.info("About to create dev dataloader")
|
logging.info("About to create dev dataloader")
|
||||||
valid_dl = DataLoader(
|
valid_dl = DataLoader(
|
||||||
@ -230,81 +267,11 @@ class LibriLightDataModule:
|
|||||||
|
|
||||||
return valid_dl
|
return valid_dl
|
||||||
|
|
||||||
def test_dataloaders(
|
|
||||||
self,
|
|
||||||
cuts: CutSet,
|
|
||||||
sample_rate: float = 16000,
|
|
||||||
label_rate: float = 50,
|
|
||||||
random_crop: bool = True,
|
|
||||||
pad_audio: bool = False,
|
|
||||||
num_classes: list = [504],
|
|
||||||
do_normalize: bool = True,
|
|
||||||
) -> DataLoader:
|
|
||||||
logging.debug("About to create test dataset")
|
|
||||||
test = HubertDataset(
|
|
||||||
sample_rate=sample_rate,
|
|
||||||
label_rate=label_rate,
|
|
||||||
random_crop=random_crop,
|
|
||||||
pad_audio=pad_audio,
|
|
||||||
num_classes=num_classes,
|
|
||||||
do_normalize=do_normalize,
|
|
||||||
)
|
|
||||||
sampler = DynamicBucketingSampler(
|
|
||||||
cuts,
|
|
||||||
max_duration=self.args.max_duration,
|
|
||||||
shuffle=False,
|
|
||||||
)
|
|
||||||
logging.debug("About to create test dataloader")
|
|
||||||
test_dl = DataLoader(
|
|
||||||
test,
|
|
||||||
batch_size=None,
|
|
||||||
sampler=sampler,
|
|
||||||
num_workers=self.args.num_workers,
|
|
||||||
)
|
|
||||||
return test_dl
|
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def small_cuts(self) -> CutSet:
|
def all_shuf_cuts(self) -> CutSet:
|
||||||
logging.info("About to get small cuts")
|
|
||||||
return load_manifest_lazy(
|
|
||||||
self.args.manifest_dir / "librilight_cuts_small.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
@lru_cache()
|
|
||||||
def medium_cuts(self) -> CutSet:
|
|
||||||
logging.info("About to get medium cuts")
|
|
||||||
filenames = glob.glob(
|
|
||||||
f"{self.args.manifest_dir}/medium_splits/librilight_cuts_medium.*.jsonl.gz"
|
|
||||||
)
|
|
||||||
pattern = re.compile(r"librilight_cuts_medium.([0-9]+).jsonl.gz")
|
|
||||||
idx_filenames = ((int(pattern.search(f).group(1)), f) for f in filenames)
|
|
||||||
idx_filenames = sorted(idx_filenames, key=lambda x: x[0])
|
|
||||||
sorted_filenames = [f[1] for f in idx_filenames]
|
|
||||||
logging.info(
|
logging.info(
|
||||||
f"Loading LibriLight medium {len(sorted_filenames)} splits in lazy mode"
|
"About to get the shuffled librilight small, medium and large cuts"
|
||||||
)
|
)
|
||||||
|
|
||||||
return combine(load_manifest_lazy(p) for p in sorted_filenames)
|
|
||||||
|
|
||||||
@lru_cache()
|
|
||||||
def large_cuts(self) -> CutSet:
|
|
||||||
logging.info("About to get large cuts")
|
|
||||||
filenames = glob.glob(
|
|
||||||
f"{self.args.manifest_dir}/large_splits/librilight_cuts_large.*.jsonl.gz"
|
|
||||||
)
|
|
||||||
pattern = re.compile(r"librilight_cuts_large.([0-9]+).jsonl.gz")
|
|
||||||
idx_filenames = ((int(pattern.search(f).group(1)), f) for f in filenames)
|
|
||||||
idx_filenames = sorted(idx_filenames, key=lambda x: x[0])
|
|
||||||
sorted_filenames = [f[1] for f in idx_filenames]
|
|
||||||
logging.info(
|
|
||||||
f"Loading LibriLight large {len(sorted_filenames)} splits in lazy mode"
|
|
||||||
)
|
|
||||||
|
|
||||||
return combine(load_manifest_lazy(p) for p in sorted_filenames)
|
|
||||||
|
|
||||||
@lru_cache()
|
|
||||||
def train_all_shuf_cuts(self) -> CutSet:
|
|
||||||
logging.info("About to get the shuffled small, medium and large cuts")
|
|
||||||
small_cuts = self.small_cuts()
|
small_cuts = self.small_cuts()
|
||||||
medium_cuts = self.medium_cuts()
|
medium_cuts = self.medium_cuts()
|
||||||
large_cuts = self.large_cuts()
|
large_cuts = self.large_cuts()
|
||||||
@ -313,22 +280,52 @@ class LibriLightDataModule:
|
|||||||
medium_cuts,
|
medium_cuts,
|
||||||
large_cuts,
|
large_cuts,
|
||||||
weights=[
|
weights=[
|
||||||
122867, # len(small_cuts)
|
229051, # len(small_cuts)
|
||||||
1104071, # len(medium_cuts)
|
2022949, # len(medium_cuts)
|
||||||
11012085, # len(large_cuts)
|
19883414, # len(large_cuts)
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def dev_clean_cuts(self) -> CutSet:
|
def dev_clean_cuts(self) -> CutSet:
|
||||||
logging.info("About to get dev-clean cuts")
|
logging.info("About to get librispeech dev-clean cuts")
|
||||||
return load_manifest_lazy(
|
return load_manifest_lazy(
|
||||||
self.args.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
|
self.args.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def dev_other_cuts(self) -> CutSet:
|
def small_cuts(self) -> CutSet:
|
||||||
logging.info("About to get dev-other cuts")
|
logging.info("About to get librilight small cuts")
|
||||||
return load_manifest_lazy(
|
return load_manifest_lazy(
|
||||||
self.args.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz"
|
self.args.manifest_dir / "librilight_cuts_small.jsonl.gz"
|
||||||
|
)
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def medium_cuts(self) -> CutSet:
|
||||||
|
logging.info("About to get librilight medium cuts")
|
||||||
|
filenames = glob.glob(
|
||||||
|
str(self.args.manifest_dir / "medium_split" / "librilight_cuts_medium.*.jsonl.gz")
|
||||||
|
)
|
||||||
|
pattern = re.compile(r"librilight_cuts_medium.([0-9]+).jsonl.gz")
|
||||||
|
idx_filenames = ((int(pattern.search(f).group(1)), f) for f in filenames)
|
||||||
|
idx_filenames = sorted(idx_filenames, key=lambda x: x[0])
|
||||||
|
sorted_filenames = [f[1] for f in idx_filenames]
|
||||||
|
logging.info(f"Loading Libri-Light medium {len(sorted_filenames)} splits in lazy mode")
|
||||||
|
return lhotse.combine(
|
||||||
|
lhotse.load_manifest_lazy(p) for p in sorted_filenames
|
||||||
|
)
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def large_cuts(self) -> CutSet:
|
||||||
|
logging.info("About to get librilight large cuts")
|
||||||
|
filenames = glob.glob(
|
||||||
|
str(self.args.manifest_dir / "large_split" / "librilight_cuts_large.*.jsonl.gz")
|
||||||
|
)
|
||||||
|
pattern = re.compile(r"librilight_cuts_large.([0-9]+).jsonl.gz")
|
||||||
|
idx_filenames = ((int(pattern.search(f).group(1)), f) for f in filenames)
|
||||||
|
idx_filenames = sorted(idx_filenames, key=lambda x: x[0])
|
||||||
|
sorted_filenames = [f[1] for f in idx_filenames]
|
||||||
|
logging.info(f"Loading Libri-Light large {len(sorted_filenames)} splits in lazy mode")
|
||||||
|
return lhotse.combine(
|
||||||
|
lhotse.load_manifest_lazy(p) for p in sorted_filenames
|
||||||
)
|
)
|
||||||
|
@ -132,6 +132,8 @@ class LibriSpeechAsrDataModule:
|
|||||||
cuts_train: CutSet,
|
cuts_train: CutSet,
|
||||||
do_normalize: bool,
|
do_normalize: bool,
|
||||||
sampler_state_dict: Optional[Dict[str, Any]] = None,
|
sampler_state_dict: Optional[Dict[str, Any]] = None,
|
||||||
|
world_size: Optional[int] = None,
|
||||||
|
rank: Optional[int] = None,
|
||||||
) -> DataLoader:
|
) -> DataLoader:
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
@ -150,7 +152,10 @@ class LibriSpeechAsrDataModule:
|
|||||||
max_duration=self.args.max_duration,
|
max_duration=self.args.max_duration,
|
||||||
shuffle=self.args.shuffle,
|
shuffle=self.args.shuffle,
|
||||||
num_buckets=self.args.num_buckets,
|
num_buckets=self.args.num_buckets,
|
||||||
|
buffer_size=self.args.num_buckets * 2000,
|
||||||
drop_last=self.args.drop_last,
|
drop_last=self.args.drop_last,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logging.info("Using SimpleCutSampler.")
|
logging.info("Using SimpleCutSampler.")
|
||||||
@ -158,6 +163,8 @@ class LibriSpeechAsrDataModule:
|
|||||||
cuts_train,
|
cuts_train,
|
||||||
max_duration=self.args.max_duration,
|
max_duration=self.args.max_duration,
|
||||||
shuffle=self.args.shuffle,
|
shuffle=self.args.shuffle,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
logging.info("About to create train dataloader")
|
logging.info("About to create train dataloader")
|
||||||
|
|
||||||
@ -181,13 +188,21 @@ class LibriSpeechAsrDataModule:
|
|||||||
|
|
||||||
return train_dl
|
return train_dl
|
||||||
|
|
||||||
def valid_dataloaders(self, cuts_valid: CutSet, do_normalize: bool) -> DataLoader:
|
def valid_dataloaders(
|
||||||
|
self,
|
||||||
|
cuts_valid: CutSet,
|
||||||
|
do_normalize: bool,
|
||||||
|
world_size: Optional[int] = None,
|
||||||
|
rank: Optional[int] = None,
|
||||||
|
) -> DataLoader:
|
||||||
logging.info("About to create dev dataset")
|
logging.info("About to create dev dataset")
|
||||||
validate = HubertAsrDataset(do_normalize=do_normalize)
|
validate = HubertAsrDataset(do_normalize=do_normalize)
|
||||||
valid_sampler = DynamicBucketingSampler(
|
valid_sampler = DynamicBucketingSampler(
|
||||||
cuts_valid,
|
cuts_valid,
|
||||||
max_duration=self.args.max_duration,
|
max_duration=self.args.max_duration,
|
||||||
shuffle=False,
|
shuffle=False,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
logging.info("About to create dev dataloader")
|
logging.info("About to create dev dataloader")
|
||||||
valid_dl = DataLoader(
|
valid_dl = DataLoader(
|
||||||
|
0
egs/librispeech/SSL/hubert/decode.py
Normal file → Executable file
0
egs/librispeech/SSL/hubert/decode.py
Normal file → Executable file
0
egs/librispeech/SSL/hubert/decode_ce.py
Normal file → Executable file
0
egs/librispeech/SSL/hubert/decode_ce.py
Normal file → Executable file
4
egs/librispeech/SSL/hubert/finetune.py
Normal file → Executable file
4
egs/librispeech/SSL/hubert/finetune.py
Normal file → Executable file
@ -1090,6 +1090,8 @@ def run(rank, world_size, args):
|
|||||||
train_cuts,
|
train_cuts,
|
||||||
do_normalize=params.do_normalize,
|
do_normalize=params.do_normalize,
|
||||||
sampler_state_dict=sampler_state_dict,
|
sampler_state_dict=sampler_state_dict,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
|
|
||||||
valid_cuts = librispeech.dev_clean_cuts()
|
valid_cuts = librispeech.dev_clean_cuts()
|
||||||
@ -1098,6 +1100,8 @@ def run(rank, world_size, args):
|
|||||||
valid_dl = librispeech.valid_dataloaders(
|
valid_dl = librispeech.valid_dataloaders(
|
||||||
valid_cuts,
|
valid_cuts,
|
||||||
do_normalize=params.do_normalize,
|
do_normalize=params.do_normalize,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
|
|
||||||
if params.sanity_check and not params.print_diagnostics:
|
if params.sanity_check and not params.print_diagnostics:
|
||||||
|
4
egs/librispeech/SSL/hubert/finetune_ce.py
Normal file → Executable file
4
egs/librispeech/SSL/hubert/finetune_ce.py
Normal file → Executable file
@ -1090,6 +1090,8 @@ def run(rank, world_size, args):
|
|||||||
train_cuts,
|
train_cuts,
|
||||||
do_normalize=params.do_normalize,
|
do_normalize=params.do_normalize,
|
||||||
sampler_state_dict=sampler_state_dict,
|
sampler_state_dict=sampler_state_dict,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
|
|
||||||
valid_cuts = librispeech.dev_clean_cuts()
|
valid_cuts = librispeech.dev_clean_cuts()
|
||||||
@ -1098,6 +1100,8 @@ def run(rank, world_size, args):
|
|||||||
valid_dl = librispeech.valid_dataloaders(
|
valid_dl = librispeech.valid_dataloaders(
|
||||||
valid_cuts,
|
valid_cuts,
|
||||||
do_normalize=params.do_normalize,
|
do_normalize=params.do_normalize,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
|
|
||||||
if params.sanity_check and not params.print_diagnostics:
|
if params.sanity_check and not params.print_diagnostics:
|
||||||
|
0
egs/librispeech/SSL/hubert/pretrain.py
Normal file → Executable file
0
egs/librispeech/SSL/hubert/pretrain.py
Normal file → Executable file
0
egs/librispeech/SSL/hubert/pretrain_ce.py
Normal file → Executable file
0
egs/librispeech/SSL/hubert/pretrain_ce.py
Normal file → Executable file
@ -144,6 +144,8 @@ class LibriSpeechDataModule:
|
|||||||
num_classes: list = [504],
|
num_classes: list = [504],
|
||||||
do_normalize: bool = True,
|
do_normalize: bool = True,
|
||||||
sampler_state_dict: Optional[Dict[str, Any]] = None,
|
sampler_state_dict: Optional[Dict[str, Any]] = None,
|
||||||
|
world_size: Optional[int] = None,
|
||||||
|
rank: Optional[int] = None,
|
||||||
) -> DataLoader:
|
) -> DataLoader:
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
@ -170,7 +172,10 @@ class LibriSpeechDataModule:
|
|||||||
max_duration=self.args.max_duration,
|
max_duration=self.args.max_duration,
|
||||||
shuffle=self.args.shuffle,
|
shuffle=self.args.shuffle,
|
||||||
num_buckets=self.args.num_buckets,
|
num_buckets=self.args.num_buckets,
|
||||||
|
buffer_size=self.args.num_buckets * 2000,
|
||||||
drop_last=self.args.drop_last,
|
drop_last=self.args.drop_last,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logging.info("Using SimpleCutSampler.")
|
logging.info("Using SimpleCutSampler.")
|
||||||
@ -178,6 +183,8 @@ class LibriSpeechDataModule:
|
|||||||
cuts_train,
|
cuts_train,
|
||||||
max_duration=self.args.max_duration,
|
max_duration=self.args.max_duration,
|
||||||
shuffle=self.args.shuffle,
|
shuffle=self.args.shuffle,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
logging.info("About to create train dataloader")
|
logging.info("About to create train dataloader")
|
||||||
|
|
||||||
@ -211,6 +218,8 @@ class LibriSpeechDataModule:
|
|||||||
pad_audio: bool = False,
|
pad_audio: bool = False,
|
||||||
num_classes: list = [504],
|
num_classes: list = [504],
|
||||||
do_normalize: bool = True,
|
do_normalize: bool = True,
|
||||||
|
world_size: Optional[int] = None,
|
||||||
|
rank: Optional[int] = None,
|
||||||
) -> DataLoader:
|
) -> DataLoader:
|
||||||
logging.info("About to create dev dataset")
|
logging.info("About to create dev dataset")
|
||||||
validate = HubertDataset(
|
validate = HubertDataset(
|
||||||
@ -226,6 +235,8 @@ class LibriSpeechDataModule:
|
|||||||
cuts_valid,
|
cuts_valid,
|
||||||
max_duration=self.args.max_duration,
|
max_duration=self.args.max_duration,
|
||||||
shuffle=False,
|
shuffle=False,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
logging.info("About to create dev dataloader")
|
logging.info("About to create dev dataloader")
|
||||||
valid_dl = DataLoader(
|
valid_dl = DataLoader(
|
||||||
|
19
egs/librispeech/SSL/pretrain.sh
Executable file
19
egs/librispeech/SSL/pretrain.sh
Executable file
@ -0,0 +1,19 @@
|
|||||||
|
./zipformer/pretrain.py \
|
||||||
|
--world-size 8 \
|
||||||
|
--num-epochs 300 \
|
||||||
|
--start-epoch 1 \
|
||||||
|
--use-fp16 1 \
|
||||||
|
--exp-dir zipformer/exp_pretrain \
|
||||||
|
--full-libri 1 \
|
||||||
|
--max-duration 600 \
|
||||||
|
--accum-grad 1 \
|
||||||
|
--do-normalize 0 \
|
||||||
|
--mask-prob 0.8 \
|
||||||
|
--dropout-input 0.1 \
|
||||||
|
--dropout-features 0.1 \
|
||||||
|
--feature-grad-mult 0.1 \
|
||||||
|
--num-encoder-layers 2,2,3,4,3,2 \
|
||||||
|
--feedforward-dim 512,768,1024,1536,1024,768 \
|
||||||
|
--encoder-dim 192,256,448,768,448,192 \
|
||||||
|
--encoder-unmasked-dim 192,192,256,256,256,192 \
|
||||||
|
--base-lr 0.045
|
0
egs/librispeech/SSL/zipformer/decode.py
Normal file → Executable file
0
egs/librispeech/SSL/zipformer/decode.py
Normal file → Executable file
4
egs/librispeech/SSL/zipformer/finetune.py
Normal file → Executable file
4
egs/librispeech/SSL/zipformer/finetune.py
Normal file → Executable file
@ -1387,6 +1387,8 @@ def run(rank, world_size, args):
|
|||||||
train_cuts,
|
train_cuts,
|
||||||
do_normalize=params.do_normalize,
|
do_normalize=params.do_normalize,
|
||||||
sampler_state_dict=sampler_state_dict,
|
sampler_state_dict=sampler_state_dict,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
|
|
||||||
valid_cuts = librispeech.dev_clean_cuts()
|
valid_cuts = librispeech.dev_clean_cuts()
|
||||||
@ -1395,6 +1397,8 @@ def run(rank, world_size, args):
|
|||||||
valid_dl = librispeech.valid_dataloaders(
|
valid_dl = librispeech.valid_dataloaders(
|
||||||
valid_cuts,
|
valid_cuts,
|
||||||
do_normalize=params.do_normalize,
|
do_normalize=params.do_normalize,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
|
|
||||||
if params.sanity_check and not params.print_diagnostics:
|
if params.sanity_check and not params.print_diagnostics:
|
||||||
|
@ -296,7 +296,6 @@ class HubertModel(nn.Module):
|
|||||||
|
|
||||||
self.layer_norm = LayerNorm(self.embed)
|
self.layer_norm = LayerNorm(self.embed)
|
||||||
|
|
||||||
self.untie_final_proj = cfg.untie_final_proj
|
|
||||||
self.final_proj = nn.Linear(encoder_output_dim, sum(cfg.num_classes))
|
self.final_proj = nn.Linear(encoder_output_dim, sum(cfg.num_classes))
|
||||||
|
|
||||||
# modules below are not needed during fine-tuning
|
# modules below are not needed during fine-tuning
|
||||||
|
@ -154,9 +154,9 @@ class AsrModel(nn.Module):
|
|||||||
|
|
||||||
ctc_loss = torch.nn.functional.ctc_loss(
|
ctc_loss = torch.nn.functional.ctc_loss(
|
||||||
log_probs=ctc_output.permute(1, 0, 2), # (T, N, C)
|
log_probs=ctc_output.permute(1, 0, 2), # (T, N, C)
|
||||||
targets=targets,
|
targets=targets.cpu(),
|
||||||
input_lengths=encoder_out_lens,
|
input_lengths=encoder_out_lens.cpu(),
|
||||||
target_lengths=target_lengths,
|
target_lengths=target_lengths.cpu(),
|
||||||
reduction="sum",
|
reduction="sum",
|
||||||
)
|
)
|
||||||
return ctc_loss
|
return ctc_loss
|
||||||
|
7
egs/librispeech/SSL/zipformer/pretrain.py
Normal file → Executable file
7
egs/librispeech/SSL/zipformer/pretrain.py
Normal file → Executable file
@ -41,7 +41,6 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
|
|||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
import logging
|
import logging
|
||||||
import sys
|
|
||||||
import warnings
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
@ -594,7 +593,7 @@ def get_parser():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--max-keep-size",
|
"--max-keep-size",
|
||||||
type=int,
|
type=int,
|
||||||
default=sys.maxsize,
|
default=320000,
|
||||||
help="exclude sample longer than this.",
|
help="exclude sample longer than this.",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1218,6 +1217,8 @@ def run(rank, world_size, args):
|
|||||||
num_classes=params.num_classes,
|
num_classes=params.num_classes,
|
||||||
do_normalize=params.do_normalize,
|
do_normalize=params.do_normalize,
|
||||||
sampler_state_dict=sampler_state_dict,
|
sampler_state_dict=sampler_state_dict,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
|
|
||||||
valid_cuts = librispeech.dev_clean_cuts()
|
valid_cuts = librispeech.dev_clean_cuts()
|
||||||
@ -1233,6 +1234,8 @@ def run(rank, world_size, args):
|
|||||||
pad_audio=False,
|
pad_audio=False,
|
||||||
num_classes=params.num_classes,
|
num_classes=params.num_classes,
|
||||||
do_normalize=params.do_normalize,
|
do_normalize=params.do_normalize,
|
||||||
|
world_size=world_size,
|
||||||
|
rank=rank,
|
||||||
)
|
)
|
||||||
|
|
||||||
if params.sanity_check and not params.print_diagnostics:
|
if params.sanity_check and not params.print_diagnostics:
|
||||||
|
1
egs/librispeech/SSL/zipformer_ctc/asr_datamodule.py
Symbolic link
1
egs/librispeech/SSL/zipformer_ctc/asr_datamodule.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../zipformer/asr_datamodule.py
|
1
egs/librispeech/SSL/zipformer_ctc/beam_search.py
Symbolic link
1
egs/librispeech/SSL/zipformer_ctc/beam_search.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../zipformer/beam_search.py
|
823
egs/librispeech/SSL/zipformer_ctc/ctc_decode.py
Executable file
823
egs/librispeech/SSL/zipformer_ctc/ctc_decode.py
Executable file
@ -0,0 +1,823 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# Copyright 2021-2022 Xiaomi Corporation (Author: Fangjun Kuang,
|
||||||
|
# Liyong Guo,
|
||||||
|
# Quandong Wang,
|
||||||
|
# Zengwei Yao)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
Usage:
|
||||||
|
|
||||||
|
(1) ctc-decoding
|
||||||
|
./zipformer/ctc_decode.py \
|
||||||
|
--epoch 30 \
|
||||||
|
--avg 15 \
|
||||||
|
--exp-dir ./zipformer/exp \
|
||||||
|
--max-duration 600 \
|
||||||
|
--decoding-method ctc-decoding
|
||||||
|
|
||||||
|
(2) 1best
|
||||||
|
./zipformer/ctc_decode.py \
|
||||||
|
--epoch 30 \
|
||||||
|
--avg 15 \
|
||||||
|
--exp-dir ./zipformer/exp \
|
||||||
|
--max-duration 600 \
|
||||||
|
--hlg-scale 0.6 \
|
||||||
|
--decoding-method 1best
|
||||||
|
|
||||||
|
(3) nbest
|
||||||
|
./zipformer/ctc_decode.py \
|
||||||
|
--epoch 30 \
|
||||||
|
--avg 15 \
|
||||||
|
--exp-dir ./zipformer/exp \
|
||||||
|
--max-duration 600 \
|
||||||
|
--hlg-scale 0.6 \
|
||||||
|
--decoding-method nbest
|
||||||
|
|
||||||
|
(4) nbest-rescoring
|
||||||
|
./zipformer/ctc_decode.py \
|
||||||
|
--epoch 30 \
|
||||||
|
--avg 15 \
|
||||||
|
--exp-dir ./zipformer/exp \
|
||||||
|
--max-duration 600 \
|
||||||
|
--hlg-scale 0.6 \
|
||||||
|
--nbest-scale 1.0 \
|
||||||
|
--lm-dir data/lm \
|
||||||
|
--decoding-method nbest-rescoring
|
||||||
|
|
||||||
|
(5) whole-lattice-rescoring
|
||||||
|
./zipformer/ctc_decode.py \
|
||||||
|
--epoch 30 \
|
||||||
|
--avg 15 \
|
||||||
|
--exp-dir ./zipformer/exp \
|
||||||
|
--max-duration 600 \
|
||||||
|
--hlg-scale 0.6 \
|
||||||
|
--nbest-scale 1.0 \
|
||||||
|
--lm-dir data/lm \
|
||||||
|
--decoding-method whole-lattice-rescoring
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import math
|
||||||
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
import k2
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from asr_datamodule import LibriSpeechAsrDataModule
|
||||||
|
from finetune_ctc import add_model_arguments, get_model, get_params
|
||||||
|
|
||||||
|
from icefall.char_graph_compiler import CharCtcTrainingGraphCompiler
|
||||||
|
from icefall.checkpoint import (
|
||||||
|
average_checkpoints,
|
||||||
|
average_checkpoints_with_averaged_model,
|
||||||
|
find_checkpoints,
|
||||||
|
load_checkpoint,
|
||||||
|
)
|
||||||
|
from icefall.decode import (
|
||||||
|
get_lattice,
|
||||||
|
nbest_decoding,
|
||||||
|
nbest_oracle,
|
||||||
|
one_best_decoding,
|
||||||
|
rescore_with_n_best_list,
|
||||||
|
rescore_with_whole_lattice,
|
||||||
|
)
|
||||||
|
from icefall.lexicon import Lexicon
|
||||||
|
from icefall.utils import (
|
||||||
|
AttributeDict,
|
||||||
|
get_texts,
|
||||||
|
setup_logger,
|
||||||
|
store_transcripts,
|
||||||
|
str2bool,
|
||||||
|
write_error_stats,
|
||||||
|
)
|
||||||
|
|
||||||
|
LOG_EPS = math.log(1e-10)
|
||||||
|
|
||||||
|
|
||||||
|
def get_parser():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--epoch",
|
||||||
|
type=int,
|
||||||
|
default=30,
|
||||||
|
help="""It specifies the checkpoint to use for decoding.
|
||||||
|
Note: Epoch counts from 1.
|
||||||
|
You can specify --avg to use more checkpoints for model averaging.""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--iter",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="""If positive, --epoch is ignored and it
|
||||||
|
will use the checkpoint exp_dir/checkpoint-iter.pt.
|
||||||
|
You can specify --avg to use more checkpoints for model averaging.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--avg",
|
||||||
|
type=int,
|
||||||
|
default=15,
|
||||||
|
help="Number of checkpoints to average. Automatically select "
|
||||||
|
"consecutive checkpoints before the checkpoint specified by "
|
||||||
|
"'--epoch' and '--iter'",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--use-averaged-model",
|
||||||
|
type=str2bool,
|
||||||
|
default=True,
|
||||||
|
help="Whether to load averaged model. Currently it only supports "
|
||||||
|
"using --epoch. If True, it would decode with the averaged model "
|
||||||
|
"over the epoch range from `epoch-avg` (excluded) to `epoch`."
|
||||||
|
"Actually only the models with epoch number of `epoch-avg` and "
|
||||||
|
"`epoch` are loaded for averaging. ",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--exp-dir",
|
||||||
|
type=str,
|
||||||
|
default="zipformer/exp",
|
||||||
|
help="The experiment dir",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--lang-dir",
|
||||||
|
type=str,
|
||||||
|
default="data/lang_char",
|
||||||
|
help="""The lang dir
|
||||||
|
It contains language related input files such as
|
||||||
|
"lexicon.txt"
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--context-size",
|
||||||
|
type=int,
|
||||||
|
default=2,
|
||||||
|
help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--decoding-method",
|
||||||
|
type=str,
|
||||||
|
default="ctc-decoding",
|
||||||
|
help="""Decoding method.
|
||||||
|
Supported values are:
|
||||||
|
- (1) ctc-decoding. Use CTC decoding. It uses a sentence piece
|
||||||
|
model, i.e., lang_dir/bpe.model, to convert word pieces to words.
|
||||||
|
It needs neither a lexicon nor an n-gram LM.
|
||||||
|
- (2) 1best. Extract the best path from the decoding lattice as the
|
||||||
|
decoding result.
|
||||||
|
- (3) nbest. Extract n paths from the decoding lattice; the path
|
||||||
|
with the highest score is the decoding result.
|
||||||
|
- (4) nbest-rescoring. Extract n paths from the decoding lattice,
|
||||||
|
rescore them with an n-gram LM (e.g., a 4-gram LM), the path with
|
||||||
|
the highest score is the decoding result.
|
||||||
|
- (5) whole-lattice-rescoring. Rescore the decoding lattice with an
|
||||||
|
n-gram LM (e.g., a 4-gram LM), the best path of rescored lattice
|
||||||
|
is the decoding result.
|
||||||
|
you have trained an RNN LM using ./rnn_lm/train.py
|
||||||
|
- (6) nbest-oracle. Its WER is the lower bound of any n-best
|
||||||
|
rescoring method can achieve. Useful for debugging n-best
|
||||||
|
rescoring method.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-paths",
|
||||||
|
type=int,
|
||||||
|
default=100,
|
||||||
|
help="""Number of paths for n-best based decoding method.
|
||||||
|
Used only when "method" is one of the following values:
|
||||||
|
nbest, nbest-rescoring, and nbest-oracle
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--nbest-scale",
|
||||||
|
type=float,
|
||||||
|
default=1.0,
|
||||||
|
help="""The scale to be applied to `lattice.scores`.
|
||||||
|
It's needed if you use any kinds of n-best based rescoring.
|
||||||
|
Used only when "method" is one of the following values:
|
||||||
|
nbest, nbest-rescoring, and nbest-oracle
|
||||||
|
A smaller value results in more unique paths.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--hlg-scale",
|
||||||
|
type=float,
|
||||||
|
default=0.6,
|
||||||
|
help="""The scale to be applied to `hlg.scores`.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--lm-dir",
|
||||||
|
type=str,
|
||||||
|
default="data/lm",
|
||||||
|
help="""The n-gram LM dir.
|
||||||
|
It should contain either G_4_gram.pt or G_4_gram.fst.txt
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
add_model_arguments(parser)
|
||||||
|
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def get_decoding_params() -> AttributeDict:
|
||||||
|
"""Parameters for decoding."""
|
||||||
|
params = AttributeDict(
|
||||||
|
{
|
||||||
|
"frame_shift_ms": 10,
|
||||||
|
"search_beam": 20,
|
||||||
|
"output_beam": 8,
|
||||||
|
"min_active_states": 30,
|
||||||
|
"max_active_states": 10000,
|
||||||
|
"use_double_scores": True,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
def decode_one_batch(
|
||||||
|
params: AttributeDict,
|
||||||
|
model: nn.Module,
|
||||||
|
HLG: Optional[k2.Fsa],
|
||||||
|
H: Optional[k2.Fsa],
|
||||||
|
lexicon: Lexicon,
|
||||||
|
graph_compiler: CharCtcTrainingGraphCompiler,
|
||||||
|
batch: dict,
|
||||||
|
word_table: k2.SymbolTable,
|
||||||
|
G: Optional[k2.Fsa] = None,
|
||||||
|
) -> Dict[str, List[List[str]]]:
|
||||||
|
"""Decode one batch and return the result in a dict. The dict has the
|
||||||
|
following format:
|
||||||
|
- key: It indicates the setting used for decoding. For example,
|
||||||
|
if no rescoring is used, the key is the string `no_rescore`.
|
||||||
|
If LM rescoring is used, the key is the string `lm_scale_xxx`,
|
||||||
|
where `xxx` is the value of `lm_scale`. An example key is
|
||||||
|
`lm_scale_0.7`
|
||||||
|
- value: It contains the decoding result. `len(value)` equals to
|
||||||
|
batch size. `value[i]` is the decoding result for the i-th
|
||||||
|
utterance in the given batch.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
params:
|
||||||
|
It's the return value of :func:`get_params`.
|
||||||
|
|
||||||
|
- params.decoding_method is "1best", it uses 1best decoding without LM rescoring.
|
||||||
|
- params.decoding_method is "nbest", it uses nbest decoding without LM rescoring.
|
||||||
|
- params.decoding_method is "nbest-rescoring", it uses nbest LM rescoring.
|
||||||
|
- params.decoding_method is "whole-lattice-rescoring", it uses whole lattice LM
|
||||||
|
rescoring.
|
||||||
|
|
||||||
|
model:
|
||||||
|
The neural model.
|
||||||
|
HLG:
|
||||||
|
The decoding graph. Used only when params.decoding_method is NOT ctc-decoding.
|
||||||
|
H:
|
||||||
|
The ctc topo. Used only when params.decoding_method is ctc-decoding.
|
||||||
|
batch:
|
||||||
|
It is the return value from iterating
|
||||||
|
`lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
|
||||||
|
for the format of the `batch`.
|
||||||
|
word_table:
|
||||||
|
The word symbol table.
|
||||||
|
G:
|
||||||
|
An LM. It is not None when params.decoding_method is "nbest-rescoring"
|
||||||
|
or "whole-lattice-rescoring". In general, the G in HLG
|
||||||
|
is a 3-gram LM, while this G is a 4-gram LM.
|
||||||
|
Returns:
|
||||||
|
Return the decoding result. See above description for the format of
|
||||||
|
the returned dict. Note: If it decodes to nothing, then return None.
|
||||||
|
"""
|
||||||
|
if HLG is not None:
|
||||||
|
device = HLG.device
|
||||||
|
else:
|
||||||
|
device = H.device
|
||||||
|
|
||||||
|
audio = batch["audio"].to(device)
|
||||||
|
padding_mask = batch["padding_mask"].to(device)
|
||||||
|
encoder_out, encoder_out_lens = model.forward_encoder(audio, padding_mask)
|
||||||
|
ctc_output = model.ctc_output(encoder_out)
|
||||||
|
|
||||||
|
num_frames = encoder_out_lens.cpu()
|
||||||
|
|
||||||
|
supervision_segments = torch.stack(
|
||||||
|
(
|
||||||
|
torch.arange(audio.shape[0], dtype=torch.int32),
|
||||||
|
torch.zeros_like(num_frames, dtype=torch.int32),
|
||||||
|
num_frames,
|
||||||
|
),
|
||||||
|
1,
|
||||||
|
).to(torch.int32)
|
||||||
|
|
||||||
|
if H is None:
|
||||||
|
assert HLG is not None
|
||||||
|
decoding_graph = HLG
|
||||||
|
else:
|
||||||
|
assert HLG is None
|
||||||
|
decoding_graph = H
|
||||||
|
|
||||||
|
lattice = get_lattice(
|
||||||
|
nnet_output=ctc_output,
|
||||||
|
decoding_graph=decoding_graph,
|
||||||
|
supervision_segments=supervision_segments,
|
||||||
|
search_beam=params.search_beam,
|
||||||
|
output_beam=params.output_beam,
|
||||||
|
min_active_states=params.min_active_states,
|
||||||
|
max_active_states=params.max_active_states,
|
||||||
|
)
|
||||||
|
|
||||||
|
if params.decoding_method == "ctc-decoding":
|
||||||
|
best_path = one_best_decoding(
|
||||||
|
lattice=lattice, use_double_scores=params.use_double_scores
|
||||||
|
)
|
||||||
|
# Note: `best_path.aux_labels` contains token IDs, not word IDs
|
||||||
|
# since we are using H, not HLG here.
|
||||||
|
#
|
||||||
|
# token_ids is a lit-of-list of IDs
|
||||||
|
token_ids = get_texts(best_path)
|
||||||
|
|
||||||
|
# hyps is a list of str, e.g., ['xxx yyy zzz', ...]
|
||||||
|
hyps = [
|
||||||
|
"".join(lexicon.token_table[idx].replace("|", " ") for idx in token_id)
|
||||||
|
for token_id in token_ids
|
||||||
|
]
|
||||||
|
|
||||||
|
# hyps is a list of list of str, e.g., [['xxx', 'yyy', 'zzz'], ... ]
|
||||||
|
hyps = [s.split() for s in hyps]
|
||||||
|
key = "ctc-decoding"
|
||||||
|
return {key: hyps}
|
||||||
|
|
||||||
|
if params.decoding_method == "nbest-oracle":
|
||||||
|
# Note: You can also pass rescored lattices to it.
|
||||||
|
# We choose the HLG decoded lattice for speed reasons
|
||||||
|
# as HLG decoding is faster and the oracle WER
|
||||||
|
# is only slightly worse than that of rescored lattices.
|
||||||
|
best_path = nbest_oracle(
|
||||||
|
lattice=lattice,
|
||||||
|
num_paths=params.num_paths,
|
||||||
|
ref_texts=batch["supervisions"]["text"],
|
||||||
|
word_table=word_table,
|
||||||
|
nbest_scale=params.nbest_scale,
|
||||||
|
oov="<UNK>",
|
||||||
|
)
|
||||||
|
hyps = get_texts(best_path)
|
||||||
|
hyps = [[word_table[i] for i in ids] for ids in hyps]
|
||||||
|
key = f"oracle_{params.num_paths}_nbest_scale_{params.nbest_scale}" # noqa
|
||||||
|
return {key: hyps}
|
||||||
|
|
||||||
|
if params.decoding_method in ["1best", "nbest"]:
|
||||||
|
if params.decoding_method == "1best":
|
||||||
|
best_path = one_best_decoding(
|
||||||
|
lattice=lattice, use_double_scores=params.use_double_scores
|
||||||
|
)
|
||||||
|
key = "no_rescore"
|
||||||
|
else:
|
||||||
|
best_path = nbest_decoding(
|
||||||
|
lattice=lattice,
|
||||||
|
num_paths=params.num_paths,
|
||||||
|
use_double_scores=params.use_double_scores,
|
||||||
|
nbest_scale=params.nbest_scale,
|
||||||
|
)
|
||||||
|
key = f"no_rescore-nbest-scale-{params.nbest_scale}-{params.num_paths}" # noqa
|
||||||
|
|
||||||
|
hyps = get_texts(best_path)
|
||||||
|
hyps = [[word_table[i] for i in ids] for ids in hyps]
|
||||||
|
return {key: hyps}
|
||||||
|
|
||||||
|
assert params.decoding_method in [
|
||||||
|
"nbest-rescoring",
|
||||||
|
"whole-lattice-rescoring",
|
||||||
|
]
|
||||||
|
|
||||||
|
lm_scale_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
|
||||||
|
lm_scale_list += [0.8, 0.9, 1.0, 1.1, 1.2, 1.3]
|
||||||
|
lm_scale_list += [1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
|
||||||
|
|
||||||
|
if params.decoding_method == "nbest-rescoring":
|
||||||
|
best_path_dict = rescore_with_n_best_list(
|
||||||
|
lattice=lattice,
|
||||||
|
G=G,
|
||||||
|
num_paths=params.num_paths,
|
||||||
|
lm_scale_list=lm_scale_list,
|
||||||
|
nbest_scale=params.nbest_scale,
|
||||||
|
)
|
||||||
|
elif params.decoding_method == "whole-lattice-rescoring":
|
||||||
|
best_path_dict = rescore_with_whole_lattice(
|
||||||
|
lattice=lattice,
|
||||||
|
G_with_epsilon_loops=G,
|
||||||
|
lm_scale_list=lm_scale_list,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
assert False, f"Unsupported decoding method: {params.decoding_method}"
|
||||||
|
|
||||||
|
ans = dict()
|
||||||
|
if best_path_dict is not None:
|
||||||
|
for lm_scale_str, best_path in best_path_dict.items():
|
||||||
|
hyps = get_texts(best_path)
|
||||||
|
hyps = [[word_table[i] for i in ids] for ids in hyps]
|
||||||
|
ans[lm_scale_str] = hyps
|
||||||
|
else:
|
||||||
|
ans = None
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
def decode_dataset(
|
||||||
|
dl: torch.utils.data.DataLoader,
|
||||||
|
params: AttributeDict,
|
||||||
|
model: nn.Module,
|
||||||
|
HLG: Optional[k2.Fsa],
|
||||||
|
H: Optional[k2.Fsa],
|
||||||
|
lexicon: Lexicon,
|
||||||
|
graph_compiler: CharCtcTrainingGraphCompiler,
|
||||||
|
word_table: k2.SymbolTable,
|
||||||
|
G: Optional[k2.Fsa] = None,
|
||||||
|
) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
|
||||||
|
"""Decode dataset.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dl:
|
||||||
|
PyTorch's dataloader containing the dataset to decode.
|
||||||
|
params:
|
||||||
|
It is returned by :func:`get_params`.
|
||||||
|
model:
|
||||||
|
The neural model.
|
||||||
|
HLG:
|
||||||
|
The decoding graph. Used only when params.decoding_method is NOT ctc-decoding.
|
||||||
|
H:
|
||||||
|
The ctc topo. Used only when params.decoding_method is ctc-decoding.
|
||||||
|
word_table:
|
||||||
|
It is the word symbol table.
|
||||||
|
G:
|
||||||
|
An LM. It is not None when params.decoding_method is "nbest-rescoring"
|
||||||
|
or "whole-lattice-rescoring". In general, the G in HLG
|
||||||
|
is a 3-gram LM, while this G is a 4-gram LM.
|
||||||
|
Returns:
|
||||||
|
Return a dict, whose key may be "no-rescore" if no LM rescoring
|
||||||
|
is used, or it may be "lm_scale_0.7" if LM rescoring is used.
|
||||||
|
Its value is a list of tuples. Each tuple contains two elements:
|
||||||
|
The first is the reference transcript, and the second is the
|
||||||
|
predicted result.
|
||||||
|
"""
|
||||||
|
num_cuts = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
num_batches = len(dl)
|
||||||
|
except TypeError:
|
||||||
|
num_batches = "?"
|
||||||
|
|
||||||
|
results = defaultdict(list)
|
||||||
|
for batch_idx, batch in enumerate(dl):
|
||||||
|
texts = batch["supervisions"]["text"]
|
||||||
|
cut_ids = [cut.id for cut in batch["cuts"]]
|
||||||
|
|
||||||
|
hyps_dict = decode_one_batch(
|
||||||
|
params=params,
|
||||||
|
model=model,
|
||||||
|
HLG=HLG,
|
||||||
|
H=H,
|
||||||
|
lexicon=lexicon,
|
||||||
|
graph_compiler=graph_compiler,
|
||||||
|
batch=batch,
|
||||||
|
word_table=word_table,
|
||||||
|
G=G,
|
||||||
|
)
|
||||||
|
|
||||||
|
for name, hyps in hyps_dict.items():
|
||||||
|
this_batch = []
|
||||||
|
assert len(hyps) == len(texts)
|
||||||
|
for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
|
||||||
|
ref_words = ref_text.split()
|
||||||
|
this_batch.append((cut_id, ref_words, hyp_words))
|
||||||
|
|
||||||
|
results[name].extend(this_batch)
|
||||||
|
|
||||||
|
num_cuts += len(texts)
|
||||||
|
|
||||||
|
if batch_idx % 100 == 0:
|
||||||
|
batch_str = f"{batch_idx}/{num_batches}"
|
||||||
|
|
||||||
|
logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def save_results(
|
||||||
|
params: AttributeDict,
|
||||||
|
test_set_name: str,
|
||||||
|
results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
|
||||||
|
):
|
||||||
|
test_set_wers = dict()
|
||||||
|
for key, results in results_dict.items():
|
||||||
|
recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
|
||||||
|
results = sorted(results)
|
||||||
|
store_transcripts(filename=recog_path, texts=results)
|
||||||
|
logging.info(f"The transcripts are stored in {recog_path}")
|
||||||
|
|
||||||
|
# The following prints out WERs, per-word error statistics and aligned
|
||||||
|
# ref/hyp pairs.
|
||||||
|
errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt"
|
||||||
|
with open(errs_filename, "w") as f:
|
||||||
|
wer = write_error_stats(f, f"{test_set_name}-{key}", results)
|
||||||
|
test_set_wers[key] = wer
|
||||||
|
|
||||||
|
logging.info("Wrote detailed error stats to {}".format(errs_filename))
|
||||||
|
|
||||||
|
test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
|
||||||
|
errs_info = params.res_dir / f"wer-summary-{test_set_name}-{params.suffix}.txt"
|
||||||
|
with open(errs_info, "w") as f:
|
||||||
|
print("settings\tWER", file=f)
|
||||||
|
for key, val in test_set_wers:
|
||||||
|
print("{}\t{}".format(key, val), file=f)
|
||||||
|
|
||||||
|
s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
|
||||||
|
note = "\tbest for {}".format(test_set_name)
|
||||||
|
for key, val in test_set_wers:
|
||||||
|
s += "{}\t{}{}\n".format(key, val, note)
|
||||||
|
note = ""
|
||||||
|
logging.info(s)
|
||||||
|
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def main():
|
||||||
|
parser = get_parser()
|
||||||
|
LibriSpeechAsrDataModule.add_arguments(parser)
|
||||||
|
args = parser.parse_args()
|
||||||
|
args.exp_dir = Path(args.exp_dir)
|
||||||
|
args.lang_dir = Path(args.lang_dir)
|
||||||
|
args.lm_dir = Path(args.lm_dir)
|
||||||
|
|
||||||
|
params = get_params()
|
||||||
|
# add decoding params
|
||||||
|
params.update(get_decoding_params())
|
||||||
|
params.update(vars(args))
|
||||||
|
|
||||||
|
assert params.decoding_method in (
|
||||||
|
"ctc-decoding",
|
||||||
|
"1best",
|
||||||
|
"nbest",
|
||||||
|
"nbest-rescoring",
|
||||||
|
"whole-lattice-rescoring",
|
||||||
|
"nbest-oracle",
|
||||||
|
)
|
||||||
|
params.res_dir = params.exp_dir / params.decoding_method
|
||||||
|
|
||||||
|
if params.iter > 0:
|
||||||
|
params.suffix = f"iter-{params.iter}-avg-{params.avg}"
|
||||||
|
else:
|
||||||
|
params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
|
||||||
|
|
||||||
|
if params.use_averaged_model:
|
||||||
|
params.suffix += "-use-averaged-model"
|
||||||
|
|
||||||
|
setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
|
||||||
|
logging.info("Decoding started")
|
||||||
|
|
||||||
|
device = torch.device("cpu")
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
device = torch.device("cuda", 0)
|
||||||
|
|
||||||
|
logging.info(f"Device: {device}")
|
||||||
|
logging.info(params)
|
||||||
|
|
||||||
|
lexicon = Lexicon(params.lang_dir)
|
||||||
|
max_token_id = max(lexicon.tokens)
|
||||||
|
|
||||||
|
graph_compiler = CharCtcTrainingGraphCompiler(
|
||||||
|
lexicon=lexicon,
|
||||||
|
device=device,
|
||||||
|
)
|
||||||
|
|
||||||
|
params.blank_id = lexicon.token_table["<blk>"]
|
||||||
|
params.vocab_size = max(lexicon.tokens) + 1
|
||||||
|
|
||||||
|
if params.decoding_method == "ctc-decoding":
|
||||||
|
HLG = None
|
||||||
|
H = k2.ctc_topo(
|
||||||
|
max_token=max_token_id,
|
||||||
|
modified=False,
|
||||||
|
device=device,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
H = None
|
||||||
|
HLG = k2.Fsa.from_dict(
|
||||||
|
torch.load(f"{params.lang_dir}/HLG.pt", map_location=device)
|
||||||
|
)
|
||||||
|
assert HLG.requires_grad is False
|
||||||
|
|
||||||
|
HLG.scores *= params.hlg_scale
|
||||||
|
if not hasattr(HLG, "lm_scores"):
|
||||||
|
HLG.lm_scores = HLG.scores.clone()
|
||||||
|
|
||||||
|
if params.decoding_method in (
|
||||||
|
"nbest-rescoring",
|
||||||
|
"whole-lattice-rescoring",
|
||||||
|
):
|
||||||
|
if not (params.lm_dir / "G_4_gram.pt").is_file():
|
||||||
|
logging.info("Loading G_4_gram.fst.txt")
|
||||||
|
logging.warning("It may take 8 minutes.")
|
||||||
|
with open(params.lm_dir / "G_4_gram.fst.txt") as f:
|
||||||
|
first_word_disambig_id = lexicon.word_table["#0"]
|
||||||
|
|
||||||
|
G = k2.Fsa.from_openfst(f.read(), acceptor=False)
|
||||||
|
# G.aux_labels is not needed in later computations, so
|
||||||
|
# remove it here.
|
||||||
|
del G.aux_labels
|
||||||
|
# CAUTION: The following line is crucial.
|
||||||
|
# Arcs entering the back-off state have label equal to #0.
|
||||||
|
# We have to change it to 0 here.
|
||||||
|
G.labels[G.labels >= first_word_disambig_id] = 0
|
||||||
|
# See https://github.com/k2-fsa/k2/issues/874
|
||||||
|
# for why we need to set G.properties to None
|
||||||
|
G.__dict__["_properties"] = None
|
||||||
|
G = k2.Fsa.from_fsas([G]).to(device)
|
||||||
|
G = k2.arc_sort(G)
|
||||||
|
# Save a dummy value so that it can be loaded in C++.
|
||||||
|
# See https://github.com/pytorch/pytorch/issues/67902
|
||||||
|
# for why we need to do this.
|
||||||
|
G.dummy = 1
|
||||||
|
|
||||||
|
torch.save(G.as_dict(), params.lm_dir / "G_4_gram.pt")
|
||||||
|
else:
|
||||||
|
logging.info("Loading pre-compiled G_4_gram.pt")
|
||||||
|
d = torch.load(params.lm_dir / "G_4_gram.pt", map_location=device)
|
||||||
|
G = k2.Fsa.from_dict(d)
|
||||||
|
|
||||||
|
if params.decoding_method == "whole-lattice-rescoring":
|
||||||
|
# Add epsilon self-loops to G as we will compose
|
||||||
|
# it with the whole lattice later
|
||||||
|
G = k2.add_epsilon_self_loops(G)
|
||||||
|
G = k2.arc_sort(G)
|
||||||
|
G = G.to(device)
|
||||||
|
|
||||||
|
# G.lm_scores is used to replace HLG.lm_scores during
|
||||||
|
# LM rescoring.
|
||||||
|
G.lm_scores = G.scores.clone()
|
||||||
|
else:
|
||||||
|
G = None
|
||||||
|
|
||||||
|
logging.info("About to create model")
|
||||||
|
model = get_model(params)
|
||||||
|
|
||||||
|
if not params.use_averaged_model:
|
||||||
|
if params.iter > 0:
|
||||||
|
filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
|
||||||
|
: params.avg
|
||||||
|
]
|
||||||
|
if len(filenames) == 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"No checkpoints found for"
|
||||||
|
f" --iter {params.iter}, --avg {params.avg}"
|
||||||
|
)
|
||||||
|
elif len(filenames) < params.avg:
|
||||||
|
raise ValueError(
|
||||||
|
f"Not enough checkpoints ({len(filenames)}) found for"
|
||||||
|
f" --iter {params.iter}, --avg {params.avg}"
|
||||||
|
)
|
||||||
|
logging.info(f"averaging {filenames}")
|
||||||
|
model.to(device)
|
||||||
|
model.load_state_dict(average_checkpoints(filenames, device=device))
|
||||||
|
elif params.avg == 1:
|
||||||
|
load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
|
||||||
|
else:
|
||||||
|
start = params.epoch - params.avg + 1
|
||||||
|
filenames = []
|
||||||
|
for i in range(start, params.epoch + 1):
|
||||||
|
if i >= 1:
|
||||||
|
filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
|
||||||
|
logging.info(f"averaging {filenames}")
|
||||||
|
model.to(device)
|
||||||
|
model.load_state_dict(average_checkpoints(filenames, device=device))
|
||||||
|
else:
|
||||||
|
if params.iter > 0:
|
||||||
|
filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
|
||||||
|
: params.avg + 1
|
||||||
|
]
|
||||||
|
if len(filenames) == 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"No checkpoints found for"
|
||||||
|
f" --iter {params.iter}, --avg {params.avg}"
|
||||||
|
)
|
||||||
|
elif len(filenames) < params.avg + 1:
|
||||||
|
raise ValueError(
|
||||||
|
f"Not enough checkpoints ({len(filenames)}) found for"
|
||||||
|
f" --iter {params.iter}, --avg {params.avg}"
|
||||||
|
)
|
||||||
|
filename_start = filenames[-1]
|
||||||
|
filename_end = filenames[0]
|
||||||
|
logging.info(
|
||||||
|
"Calculating the averaged model over iteration checkpoints"
|
||||||
|
f" from {filename_start} (excluded) to {filename_end}"
|
||||||
|
)
|
||||||
|
model.to(device)
|
||||||
|
model.load_state_dict(
|
||||||
|
average_checkpoints_with_averaged_model(
|
||||||
|
filename_start=filename_start,
|
||||||
|
filename_end=filename_end,
|
||||||
|
device=device,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
assert params.avg > 0, params.avg
|
||||||
|
start = params.epoch - params.avg
|
||||||
|
assert start >= 1, start
|
||||||
|
filename_start = f"{params.exp_dir}/epoch-{start}.pt"
|
||||||
|
filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
|
||||||
|
logging.info(
|
||||||
|
f"Calculating the averaged model over epoch range from "
|
||||||
|
f"{start} (excluded) to {params.epoch}"
|
||||||
|
)
|
||||||
|
model.to(device)
|
||||||
|
model.load_state_dict(
|
||||||
|
average_checkpoints_with_averaged_model(
|
||||||
|
filename_start=filename_start,
|
||||||
|
filename_end=filename_end,
|
||||||
|
device=device,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
model.to(device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
num_param = sum([p.numel() for p in model.parameters()])
|
||||||
|
logging.info(f"Number of model parameters: {num_param}")
|
||||||
|
|
||||||
|
# we need cut ids to display recognition results.
|
||||||
|
args.return_cuts = True
|
||||||
|
librispeech = LibriSpeechAsrDataModule(args)
|
||||||
|
|
||||||
|
dev_clean_cuts = librispeech.dev_clean_cuts()
|
||||||
|
dev_other_cuts = librispeech.dev_other_cuts()
|
||||||
|
|
||||||
|
dev_clean_dl = librispeech.test_dataloaders(
|
||||||
|
dev_clean_cuts,
|
||||||
|
do_normalize=params.do_normalize,
|
||||||
|
)
|
||||||
|
dev_other_dl = librispeech.test_dataloaders(
|
||||||
|
dev_other_cuts,
|
||||||
|
do_normalize=params.do_normalize,
|
||||||
|
)
|
||||||
|
|
||||||
|
test_sets = ["dev-clean", "dev-other"]
|
||||||
|
test_dl = [dev_clean_dl, dev_other_dl]
|
||||||
|
|
||||||
|
# test_clean_cuts = librispeech.test_clean_cuts()
|
||||||
|
# test_other_cuts = librispeech.test_other_cuts()
|
||||||
|
|
||||||
|
# test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
|
||||||
|
# test_other_dl = librispeech.test_dataloaders(test_other_cuts)
|
||||||
|
|
||||||
|
# test_sets = ["test-clean", "test-other"]
|
||||||
|
# test_dl = [test_clean_dl, test_other_dl]
|
||||||
|
|
||||||
|
for test_set, test_dl in zip(test_sets, test_dl):
|
||||||
|
results_dict = decode_dataset(
|
||||||
|
dl=test_dl,
|
||||||
|
params=params,
|
||||||
|
model=model,
|
||||||
|
HLG=HLG,
|
||||||
|
H=H,
|
||||||
|
lexicon=lexicon,
|
||||||
|
graph_compiler=graph_compiler,
|
||||||
|
word_table=lexicon.word_table,
|
||||||
|
G=G,
|
||||||
|
)
|
||||||
|
|
||||||
|
save_results(
|
||||||
|
params=params,
|
||||||
|
test_set_name=test_set,
|
||||||
|
results_dict=results_dict,
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.info("Done!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
1
egs/librispeech/SSL/zipformer_ctc/dataset.py
Symbolic link
1
egs/librispeech/SSL/zipformer_ctc/dataset.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../zipformer/dataset.py
|
1
egs/librispeech/SSL/zipformer_ctc/encoder_interface.py
Symbolic link
1
egs/librispeech/SSL/zipformer_ctc/encoder_interface.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../zipformer/encoder_interface.py
|
1399
egs/librispeech/SSL/zipformer_ctc/finetune_ctc.py
Executable file
1399
egs/librispeech/SSL/zipformer_ctc/finetune_ctc.py
Executable file
File diff suppressed because it is too large
Load Diff
1
egs/librispeech/SSL/zipformer_ctc/hubert_ce.py
Symbolic link
1
egs/librispeech/SSL/zipformer_ctc/hubert_ce.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../zipformer/hubert_ce.py
|
153
egs/librispeech/SSL/zipformer_ctc/model.py
Normal file
153
egs/librispeech/SSL/zipformer_ctc/model.py
Normal file
@ -0,0 +1,153 @@
|
|||||||
|
# Copyright 2021-2024 Xiaomi Corp. (authors: Fangjun Kuang,
|
||||||
|
# Wei Kang,
|
||||||
|
# Zengwei Yao,
|
||||||
|
# Yifan Yang)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
import k2
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from scaling import ScaledLinear
|
||||||
|
|
||||||
|
from icefall.utils import add_sos
|
||||||
|
|
||||||
|
|
||||||
|
class AsrModel(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
encoder,
|
||||||
|
encoder_dim: int = 768,
|
||||||
|
vocab_size: int = 500,
|
||||||
|
):
|
||||||
|
"""CTC ASR model.
|
||||||
|
|
||||||
|
- Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks (http://imagine.enpc.fr/~obozinsg/teaching/mva_gm/papers/ctc.pdf)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
encoder:
|
||||||
|
It is the transcription network in the paper. Its accepts
|
||||||
|
inputs: `x` of (N, T, encoder_dim).
|
||||||
|
It returns two tensors: `logits` of shape (N, T, encoder_dim) and
|
||||||
|
`logit_lens` of shape (N,).
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.encoder = encoder
|
||||||
|
|
||||||
|
# Modules for CTC head
|
||||||
|
self.ctc_output = nn.Sequential(
|
||||||
|
nn.Dropout(p=0.1),
|
||||||
|
nn.Linear(encoder_dim, vocab_size),
|
||||||
|
nn.LogSoftmax(dim=-1),
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward_encoder(
|
||||||
|
self,
|
||||||
|
x: torch.Tensor,
|
||||||
|
padding_mask: Optional[torch.Tensor] = None,
|
||||||
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
"""Compute encoder outputs.
|
||||||
|
Args:
|
||||||
|
x:
|
||||||
|
A 2-D tensor of shape (N, T).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
encoder_out:
|
||||||
|
Encoder output, of shape (N, T, C).
|
||||||
|
encoder_out_lens:
|
||||||
|
Encoder output lengths, of shape (N,).
|
||||||
|
"""
|
||||||
|
if padding_mask is None:
|
||||||
|
padding_mask = torch.zeros_like(x, dtype=torch.bool)
|
||||||
|
|
||||||
|
encoder_out, padding_mask = self.encoder.extract_features(
|
||||||
|
source=x,
|
||||||
|
padding_mask=padding_mask,
|
||||||
|
mask=self.encoder.training,
|
||||||
|
)
|
||||||
|
encoder_out_lens = torch.sum(~padding_mask, dim=1)
|
||||||
|
assert torch.all(encoder_out_lens > 0), encoder_out_lens
|
||||||
|
|
||||||
|
return encoder_out, encoder_out_lens
|
||||||
|
|
||||||
|
def forward_ctc(
|
||||||
|
self,
|
||||||
|
encoder_out: torch.Tensor,
|
||||||
|
encoder_out_lens: torch.Tensor,
|
||||||
|
targets: torch.Tensor,
|
||||||
|
target_lengths: torch.Tensor,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
"""Compute CTC loss.
|
||||||
|
Args:
|
||||||
|
encoder_out:
|
||||||
|
Encoder output, of shape (N, T, C).
|
||||||
|
encoder_out_lens:
|
||||||
|
Encoder output lengths, of shape (N,).
|
||||||
|
targets:
|
||||||
|
Target Tensor of shape (sum(target_lengths)). The targets are assumed
|
||||||
|
to be un-padded and concatenated within 1 dimension.
|
||||||
|
"""
|
||||||
|
# Compute CTC log-prob
|
||||||
|
ctc_output = self.ctc_output(encoder_out) # (N, T, C)
|
||||||
|
|
||||||
|
ctc_loss = torch.nn.functional.ctc_loss(
|
||||||
|
log_probs=ctc_output.permute(1, 0, 2), # (T, N, C)
|
||||||
|
targets=targets.cpu(),
|
||||||
|
input_lengths=encoder_out_lens.cpu(),
|
||||||
|
target_lengths=target_lengths.cpu(),
|
||||||
|
reduction="sum",
|
||||||
|
)
|
||||||
|
return ctc_loss
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
x: torch.Tensor,
|
||||||
|
y: k2.RaggedTensor,
|
||||||
|
padding_mask: Optional[torch.Tensor] = None,
|
||||||
|
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
x:
|
||||||
|
A 2-D tensor of shape (N, T).
|
||||||
|
y:
|
||||||
|
A ragged tensor with 2 axes [utt][label]. It contains labels of each
|
||||||
|
utterance.
|
||||||
|
Returns:
|
||||||
|
Return the CTC loss,
|
||||||
|
"""
|
||||||
|
assert x.ndim == 2, x.shape
|
||||||
|
assert y.num_axes == 2, y.num_axes
|
||||||
|
|
||||||
|
assert x.size(0) == y.dim0, (x.shape, y.dim0)
|
||||||
|
|
||||||
|
# Compute encoder outputs
|
||||||
|
encoder_out, encoder_out_lens = self.forward_encoder(x, padding_mask)
|
||||||
|
|
||||||
|
row_splits = y.shape.row_splits(1)
|
||||||
|
y_lens = row_splits[1:] - row_splits[:-1]
|
||||||
|
|
||||||
|
# Compute CTC loss
|
||||||
|
targets = y.values
|
||||||
|
ctc_loss = self.forward_ctc(
|
||||||
|
encoder_out=encoder_out,
|
||||||
|
encoder_out_lens=encoder_out_lens,
|
||||||
|
targets=targets,
|
||||||
|
target_lengths=y_lens,
|
||||||
|
)
|
||||||
|
|
||||||
|
return ctc_loss, encoder_out_lens
|
1
egs/librispeech/SSL/zipformer_ctc/optim.py
Symbolic link
1
egs/librispeech/SSL/zipformer_ctc/optim.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../zipformer/optim.py
|
1
egs/librispeech/SSL/zipformer_ctc/scaling.py
Symbolic link
1
egs/librispeech/SSL/zipformer_ctc/scaling.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../zipformer/scaling.py
|
1
egs/librispeech/SSL/zipformer_ctc/utils.py
Symbolic link
1
egs/librispeech/SSL/zipformer_ctc/utils.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../zipformer/utils.py
|
1
egs/librispeech/SSL/zipformer_ctc/wav2vec2_module.py
Symbolic link
1
egs/librispeech/SSL/zipformer_ctc/wav2vec2_module.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../zipformer/wav2vec2_module.py
|
1
egs/librispeech/SSL/zipformer_ctc/zipformer.py
Symbolic link
1
egs/librispeech/SSL/zipformer_ctc/zipformer.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../zipformer/zipformer.py
|
@ -70,12 +70,15 @@ class CharCtcTrainingGraphCompiler(object):
|
|||||||
Returns:
|
Returns:
|
||||||
Return a list-of-list of token IDs.
|
Return a list-of-list of token IDs.
|
||||||
"""
|
"""
|
||||||
assert sep in ("", "/"), sep
|
assert sep in ("", "/", "|"), sep
|
||||||
ids: List[List[int]] = []
|
ids: List[List[int]] = []
|
||||||
whitespace = re.compile(r"([ \t])")
|
whitespace = re.compile(r"([ \t])")
|
||||||
for text in texts:
|
for text in texts:
|
||||||
if sep == "":
|
if sep == "":
|
||||||
text = re.sub(whitespace, "", text)
|
text = re.sub(whitespace, "", text)
|
||||||
|
elif sep == "|":
|
||||||
|
text = re.sub(r"\s+", " ", text)
|
||||||
|
text = re.sub(" ", "|", text)
|
||||||
else:
|
else:
|
||||||
text = text.split(sep)
|
text = text.split(sep)
|
||||||
sub_ids = [
|
sub_ids = [
|
||||||
|
Loading…
x
Reference in New Issue
Block a user