diff --git a/docs/source/recipes/aishell/conformer_ctc.rst b/docs/source/recipes/aishell/conformer_ctc.rst index 2dcf0c728..75a2a8eca 100644 --- a/docs/source/recipes/aishell/conformer_ctc.rst +++ b/docs/source/recipes/aishell/conformer_ctc.rst @@ -1,4 +1,4 @@ -Confromer CTC +Conformer CTC ============= This tutorial shows you how to run a conformer ctc model diff --git a/egs/aishell/ASR/local/compile_hlg.py b/egs/aishell/ASR/local/compile_hlg.py deleted file mode 100755 index 098d5d6a3..000000000 --- a/egs/aishell/ASR/local/compile_hlg.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script takes as input lang_dir and generates HLG from - - - H, the ctc topology, built from tokens contained in lang_dir/lexicon.txt - - L, the lexicon, built from lang_dir/L_disambig.pt - - Caution: We use a lexicon that contains disambiguation symbols - - - G, the LM, built from data/lm/G_3_gram.fst.txt - -The generated HLG is saved in $lang_dir/HLG.pt -""" -import argparse -import logging -from pathlib import Path - -import k2 -import torch - -from icefall.lexicon import Lexicon - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--lang-dir", - type=str, - help="""Input and output directory. - """, - ) - - return parser.parse_args() - - -def compile_HLG(lang_dir: str) -> k2.Fsa: - """ - Args: - lang_dir: - The language directory, e.g., data/lang_phone or data/lang_bpe_5000. - - Return: - An FSA representing HLG. - """ - lexicon = Lexicon(lang_dir) - max_token_id = max(lexicon.tokens) - logging.info(f"Building ctc_topo. max_token_id: {max_token_id}") - H = k2.ctc_topo(max_token_id) - L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt")) - - if Path("data/lm/G_3_gram.pt").is_file(): - logging.info("Loading pre-compiled G_3_gram") - d = torch.load("data/lm/G_3_gram.pt") - G = k2.Fsa.from_dict(d) - else: - logging.info("Loading G_3_gram.fst.txt") - with open("data/lm/G_3_gram.fst.txt") as f: - G = k2.Fsa.from_openfst(f.read(), acceptor=False) - torch.save(G.as_dict(), "data/lm/G_3_gram.pt") - - first_token_disambig_id = lexicon.token_table["#0"] - first_word_disambig_id = lexicon.word_table["#0"] - - L = k2.arc_sort(L) - G = k2.arc_sort(G) - - logging.info("Intersecting L and G") - LG = k2.compose(L, G) - logging.info(f"LG shape: {LG.shape}") - - logging.info("Connecting LG") - LG = k2.connect(LG) - logging.info(f"LG shape after k2.connect: {LG.shape}") - - logging.info(type(LG.aux_labels)) - logging.info("Determinizing LG") - - LG = k2.determinize(LG) - logging.info(type(LG.aux_labels)) - - logging.info("Connecting LG after k2.determinize") - LG = k2.connect(LG) - - logging.info("Removing disambiguation symbols on LG") - - LG.labels[LG.labels >= first_token_disambig_id] = 0 - - assert isinstance(LG.aux_labels, k2.RaggedTensor) - LG.aux_labels.values[LG.aux_labels.values >= first_word_disambig_id] = 0 - - LG = k2.remove_epsilon(LG) - logging.info(f"LG shape after k2.remove_epsilon: {LG.shape}") - - LG = k2.connect(LG) - LG.aux_labels = LG.aux_labels.remove_values_eq(0) - - logging.info("Arc sorting LG") - LG = k2.arc_sort(LG) - - logging.info("Composing H and LG") - # CAUTION: The name of the inner_labels is fixed - # to `tokens`. If you want to change it, please - # also change other places in icefall that are using - # it. - HLG = k2.compose(H, LG, inner_labels="tokens") - - logging.info("Connecting LG") - HLG = k2.connect(HLG) - - logging.info("Arc sorting LG") - HLG = k2.arc_sort(HLG) - logging.info(f"HLG.shape: {HLG.shape}") - - return HLG - - -def main(): - args = get_args() - lang_dir = Path(args.lang_dir) - - if (lang_dir / "HLG.pt").is_file(): - logging.info(f"{lang_dir}/HLG.pt already exists - skipping") - return - - logging.info(f"Processing {lang_dir}") - - HLG = compile_HLG(lang_dir) - logging.info(f"Saving HLG.pt to {lang_dir}") - torch.save(HLG.as_dict(), f"{lang_dir}/HLG.pt") - - -if __name__ == "__main__": - formatter = ( - "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" - ) - - logging.basicConfig(format=formatter, level=logging.INFO) - - main() diff --git a/egs/aishell/ASR/local/compile_hlg.py b/egs/aishell/ASR/local/compile_hlg.py new file mode 120000 index 000000000..471aa7fb4 --- /dev/null +++ b/egs/aishell/ASR/local/compile_hlg.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/compile_hlg.py \ No newline at end of file diff --git a/egs/aishell/ASR/local/compute_fbank_musan.py b/egs/aishell/ASR/local/compute_fbank_musan.py deleted file mode 100755 index e79bdafb1..000000000 --- a/egs/aishell/ASR/local/compute_fbank_musan.py +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This file computes fbank features of the musan dataset. -It looks for manifests in the directory data/manifests. - -The generated fbank features are saved in data/fbank. -""" - -import argparse -import logging -import os -from pathlib import Path - -import torch -from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine -from lhotse.recipes.utils import read_manifests_if_cached - -from icefall.utils import get_executor - -# Torch's multithreaded behavior needs to be disabled or -# it wastes a lot of CPU and slow things down. -# Do this outside of main() in case it needs to take effect -# even when we are not invoking the main (e.g. when spawning subprocesses). -torch.set_num_threads(1) -torch.set_num_interop_threads(1) - - -def compute_fbank_musan(num_mel_bins: int = 80): - src_dir = Path("data/manifests") - output_dir = Path("data/fbank") - num_jobs = min(15, os.cpu_count()) - - dataset_parts = ( - "music", - "speech", - "noise", - ) - manifests = read_manifests_if_cached( - dataset_parts=dataset_parts, output_dir=src_dir - ) - assert manifests is not None - - musan_cuts_path = output_dir / "cuts_musan.json.gz" - - if musan_cuts_path.is_file(): - logging.info(f"{musan_cuts_path} already exists - skipping") - return - - logging.info("Extracting features for Musan") - - extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins)) - - with get_executor() as ex: # Initialize the executor only once. - # create chunks of Musan with duration 5 - 10 seconds - musan_cuts = ( - CutSet.from_manifests( - recordings=combine( - part["recordings"] for part in manifests.values() - ) - ) - .cut_into_windows(10.0) - .filter(lambda c: c.duration > 5) - .compute_and_store_features( - extractor=extractor, - storage_path=f"{output_dir}/feats_musan", - num_jobs=num_jobs if ex is None else 80, - executor=ex, - storage_type=LilcomHdf5Writer, - ) - ) - musan_cuts.to_json(musan_cuts_path) - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--num-mel-bins", - type=int, - default=80, - help="""The number of mel bins for Fbank""", - ) - - return parser.parse_args() - - -if __name__ == "__main__": - formatter = ( - "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" - ) - - logging.basicConfig(format=formatter, level=logging.INFO) - args = get_args() - compute_fbank_musan(num_mel_bins=args.num_mel_bins) diff --git a/egs/aishell/ASR/local/compute_fbank_musan.py b/egs/aishell/ASR/local/compute_fbank_musan.py new file mode 120000 index 000000000..5833f2484 --- /dev/null +++ b/egs/aishell/ASR/local/compute_fbank_musan.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/compute_fbank_musan.py \ No newline at end of file diff --git a/egs/aishell/ASR/local/convert_transcript_words_to_tokens.py b/egs/aishell/ASR/local/convert_transcript_words_to_tokens.py deleted file mode 100755 index 133499c8b..000000000 --- a/egs/aishell/ASR/local/convert_transcript_words_to_tokens.py +++ /dev/null @@ -1,107 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang) -""" -Convert a transcript file containing words to a corpus file containing tokens -for LM training with the help of a lexicon. - -If the lexicon contains phones, the resulting LM will be a phone LM; If the -lexicon contains word pieces, the resulting LM will be a word piece LM. - -If a word has multiple pronunciations, the one that appears first in the lexicon -is kept; others are removed. - -If the input transcript is: - - hello zoo world hello - world zoo - foo zoo world hellO - -and if the lexicon is - - SPN - hello h e l l o 2 - hello h e l l o - world w o r l d - zoo z o o - -Then the output is - - h e l l o 2 z o o w o r l d h e l l o 2 - w o r l d z o o - SPN z o o w o r l d SPN -""" - -import argparse -from pathlib import Path -from typing import Dict, List - -from generate_unique_lexicon import filter_multiple_pronunications - -from icefall.lexicon import read_lexicon - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--transcript", - type=str, - help="The input transcript file." - "We assume that the transcript file consists of " - "lines. Each line consists of space separated words.", - ) - parser.add_argument("--lexicon", type=str, help="The input lexicon file.") - parser.add_argument( - "--oov", type=str, default="", help="The OOV word." - ) - - return parser.parse_args() - - -def process_line( - lexicon: Dict[str, List[str]], line: str, oov_token: str -) -> None: - """ - Args: - lexicon: - A dict containing pronunciations. Its keys are words and values - are pronunciations (i.e., tokens). - line: - A line of transcript consisting of space(s) separated words. - oov_token: - The pronunciation of the oov word if a word in `line` is not present - in the lexicon. - Returns: - Return None. - """ - s = "" - words = line.strip().split() - for i, w in enumerate(words): - tokens = lexicon.get(w, oov_token) - s += " ".join(tokens) - s += " " - print(s.strip()) - - -def main(): - args = get_args() - assert Path(args.lexicon).is_file() - assert Path(args.transcript).is_file() - assert len(args.oov) > 0 - - # Only the first pronunciation of a word is kept - lexicon = filter_multiple_pronunications(read_lexicon(args.lexicon)) - - lexicon = dict(lexicon) - - assert args.oov in lexicon - - oov_token = lexicon[args.oov] - - with open(args.transcript) as f: - for line in f: - process_line(lexicon=lexicon, line=line, oov_token=oov_token) - - -if __name__ == "__main__": - main() diff --git a/egs/aishell/ASR/local/convert_transcript_words_to_tokens.py b/egs/aishell/ASR/local/convert_transcript_words_to_tokens.py new file mode 120000 index 000000000..2ce13fd69 --- /dev/null +++ b/egs/aishell/ASR/local/convert_transcript_words_to_tokens.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/convert_transcript_words_to_tokens.py \ No newline at end of file diff --git a/egs/aishell/ASR/local/generate_unique_lexicon.py b/egs/aishell/ASR/local/generate_unique_lexicon.py deleted file mode 100755 index 566c0743d..000000000 --- a/egs/aishell/ASR/local/generate_unique_lexicon.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This file takes as input a lexicon.txt and output a new lexicon, -in which each word has a unique pronunciation. - -The way to do this is to keep only the first pronunciation of a word -in lexicon.txt. -""" - - -import argparse -import logging -from pathlib import Path -from typing import List, Tuple - -from icefall.lexicon import read_lexicon, write_lexicon - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--lang-dir", - type=str, - help="""Input and output directory. - It should contain a file lexicon.txt. - This file will generate a new file uniq_lexicon.txt - in it. - """, - ) - - return parser.parse_args() - - -def filter_multiple_pronunications( - lexicon: List[Tuple[str, List[str]]] -) -> List[Tuple[str, List[str]]]: - """Remove multiple pronunciations of words from a lexicon. - - If a word has more than one pronunciation in the lexicon, only - the first one is kept, while other pronunciations are removed - from the lexicon. - - Args: - lexicon: - The input lexicon, containing a list of (word, [p1, p2, ..., pn]), - where "p1, p2, ..., pn" are the pronunciations of the "word". - Returns: - Return a new lexicon where each word has a unique pronunciation. - """ - seen = set() - ans = [] - - for word, tokens in lexicon: - if word in seen: - continue - seen.add(word) - ans.append((word, tokens)) - return ans - - -def main(): - args = get_args() - lang_dir = Path(args.lang_dir) - - lexicon_filename = lang_dir / "lexicon.txt" - - in_lexicon = read_lexicon(lexicon_filename) - - out_lexicon = filter_multiple_pronunications(in_lexicon) - - write_lexicon(lang_dir / "uniq_lexicon.txt", out_lexicon) - - logging.info(f"Number of entries in lexicon.txt: {len(in_lexicon)}") - logging.info(f"Number of entries in uniq_lexicon.txt: {len(out_lexicon)}") - - -if __name__ == "__main__": - formatter = ( - "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" - ) - - logging.basicConfig(format=formatter, level=logging.INFO) - - main() diff --git a/egs/aishell/ASR/local/generate_unique_lexicon.py b/egs/aishell/ASR/local/generate_unique_lexicon.py new file mode 120000 index 000000000..c0aea1403 --- /dev/null +++ b/egs/aishell/ASR/local/generate_unique_lexicon.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/generate_unique_lexicon.py \ No newline at end of file diff --git a/egs/timit/ASR/local/compute_fbank_musan.py b/egs/timit/ASR/local/compute_fbank_musan.py deleted file mode 100644 index d44524e70..000000000 --- a/egs/timit/ASR/local/compute_fbank_musan.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This file computes fbank features of the musan dataset. -It looks for manifests in the directory data/manifests. - -The generated fbank features are saved in data/fbank. -""" - -import logging -import os -from pathlib import Path - -import torch -from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine -from lhotse.recipes.utils import read_manifests_if_cached - -from icefall.utils import get_executor - -# Torch's multithreaded behavior needs to be disabled or -# it wastes a lot of CPU and slow things down. -# Do this outside of main() in case it needs to take effect -# even when we are not invoking the main (e.g. when spawning subprocesses). -torch.set_num_threads(1) -torch.set_num_interop_threads(1) - - -def compute_fbank_musan(): - src_dir = Path("data/manifests") - output_dir = Path("data/fbank") - num_jobs = min(15, os.cpu_count()) - num_mel_bins = 80 - - dataset_parts = ( - "music", - "speech", - "noise", - ) - manifests = read_manifests_if_cached( - dataset_parts=dataset_parts, output_dir=src_dir - ) - assert manifests is not None - - musan_cuts_path = output_dir / "cuts_musan.json.gz" - - if musan_cuts_path.is_file(): - logging.info(f"{musan_cuts_path} already exists - skipping") - return - - logging.info("Extracting features for Musan") - - extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins)) - - with get_executor() as ex: # Initialize the executor only once. - # create chunks of Musan with duration 5 - 10 seconds - musan_cuts = ( - CutSet.from_manifests( - recordings=combine( - part["recordings"] for part in manifests.values() - ) - ) - .cut_into_windows(10.0) - .filter(lambda c: c.duration > 5) - .compute_and_store_features( - extractor=extractor, - storage_path=f"{output_dir}/feats_musan", - num_jobs=num_jobs if ex is None else 80, - executor=ex, - storage_type=LilcomHdf5Writer, - ) - ) - musan_cuts.to_json(musan_cuts_path) - - -if __name__ == "__main__": - formatter = ( - "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" - ) - - logging.basicConfig(format=formatter, level=logging.INFO) - compute_fbank_musan() diff --git a/egs/timit/ASR/local/compute_fbank_musan.py b/egs/timit/ASR/local/compute_fbank_musan.py new file mode 120000 index 000000000..5833f2484 --- /dev/null +++ b/egs/timit/ASR/local/compute_fbank_musan.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/compute_fbank_musan.py \ No newline at end of file diff --git a/egs/timit/ASR/shared b/egs/timit/ASR/shared deleted file mode 100644 index 4c5e91438..000000000 --- a/egs/timit/ASR/shared +++ /dev/null @@ -1 +0,0 @@ -../../../icefall/shared/ \ No newline at end of file diff --git a/egs/timit/ASR/shared b/egs/timit/ASR/shared new file mode 120000 index 000000000..4cbd91a7e --- /dev/null +++ b/egs/timit/ASR/shared @@ -0,0 +1 @@ +../../../icefall/shared \ No newline at end of file