mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-07 16:14:17 +00:00
minor updates
This commit is contained in:
parent
1adf38179c
commit
7eb2ba7d0d
259
egs/multi_zh_en/ASR/local/prepare_char.py
Executable file
259
egs/multi_zh_en/ASR/local/prepare_char.py
Executable file
@ -0,0 +1,259 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang,
|
||||||
|
# Wei Kang)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
This script takes as input `lang_dir`, which should contain::
|
||||||
|
|
||||||
|
- lang_dir/text,
|
||||||
|
- lang_dir/words.txt
|
||||||
|
|
||||||
|
and generates the following files in the directory `lang_dir`:
|
||||||
|
|
||||||
|
- lexicon.txt
|
||||||
|
- lexicon_disambig.txt
|
||||||
|
- L.pt
|
||||||
|
- L_disambig.pt
|
||||||
|
- tokens.txt
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
|
import k2
|
||||||
|
import torch
|
||||||
|
from prepare_lang import (
|
||||||
|
Lexicon,
|
||||||
|
add_disambig_symbols,
|
||||||
|
add_self_loops,
|
||||||
|
write_lexicon,
|
||||||
|
write_mapping,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def lexicon_to_fst_no_sil(
|
||||||
|
lexicon: Lexicon,
|
||||||
|
token2id: Dict[str, int],
|
||||||
|
word2id: Dict[str, int],
|
||||||
|
need_self_loops: bool = False,
|
||||||
|
) -> k2.Fsa:
|
||||||
|
"""Convert a lexicon to an FST (in k2 format).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lexicon:
|
||||||
|
The input lexicon. See also :func:`read_lexicon`
|
||||||
|
token2id:
|
||||||
|
A dict mapping tokens to IDs.
|
||||||
|
word2id:
|
||||||
|
A dict mapping words to IDs.
|
||||||
|
need_self_loops:
|
||||||
|
If True, add self-loop to states with non-epsilon output symbols
|
||||||
|
on at least one arc out of the state. The input label for this
|
||||||
|
self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
|
||||||
|
Returns:
|
||||||
|
Return an instance of `k2.Fsa` representing the given lexicon.
|
||||||
|
"""
|
||||||
|
loop_state = 0 # words enter and leave from here
|
||||||
|
next_state = 1 # the next un-allocated state, will be incremented as we go
|
||||||
|
|
||||||
|
arcs = []
|
||||||
|
|
||||||
|
# The blank symbol <blk> is defined in local/train_bpe_model.py
|
||||||
|
assert token2id["<blk>"] == 0
|
||||||
|
assert word2id["<eps>"] == 0
|
||||||
|
|
||||||
|
eps = 0
|
||||||
|
|
||||||
|
for word, pieces in lexicon:
|
||||||
|
assert len(pieces) > 0, f"{word} has no pronunciations"
|
||||||
|
cur_state = loop_state
|
||||||
|
|
||||||
|
word = word2id[word]
|
||||||
|
pieces = [token2id[i] if i in token2id else token2id["<unk>"] for i in pieces]
|
||||||
|
|
||||||
|
for i in range(len(pieces) - 1):
|
||||||
|
w = word if i == 0 else eps
|
||||||
|
arcs.append([cur_state, next_state, pieces[i], w, 0])
|
||||||
|
|
||||||
|
cur_state = next_state
|
||||||
|
next_state += 1
|
||||||
|
|
||||||
|
# now for the last piece of this word
|
||||||
|
i = len(pieces) - 1
|
||||||
|
w = word if i == 0 else eps
|
||||||
|
arcs.append([cur_state, loop_state, pieces[i], w, 0])
|
||||||
|
|
||||||
|
if need_self_loops:
|
||||||
|
disambig_token = token2id["#0"]
|
||||||
|
disambig_word = word2id["#0"]
|
||||||
|
arcs = add_self_loops(
|
||||||
|
arcs,
|
||||||
|
disambig_token=disambig_token,
|
||||||
|
disambig_word=disambig_word,
|
||||||
|
)
|
||||||
|
|
||||||
|
final_state = next_state
|
||||||
|
arcs.append([loop_state, final_state, -1, -1, 0])
|
||||||
|
arcs.append([final_state])
|
||||||
|
|
||||||
|
arcs = sorted(arcs, key=lambda arc: arc[0])
|
||||||
|
arcs = [[str(i) for i in arc] for arc in arcs]
|
||||||
|
arcs = [" ".join(arc) for arc in arcs]
|
||||||
|
arcs = "\n".join(arcs)
|
||||||
|
|
||||||
|
fsa = k2.Fsa.from_str(arcs, acceptor=False)
|
||||||
|
return fsa
|
||||||
|
|
||||||
|
|
||||||
|
def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool:
|
||||||
|
"""Check if all the given tokens are in token symbol table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_sym_table:
|
||||||
|
Token symbol table that contains all the valid tokens.
|
||||||
|
tokens:
|
||||||
|
A list of tokens.
|
||||||
|
Returns:
|
||||||
|
Return True if there is any token not in the token_sym_table,
|
||||||
|
otherwise False.
|
||||||
|
"""
|
||||||
|
for tok in tokens:
|
||||||
|
if tok not in token_sym_table:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def generate_lexicon(token_sym_table: Dict[str, int], words: List[str]) -> Lexicon:
|
||||||
|
"""Generate a lexicon from a word list and token_sym_table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_sym_table:
|
||||||
|
Token symbol table that mapping token to token ids.
|
||||||
|
words:
|
||||||
|
A list of strings representing words.
|
||||||
|
Returns:
|
||||||
|
Return a dict whose keys are words and values are the corresponding
|
||||||
|
tokens.
|
||||||
|
"""
|
||||||
|
lexicon = []
|
||||||
|
for word in words:
|
||||||
|
chars = list(word.strip(" \t"))
|
||||||
|
if contain_oov(token_sym_table, chars):
|
||||||
|
continue
|
||||||
|
lexicon.append((word, chars))
|
||||||
|
|
||||||
|
# The OOV word is <UNK>
|
||||||
|
lexicon.append(("<UNK>", ["<unk>"]))
|
||||||
|
return lexicon
|
||||||
|
|
||||||
|
|
||||||
|
def generate_tokens(text_file: str) -> Dict[str, int]:
|
||||||
|
"""Generate tokens from the given text file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text_file:
|
||||||
|
A file that contains text lines to generate tokens.
|
||||||
|
Returns:
|
||||||
|
Return a dict whose keys are tokens and values are token ids ranged
|
||||||
|
from 0 to len(keys) - 1.
|
||||||
|
"""
|
||||||
|
tokens: Dict[str, int] = dict()
|
||||||
|
tokens["<blk>"] = 0
|
||||||
|
tokens["<sos/eos>"] = 1
|
||||||
|
tokens["<unk>"] = 2
|
||||||
|
whitespace = re.compile(r"([ \t\r\n]+)")
|
||||||
|
with open(text_file, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = re.sub(whitespace, "", line)
|
||||||
|
chars = list(line)
|
||||||
|
for char in chars:
|
||||||
|
if char not in tokens:
|
||||||
|
tokens[char] = len(tokens)
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--lang-dir",
|
||||||
|
type=str,
|
||||||
|
help="""Input and output directory.
|
||||||
|
It should contain the bpe.model and words.txt
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = get_args()
|
||||||
|
lang_dir = Path(args.lang_dir)
|
||||||
|
text_file = lang_dir / "text"
|
||||||
|
|
||||||
|
word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
|
||||||
|
|
||||||
|
words = word_sym_table.symbols
|
||||||
|
|
||||||
|
excluded = ["<eps>", "!SIL", "<SPOKEN_NOISE>", "<UNK>", "#0", "<s>", "</s>"]
|
||||||
|
for w in excluded:
|
||||||
|
if w in words:
|
||||||
|
words.remove(w)
|
||||||
|
|
||||||
|
token_sym_table = generate_tokens(text_file)
|
||||||
|
|
||||||
|
lexicon = generate_lexicon(token_sym_table, words)
|
||||||
|
|
||||||
|
lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
|
||||||
|
|
||||||
|
next_token_id = max(token_sym_table.values()) + 1
|
||||||
|
for i in range(max_disambig + 1):
|
||||||
|
disambig = f"#{i}"
|
||||||
|
assert disambig not in token_sym_table
|
||||||
|
token_sym_table[disambig] = next_token_id
|
||||||
|
next_token_id += 1
|
||||||
|
|
||||||
|
word_sym_table.add("#0")
|
||||||
|
word_sym_table.add("<s>")
|
||||||
|
word_sym_table.add("</s>")
|
||||||
|
|
||||||
|
write_mapping(lang_dir / "tokens.txt", token_sym_table)
|
||||||
|
|
||||||
|
write_lexicon(lang_dir / "lexicon.txt", lexicon)
|
||||||
|
write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig)
|
||||||
|
|
||||||
|
L = lexicon_to_fst_no_sil(
|
||||||
|
lexicon,
|
||||||
|
token2id=token_sym_table,
|
||||||
|
word2id=word_sym_table,
|
||||||
|
)
|
||||||
|
|
||||||
|
L_disambig = lexicon_to_fst_no_sil(
|
||||||
|
lexicon_disambig,
|
||||||
|
token2id=token_sym_table,
|
||||||
|
word2id=word_sym_table,
|
||||||
|
need_self_loops=True,
|
||||||
|
)
|
||||||
|
torch.save(L.as_dict(), lang_dir / "L.pt")
|
||||||
|
torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -74,7 +74,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||||
log "Stage 4: preparation for training BPE model"
|
log "Stage 4: Prepare Byte BPE based lang"
|
||||||
mkdir -p data/fbank
|
mkdir -p data/fbank
|
||||||
if [ ! -d ../../aishell2/ASR/data/lang_char ]; then
|
if [ ! -d ../../aishell2/ASR/data/lang_char ]; then
|
||||||
log "Abort! Please run ../../aishell2/ASR/prepare.sh --stage 3 --stop-stage 3"
|
log "Abort! Please run ../../aishell2/ASR/prepare.sh --stage 3 --stop-stage 3"
|
||||||
@ -107,40 +107,40 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
|||||||
./local/prepare_for_bpe_model.py \
|
./local/prepare_for_bpe_model.py \
|
||||||
--lang_dir ./$lang_dir \
|
--lang_dir ./$lang_dir \
|
||||||
--text $lang_dir/text
|
--text $lang_dir/text
|
||||||
|
|
||||||
|
if [ ! -f $lang_dir/text_words_segmentation ]; then
|
||||||
|
python3 ./local/text2segments.py \
|
||||||
|
--input-file ./data/lang_char/text \
|
||||||
|
--output-file $lang_dir/text_words_segmentation
|
||||||
|
|
||||||
|
cat ./data/lang_bpe_500/transcript_words.txt \
|
||||||
|
>> $lang_dir/text_words_segmentation
|
||||||
|
fi
|
||||||
|
|
||||||
|
cat $lang_dir/text_words_segmentation | sed 's/ /\n/g' \
|
||||||
|
| sort -u | sed '/^$/d' | uniq > $lang_dir/words_no_ids.txt
|
||||||
|
|
||||||
|
if [ ! -f $lang_dir/words.txt ]; then
|
||||||
|
python3 ./local/prepare_words.py \
|
||||||
|
--input-file $lang_dir/words_no_ids.txt \
|
||||||
|
--output-file $lang_dir/words.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f $lang_dir/bbpe.model ]; then
|
||||||
|
./local/train_bbpe_model.py \
|
||||||
|
--lang-dir $lang_dir \
|
||||||
|
--vocab-size $vocab_size \
|
||||||
|
--transcript $lang_dir/transcript_chars.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f $lang_dir/L_disambig.pt ]; then
|
||||||
|
./local/prepare_lang_bbpe.py --lang-dir $lang_dir
|
||||||
|
|
||||||
|
log "Validating $lang_dir/lexicon.txt"
|
||||||
|
./local/validate_bpe_lexicon.py \
|
||||||
|
--lexicon $lang_dir/lexicon.txt \
|
||||||
|
--bpe-model $lang_dir/bbpe.model
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|
||||||
log "Stage 5: training BPE model"
|
|
||||||
if [ ! -d data/lang_char ]; then
|
|
||||||
log "Abort! Please run stage 4 first"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -d data/lang_phone ]; then
|
|
||||||
log "Abort! Please run stage 4 first"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
for vocab_size in ${vocab_sizes[@]}; do
|
|
||||||
lang_dir=data/lang_bbpe_${vocab_size}
|
|
||||||
mkdir -p $lang_dir
|
|
||||||
|
|
||||||
if [ ! -f $lang_dir/bpe.model ]; then
|
|
||||||
log "Training BPE model with vocab size ${vocab_size}"
|
|
||||||
python3 -m k2_cli.prepares.prep_bpe \
|
|
||||||
--input-dir data/lang_char \
|
|
||||||
--output-dir data/lang_char \
|
|
||||||
--vocab-size ${vocab_size}
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -f data/lang_phone/bpe${vocab_size}.model ]; then
|
|
||||||
log "Training BPE model with vocab size ${vocab_size}"
|
|
||||||
python3 -m k2_cli.prepares.prep_bpe \
|
|
||||||
--input-dir data/lang_phone \
|
|
||||||
--output-dir data/lang_phone \
|
|
||||||
--vocab-size ${vocab_size}
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
@ -31,213 +31,42 @@ class MultiDataset:
|
|||||||
Args:
|
Args:
|
||||||
manifest_dir:
|
manifest_dir:
|
||||||
It is expected to contain the following files:
|
It is expected to contain the following files:
|
||||||
- aidatatang_cuts_train.jsonl.gz
|
|
||||||
- aishell_cuts_train.jsonl.gz
|
|
||||||
- aishell2_cuts_train.jsonl.gz
|
- aishell2_cuts_train.jsonl.gz
|
||||||
- aishell4_cuts_train_L.jsonl.gz
|
|
||||||
- aishell4_cuts_train_M.jsonl.gz
|
|
||||||
- aishell4_cuts_train_S.jsonl.gz
|
|
||||||
- alimeeting-far_cuts_train.jsonl.gz
|
|
||||||
- magicdata_cuts_train.jsonl.gz
|
|
||||||
- primewords_cuts_train.jsonl.gz
|
|
||||||
- stcmds_cuts_train.jsonl.gz
|
|
||||||
- thchs_30_cuts_train.jsonl.gz
|
|
||||||
- kespeech/kespeech-asr_cuts_train_phase1.jsonl.gz
|
|
||||||
- kespeech/kespeech-asr_cuts_train_phase2.jsonl.gz
|
|
||||||
- wenetspeech/cuts_L.jsonl.gz
|
|
||||||
"""
|
"""
|
||||||
self.fbank_dir = Path(fbank_dir)
|
self.fbank_dir = Path(fbank_dir)
|
||||||
|
|
||||||
def train_cuts(self) -> CutSet:
|
def train_cuts(self) -> CutSet:
|
||||||
logging.info("About to get multidataset train cuts")
|
logging.info("About to get multidataset train cuts")
|
||||||
|
|
||||||
# THCHS-30
|
|
||||||
logging.info("Loading THCHS-30 in lazy mode")
|
|
||||||
thchs_30_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "thchs_30_cuts_train.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
# AISHELL-1
|
|
||||||
logging.info("Loading Aishell-1 in lazy mode")
|
|
||||||
aishell_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "aishell_cuts_train.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
# AISHELL-2
|
# AISHELL-2
|
||||||
logging.info("Loading Aishell-2 in lazy mode")
|
logging.info("Loading Aishell-2 in lazy mode")
|
||||||
aishell_2_cuts = load_manifest_lazy(
|
aishell_2_cuts = load_manifest_lazy(
|
||||||
self.fbank_dir / "aishell2_cuts_train.jsonl.gz"
|
self.fbank_dir / "aishell2_cuts_train.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
|
||||||
# AISHELL-4
|
|
||||||
logging.info("Loading Aishell-4 in lazy mode")
|
|
||||||
aishell_4_L_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "aishell4_cuts_train_L.jsonl.gz"
|
|
||||||
)
|
|
||||||
aishell_4_M_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "aishell4_cuts_train_M.jsonl.gz"
|
|
||||||
)
|
|
||||||
aishell_4_S_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "aishell4_cuts_train_S.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
# ST-CMDS
|
|
||||||
logging.info("Loading ST-CMDS in lazy mode")
|
|
||||||
stcmds_cuts = load_manifest_lazy(self.fbank_dir / "stcmds_cuts_train.jsonl.gz")
|
|
||||||
|
|
||||||
# Primewords
|
|
||||||
logging.info("Loading Primewords in lazy mode")
|
|
||||||
primewords_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "primewords_cuts_train.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
# MagicData
|
|
||||||
logging.info("Loading MagicData in lazy mode")
|
|
||||||
magicdata_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "magicdata_cuts_train.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Aidatatang_200zh
|
|
||||||
logging.info("Loading Aidatatang_200zh in lazy mode")
|
|
||||||
aidatatang_200zh_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "aidatatang_cuts_train.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Ali-Meeting
|
|
||||||
logging.info("Loading Ali-Meeting in lazy mode")
|
|
||||||
alimeeting_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "alimeeting-far_cuts_train.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
# WeNetSpeech
|
|
||||||
logging.info("Loading WeNetSpeech in lazy mode")
|
|
||||||
wenetspeech_L_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "wenetspeech" / "cuts_L.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
# KeSpeech
|
|
||||||
logging.info("Loading KeSpeech in lazy mode")
|
|
||||||
kespeech_1_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "kespeech" / "kespeech-asr_cuts_train_phase1.jsonl.gz"
|
|
||||||
)
|
|
||||||
kespeech_2_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "kespeech" / "kespeech-asr_cuts_train_phase2.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
return CutSet.mux(
|
return CutSet.mux(
|
||||||
thchs_30_cuts,
|
|
||||||
aishell_cuts,
|
|
||||||
aishell_2_cuts,
|
aishell_2_cuts,
|
||||||
aishell_4_L_cuts,
|
|
||||||
aishell_4_M_cuts,
|
|
||||||
aishell_4_S_cuts,
|
|
||||||
stcmds_cuts,
|
|
||||||
primewords_cuts,
|
|
||||||
magicdata_cuts,
|
|
||||||
aidatatang_200zh_cuts,
|
|
||||||
alimeeting_cuts,
|
|
||||||
wenetspeech_L_cuts,
|
|
||||||
kespeech_1_cuts,
|
|
||||||
kespeech_2_cuts,
|
|
||||||
weights=[
|
weights=[
|
||||||
len(thchs_30_cuts),
|
|
||||||
len(aishell_cuts),
|
|
||||||
len(aishell_2_cuts),
|
len(aishell_2_cuts),
|
||||||
len(aishell_4_L_cuts),
|
|
||||||
len(aishell_4_M_cuts),
|
|
||||||
len(aishell_4_S_cuts),
|
|
||||||
len(stcmds_cuts),
|
|
||||||
len(primewords_cuts),
|
|
||||||
len(magicdata_cuts),
|
|
||||||
len(aidatatang_200zh_cuts),
|
|
||||||
len(alimeeting_cuts),
|
|
||||||
len(wenetspeech_L_cuts),
|
|
||||||
len(kespeech_1_cuts),
|
|
||||||
len(kespeech_2_cuts),
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
def dev_cuts(self) -> CutSet:
|
def dev_cuts(self) -> List[CutSet]:
|
||||||
logging.info("About to get multidataset dev cuts")
|
logging.info("About to get multidataset dev cuts")
|
||||||
|
|
||||||
# Aidatatang_200zh
|
|
||||||
logging.info("Loading Aidatatang_200zh DEV set in lazy mode")
|
|
||||||
aidatatang_dev_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "aidatatang_cuts_dev.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
# AISHELL
|
|
||||||
logging.info("Loading Aishell DEV set in lazy mode")
|
|
||||||
aishell_dev_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "aishell_cuts_dev.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
# AISHELL-2
|
# AISHELL-2
|
||||||
logging.info("Loading Aishell-2 DEV set in lazy mode")
|
logging.info("Loading Aishell-2 DEV set in lazy mode")
|
||||||
aishell2_dev_cuts = load_manifest_lazy(
|
aishell2_dev_cuts = load_manifest_lazy(
|
||||||
self.fbank_dir / "aishell2_cuts_dev.jsonl.gz"
|
self.fbank_dir / "aishell2_cuts_dev.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Ali-Meeting
|
return [
|
||||||
logging.info("Loading Ali-Meeting DEV set in lazy mode")
|
aishell2_dev_cuts,
|
||||||
alimeeting_dev_cuts = load_manifest_lazy(
|
]
|
||||||
self.fbank_dir / "alimeeting-far_cuts_eval.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
# MagicData
|
|
||||||
logging.info("Loading MagicData DEV set in lazy mode")
|
|
||||||
magicdata_dev_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "magicdata_cuts_dev.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
# KeSpeech
|
|
||||||
logging.info("Loading KeSpeech DEV set in lazy mode")
|
|
||||||
kespeech_dev_phase1_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase1.jsonl.gz"
|
|
||||||
)
|
|
||||||
kespeech_dev_phase2_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase2.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
# WeNetSpeech
|
|
||||||
logging.info("Loading WeNetSpeech DEV set in lazy mode")
|
|
||||||
wenetspeech_dev_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "wenetspeech" / "cuts_DEV.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
return wenetspeech_dev_cuts
|
|
||||||
# return [
|
|
||||||
# aidatatang_dev_cuts,
|
|
||||||
# aishell_dev_cuts,
|
|
||||||
# aishell2_dev_cuts,
|
|
||||||
# alimeeting_dev_cuts,
|
|
||||||
# magicdata_dev_cuts,
|
|
||||||
# kespeech_dev_phase1_cuts,
|
|
||||||
# kespeech_dev_phase2_cuts,
|
|
||||||
# wenetspeech_dev_cuts,
|
|
||||||
# ]
|
|
||||||
|
|
||||||
def test_cuts(self) -> Dict[str, CutSet]:
|
def test_cuts(self) -> Dict[str, CutSet]:
|
||||||
logging.info("About to get multidataset test cuts")
|
logging.info("About to get multidataset test cuts")
|
||||||
|
|
||||||
# Aidatatang_200zh
|
|
||||||
logging.info("Loading Aidatatang_200zh set in lazy mode")
|
|
||||||
aidatatang_test_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "aidatatang_cuts_test.jsonl.gz"
|
|
||||||
)
|
|
||||||
aidatatang_dev_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "aidatatang_cuts_dev.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
# AISHELL
|
|
||||||
logging.info("Loading Aishell set in lazy mode")
|
|
||||||
aishell_test_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "aishell_cuts_test.jsonl.gz"
|
|
||||||
)
|
|
||||||
aishell_dev_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "aishell_cuts_dev.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
# AISHELL-2
|
# AISHELL-2
|
||||||
logging.info("Loading Aishell-2 set in lazy mode")
|
logging.info("Loading Aishell-2 set in lazy mode")
|
||||||
aishell2_test_cuts = load_manifest_lazy(
|
aishell2_test_cuts = load_manifest_lazy(
|
||||||
@ -247,70 +76,7 @@ class MultiDataset:
|
|||||||
self.fbank_dir / "aishell2_cuts_dev.jsonl.gz"
|
self.fbank_dir / "aishell2_cuts_dev.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
|
||||||
# AISHELL-4
|
|
||||||
logging.info("Loading Aishell-4 TEST set in lazy mode")
|
|
||||||
aishell4_test_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "aishell4_cuts_test.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Ali-Meeting
|
|
||||||
logging.info("Loading Ali-Meeting set in lazy mode")
|
|
||||||
alimeeting_test_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "alimeeting-far_cuts_test.jsonl.gz"
|
|
||||||
)
|
|
||||||
alimeeting_eval_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "alimeeting-far_cuts_eval.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
# MagicData
|
|
||||||
logging.info("Loading MagicData set in lazy mode")
|
|
||||||
magicdata_test_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "magicdata_cuts_test.jsonl.gz"
|
|
||||||
)
|
|
||||||
magicdata_dev_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "magicdata_cuts_dev.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
# KeSpeech
|
|
||||||
logging.info("Loading KeSpeech set in lazy mode")
|
|
||||||
kespeech_test_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "kespeech" / "kespeech-asr_cuts_test.jsonl.gz"
|
|
||||||
)
|
|
||||||
kespeech_dev_phase1_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase1.jsonl.gz"
|
|
||||||
)
|
|
||||||
kespeech_dev_phase2_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase2.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
# WeNetSpeech
|
|
||||||
logging.info("Loading WeNetSpeech set in lazy mode")
|
|
||||||
wenetspeech_test_meeting_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "wenetspeech" / "cuts_TEST_MEETING.jsonl.gz"
|
|
||||||
)
|
|
||||||
wenetspeech_test_net_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "wenetspeech" / "cuts_TEST_NET.jsonl.gz"
|
|
||||||
)
|
|
||||||
wenetspeech_dev_cuts = load_manifest_lazy(
|
|
||||||
self.fbank_dir / "wenetspeech" / "cuts_DEV.jsonl.gz"
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"aidatatang_test": aidatatang_test_cuts,
|
|
||||||
"aidatatang_dev": aidatatang_dev_cuts,
|
|
||||||
"alimeeting_test": alimeeting_test_cuts,
|
|
||||||
"alimeeting_eval": alimeeting_eval_cuts,
|
|
||||||
"aishell_test": aishell_test_cuts,
|
|
||||||
"aishell_dev": aishell_dev_cuts,
|
|
||||||
"aishell-2_test": aishell2_test_cuts,
|
"aishell-2_test": aishell2_test_cuts,
|
||||||
"aishell-2_dev": aishell2_dev_cuts,
|
"aishell-2_dev": aishell2_dev_cuts,
|
||||||
"aishell-4": aishell4_test_cuts,
|
|
||||||
"magicdata_test": magicdata_test_cuts,
|
|
||||||
"magicdata_dev": magicdata_dev_cuts,
|
|
||||||
"kespeech-asr_test": kespeech_test_cuts,
|
|
||||||
"kespeech-asr_dev_phase1": kespeech_dev_phase1_cuts,
|
|
||||||
"kespeech-asr_dev_phase2": kespeech_dev_phase2_cuts,
|
|
||||||
"wenetspeech-meeting_test": wenetspeech_test_meeting_cuts,
|
|
||||||
"wenetspeech-net_test": wenetspeech_test_net_cuts,
|
|
||||||
"wenetspeech_dev": wenetspeech_dev_cuts,
|
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user