mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 01:52:41 +00:00
Combined updates. Changed BBPE path structure, changed dataset path structure, added script to update cutset paths. WIP
This commit is contained in:
parent
1f11ba4d28
commit
ce894a7ba2
@ -21,7 +21,7 @@
|
|||||||
|
|
||||||
This script takes as input `lang_dir`, which should contain::
|
This script takes as input `lang_dir`, which should contain::
|
||||||
|
|
||||||
- lang_dir/bbpe.model,
|
- lang_dir/bbpe_2000/bbpe.model
|
||||||
- lang_dir/words.txt
|
- lang_dir/words.txt
|
||||||
|
|
||||||
and generates the following files in the directory `lang_dir`:
|
and generates the following files in the directory `lang_dir`:
|
||||||
@ -173,7 +173,8 @@ def get_args():
|
|||||||
"--lang-dir",
|
"--lang-dir",
|
||||||
type=str,
|
type=str,
|
||||||
help="""Input and output directory.
|
help="""Input and output directory.
|
||||||
It should contain the bpe.model and words.txt
|
It should contain the words.txt file and the
|
||||||
|
bbpe model in a subdirectory (e.g., bbpe_2000/bbpe.model).
|
||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -184,6 +185,13 @@ def get_args():
|
|||||||
help="The out of vocabulary word in lexicon.",
|
help="The out of vocabulary word in lexicon.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--vocab-size",
|
||||||
|
type=int,
|
||||||
|
default=2000, # Add a default value for vocab_size for consistency
|
||||||
|
help="Vocabulary size used for BPE training (determines the bbpe model directory).",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--debug",
|
"--debug",
|
||||||
type=str2bool,
|
type=str2bool,
|
||||||
@ -205,6 +213,9 @@ def main():
|
|||||||
args = get_args()
|
args = get_args()
|
||||||
lang_dir = Path(args.lang_dir)
|
lang_dir = Path(args.lang_dir)
|
||||||
model_file = lang_dir / "bbpe.model"
|
model_file = lang_dir / "bbpe.model"
|
||||||
|
|
||||||
|
if not model_file.is_file():
|
||||||
|
raise FileNotFoundError(f"BPE model not found at: {model_file}")
|
||||||
|
|
||||||
word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
|
word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
|
||||||
|
|
||||||
@ -216,7 +227,7 @@ def main():
|
|||||||
if w in words:
|
if w in words:
|
||||||
words.remove(w)
|
words.remove(w)
|
||||||
|
|
||||||
lexicon, token_sym_table = generate_lexicon(model_file, words, args.oov)
|
lexicon, token_sym_table = generate_lexicon(str(model_file), words, args.oov)
|
||||||
|
|
||||||
lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
|
lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
|
||||||
|
|
||||||
|
@ -33,7 +33,7 @@ from pathlib import Path
|
|||||||
import sentencepiece as spm
|
import sentencepiece as spm
|
||||||
|
|
||||||
from icefall import byte_encode
|
from icefall import byte_encode
|
||||||
from icefall.utils import tokenize_by_ja_char
|
from icefall.utils import str2bool, tokenize_by_ja_char
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
def get_args():
|
||||||
@ -41,9 +41,7 @@ def get_args():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--lang-dir",
|
"--lang-dir",
|
||||||
type=str,
|
type=str,
|
||||||
help="""Input and output directory.
|
help="""Input directory.""",
|
||||||
The generated bpe.model is saved to this directory.
|
|
||||||
""",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -58,6 +56,27 @@ def get_args():
|
|||||||
help="Vocabulary size for BPE training",
|
help="Vocabulary size for BPE training",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-model",
|
||||||
|
type=str,
|
||||||
|
help="Path to save the trained BPE model.",
|
||||||
|
required=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--input-sentence-size",
|
||||||
|
type=int,
|
||||||
|
default=1000000, # Added default value
|
||||||
|
help="Maximum number of sentences to load for BPE training.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--shuffle-input-sentence",
|
||||||
|
type=str2bool,
|
||||||
|
default=True, # Added default value
|
||||||
|
help="Whether to shuffle input sentences.",
|
||||||
|
)
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
@ -71,17 +90,20 @@ def main():
|
|||||||
args = get_args()
|
args = get_args()
|
||||||
vocab_size = args.vocab_size
|
vocab_size = args.vocab_size
|
||||||
lang_dir = Path(args.lang_dir)
|
lang_dir = Path(args.lang_dir)
|
||||||
|
output_model = Path(args.output_model)
|
||||||
|
input_sentence_size = args.input_sentence_size
|
||||||
|
shuffle_input_sentence = args.shuffle_input_sentence
|
||||||
|
|
||||||
model_type = "unigram"
|
model_type = "unigram"
|
||||||
|
|
||||||
model_prefix = f"{lang_dir}/{model_type}_{vocab_size}"
|
model_prefix = str(output_model.parent / f"{model_type}_{vocab_size}")
|
||||||
model_file = Path(model_prefix + ".model")
|
temp_model_file = Path(model_prefix + ".model")
|
||||||
if model_file.is_file():
|
|
||||||
print(f"{model_file} exists - skipping")
|
if output_model.is_file():
|
||||||
|
print(f"{output_model} exists - skipping")
|
||||||
return
|
return
|
||||||
|
|
||||||
character_coverage = 1.0
|
character_coverage = 1.0
|
||||||
input_sentence_size = 100000000
|
|
||||||
|
|
||||||
user_defined_symbols = ["<blk>", "<sos/eos>"]
|
user_defined_symbols = ["<blk>", "<sos/eos>"]
|
||||||
unk_id = len(user_defined_symbols)
|
unk_id = len(user_defined_symbols)
|
||||||
@ -100,6 +122,7 @@ def main():
|
|||||||
model_type=model_type,
|
model_type=model_type,
|
||||||
model_prefix=model_prefix,
|
model_prefix=model_prefix,
|
||||||
input_sentence_size=input_sentence_size,
|
input_sentence_size=input_sentence_size,
|
||||||
|
shuffle_input_sentence=shuffle_input_sentence,
|
||||||
character_coverage=character_coverage,
|
character_coverage=character_coverage,
|
||||||
user_defined_symbols=user_defined_symbols,
|
user_defined_symbols=user_defined_symbols,
|
||||||
unk_id=unk_id,
|
unk_id=unk_id,
|
||||||
@ -107,8 +130,8 @@ def main():
|
|||||||
eos_id=-1,
|
eos_id=-1,
|
||||||
)
|
)
|
||||||
|
|
||||||
shutil.copyfile(model_file, f"{lang_dir}/bbpe.model")
|
shutil.move(str(temp_model_file), str(output_model))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
103
egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py
Normal file
103
egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from lhotse import CutSet, load_manifest
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "data/manifests"):
|
||||||
|
"""
|
||||||
|
Updates the storage_path in a CutSet's features to reflect the structure in multi_ja_en.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cuts: The Lhotse CutSet to modify.
|
||||||
|
dataset_name: The name of the dataset (e.g., "reazonspeech", "mls_english")
|
||||||
|
which corresponds to the new subdirectory for features.
|
||||||
|
old_feature_prefix: The prefix that the original feature paths were relative to.
|
||||||
|
This typically corresponds to the root of the manifests dir
|
||||||
|
in the original recipe.
|
||||||
|
"""
|
||||||
|
# updated_cuts = []
|
||||||
|
# for cut in cuts:
|
||||||
|
# if cut.features is not None:
|
||||||
|
# original_storage_path = Path(cut.features.storage_path)
|
||||||
|
|
||||||
|
# # Check if the path needs updating, i.e., if it's still pointing to the old flat structure
|
||||||
|
# # and isn't already pointing to the new dataset-specific structure.
|
||||||
|
# # The `startswith` check on the original path is crucial here.
|
||||||
|
# # Example: 'data/manifests/feats_train/feats-12.lca'
|
||||||
|
# if original_storage_path.parts[0] == old_feature_prefix.split('/')[0] and \
|
||||||
|
# original_storage_path.parts[1] == old_feature_prefix.split('/')[1] and \
|
||||||
|
# not original_storage_path.parts[2].startswith(dataset_name):
|
||||||
|
|
||||||
|
# # Assuming the original feature files were structured like
|
||||||
|
# # data/manifests/feats_train/some_file.lca
|
||||||
|
# # We want to change them to data/manifests/reazonspeech/feats_train/some_file.lca
|
||||||
|
|
||||||
|
# # This gives us 'feats_train/feats-12.lca'
|
||||||
|
# relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix)
|
||||||
|
|
||||||
|
# # Construct the new path: data/manifests/<dataset_name>/feats_train/feats-12.lca
|
||||||
|
# new_storage_path = Path(old_feature_prefix) / dataset_name / relative_path_from_old_prefix
|
||||||
|
# cut = cut.with_features_path_prefix(cut.features.with_path(str(new_storage_path)))
|
||||||
|
# updated_cuts.append(cut)
|
||||||
|
# else:
|
||||||
|
# updated_cuts.append(cut) # No features, or not a path we need to modify
|
||||||
|
# return CutSet.from_cuts(updated_cuts)
|
||||||
|
return cuts.with_features_path_prefix(old_feature_prefix + "/" + dataset_name)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# The root where the symlinked manifests are located in the multi_ja_en recipe
|
||||||
|
multi_recipe_manifests_root = Path("data/manifests")
|
||||||
|
|
||||||
|
# Define the datasets and their *specific* manifest file prefixes
|
||||||
|
# The keys are the dataset names (which are also the subdirectory names)
|
||||||
|
# The values are the base filename for their cuts (e.g., "reazonspeech_cuts", "mls_eng_cuts")
|
||||||
|
dataset_manifest_prefixes = {
|
||||||
|
"reazonspeech": "reazonspeech_cuts",
|
||||||
|
"mls_english": "mls_eng_cuts",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Define the splits. The script will append "_dev.jsonl.gz", "_train.jsonl.gz", etc.
|
||||||
|
splits = ["train", "dev", "test"]
|
||||||
|
|
||||||
|
# This is the path segment *inside* the original recipe's data/manifests
|
||||||
|
# that your features were stored under.
|
||||||
|
# e.g., if original path was /original/recipe/data/manifests/feats_train/file.lca
|
||||||
|
# then this is 'data/manifests'
|
||||||
|
original_feature_base_path = "data/manifests"
|
||||||
|
|
||||||
|
|
||||||
|
for dataset_name, manifest_prefix in dataset_manifest_prefixes.items():
|
||||||
|
dataset_symlink_dir = multi_recipe_manifests_root / dataset_name
|
||||||
|
if not dataset_symlink_dir.is_dir():
|
||||||
|
logger.warning(f"Dataset symlink directory not found: {dataset_symlink_dir}. Skipping {dataset_name}.")
|
||||||
|
continue
|
||||||
|
|
||||||
|
for split in splits:
|
||||||
|
# Construct the path to the symlinked manifest file
|
||||||
|
manifest_filename = f"{manifest_prefix}_{split}.jsonl.gz"
|
||||||
|
manifest_path = dataset_symlink_dir / manifest_filename
|
||||||
|
|
||||||
|
if manifest_path.is_file():
|
||||||
|
logger.info(f"Processing {dataset_name} {split} cuts from symlink: {manifest_path}")
|
||||||
|
try:
|
||||||
|
# Load the manifest (Lhotse will follow the symlink)
|
||||||
|
cuts = load_manifest(manifest_path)
|
||||||
|
|
||||||
|
# Update the storage_path within the loaded cuts
|
||||||
|
# The `old_feature_prefix` is still 'data/manifests' as that's what the original
|
||||||
|
# paths in the underlying manifest refer to.
|
||||||
|
updated_cuts = update_paths(cuts, dataset_name, old_feature_prefix=original_feature_base_path)
|
||||||
|
|
||||||
|
# Save the updated cuts back to the *symlinked* path.
|
||||||
|
# Lhotse will write to the target of the symlink.
|
||||||
|
updated_cuts.to_file(manifest_path)
|
||||||
|
logger.info(f"Updated {dataset_name} {split} cuts saved to: {manifest_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing {manifest_path}: {e}", exc_info=True) # Print full traceback
|
||||||
|
else:
|
||||||
|
logger.warning(f"Manifest file not found (symlink target might be missing or file name mismatch): {manifest_path}")
|
||||||
|
|
||||||
|
logger.info("CutSet path updating complete.")
|
@ -1 +1 @@
|
|||||||
../../../librispeech/ASR/local/validate_bpe_lexicon.py
|
/root/icefall/egs/librispeech/ASR/local/validate_bpe_lexicon.py
|
@ -140,28 +140,29 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|||||||
bbpe_dir=$lang_dir/bbpe_${vocab_size}
|
bbpe_dir=$lang_dir/bbpe_${vocab_size}
|
||||||
mkdir -p $bbpe_dir
|
mkdir -p $bbpe_dir
|
||||||
|
|
||||||
if [ ! -f $lang_dir/transcript_chars.txt ]; then
|
if [ ! -f $bbpe_dir/transcript_chars.txt ]; then
|
||||||
./local/prepare_for_bpe_model.py \
|
./local/prepare_for_bpe_model.py \
|
||||||
--lang-dir ./$lang_dir \
|
--lang-dir $bbpe_dir \
|
||||||
--text $lang_dir/text
|
--text $lang_dir/text
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -f $lang_dir/text_words_segmentation ]; then
|
if [ ! -f $bbpe_dir/text_words_segmentation ]; then
|
||||||
python3 ./local/text2segments.py \
|
python3 ./local/text2segments.py \
|
||||||
--input-file ./data/lang_char/text \
|
--input-file ./data/lang_char/text \
|
||||||
--output-file $lang_dir/text_words_segmentation
|
--output-file $bbpe_dir/text_words_segmentation
|
||||||
|
|
||||||
cat ../../mls_english/ASR/data/lang/transcript.txt \
|
cat ../../mls_english/ASR/data/lang/transcript.txt \
|
||||||
>> $lang_dir/text_words_segmentation
|
>> $bbpe_dir/text_words_segmentation
|
||||||
fi
|
fi
|
||||||
|
|
||||||
cat $lang_dir/text_words_segmentation | sed 's/ /\n/g' \
|
if [ ! -f $bbpe_dir/words_no_ids.txt ]; then
|
||||||
| sort -u | sed '/^$/d' | uniq > $lang_dir/words_no_ids.txt
|
cat $bbpe_dir/text_words_segmentation | sed 's/ /\n/g' \
|
||||||
|
| sort -u | sed '/^$/d' | uniq > $bbpe_dir/words_no_ids.txt
|
||||||
|
fi
|
||||||
|
|
||||||
if [ ! -f $lang_dir/words.txt ]; then
|
if [ ! -f $bbpe_dir/words.txt ]; then
|
||||||
python3 ./local/prepare_words.py \
|
python3 ./local/prepare_words.py \
|
||||||
--input-file $lang_dir/words_no_ids.txt \
|
--input-file $bbpe_dir/words_no_ids.txt \
|
||||||
--output-file $lang_dir/words.txt
|
--output-file $bbpe_dir/words.txt
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -f $bbpe_dir/bbpe.model ]; then
|
if [ ! -f $bbpe_dir/bbpe.model ]; then
|
||||||
@ -169,26 +170,28 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|||||||
--lang-dir $lang_dir \
|
--lang-dir $lang_dir \
|
||||||
--vocab-size $vocab_size \
|
--vocab-size $vocab_size \
|
||||||
--transcript $lang_dir/text \
|
--transcript $lang_dir/text \
|
||||||
--output-model $bbpe_dir/bbpe.model # Specify output path
|
--output-model $bbpe_dir/bbpe.model \
|
||||||
|
--input-sentence-size 2000000 # Example: limit to 2 million sentences
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -f $lang_dir/L_disambig.pt ]; then
|
if [ ! -f $bbpe_dir/L_disambig.pt ]; then
|
||||||
./local/prepare_lang_bbpe.py --lang-dir $lang_dir
|
./local/prepare_lang_bbpe.py --lang-dir $bbpe_dir --vocab-size $vocab_size
|
||||||
|
|
||||||
log "Validating $lang_dir/lexicon.txt"
|
log "Validating $bbpe_dir/lexicon.txt"
|
||||||
ln -svf $(realpath ../../multi_zh_en/ASR/local/validate_bpe_lexicon.py) local/
|
ln -svf $(realpath ../../multi_zh_en/ASR/local/validate_bpe_lexicon.py) local/
|
||||||
./local/validate_bpe_lexicon.py \
|
./local/validate_bpe_lexicon.py \
|
||||||
--lexicon $lang_dir/lexicon.txt \
|
--lexicon $bbpe_dir/lexicon.txt \
|
||||||
--bpe-model $bbpe_dir/bbpe.model # Use the model in the bbpe subdir
|
--bpe-model $bbpe_dir/bbpe.model
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Remove top-level files (if they were created)
|
||||||
rm -f $lang_dir/lexicon.txt $lang_dir/L_disambig.pt
|
rm -f $lang_dir/lexicon.txt $lang_dir/L_disambig.pt
|
||||||
done
|
done
|
||||||
|
|
||||||
# Optionally, create a symlink for consistency if other parts of the recipe expect data/lang/bpe_2000
|
# Optional symlink
|
||||||
# if [ -d $lang_dir/bbpe_2000 ] && [ ! -e $lang_dir/bpe_2000 ]; then
|
if [ -d $lang_dir/bbpe_2000 ] && [ ! -e $lang_dir/bpe_2000 ]; then
|
||||||
# ln -s bbpe_2000 $lang_dir/bpe_2000
|
ln -s bbpe_2000 $lang_dir/bpe_2000
|
||||||
# fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log "prepare.sh: PREPARATION DONE"
|
log "prepare.sh: PREPARATION DONE"
|
@ -29,12 +29,12 @@ class MultiDataset:
|
|||||||
|
|
||||||
logging.info("Loading Reazonspeech TRAIN set in lazy mode")
|
logging.info("Loading Reazonspeech TRAIN set in lazy mode")
|
||||||
reazonspeech_train_cuts = load_manifest_lazy(
|
reazonspeech_train_cuts = load_manifest_lazy(
|
||||||
self.manifest_dir / "reazonspeech_cuts_train.jsonl.gz"
|
self.manifest_dir / "reazonspeech/reazonspeech_cuts_train.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
|
||||||
logging.info("Loading MLS English TRAIN set in lazy mode")
|
logging.info("Loading MLS English TRAIN set in lazy mode")
|
||||||
mls_eng_train_cuts = load_manifest_lazy(
|
mls_eng_train_cuts = load_manifest_lazy(
|
||||||
self.manifest_dir / "mls_eng_cuts_train.jsonl.gz"
|
self.manifest_dir / "mls_english/mls_eng_cuts_train.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
|
||||||
return CutSet.mux(
|
return CutSet.mux(
|
||||||
@ -51,12 +51,12 @@ class MultiDataset:
|
|||||||
|
|
||||||
logging.info("Loading Reazonspeech DEV set in lazy mode")
|
logging.info("Loading Reazonspeech DEV set in lazy mode")
|
||||||
reazonspeech_dev_cuts = load_manifest_lazy(
|
reazonspeech_dev_cuts = load_manifest_lazy(
|
||||||
self.manifest_dir / "reazonspeech_cuts_dev.jsonl.gz"
|
self.manifest_dir / "reazonspeech/reazonspeech_cuts_dev.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
|
||||||
logging.info("Loading MLS English DEV set in lazy mode")
|
logging.info("Loading MLS English DEV set in lazy mode")
|
||||||
mls_eng_dev_cuts = load_manifest_lazy(
|
mls_eng_dev_cuts = load_manifest_lazy(
|
||||||
self.manifest_dir / "mls_eng_cuts_dev.jsonl.gz"
|
self.manifest_dir / "mls_english/mls_eng_cuts_dev.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
|
||||||
return CutSet.mux(
|
return CutSet.mux(
|
||||||
@ -73,12 +73,12 @@ class MultiDataset:
|
|||||||
|
|
||||||
logging.info("Loading Reazonspeech TEST set in lazy mode")
|
logging.info("Loading Reazonspeech TEST set in lazy mode")
|
||||||
reazonspeech_test_cuts = load_manifest_lazy(
|
reazonspeech_test_cuts = load_manifest_lazy(
|
||||||
self.manifest_dir / "reazonspeech_cuts_test.jsonl.gz"
|
self.manifest_dir / "reazonspeech/reazonspeech_cuts_test.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
|
||||||
logging.info("Loading MLS English TEST set in lazy mode")
|
logging.info("Loading MLS English TEST set in lazy mode")
|
||||||
mls_eng_test_cuts = load_manifest_lazy(
|
mls_eng_test_cuts = load_manifest_lazy(
|
||||||
self.manifest_dir / "mls_eng_cuts_test.jsonl.gz"
|
self.manifest_dir / "mls_english/mls_eng_cuts_test.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
|
||||||
return CutSet.mux(
|
return CutSet.mux(
|
||||||
|
@ -327,7 +327,7 @@ def get_parser():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--bpe-model",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bbpe_2000/bbpe.model",
|
default="data/lang/bbpe_2000/bbpe.model",
|
||||||
help="Path to the BPE model",
|
help="Path to the BPE model",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1120,7 +1120,7 @@ def run(rank, world_size, args):
|
|||||||
|
|
||||||
# <blk> is defined in local/prepare_lang_char.py
|
# <blk> is defined in local/prepare_lang_char.py
|
||||||
params.blank_id = sentencepiece_processor.piece_to_id("<blk>")
|
params.blank_id = sentencepiece_processor.piece_to_id("<blk>")
|
||||||
arams.vocab_size = sentencepiece_processor.get_piece_size()
|
params.vocab_size = sentencepiece_processor.get_piece_size()
|
||||||
|
|
||||||
if not params.use_transducer:
|
if not params.use_transducer:
|
||||||
params.ctc_loss_scale = 1.0
|
params.ctc_loss_scale = 1.0
|
||||||
@ -1393,6 +1393,7 @@ def main():
|
|||||||
MultiDatasetAsrDataModule.add_arguments(parser)
|
MultiDatasetAsrDataModule.add_arguments(parser)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
args.exp_dir = Path(args.exp_dir)
|
args.exp_dir = Path(args.exp_dir)
|
||||||
|
print(args)
|
||||||
|
|
||||||
world_size = args.world_size
|
world_size = args.world_size
|
||||||
assert world_size >= 1
|
assert world_size >= 1
|
||||||
|
Loading…
x
Reference in New Issue
Block a user