mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-08 00:24:19 +00:00
minor updates
This commit is contained in:
parent
ba480b7b5f
commit
7feaa6185d
@ -796,15 +796,17 @@ def main():
|
||||
test_eval2000_cuts = switchboard.test_eval2000_cuts().trim_to_supervisions(
|
||||
keep_all_channels=True
|
||||
)
|
||||
test_rt03_cuts = switchboard.test_rt03_cuts().trim_to_supervisions(
|
||||
keep_all_channels=True
|
||||
)
|
||||
# test_rt03_cuts = switchboard.test_rt03_cuts().trim_to_supervisions(
|
||||
# keep_all_channels=True
|
||||
# )
|
||||
|
||||
test_eval2000_dl = switchboard.test_dataloaders(test_eval2000_cuts)
|
||||
test_rt03_dl = switchboard.test_dataloaders(test_rt03_cuts)
|
||||
# test_rt03_dl = switchboard.test_dataloaders(test_rt03_cuts)
|
||||
|
||||
test_sets = ["test-eval2000", "test-rt03"]
|
||||
test_dl = [test_eval2000_dl, test_rt03_dl]
|
||||
# test_sets = ["test-eval2000", "test-rt03"]
|
||||
# test_dl = [test_eval2000_dl, test_rt03_dl]
|
||||
test_sets = ["test-eval2000"]
|
||||
test_dl = [test_eval2000_dl]
|
||||
|
||||
for test_set, test_dl in zip(test_sets, test_dl):
|
||||
results_dict = decode_dataset(
|
||||
|
@ -97,7 +97,7 @@ def compute_fbank_switchboard(
|
||||
prefix = dir_name
|
||||
suffix = "jsonl.gz"
|
||||
manifests = {
|
||||
"eval2000": "data/manifests/eval2000/eval2000_cuts_all_trimmed.jsonl.gz",
|
||||
"eval2000": "data/manifests/eval2000/eval2000_cuts_all.jsonl.gz",
|
||||
}
|
||||
assert manifests is not None
|
||||
|
||||
@ -111,7 +111,12 @@ def compute_fbank_switchboard(
|
||||
logging.info(f"{prefix} already exists - skipping.")
|
||||
return
|
||||
logging.info(f"Processing {prefix}")
|
||||
cut_set = CutSet.from_file(manifests[prefix]).resample(16000)
|
||||
cut_set = (
|
||||
CutSet.from_file(manifests[prefix])
|
||||
.resample(16000)
|
||||
.to_eager()
|
||||
.filter(lambda c: c.duration > 0.5)
|
||||
)
|
||||
|
||||
cut_set = cut_set.compute_and_store_features(
|
||||
extractor=extractor,
|
||||
@ -121,6 +126,7 @@ def compute_fbank_switchboard(
|
||||
executor=ex,
|
||||
storage_type=LilcomChunkyWriter,
|
||||
)
|
||||
cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
|
||||
cut_set.to_file(output_dir / cuts_filename)
|
||||
|
||||
|
||||
|
@ -30,8 +30,8 @@ from lhotse import load_manifest_lazy
|
||||
|
||||
def main():
|
||||
# path = "./data/fbank/swbd_cuts_rt03.jsonl.gz"
|
||||
# path = "./data/fbank/swbd_cuts_eval2000.jsonl.gz"
|
||||
path = "./data/fbank/swbd_cuts_all.jsonl.gz"
|
||||
path = "./data/fbank/eval2000/eval2000_cuts_all.jsonl.gz"
|
||||
# path = "./data/fbank/swbd_cuts_all.jsonl.gz"
|
||||
|
||||
cuts = load_manifest_lazy(path)
|
||||
cuts.describe()
|
||||
@ -41,7 +41,7 @@ if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
"""
|
||||
Cut statistics:
|
||||
Training Cut statistics:
|
||||
╒═══════════════════════════╤═══════════╕
|
||||
│ Cuts count: │ 167244 │
|
||||
├───────────────────────────┼───────────┤
|
||||
@ -81,4 +81,45 @@ Speech duration statistics:
|
||||
├──────────────────────────────┼───────────┼──────────────────────┤
|
||||
│ Total silence duration │ 00:00:00 │ 0.00% of recording │
|
||||
╘══════════════════════════════╧═══════════╧══════════════════════╛
|
||||
|
||||
Eval2000 Cut statistics:
|
||||
╒═══════════════════════════╤══════════╕
|
||||
│ Cuts count: │ 2709 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Total duration (hh:mm:ss) │ 01:39:19 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ mean │ 2.2 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ std │ 1.8 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ min │ 0.1 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 25% │ 0.7 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 50% │ 1.7 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 75% │ 3.1 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 99% │ 8.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 99.5% │ 8.3 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 99.9% │ 11.3 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ max │ 14.1 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Recordings available: │ 2709 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Features available: │ 0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Supervisions available: │ 2709 │
|
||||
╘═══════════════════════════╧══════════╛
|
||||
Speech duration statistics:
|
||||
╒══════════════════════════════╤══════════╤══════════════════════╕
|
||||
│ Total speech duration │ 01:39:19 │ 100.00% of recording │
|
||||
├──────────────────────────────┼──────────┼──────────────────────┤
|
||||
│ Total speaking time duration │ 01:39:19 │ 100.00% of recording │
|
||||
├──────────────────────────────┼──────────┼──────────────────────┤
|
||||
│ Total silence duration │ 00:00:00 │ 0.00% of recording │
|
||||
╘══════════════════════════════╧══════════╧══════════════════════╛
|
||||
"""
|
||||
|
@ -216,9 +216,6 @@ def main():
|
||||
"#0",
|
||||
"<s>",
|
||||
"</s>",
|
||||
"[VOCALIZED-NOISE]",
|
||||
"[NOISE]",
|
||||
"[LAUGHTER]",
|
||||
]
|
||||
|
||||
for w in excluded:
|
||||
|
@ -75,6 +75,8 @@ def main():
|
||||
# If you change it, you should also change other
|
||||
# places that are using it.
|
||||
|
||||
user_defined_symbols += ["[LAUGHTER]", "[NOISE]", "[VOCALIZED-NOISE]"]
|
||||
|
||||
model_file = Path(model_prefix + ".model")
|
||||
if not model_file.is_file():
|
||||
spm.SentencePieceTrainer.train(
|
||||
|
@ -45,7 +45,7 @@ fisher_dir="/export/corpora3/LDC/LDC2004T19"
|
||||
vocab_sizes=(
|
||||
# 5000
|
||||
# 2000
|
||||
# 1000
|
||||
1000
|
||||
500
|
||||
)
|
||||
|
||||
@ -197,7 +197,7 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||
# [noise] nsn
|
||||
# !sil sil
|
||||
# <unk> spn
|
||||
cat data/local/dict_nosp/lexicon.txt |
|
||||
cat data/local/dict_nosp/lexicon.txt | sed 's/-//g' | sed 's/\[vocalizednoise\]/\[vocalized-noise\]/g' |
|
||||
sort | uniq >$lang_dir/lexicon_lower.txt
|
||||
|
||||
cat $lang_dir/lexicon_lower.txt | tr a-z A-Z > $lang_dir/lexicon.txt
|
||||
|
Loading…
x
Reference in New Issue
Block a user