minor updates

This commit is contained in:
JinZr 2023-08-23 14:45:10 +08:00
parent ba480b7b5f
commit 7feaa6185d
6 changed files with 64 additions and 16 deletions

View File

@ -796,15 +796,17 @@ def main():
test_eval2000_cuts = switchboard.test_eval2000_cuts().trim_to_supervisions(
keep_all_channels=True
)
test_rt03_cuts = switchboard.test_rt03_cuts().trim_to_supervisions(
keep_all_channels=True
)
# test_rt03_cuts = switchboard.test_rt03_cuts().trim_to_supervisions(
# keep_all_channels=True
# )
test_eval2000_dl = switchboard.test_dataloaders(test_eval2000_cuts)
test_rt03_dl = switchboard.test_dataloaders(test_rt03_cuts)
# test_rt03_dl = switchboard.test_dataloaders(test_rt03_cuts)
test_sets = ["test-eval2000", "test-rt03"]
test_dl = [test_eval2000_dl, test_rt03_dl]
# test_sets = ["test-eval2000", "test-rt03"]
# test_dl = [test_eval2000_dl, test_rt03_dl]
test_sets = ["test-eval2000"]
test_dl = [test_eval2000_dl]
for test_set, test_dl in zip(test_sets, test_dl):
results_dict = decode_dataset(

View File

@ -97,7 +97,7 @@ def compute_fbank_switchboard(
prefix = dir_name
suffix = "jsonl.gz"
manifests = {
"eval2000": "data/manifests/eval2000/eval2000_cuts_all_trimmed.jsonl.gz",
"eval2000": "data/manifests/eval2000/eval2000_cuts_all.jsonl.gz",
}
assert manifests is not None
@ -111,7 +111,12 @@ def compute_fbank_switchboard(
logging.info(f"{prefix} already exists - skipping.")
return
logging.info(f"Processing {prefix}")
cut_set = CutSet.from_file(manifests[prefix]).resample(16000)
cut_set = (
CutSet.from_file(manifests[prefix])
.resample(16000)
.to_eager()
.filter(lambda c: c.duration > 0.5)
)
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
@ -121,6 +126,7 @@ def compute_fbank_switchboard(
executor=ex,
storage_type=LilcomChunkyWriter,
)
cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
cut_set.to_file(output_dir / cuts_filename)

View File

@ -30,8 +30,8 @@ from lhotse import load_manifest_lazy
def main():
# path = "./data/fbank/swbd_cuts_rt03.jsonl.gz"
# path = "./data/fbank/swbd_cuts_eval2000.jsonl.gz"
path = "./data/fbank/swbd_cuts_all.jsonl.gz"
path = "./data/fbank/eval2000/eval2000_cuts_all.jsonl.gz"
# path = "./data/fbank/swbd_cuts_all.jsonl.gz"
cuts = load_manifest_lazy(path)
cuts.describe()
@ -41,7 +41,7 @@ if __name__ == "__main__":
main()
"""
Cut statistics:
Training Cut statistics:
Cuts count: 167244
@ -81,4 +81,45 @@ Speech duration statistics:
Total silence duration 00:00:00 0.00% of recording
Eval2000 Cut statistics:
Cuts count: 2709
Total duration (hh:mm:ss) 01:39:19
mean 2.2
std 1.8
min 0.1
25% 0.7
50% 1.7
75% 3.1
99% 8.0
99.5% 8.3
99.9% 11.3
max 14.1
Recordings available: 2709
Features available: 0
Supervisions available: 2709
Speech duration statistics:
Total speech duration 01:39:19 100.00% of recording
Total speaking time duration 01:39:19 100.00% of recording
Total silence duration 00:00:00 0.00% of recording
"""

View File

@ -216,9 +216,6 @@ def main():
"#0",
"<s>",
"</s>",
"[VOCALIZED-NOISE]",
"[NOISE]",
"[LAUGHTER]",
]
for w in excluded:

View File

@ -75,6 +75,8 @@ def main():
# If you change it, you should also change other
# places that are using it.
user_defined_symbols += ["[LAUGHTER]", "[NOISE]", "[VOCALIZED-NOISE]"]
model_file = Path(model_prefix + ".model")
if not model_file.is_file():
spm.SentencePieceTrainer.train(

View File

@ -45,7 +45,7 @@ fisher_dir="/export/corpora3/LDC/LDC2004T19"
vocab_sizes=(
# 5000
# 2000
# 1000
1000
500
)
@ -197,7 +197,7 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
# [noise] nsn
# !sil sil
# <unk> spn
cat data/local/dict_nosp/lexicon.txt |
cat data/local/dict_nosp/lexicon.txt | sed 's/-//g' | sed 's/\[vocalizednoise\]/\[vocalized-noise\]/g' |
sort | uniq >$lang_dir/lexicon_lower.txt
cat $lang_dir/lexicon_lower.txt | tr a-z A-Z > $lang_dir/lexicon.txt