mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 01:52:41 +00:00
* Begin to add RNN-T training for librispeech. * Copy files from conformer_ctc. Will edit it. * Use conformer/transformer model as encoder. * Begin to add training script. * Add training code. * Remove long utterances to avoid OOM when a large max_duraiton is used. * Begin to add decoding script. * Add decoding script. * Minor fixes. * Add beam search. * Use LSTM layers for the encoder. Need more tunings. * Use stateless decoder. * Minor fixes to make it ready for merge. * Fix README. * Update RESULT.md to include RNN-T Conformer. * Minor fixes. * Fix tests. * Minor fixes. * Minor fixes. * Fix tests.
216 lines
3.9 KiB
Python
Executable File
216 lines
3.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
|
#
|
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
This file displays duration statistics of utterances in a manifest.
|
|
You can use the displayed value to choose minimum/maximum duration
|
|
to remove short and long utterances during the training.
|
|
|
|
See the function `remove_short_and_long_utt()` in transducer/train.py
|
|
for usage.
|
|
"""
|
|
|
|
|
|
from lhotse import load_manifest
|
|
|
|
|
|
def main():
|
|
path = "./data/fbank/cuts_train-clean-100.json.gz"
|
|
path = "./data/fbank/cuts_train-clean-360.json.gz"
|
|
path = "./data/fbank/cuts_train-other-500.json.gz"
|
|
path = "./data/fbank/cuts_dev-clean.json.gz"
|
|
path = "./data/fbank/cuts_dev-other.json.gz"
|
|
path = "./data/fbank/cuts_test-clean.json.gz"
|
|
path = "./data/fbank/cuts_test-other.json.gz"
|
|
|
|
cuts = load_manifest(path)
|
|
cuts.describe()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
"""
|
|
## train-clean-100
|
|
Cuts count: 85617
|
|
Total duration (hours): 303.8
|
|
Speech duration (hours): 303.8 (100.0%)
|
|
***
|
|
Duration statistics (seconds):
|
|
mean 12.8
|
|
std 3.8
|
|
min 1.3
|
|
0.1% 1.9
|
|
0.5% 2.2
|
|
1% 2.5
|
|
5% 4.2
|
|
10% 6.4
|
|
25% 11.4
|
|
50% 13.8
|
|
75% 15.3
|
|
90% 16.7
|
|
95% 17.3
|
|
99% 18.1
|
|
99.5% 18.4
|
|
99.9% 18.8
|
|
max 27.2
|
|
|
|
## train-clean-360
|
|
Cuts count: 312042
|
|
Total duration (hours): 1098.2
|
|
Speech duration (hours): 1098.2 (100.0%)
|
|
***
|
|
Duration statistics (seconds):
|
|
mean 12.7
|
|
std 3.8
|
|
min 1.0
|
|
0.1% 1.8
|
|
0.5% 2.2
|
|
1% 2.5
|
|
5% 4.2
|
|
10% 6.2
|
|
25% 11.2
|
|
50% 13.7
|
|
75% 15.3
|
|
90% 16.6
|
|
95% 17.3
|
|
99% 18.1
|
|
99.5% 18.4
|
|
99.9% 18.8
|
|
max 33.0
|
|
|
|
## train-other 500
|
|
Cuts count: 446064
|
|
Total duration (hours): 1500.6
|
|
Speech duration (hours): 1500.6 (100.0%)
|
|
***
|
|
Duration statistics (seconds):
|
|
mean 12.1
|
|
std 4.2
|
|
min 0.8
|
|
0.1% 1.7
|
|
0.5% 2.1
|
|
1% 2.3
|
|
5% 3.5
|
|
10% 5.0
|
|
25% 9.8
|
|
50% 13.4
|
|
75% 15.1
|
|
90% 16.5
|
|
95% 17.2
|
|
99% 18.1
|
|
99.5% 18.4
|
|
99.9% 18.9
|
|
max 31.0
|
|
|
|
## dev-clean
|
|
Cuts count: 2703
|
|
Total duration (hours): 5.4
|
|
Speech duration (hours): 5.4 (100.0%)
|
|
***
|
|
Duration statistics (seconds):
|
|
mean 7.2
|
|
std 4.7
|
|
min 1.4
|
|
0.1% 1.6
|
|
0.5% 1.8
|
|
1% 1.9
|
|
5% 2.4
|
|
10% 2.7
|
|
25% 3.8
|
|
50% 5.9
|
|
75% 9.3
|
|
90% 13.3
|
|
95% 16.4
|
|
99% 23.8
|
|
99.5% 28.5
|
|
99.9% 32.3
|
|
max 32.6
|
|
|
|
## dev-other
|
|
Cuts count: 2864
|
|
Total duration (hours): 5.1
|
|
Speech duration (hours): 5.1 (100.0%)
|
|
***
|
|
Duration statistics (seconds):
|
|
mean 6.4
|
|
std 4.3
|
|
min 1.1
|
|
0.1% 1.3
|
|
0.5% 1.7
|
|
1% 1.8
|
|
5% 2.2
|
|
10% 2.6
|
|
25% 3.5
|
|
50% 5.3
|
|
75% 7.9
|
|
90% 12.0
|
|
95% 15.0
|
|
99% 22.2
|
|
99.5% 27.1
|
|
99.9% 32.4
|
|
max 35.2
|
|
|
|
## test-clean
|
|
Cuts count: 2620
|
|
Total duration (hours): 5.4
|
|
Speech duration (hours): 5.4 (100.0%)
|
|
***
|
|
Duration statistics (seconds):
|
|
mean 7.4
|
|
std 5.2
|
|
min 1.3
|
|
0.1% 1.6
|
|
0.5% 1.8
|
|
1% 2.0
|
|
5% 2.3
|
|
10% 2.7
|
|
25% 3.7
|
|
50% 5.8
|
|
75% 9.6
|
|
90% 14.6
|
|
95% 17.8
|
|
99% 25.5
|
|
99.5% 28.4
|
|
99.9% 32.8
|
|
max 35.0
|
|
|
|
## test-other
|
|
Cuts count: 2939
|
|
Total duration (hours): 5.3
|
|
Speech duration (hours): 5.3 (100.0%)
|
|
***
|
|
Duration statistics (seconds):
|
|
mean 6.5
|
|
std 4.4
|
|
min 1.2
|
|
0.1% 1.5
|
|
0.5% 1.8
|
|
1% 1.9
|
|
5% 2.3
|
|
10% 2.6
|
|
25% 3.4
|
|
50% 5.2
|
|
75% 8.2
|
|
90% 12.6
|
|
95% 15.8
|
|
99% 21.4
|
|
99.5% 23.8
|
|
99.9% 33.5
|
|
max 34.5
|
|
"""
|