icefall/egs/librispeech/ASR/local/display_manifest_statistics.py

#!/usr/bin/env python3
# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This file displays duration statistics of utterances in a manifest.
You can use the displayed value to choose minimum/maximum duration
to remove short and long utterances during the training.

See the function `remove_short_and_long_utt()` in transducer/train.py
for usage.
"""


from lhotse import load_manifest


def main():
    path = "./data/fbank/cuts_train-clean-100.json.gz"
    path = "./data/fbank/cuts_train-clean-360.json.gz"
    path = "./data/fbank/cuts_train-other-500.json.gz"
    path = "./data/fbank/cuts_dev-clean.json.gz"
    path = "./data/fbank/cuts_dev-other.json.gz"
    path = "./data/fbank/cuts_test-clean.json.gz"
    path = "./data/fbank/cuts_test-other.json.gz"

    cuts = load_manifest(path)
    cuts.describe()


if __name__ == "__main__":
    main()

"""
## train-clean-100
Cuts count: 85617
Total duration (hours): 303.8
Speech duration (hours): 303.8 (100.0%)
***
Duration statistics (seconds):
mean    12.8
std     3.8
min     1.3
0.1%    1.9
0.5%    2.2
1%      2.5
5%      4.2
10%     6.4
25%     11.4
50%     13.8
75%     15.3
90%     16.7
95%     17.3
99%     18.1
99.5%   18.4
99.9%   18.8
max     27.2

## train-clean-360
Cuts count: 312042
Total duration (hours): 1098.2
Speech duration (hours): 1098.2 (100.0%)
***
Duration statistics (seconds):
mean    12.7
std     3.8
min     1.0
0.1%    1.8
0.5%    2.2
1%      2.5
5%      4.2
10%     6.2
25%     11.2
50%     13.7
75%     15.3
90%     16.6
95%     17.3
99%     18.1
99.5%   18.4
99.9%   18.8
max     33.0

## train-other 500
Cuts count: 446064
Total duration (hours): 1500.6
Speech duration (hours): 1500.6 (100.0%)
***
Duration statistics (seconds):
mean    12.1
std     4.2
min     0.8
0.1%    1.7
0.5%    2.1
1%      2.3
5%      3.5
10%     5.0
25%     9.8
50%     13.4
75%     15.1
90%     16.5
95%     17.2
99%     18.1
99.5%   18.4
99.9%   18.9
max     31.0

## dev-clean
Cuts count: 2703
Total duration (hours): 5.4
Speech duration (hours): 5.4 (100.0%)
***
Duration statistics (seconds):
mean    7.2
std     4.7
min     1.4
0.1%    1.6
0.5%    1.8
1%      1.9
5%      2.4
10%     2.7
25%     3.8
50%     5.9
75%     9.3
90%     13.3
95%     16.4
99%     23.8
99.5%   28.5
99.9%   32.3
max     32.6

## dev-other
Cuts count: 2864
Total duration (hours): 5.1
Speech duration (hours): 5.1 (100.0%)
***
Duration statistics (seconds):
mean    6.4
std     4.3
min     1.1
0.1%    1.3
0.5%    1.7
1%      1.8
5%      2.2
10%     2.6
25%     3.5
50%     5.3
75%     7.9
90%     12.0
95%     15.0
99%     22.2
99.5%   27.1
99.9%   32.4
max     35.2

## test-clean
Cuts count: 2620
Total duration (hours): 5.4
Speech duration (hours): 5.4 (100.0%)
***
Duration statistics (seconds):
mean    7.4
std     5.2
min     1.3
0.1%    1.6
0.5%    1.8
1%      2.0
5%      2.3
10%     2.7
25%     3.7
50%     5.8
75%     9.6
90%     14.6
95%     17.8
99%     25.5
99.5%   28.4
99.9%   32.8
max     35.0

## test-other
Cuts count: 2939
Total duration (hours): 5.3
Speech duration (hours): 5.3 (100.0%)
***
Duration statistics (seconds):
mean    6.5
std     4.4
min     1.2
0.1%    1.5
0.5%    1.8
1%      1.9
5%      2.3
10%     2.6
25%     3.4
50%     5.2
75%     8.2
90%     12.6
95%     15.8
99%     21.4
99.5%   23.8
99.9%   33.5
max     34.5
"""