mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-06 23:54:17 +00:00
update the PR#1374 (feedback from @csukuangfj)
- fixing .py headers and docstrings - removing BUT specific parts of `prepare.sh` - adding assert `num_jobs >= num_workers` to `compute_fbank.py` - narrowing list of languages (let's limit to ASR sets with transcripts for now) - added links to `README.md` - extending `text_from_manifest.py`
This commit is contained in:
parent
4ec48f30b1
commit
07a229ac81
@ -1,6 +1,8 @@
|
|||||||
# Readme
|
# Readme
|
||||||
|
|
||||||
This recipe contains data preparation for the VoxPopuli dataset.
|
This recipe contains data preparation for the
|
||||||
|
[VoxPopuli](https://github.com/facebookresearch/voxpopuli) dataset
|
||||||
|
[(pdf)](https://aclanthology.org/2021.acl-long.80.pdf).
|
||||||
At the moment, without model training.
|
At the moment, without model training.
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||||
# 2023 Brno University of Technology (authors: Karel Veselý)
|
# 2023 Brno University of Technology (authors: Karel Veselý)
|
||||||
#
|
#
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
#
|
#
|
||||||
@ -36,23 +36,29 @@ located at: `{src_dir}/{prefix}_cuts_{dataset}_raw.jsonl.gz`.
|
|||||||
The generated fbank features are saved in `data/fbank/{prefix}-{dataset}_feats`
|
The generated fbank features are saved in `data/fbank/{prefix}-{dataset}_feats`
|
||||||
and CutSet manifest stored in `data/fbank/{prefix}_cuts_{dataset}.jsonl.gz`.
|
and CutSet manifest stored in `data/fbank/{prefix}_cuts_{dataset}.jsonl.gz`.
|
||||||
|
|
||||||
The number of workers is smaller than nunber of jobs
|
Typically, the number of workers is smaller than number of jobs
|
||||||
(see --num-jobs 100 --num-workers 25 in the example).
|
(see --num-jobs 100 --num-workers 25 in the example).
|
||||||
|
And, the number of jobs should be at least the number of workers (it's checked).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from concurrent.futures import ProcessPoolExecutor
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import sentencepiece as spm
|
||||||
import torch
|
import torch
|
||||||
from filter_cuts import filter_cuts
|
from filter_cuts import filter_cuts
|
||||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
|
from lhotse import (
|
||||||
from lhotse import is_caching_enabled, set_caching_enabled
|
CutSet,
|
||||||
|
Fbank,
|
||||||
|
FbankConfig,
|
||||||
|
LilcomChunkyWriter,
|
||||||
|
is_caching_enabled,
|
||||||
|
set_caching_enabled,
|
||||||
|
)
|
||||||
|
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import str2bool
|
||||||
|
|
||||||
@ -128,7 +134,6 @@ def get_args():
|
|||||||
|
|
||||||
|
|
||||||
def compute_fbank_features(args: argparse.Namespace):
|
def compute_fbank_features(args: argparse.Namespace):
|
||||||
|
|
||||||
set_caching_enabled(True) # lhotse
|
set_caching_enabled(True) # lhotse
|
||||||
|
|
||||||
src_dir = Path(args.src_dir)
|
src_dir = Path(args.src_dir)
|
||||||
@ -181,6 +186,7 @@ def compute_fbank_features(args: argparse.Namespace):
|
|||||||
# We typically use `num_jobs=100, num_workers=20`
|
# We typically use `num_jobs=100, num_workers=20`
|
||||||
# - this is helpful for large databases
|
# - this is helpful for large databases
|
||||||
# - both values are configurable externally
|
# - both values are configurable externally
|
||||||
|
assert num_jobs >= num_workers, (num_jobs, num_workers)
|
||||||
executor = ProcessPoolExecutor(
|
executor = ProcessPoolExecutor(
|
||||||
max_workers=num_workers,
|
max_workers=num_workers,
|
||||||
mp_context=multiprocessing.get_context("spawn"),
|
mp_context=multiprocessing.get_context("spawn"),
|
||||||
@ -202,7 +208,7 @@ def compute_fbank_features(args: argparse.Namespace):
|
|||||||
|
|
||||||
# correct small deviations of duration, caused by speed-perturbation
|
# correct small deviations of duration, caused by speed-perturbation
|
||||||
for cut in cut_set:
|
for cut in cut_set:
|
||||||
assert len(cut.supervisions) == 1
|
assert len(cut.supervisions) == 1, (len(cut.supervisions), cut.id)
|
||||||
duration_difference = abs(cut.supervisions[0].duration - cut.duration)
|
duration_difference = abs(cut.supervisions[0].duration - cut.duration)
|
||||||
tolerance = 0.02 # 20ms
|
tolerance = 0.02 # 20ms
|
||||||
if duration_difference == 0.0:
|
if duration_difference == 0.0:
|
||||||
@ -211,7 +217,7 @@ def compute_fbank_features(args: argparse.Namespace):
|
|||||||
logging.info(
|
logging.info(
|
||||||
"small mismatch of the supervision duration "
|
"small mismatch of the supervision duration "
|
||||||
f"(Δt = {duration_difference*1000}ms), "
|
f"(Δt = {duration_difference*1000}ms), "
|
||||||
f"corretcing : cut.duration {cut.duration} -> "
|
f"correcting : cut.duration {cut.duration} -> "
|
||||||
f"supervision {cut.supervisions[0].duration}"
|
f"supervision {cut.supervisions[0].duration}"
|
||||||
)
|
)
|
||||||
cut.supervisions[0].duration = cut.duration
|
cut.supervisions[0].duration = cut.duration
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||||
# 2023 Brno University of Technology (authors: Karel Veselý)
|
# 2023 Brno University of Technology (authors: Karel Veselý)
|
||||||
#
|
#
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
#
|
#
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#!/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# Copyright 2023 Brno University of Technology (authors: Karel Veselý)
|
# Copyright 2023 Brno University of Technology (authors: Karel Veselý)
|
||||||
#
|
#
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
#
|
#
|
||||||
@ -25,11 +25,11 @@ Usage example:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import gzip
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import sys
|
|
||||||
import gzip
|
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
def get_args():
|
||||||
@ -54,7 +54,6 @@ def main():
|
|||||||
total_n_utts = 0
|
total_n_utts = 0
|
||||||
|
|
||||||
for fname in args.filename:
|
for fname in args.filename:
|
||||||
|
|
||||||
if fname == "-":
|
if fname == "-":
|
||||||
fd = sys.stdin
|
fd = sys.stdin
|
||||||
elif re.match(r".*\.jsonl\.gz$", fname):
|
elif re.match(r".*\.jsonl\.gz$", fname):
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# Copyright 2023 Xiaomi Corp. (authors: Yifan Yang)
|
# Copyright 2023 Xiaomi Corp. (authors: Yifan Yang)
|
||||||
|
# 2023 Brno University of Technology (author: Karel Veselý)
|
||||||
#
|
#
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
#
|
#
|
||||||
@ -19,8 +20,8 @@
|
|||||||
Preprocess the database.
|
Preprocess the database.
|
||||||
- Convert RecordingSet and SupervisionSet to CutSet.
|
- Convert RecordingSet and SupervisionSet to CutSet.
|
||||||
- Apply text normalization to the transcripts.
|
- Apply text normalization to the transcripts.
|
||||||
- We take renormzlized `orig_text` as `text` transcripts.
|
- We take renormalized `orig_text` as `text` transcripts.
|
||||||
- The the text normalization is separating punctuation from words.
|
- The text normalization is separating punctuation from words.
|
||||||
- Also we put capital letter to the beginning of a sentence.
|
- Also we put capital letter to the beginning of a sentence.
|
||||||
|
|
||||||
The script is inspired in:
|
The script is inspired in:
|
||||||
@ -40,12 +41,12 @@ from typing import Optional
|
|||||||
from lhotse import CutSet
|
from lhotse import CutSet
|
||||||
from lhotse.recipes.utils import read_manifests_if_cached
|
from lhotse.recipes.utils import read_manifests_if_cached
|
||||||
|
|
||||||
from icefall.utils import str2bool
|
|
||||||
|
|
||||||
# from local/
|
# from local/
|
||||||
from separate_punctuation import separate_punctuation
|
from separate_punctuation import separate_punctuation
|
||||||
from uppercase_begin_of_sentence import UpperCaseBeginOfSentence
|
from uppercase_begin_of_sentence import UpperCaseBeginOfSentence
|
||||||
|
|
||||||
|
from icefall.utils import str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
def get_args():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#!/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# Copyright 2023 Brno University of Technology (authors: Karel Veselý)
|
# Copyright 2023 Brno University of Technology (authors: Karel Veselý)
|
||||||
#
|
#
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
#
|
#
|
||||||
@ -20,7 +20,7 @@ Example:
|
|||||||
input: "This is fine. Yes, you are right."
|
input: "This is fine. Yes, you are right."
|
||||||
output: "This is fine . Yes , you are right ."
|
output: "This is fine . Yes , you are right ."
|
||||||
|
|
||||||
The script also handles exceptions in a hard-coded fasion.
|
The script also handles exceptions in a hard-coded fashion.
|
||||||
|
|
||||||
(same functionality could be done with `nltk.tokenize.word_tokenize()`,
|
(same functionality could be done with `nltk.tokenize.word_tokenize()`,
|
||||||
but that would be an extra dependency)
|
but that would be an extra dependency)
|
||||||
@ -28,17 +28,18 @@ The script also handles exceptions in a hard-coded fasion.
|
|||||||
It can be used as a module, or as an executable script.
|
It can be used as a module, or as an executable script.
|
||||||
|
|
||||||
Usage example #1:
|
Usage example #1:
|
||||||
from separate_punctuation import separate_punctuation
|
`from separate_punctuation import separate_punctuation`
|
||||||
|
|
||||||
Usage example #2:
|
Usage example #2:
|
||||||
|
```
|
||||||
python3 ./local/separate_punctuation.py \
|
python3 ./local/separate_punctuation.py \
|
||||||
--ignore-columnts 1 \
|
--ignore-columns 1 \
|
||||||
${kaldi_data}/text
|
< ${kaldi_data}/text
|
||||||
|
```
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import sys
|
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
|
||||||
@ -67,10 +68,8 @@ def separate_punctuation(text: str) -> str:
|
|||||||
|
|
||||||
# re-join the special cases of punctuation
|
# re-join the special cases of punctuation
|
||||||
for ii, tok in enumerate(tokens):
|
for ii, tok in enumerate(tokens):
|
||||||
|
|
||||||
# no rewriting for 1st and last token
|
# no rewriting for 1st and last token
|
||||||
if ii > 0 and ii < len(tokens) - 1:
|
if ii > 0 and ii < len(tokens) - 1:
|
||||||
|
|
||||||
# **RULES ADDED FOR CZECH COMMON VOICE**
|
# **RULES ADDED FOR CZECH COMMON VOICE**
|
||||||
|
|
||||||
# fix "27 . dubna" -> "27. dubna", but keep punctuation separate,
|
# fix "27 . dubna" -> "27. dubna", but keep punctuation separate,
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#!/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# Copyright 2023 Brno University of Technology (authors: Karel Veselý)
|
# Copyright 2023 Brno University of Technology (authors: Karel Veselý)
|
||||||
#
|
#
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
#
|
#
|
||||||
@ -15,7 +15,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""
|
"""
|
||||||
Print the text contained in `supervisions.jsonl.gz`.
|
Print the text contained in `supervisions.jsonl.gz` or `cuts.jsonl.gz`.
|
||||||
|
|
||||||
Usage example:
|
Usage example:
|
||||||
python3 ./local/text_from_manifest.py \
|
python3 ./local/text_from_manifest.py \
|
||||||
@ -23,8 +23,8 @@ Usage example:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
|
||||||
import gzip
|
import gzip
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
def get_args():
|
||||||
@ -41,7 +41,13 @@ def main():
|
|||||||
with gzip.open(args.filename, mode="r") as fd:
|
with gzip.open(args.filename, mode="r") as fd:
|
||||||
for line in fd:
|
for line in fd:
|
||||||
js = json.loads(line)
|
js = json.loads(line)
|
||||||
print(js["text"])
|
if "text" in js:
|
||||||
|
print(js["text"]) # supervisions.jsonl.gz
|
||||||
|
elif "supervisions" in js:
|
||||||
|
for s in js["supervisions"]:
|
||||||
|
print(s["text"]) # cuts.jsonl.gz
|
||||||
|
else:
|
||||||
|
raise Exception(f"Unknown jsonl format of {args.filename}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#!/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# Copyright 2023 Brno University of Technology (authors: Karel Veselý)
|
# Copyright 2023 Brno University of Technology (authors: Karel Veselý)
|
||||||
#
|
#
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
#
|
#
|
||||||
@ -19,17 +19,18 @@ This script introduces initial capital letter at the beginning of a sentence.
|
|||||||
It can be used as a module, or as an executable script.
|
It can be used as a module, or as an executable script.
|
||||||
|
|
||||||
Usage example #1:
|
Usage example #1:
|
||||||
from uppercase_begin_of_sentence import UpperCaseBeginOfSentence
|
`from uppercase_begin_of_sentence import UpperCaseBeginOfSentence`
|
||||||
|
|
||||||
Usage example #2:
|
Usage example #2:
|
||||||
|
```
|
||||||
python3 ./local/uppercase_begin_of_sentence.py \
|
python3 ./local/uppercase_begin_of_sentence.py \
|
||||||
--ignore-columnts 1 \
|
--ignore-columns 1 \
|
||||||
${kaldi_data}/text
|
< ${kaldi_data}/text
|
||||||
|
```
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
|
||||||
@ -44,7 +45,6 @@ class UpperCaseBeginOfSentence:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
# The 1st word will have Title-case
|
# The 1st word will have Title-case
|
||||||
# This variable transfers context from previous line
|
# This variable transfers context from previous line
|
||||||
self.prev_token_is_punct = True
|
self.prev_token_is_punct = True
|
||||||
@ -59,7 +59,6 @@ class UpperCaseBeginOfSentence:
|
|||||||
punct_set = set([".", "!", "?"])
|
punct_set = set([".", "!", "?"])
|
||||||
|
|
||||||
for ii, w in enumerate(words):
|
for ii, w in enumerate(words):
|
||||||
|
|
||||||
# punctuation ?
|
# punctuation ?
|
||||||
if w in punct_set:
|
if w in punct_set:
|
||||||
self.prev_token_is_punct = True
|
self.prev_token_is_punct = True
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
|
# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||||
|
# 2023 Brno University of Technology (authors: Karel Veselý)
|
||||||
#
|
#
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
#
|
#
|
||||||
@ -18,7 +19,8 @@
|
|||||||
This script checks the following assumptions of the generated manifest:
|
This script checks the following assumptions of the generated manifest:
|
||||||
|
|
||||||
- Single supervision per cut
|
- Single supervision per cut
|
||||||
- Supervision time bounds are within cut time bounds
|
- Supervision time bounds are within Cut time bounds
|
||||||
|
- Duration of Cut and Superivion are equal
|
||||||
|
|
||||||
We will add more checks later if needed.
|
We will add more checks later if needed.
|
||||||
|
|
||||||
@ -27,14 +29,13 @@ Usage example:
|
|||||||
python3 ./local/validate_manifest.py \
|
python3 ./local/validate_manifest.py \
|
||||||
./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz
|
./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz
|
||||||
|
|
||||||
|
(Based on: `librispeech/ASR/local/validate_manifest.py`)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from icefall.utils import setup_logger
|
|
||||||
|
|
||||||
from lhotse import CutSet, load_manifest_lazy
|
from lhotse import CutSet, load_manifest_lazy
|
||||||
from lhotse.cut import Cut
|
from lhotse.cut import Cut
|
||||||
from lhotse.dataset.speech_recognition import validate_for_asr
|
from lhotse.dataset.speech_recognition import validate_for_asr
|
||||||
@ -49,14 +50,6 @@ def get_args():
|
|||||||
help="Path to the manifest file",
|
help="Path to the manifest file",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--log-file",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
required=True,
|
|
||||||
help="The filename to save the log.",
|
|
||||||
)
|
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
@ -101,8 +94,6 @@ def main():
|
|||||||
args = get_args()
|
args = get_args()
|
||||||
|
|
||||||
manifest = args.cutset_manifest
|
manifest = args.cutset_manifest
|
||||||
|
|
||||||
setup_logger(log_filename=f"{args.log_file}", log_level="info")
|
|
||||||
logging.info(f"Validating {manifest}")
|
logging.info(f"Validating {manifest}")
|
||||||
|
|
||||||
assert manifest.is_file(), f"{manifest} does not exist"
|
assert manifest.is_file(), f"{manifest} does not exist"
|
||||||
@ -125,4 +116,8 @@ def main():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||||
|
|
||||||
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||||
|
|
||||||
main()
|
main()
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
. /mnt/matylda5/iveselyk/ASR_TOOLKITS/K2_SHERPA_PYTORCH20/conda-activate.sh
|
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
|
||||||
|
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
||||||
|
|
||||||
set -euxo pipefail
|
set -euxo pipefail
|
||||||
|
|
||||||
@ -8,28 +9,17 @@ nj=20
|
|||||||
stage=-1
|
stage=-1
|
||||||
stop_stage=100
|
stop_stage=100
|
||||||
|
|
||||||
# Split data/${lang}set to this number of pieces
|
|
||||||
# This is to avoid OOM during feature extraction.
|
|
||||||
num_splits=100
|
|
||||||
|
|
||||||
# We assume dl_dir (download dir) contains the following
|
# We assume dl_dir (download dir) contains the following
|
||||||
# directories and files. If not, they will be downloaded
|
# directories and files. If not, they will be downloaded
|
||||||
# by this script automatically.
|
# by this script automatically.
|
||||||
#
|
#
|
||||||
# [TODO update this]
|
# - $dl_dir/voxpopuli/raw_audios/$lang/$year
|
||||||
|
# This directory contains *.ogg files with audio downloaded and extracted from archives:
|
||||||
|
# https://dl.fbaipublicfiles.com/voxpopuli/audios/${lang}_${year}.tar
|
||||||
#
|
#
|
||||||
# - $dl_dir/$release/$lang
|
# - Note: the voxpopuli transcripts are downloaded to a ${tmp} folder
|
||||||
# This directory contains the following files downloaded from
|
# as part of `lhotse prepare voxpopuli` from:
|
||||||
# https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/${release}/${release}-${lang}.tar.gz
|
# https://dl.fbaipublicfiles.com/voxpopuli/annotations/asr/asr_${lang}.tsv.gz
|
||||||
#
|
|
||||||
# - clips
|
|
||||||
# - dev.tsv
|
|
||||||
# - invalidated.tsv
|
|
||||||
# - other.tsv
|
|
||||||
# - reported.tsv
|
|
||||||
# - test.tsv
|
|
||||||
# - train.tsv
|
|
||||||
# - validated.tsv
|
|
||||||
#
|
#
|
||||||
# - $dl_dir/musan
|
# - $dl_dir/musan
|
||||||
# This directory contains the following directories downloaded from
|
# This directory contains the following directories downloaded from
|
||||||
@ -39,19 +29,19 @@ num_splits=100
|
|||||||
# - noise
|
# - noise
|
||||||
# - speech
|
# - speech
|
||||||
|
|
||||||
#dl_dir=$PWD/download
|
dl_dir=$PWD/download
|
||||||
dl_dir=/mnt/matylda6/szoke/EU-ASR/DATA
|
#dl_dir=/mnt/matylda6/szoke/EU-ASR/DATA # BUT
|
||||||
|
|
||||||
#musan_dir=${dl_dir}/musan
|
musan_dir=${dl_dir}/musan
|
||||||
musan_dir=/mnt/matylda2/data/MUSAN
|
#musan_dir=/mnt/matylda2/data/MUSAN # BUT
|
||||||
|
|
||||||
# Choose vlues from:
|
# Choose value from ASR_LANGUAGES:
|
||||||
#
|
#
|
||||||
# "en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr",
|
# [ "en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr",
|
||||||
# "sk", "sl", "et", "lt", "pt", "bg", "el", "lv", "mt", "sv", "da",
|
# "sk", "sl", "et", "lt" ]
|
||||||
# "asr", "10k", "100k", "400k"
|
|
||||||
#
|
#
|
||||||
# See: https://github.com/lhotse-speech/lhotse/blob/c5f26afd100885b86e4244eeb33ca1986f3fa923/lhotse/bin/modes/recipes/voxpopuli.py#L77
|
# See ASR_LANGUAGES in:
|
||||||
|
# https://github.com/lhotse-speech/lhotse/blob/c5f26afd100885b86e4244eeb33ca1986f3fa923/lhotse/recipes/voxpopuli.py#L54C4-L54C4
|
||||||
lang=en
|
lang=en
|
||||||
|
|
||||||
task=asr
|
task=asr
|
||||||
@ -102,12 +92,6 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
|||||||
if [ ! -d $musan_dir/musan ]; then
|
if [ ! -d $musan_dir/musan ]; then
|
||||||
lhotse download musan $musan_dir
|
lhotse download musan $musan_dir
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# pre-download the transcripts
|
|
||||||
DOWNLOAD_BASE_URL="https://dl.fbaipublicfiles.com/voxpopuli"
|
|
||||||
dir=data/manifests; mkdir -p ${dir}
|
|
||||||
wget --tries=10 --continue --progress=bar --directory-prefix=${dir} \
|
|
||||||
"${DOWNLOAD_BASE_URL}/annotations/asr/${task}_${lang}.tsv.gz"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||||
@ -115,7 +99,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|||||||
# We assume that you have downloaded the VoxPopuli corpus
|
# We assume that you have downloaded the VoxPopuli corpus
|
||||||
# to $dl_dir/voxpopuli
|
# to $dl_dir/voxpopuli
|
||||||
if [ ! -e data/manifests/.voxpopuli-${task}-${lang}.done ]; then
|
if [ ! -e data/manifests/.voxpopuli-${task}-${lang}.done ]; then
|
||||||
# Warning : it requires Internet connection (it downloads transcripts)
|
# Warning : it requires Internet connection (it downloads transcripts to ${tmpdir})
|
||||||
lhotse prepare voxpopuli --task asr --lang $lang -j $nj $dl_dir/voxpopuli data/manifests
|
lhotse prepare voxpopuli --task asr --lang $lang -j $nj $dl_dir/voxpopuli data/manifests
|
||||||
touch data/manifests/.voxpopuli-${task}-${lang}.done
|
touch data/manifests/.voxpopuli-${task}-${lang}.done
|
||||||
fi
|
fi
|
||||||
@ -150,7 +134,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
|||||||
for dataset in "dev" "test"; do
|
for dataset in "dev" "test"; do
|
||||||
if [ ! -e data/fbank/.voxpopuli-${task}-${lang}-${dataset}.done ]; then
|
if [ ! -e data/fbank/.voxpopuli-${task}-${lang}-${dataset}.done ]; then
|
||||||
./local/compute_fbank.py --src-dir data/fbank --output-dir data/fbank \
|
./local/compute_fbank.py --src-dir data/fbank --output-dir data/fbank \
|
||||||
--num-jobs 50 --num-workers 10 \
|
--num-jobs 50 --num-workers ${nj} \
|
||||||
--prefix "voxpopuli-${task}-${lang}" \
|
--prefix "voxpopuli-${task}-${lang}" \
|
||||||
--dataset ${dataset} \
|
--dataset ${dataset} \
|
||||||
--trim-to-supervisions True
|
--trim-to-supervisions True
|
||||||
@ -160,10 +144,10 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||||
log "Stage 6: Compute fbank for train set of VoxPopuli"
|
log "Stage 5: Compute fbank for train set of VoxPopuli"
|
||||||
if [ ! -e data/fbank/.voxpopuli-${task}-${lang}-train.done ]; then
|
if [ ! -e data/fbank/.voxpopuli-${task}-${lang}-train.done ]; then
|
||||||
./local/compute_fbank.py --src-dir data/fbank --output-dir data/fbank \
|
./local/compute_fbank.py --src-dir data/fbank --output-dir data/fbank \
|
||||||
--num-jobs 100 --num-workers 25 \
|
--num-jobs 100 --num-workers ${nj} \
|
||||||
--prefix "voxpopuli-${task}-${lang}" \
|
--prefix "voxpopuli-${task}-${lang}" \
|
||||||
--dataset train \
|
--dataset train \
|
||||||
--trim-to-supervisions True \
|
--trim-to-supervisions True \
|
||||||
@ -173,7 +157,17 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||||
log "Stage 6: Compute fbank for musan"
|
log "Stage 6: Validate fbank manifests for VoxPopuli"
|
||||||
|
for dataset in "dev" "test" "train"; do
|
||||||
|
mkdir -p data/fbank/log/
|
||||||
|
./local/validate_cutset_manifest.py \
|
||||||
|
data/fbank/voxpopuli-asr-en_cuts_${dataset}.jsonl.gz \
|
||||||
|
2>&1 | tee data/fbank/log/validate_voxpopuli-asr-en_cuts_${dataset}.log
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
|
||||||
|
log "Stage 7: Compute fbank for musan"
|
||||||
mkdir -p data/fbank
|
mkdir -p data/fbank
|
||||||
if [ ! -e data/fbank/.musan.done ]; then
|
if [ ! -e data/fbank/.musan.done ]; then
|
||||||
./local/compute_fbank_musan.py
|
./local/compute_fbank_musan.py
|
||||||
@ -181,8 +175,8 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
|
if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
|
||||||
log "Stage 7: Prepare BPE based lang"
|
log "Stage 8: Prepare BPE based lang"
|
||||||
|
|
||||||
for vocab_size in ${vocab_sizes[@]}; do
|
for vocab_size in ${vocab_sizes[@]}; do
|
||||||
lang_dir=data/lang_bpe_${vocab_size}_${lang}
|
lang_dir=data/lang_bpe_${vocab_size}_${lang}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user