update the PR#1374 (feedback from @csukuangfj)

- fixing .py headers and docstrings - removing BUT specific parts of `prepare.sh` - adding assert `num_jobs >= num_workers` to `compute_fbank.py` - narrowing list of languages (let's limit to ASR sets with transcripts for now) - added links to `README.md` - extending `text_from_manifest.py`
2025-09-06 23:54:17 +00:00 · 2023-11-13 17:23:24 +01:00 · 2023-11-13 17:23:24 +01:00 · 07a229ac81
commit 07a229ac81
parent 4ec48f30b1
10 changed files with 96 additions and 95 deletions
--- a/egs/voxpopuli/ASR/README.md
+++ b/egs/voxpopuli/ASR/README.md
@ -1,6 +1,8 @@
 # Readme
-This recipe contains data preparation for the VoxPopuli dataset.
+This recipe contains data preparation for the
 [VoxPopuli](https://github.com/facebookresearch/voxpopuli) dataset
 [(pdf)](https://aclanthology.org/2021.acl-long.80.pdf).
 At the moment, without model training.
--- a/egs/voxpopuli/ASR/local/compute_fbank.py
+++ b/egs/voxpopuli/ASR/local/compute_fbank.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#              2023  Brno University of Technology (authors: Karel Veselý)
+#              2023  Brno University of Technology  (authors: Karel Veselý)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -36,23 +36,29 @@ located at: `{src_dir}/{prefix}_cuts_{dataset}_raw.jsonl.gz`.
 The generated fbank features are saved in `data/fbank/{prefix}-{dataset}_feats`
 and CutSet manifest stored in `data/fbank/{prefix}_cuts_{dataset}.jsonl.gz`.
-The number of workers is smaller than nunber of jobs
+Typically, the number of workers is smaller than number of jobs
 (see --num-jobs 100 --num-workers 25 in the example).
 And, the number of jobs should be at least the number of workers (it's checked).
 """
 import argparse
 import logging
 import multiprocessing
 import os
 from concurrent.futures import ProcessPoolExecutor
 from pathlib import Path
 import sentencepiece as spm
 import torch
 from filter_cuts import filter_cuts
-from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
+from lhotse import (
-from lhotse import is_caching_enabled, set_caching_enabled
+    CutSet,
    Fbank,
    FbankConfig,
    LilcomChunkyWriter,
    is_caching_enabled,
    set_caching_enabled,
 )
 from icefall.utils import str2bool
@ -128,7 +134,6 @@ def get_args():
 def compute_fbank_features(args: argparse.Namespace):
    set_caching_enabled(True)  # lhotse
    src_dir = Path(args.src_dir)
@ -181,6 +186,7 @@ def compute_fbank_features(args: argparse.Namespace):
    # We typically use `num_jobs=100, num_workers=20`
    # - this is helpful for large databases
    # - both values are configurable externally
    assert num_jobs >= num_workers, (num_jobs, num_workers)
    executor = ProcessPoolExecutor(
        max_workers=num_workers,
        mp_context=multiprocessing.get_context("spawn"),
@ -202,7 +208,7 @@ def compute_fbank_features(args: argparse.Namespace):
    # correct small deviations of duration, caused by speed-perturbation
    for cut in cut_set:
-        assert len(cut.supervisions) == 1
+        assert len(cut.supervisions) == 1, (len(cut.supervisions), cut.id)
        duration_difference = abs(cut.supervisions[0].duration - cut.duration)
        tolerance = 0.02  # 20ms
        if duration_difference == 0.0:
@ -211,7 +217,7 @@ def compute_fbank_features(args: argparse.Namespace):
            logging.info(
                "small mismatch of the supervision duration "
                f"(Δt = {duration_difference*1000}ms), "
-                f"corretcing : cut.duration {cut.duration} -> "
+                f"correcting : cut.duration {cut.duration} -> "
                f"supervision {cut.supervisions[0].duration}"
            )
            cut.supervisions[0].duration = cut.duration
--- a/egs/voxpopuli/ASR/local/display_manifest_statistics.py
+++ b/egs/voxpopuli/ASR/local/display_manifest_statistics.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#              2023  Brno University of Technology (authors: Karel Veselý)
+#              2023  Brno University of Technology  (authors: Karel Veselý)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
--- a/egs/voxpopuli/ASR/local/duration_from_supervision_manifest.py
+++ b/egs/voxpopuli/ASR/local/duration_from_supervision_manifest.py
@ -1,5 +1,5 @@
-#!/bin/env python3
+#!/usr/bin/env python3
-# Copyright    2023  Brno University of Technology (authors: Karel Veselý)
+# Copyright    2023  Brno University of Technology  (authors: Karel Veselý)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -25,11 +25,11 @@ Usage example:
 """
 import argparse
 import gzip
 import json
 import logging
 import sys
 import gzip
 import re
 import sys
 def get_args():
@ -54,7 +54,6 @@ def main():
    total_n_utts = 0
    for fname in args.filename:
        if fname == "-":
            fd = sys.stdin
        elif re.match(r".*\.jsonl\.gz$", fname):
--- a/egs/voxpopuli/ASR/local/preprocess_voxpopuli.py
+++ b/egs/voxpopuli/ASR/local/preprocess_voxpopuli.py
@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 # Copyright    2023  Xiaomi Corp.        (authors: Yifan Yang)
 #              2023  Brno University of Technology  (author: Karel Veselý)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -19,8 +20,8 @@
 Preprocess the database.
 - Convert RecordingSet and SupervisionSet to CutSet.
 - Apply text normalization to the transcripts.
-   - We take renormzlized `orig_text` as `text` transcripts.
+   - We take renormalized `orig_text` as `text` transcripts.
-   - The the text normalization is separating punctuation from words.
+   - The text normalization is separating punctuation from words.
   - Also we put capital letter to the beginning of a sentence.
 The script is inspired in:
@ -40,12 +41,12 @@ from typing import Optional
 from lhotse import CutSet
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import str2bool
 # from local/
 from separate_punctuation import separate_punctuation
 from uppercase_begin_of_sentence import UpperCaseBeginOfSentence
 from icefall.utils import str2bool
 def get_args():
    parser = argparse.ArgumentParser()
--- a/egs/voxpopuli/ASR/local/separate_punctuation.py
+++ b/egs/voxpopuli/ASR/local/separate_punctuation.py
@ -1,5 +1,5 @@
-#!/bin/env python3
+#!/usr/bin/env python3
-# Copyright    2023  Brno University of Technology (authors: Karel Veselý)
+# Copyright    2023  Brno University of Technology  (authors: Karel Veselý)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -20,7 +20,7 @@ Example:
    input: "This is fine. Yes, you are right."
    output: "This is fine . Yes , you are right ."
-The script also handles exceptions in a hard-coded fasion.
+The script also handles exceptions in a hard-coded fashion.
 (same functionality could be done with `nltk.tokenize.word_tokenize()`,
 but that would be an extra dependency)
@ -28,17 +28,18 @@ The script also handles exceptions in a hard-coded fasion.
 It can be used as a module, or as an executable script.
 Usage example #1:
-  from separate_punctuation import separate_punctuation
+  `from separate_punctuation import separate_punctuation`
 Usage example #2:
 ```
  python3 ./local/separate_punctuation.py \
-    --ignore-columnts 1 \
+    --ignore-columns 1 \
-    ${kaldi_data}/text
+    < ${kaldi_data}/text
 ```
 """
 import sys
 import re
-
+import sys
 from argparse import ArgumentParser
@ -67,10 +68,8 @@ def separate_punctuation(text: str) -> str:
    # re-join the special cases of punctuation
    for ii, tok in enumerate(tokens):
        # no rewriting for 1st and last token
        if ii > 0 and ii < len(tokens) - 1:
            # **RULES ADDED FOR CZECH COMMON VOICE**
            # fix "27 . dubna" -> "27. dubna", but keep punctuation separate,
--- a/egs/voxpopuli/ASR/local/text_from_manifest.py
+++ b/egs/voxpopuli/ASR/local/text_from_manifest.py
@ -1,5 +1,5 @@
-#!/bin/env python3
+#!/usr/bin/env python3
-# Copyright    2023  Brno University of Technology (authors: Karel Veselý)
+# Copyright    2023  Brno University of Technology  (authors: Karel Veselý)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -15,7 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Print the text contained in `supervisions.jsonl.gz`.
+Print the text contained in `supervisions.jsonl.gz` or `cuts.jsonl.gz`.
 Usage example:
    python3 ./local/text_from_manifest.py \
@ -23,8 +23,8 @@ Usage example:
 """
 import argparse
 import json
 import gzip
 import json
 def get_args():
@ -41,7 +41,13 @@ def main():
    with gzip.open(args.filename, mode="r") as fd:
        for line in fd:
            js = json.loads(line)
-            print(js["text"])
+            if "text" in js:
                print(js["text"])  # supervisions.jsonl.gz
            elif "supervisions" in js:
                for s in js["supervisions"]:
                    print(s["text"])  # cuts.jsonl.gz
            else:
                raise Exception(f"Unknown jsonl format of {args.filename}")
 if __name__ == "__main__":
--- a/egs/voxpopuli/ASR/local/uppercase_begin_of_sentence.py
+++ b/egs/voxpopuli/ASR/local/uppercase_begin_of_sentence.py
@ -1,5 +1,5 @@
-#!/bin/env python3
+#!/usr/bin/env python3
-# Copyright    2023  Brno University of Technology (authors: Karel Veselý)
+# Copyright    2023  Brno University of Technology  (authors: Karel Veselý)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -19,17 +19,18 @@ This script introduces initial capital letter at the beginning of a sentence.
 It can be used as a module, or as an executable script.
 Usage example #1:
-  from uppercase_begin_of_sentence import UpperCaseBeginOfSentence
+  `from uppercase_begin_of_sentence import UpperCaseBeginOfSentence`
 Usage example #2:
 ```
  python3 ./local/uppercase_begin_of_sentence.py \
-    --ignore-columnts 1 \
+    --ignore-columns 1 \
-    ${kaldi_data}/text
+    < ${kaldi_data}/text
 ```
 """
 import re
 import sys
 from argparse import ArgumentParser
@ -44,7 +45,6 @@ class UpperCaseBeginOfSentence:
    """
    def __init__(self):
        # The 1st word will have Title-case
        # This variable transfers context from previous line
        self.prev_token_is_punct = True
@ -59,7 +59,6 @@ class UpperCaseBeginOfSentence:
        punct_set = set([".", "!", "?"])
        for ii, w in enumerate(words):
            # punctuation ?
            if w in punct_set:
                self.prev_token_is_punct = True
--- a/egs/voxpopuli/ASR/local/validate_cutset_manifest.py
+++ b/egs/voxpopuli/ASR/local/validate_cutset_manifest.py
@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 # Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
 #              2023  Brno University of Technology  (authors: Karel Veselý)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -18,7 +19,8 @@
 This script checks the following assumptions of the generated manifest:
 - Single supervision per cut
- Supervision time bounds are within cut time bounds
+- Supervision time bounds are within Cut time bounds
 - Duration of Cut and Superivion are equal
 We will add more checks later if needed.
@ -27,14 +29,13 @@ Usage example:
    python3 ./local/validate_manifest.py \
            ./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz
 (Based on: `librispeech/ASR/local/validate_manifest.py`)
 """
 import argparse
 import logging
 from pathlib import Path
 from icefall.utils import setup_logger
 from lhotse import CutSet, load_manifest_lazy
 from lhotse.cut import Cut
 from lhotse.dataset.speech_recognition import validate_for_asr
@ -49,14 +50,6 @@ def get_args():
        help="Path to the manifest file",
    )
    parser.add_argument(
        "--log-file",
        type=str,
        default=None,
        required=True,
        help="The filename to save the log.",
    )
    return parser.parse_args()
@ -101,8 +94,6 @@ def main():
    args = get_args()
    manifest = args.cutset_manifest
    setup_logger(log_filename=f"{args.log_file}", log_level="info")
    logging.info(f"Validating {manifest}")
    assert manifest.is_file(), f"{manifest} does not exist"
@ -125,4 +116,8 @@ def main():
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/voxpopuli/ASR/prepare.sh
+++ b/egs/voxpopuli/ASR/prepare.sh
@ -1,6 +1,7 @@
 #!/usr/bin/env bash
-. /mnt/matylda5/iveselyk/ASR_TOOLKITS/K2_SHERPA_PYTORCH20/conda-activate.sh
+# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 set -euxo pipefail
@ -8,28 +9,17 @@ nj=20
 stage=-1
 stop_stage=100
 # Split data/${lang}set to this number of pieces
 # This is to avoid OOM during feature extraction.
 num_splits=100
 # We assume dl_dir (download dir) contains the following
 # directories and files. If not, they will be downloaded
 # by this script automatically.
 #
-# [TODO update this]
+#  - $dl_dir/voxpopuli/raw_audios/$lang/$year
 #      This directory contains *.ogg files with audio downloaded and extracted from archives:
 #       https://dl.fbaipublicfiles.com/voxpopuli/audios/${lang}_${year}.tar
 #
-#  - $dl_dir/$release/$lang
+#  - Note: the voxpopuli transcripts are downloaded to a ${tmp} folder
-#      This directory contains the following files downloaded from
+#    as part of `lhotse prepare voxpopuli` from:
-#       https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/${release}/${release}-${lang}.tar.gz
+#       https://dl.fbaipublicfiles.com/voxpopuli/annotations/asr/asr_${lang}.tsv.gz
 #
 #     - clips
 #     - dev.tsv
 #     - invalidated.tsv
 #     - other.tsv
 #     - reported.tsv
 #     - test.tsv
 #     - train.tsv
 #     - validated.tsv
 #
 #  - $dl_dir/musan
 #      This directory contains the following directories downloaded from
@ -39,19 +29,19 @@ num_splits=100
 #     - noise
 #     - speech
-#dl_dir=$PWD/download
+dl_dir=$PWD/download
-dl_dir=/mnt/matylda6/szoke/EU-ASR/DATA
+#dl_dir=/mnt/matylda6/szoke/EU-ASR/DATA  # BUT
-#musan_dir=${dl_dir}/musan
+musan_dir=${dl_dir}/musan
-musan_dir=/mnt/matylda2/data/MUSAN
+#musan_dir=/mnt/matylda2/data/MUSAN  # BUT
-# Choose vlues from:
+# Choose value from ASR_LANGUAGES:
 #
-# "en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr",
+# [ "en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr",
-# "sk", "sl", "et", "lt", "pt", "bg", "el", "lv", "mt", "sv", "da",
+# "sk", "sl", "et", "lt" ]
 # "asr", "10k", "100k", "400k"
 #
-# See: https://github.com/lhotse-speech/lhotse/blob/c5f26afd100885b86e4244eeb33ca1986f3fa923/lhotse/bin/modes/recipes/voxpopuli.py#L77
+# See ASR_LANGUAGES in:
 # https://github.com/lhotse-speech/lhotse/blob/c5f26afd100885b86e4244eeb33ca1986f3fa923/lhotse/recipes/voxpopuli.py#L54C4-L54C4
 lang=en
 task=asr
@ -102,12 +92,6 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  if [ ! -d $musan_dir/musan ]; then
    lhotse download musan $musan_dir
  fi
  # pre-download the transcripts
  DOWNLOAD_BASE_URL="https://dl.fbaipublicfiles.com/voxpopuli"
  dir=data/manifests; mkdir -p ${dir}
  wget --tries=10 --continue --progress=bar --directory-prefix=${dir} \
    "${DOWNLOAD_BASE_URL}/annotations/asr/${task}_${lang}.tsv.gz"
 fi
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
@ -115,7 +99,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  # We assume that you have downloaded the VoxPopuli corpus
  # to $dl_dir/voxpopuli
  if [ ! -e data/manifests/.voxpopuli-${task}-${lang}.done ]; then
-    # Warning : it requires Internet connection (it downloads transcripts)
+    # Warning : it requires Internet connection (it downloads transcripts to ${tmpdir})
    lhotse prepare voxpopuli --task asr --lang $lang -j $nj $dl_dir/voxpopuli data/manifests
    touch data/manifests/.voxpopuli-${task}-${lang}.done
  fi
@ -150,7 +134,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  for dataset in "dev" "test"; do
    if [ ! -e data/fbank/.voxpopuli-${task}-${lang}-${dataset}.done ]; then
      ./local/compute_fbank.py --src-dir data/fbank --output-dir data/fbank \
-          --num-jobs 50 --num-workers 10 \
+          --num-jobs 50 --num-workers ${nj} \
          --prefix "voxpopuli-${task}-${lang}" \
          --dataset ${dataset} \
          --trim-to-supervisions True
@ -160,10 +144,10 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
 fi
 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
-  log "Stage 6: Compute fbank for train set of VoxPopuli"
+  log "Stage 5: Compute fbank for train set of VoxPopuli"
  if [ ! -e data/fbank/.voxpopuli-${task}-${lang}-train.done ]; then
    ./local/compute_fbank.py --src-dir data/fbank --output-dir data/fbank \
-        --num-jobs 100 --num-workers 25 \
+        --num-jobs 100 --num-workers ${nj} \
        --prefix "voxpopuli-${task}-${lang}" \
        --dataset train \
        --trim-to-supervisions True \
@ -173,7 +157,17 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
 fi
 if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
-  log "Stage 6: Compute fbank for musan"
+  log "Stage 6: Validate fbank manifests for VoxPopuli"
  for dataset in "dev" "test" "train"; do
    mkdir -p data/fbank/log/
    ./local/validate_cutset_manifest.py \
      data/fbank/voxpopuli-asr-en_cuts_${dataset}.jsonl.gz \
      2>&1 | tee data/fbank/log/validate_voxpopuli-asr-en_cuts_${dataset}.log
  done
 fi
 if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
  log "Stage 7: Compute fbank for musan"
  mkdir -p data/fbank
  if [ ! -e data/fbank/.musan.done ]; then
    ./local/compute_fbank_musan.py
@ -181,8 +175,8 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
  fi
 fi
-if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
+if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
-  log "Stage 7: Prepare BPE based lang"
+  log "Stage 8: Prepare BPE based lang"
  for vocab_size in ${vocab_sizes[@]}; do
    lang_dir=data/lang_bpe_${vocab_size}_${lang}