enhance documentation

2025-08-26 18:24:18 +00:00 · 2024-03-26 14:56:29 +08:00 · 2024-03-26 14:56:29 +08:00 · f4c187286a
commit f4c187286a
parent 7a8c9b7f53
2 changed files with 36 additions and 11 deletions
--- a/egs/audioset/AT/local/generate_audioset_manifest.py
+++ b/egs/audioset/AT/local/generate_audioset_manifest.py
@ -1,7 +1,30 @@
 #!/usr/bin/env python3
 # Copyright    2023  Xiaomi Corp.        (authors: Xiaoyu Yang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file generates the manifest and computes the fbank features for AudioSet
 dataset. The generated manifests and features are stored in data/fbank.
 """
 import argparse
 import csv
 import glob
 import logging
 import os
 import torch
 from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
@ -15,8 +38,13 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
-def parse_csv(csv_file="downloads/audioset/full_train_asedata_with_duration.csv"):
+def parse_csv(csv_file):
-
+    # The content of the csv file shoud be something like this
    # ------------------------------------------------------
    # filename  label
    # dataset/AudioSet/balanced/xxxx.wav 0;451
    # dataset/AudioSet/balanced/xxxy.wav 375
    # ------------------------------------------------------
    mapping = {}
    with open(csv_file, "r") as fin:
        reader = csv.reader(fin, delimiter="\t")
@ -45,7 +73,7 @@ def get_parser():
    parser.add_argument(
        "--feat-output-dir",
        type=str,
-        default="data/fbank_audioset",
+        default="data/fbank",
    )
    return parser
@ -59,12 +87,9 @@ def main():
    split = args.split
    feat_output_dir = args.feat_output_dir
-    num_jobs = 15
+    num_jobs = min(15, os.cpu_count())
    num_mel_bins = 80
    import pdb
    pdb.set_trace()
    if split in ["balanced", "unbalanced"]:
        csv_file = "downloads/audioset/full_train_asedata_with_duration.csv"
    elif split == "eval":
@ -100,7 +125,7 @@ def main():
            supervision.audio_event = labels[cut_id]
        except KeyError:
            logging.info(f"No labels found for {cut_id}.")
-            supervision.audio_event = ""
+            continue
        cut.supervisions = [supervision]
        new_cuts.append(cut)
@ -115,7 +140,7 @@ def main():
    with get_executor() as ex:
        cuts = cuts.compute_and_store_features(
            extractor=extractor,
-            storage_path=f"{feat_output_dir}/{split}_{args.split}_feats",
+            storage_path=f"{feat_output_dir}/{split}_{split}_feats",
            num_jobs=num_jobs if ex is None else 80,
            executor=ex,
            storage_type=LilcomChunkyWriter,
--- a/egs/audioset/AT/zipformer/at_datamodule.py
+++ b/egs/audioset/AT/zipformer/at_datamodule.py
@ -56,7 +56,7 @@ class AudioSetATDatamodule:
    DataModule for k2 audio tagging (AT) experiments.
-    It contains all the common data pipeline modules used in ASR
+    It contains all the common data pipeline modules used in AT
    experiments, e.g.:
    - dynamic batch size,
    - bucketing samplers,
@ -64,7 +64,7 @@ class AudioSetATDatamodule:
    - augmentation,
    - on-the-fly feature extraction
-    This class should be derived for specific corpora used in ASR tasks.
+    This class should be derived for specific corpora used in AT tasks.
    """
    def __init__(self, args: argparse.Namespace):