From f4c187286a742d56658854c6b25427649a63ebf6 Mon Sep 17 00:00:00 2001
From: marcoyang <marcoyang1998@gmail.com>
Date: Tue, 26 Mar 2024 14:56:29 +0800
Subject: [PATCH] enhance documentation

---
 .../AT/local/generate_audioset_manifest.py    | 43 +++++++++++++++----
 egs/audioset/AT/zipformer/at_datamodule.py    |  4 +-
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/egs/audioset/AT/local/generate_audioset_manifest.py b/egs/audioset/AT/local/generate_audioset_manifest.py
index 060337a72..8d4f4ec98 100644
--- a/egs/audioset/AT/local/generate_audioset_manifest.py
+++ b/egs/audioset/AT/local/generate_audioset_manifest.py
@@ -1,7 +1,30 @@
+#!/usr/bin/env python3
+# Copyright    2023  Xiaomi Corp.        (authors: Xiaoyu Yang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This file generates the manifest and computes the fbank features for AudioSet
+dataset. The generated manifests and features are stored in data/fbank.
+"""
+
 import argparse
 import csv
 import glob
 import logging
+import os
 
 import torch
 from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
@@ -15,8 +38,13 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 
 
-def parse_csv(csv_file="downloads/audioset/full_train_asedata_with_duration.csv"):
-
+def parse_csv(csv_file):
+    # The content of the csv file shoud be something like this
+    # ------------------------------------------------------
+    # filename  label
+    # dataset/AudioSet/balanced/xxxx.wav 0;451
+    # dataset/AudioSet/balanced/xxxy.wav 375
+    # ------------------------------------------------------
     mapping = {}
     with open(csv_file, "r") as fin:
         reader = csv.reader(fin, delimiter="\t")
@@ -45,7 +73,7 @@ def get_parser():
     parser.add_argument(
         "--feat-output-dir",
         type=str,
-        default="data/fbank_audioset",
+        default="data/fbank",
     )
 
     return parser
@@ -59,12 +87,9 @@ def main():
     split = args.split
     feat_output_dir = args.feat_output_dir
 
-    num_jobs = 15
+    num_jobs = min(15, os.cpu_count())
     num_mel_bins = 80
 
-    import pdb
-
-    pdb.set_trace()
     if split in ["balanced", "unbalanced"]:
         csv_file = "downloads/audioset/full_train_asedata_with_duration.csv"
     elif split == "eval":
@@ -100,7 +125,7 @@ def main():
             supervision.audio_event = labels[cut_id]
         except KeyError:
             logging.info(f"No labels found for {cut_id}.")
-            supervision.audio_event = ""
+            continue
         cut.supervisions = [supervision]
         new_cuts.append(cut)
 
@@ -115,7 +140,7 @@ def main():
     with get_executor() as ex:
         cuts = cuts.compute_and_store_features(
             extractor=extractor,
-            storage_path=f"{feat_output_dir}/{split}_{args.split}_feats",
+            storage_path=f"{feat_output_dir}/{split}_{split}_feats",
             num_jobs=num_jobs if ex is None else 80,
             executor=ex,
             storage_type=LilcomChunkyWriter,
diff --git a/egs/audioset/AT/zipformer/at_datamodule.py b/egs/audioset/AT/zipformer/at_datamodule.py
index 77483a6b2..cbb639ec7 100644
--- a/egs/audioset/AT/zipformer/at_datamodule.py
+++ b/egs/audioset/AT/zipformer/at_datamodule.py
@@ -56,7 +56,7 @@ class AudioSetATDatamodule:
     DataModule for k2 audio tagging (AT) experiments.
 
 
-    It contains all the common data pipeline modules used in ASR
+    It contains all the common data pipeline modules used in AT
     experiments, e.g.:
     - dynamic batch size,
     - bucketing samplers,
@@ -64,7 +64,7 @@ class AudioSetATDatamodule:
     - augmentation,
     - on-the-fly feature extraction
 
-    This class should be derived for specific corpora used in ASR tasks.
+    This class should be derived for specific corpora used in AT tasks.
     """
 
     def __init__(self, args: argparse.Namespace):