change data -> data_{dataset}

2025-08-09 18:12:19 +00:00 · 2023-05-02 14:02:23 +05:30 · 2023-05-02 14:02:23 +05:30 · 92c3f61f21
commit 92c3f61f21
parent 99487d3000
10 changed files with 696 additions and 62 deletions
--- a/egs/mucs/ASR/conformer_ctc/asr_datamodule.py
+++ b/egs/mucs/ASR/conformer_ctc/asr_datamodule.py
@ -52,12 +52,11 @@ class _SeedWorkers:
        fix_random_seed(self.seed + worker_id)


-class LibriSpeechAsrDataModule:
+class MUCSAsrDataModule:
    """
-    DataModule for k2 ASR experiments.
-    It assumes there is always one train and valid dataloader,
-    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
-    and test-other).
+    DataModule for k2 ASR experiments.  
+    It assumes there is always one train, valid dataloader, and one test loader
+    This modified from librispeech asrmodule

    It contains all the common data pipeline modules used in ASR
    experiments, e.g.:
--- a/egs/mucs/ASR/conformer_ctc/train.py
+++ b/egs/mucs/ASR/conformer_ctc/train.py
@ -38,7 +38,7 @@ import k2
 import torch
 import torch.multiprocessing as mp
 import torch.nn as nn
-from asr_datamodule import LibriSpeechAsrDataModule
+from asr_datamodule import MUCSAsrDataModule
 from conformer import Conformer
 from lhotse.cut import Cut
 from lhotse.utils import fix_random_seed
@ -687,7 +687,7 @@ def run(rank, world_size, args):
    if checkpoints:
        optimizer.load_state_dict(checkpoints["optimizer"])

-    librispeech = LibriSpeechAsrDataModule(args)
+    librispeech = MUCSAsrDataModule(args)
    # params.full_libri = False
    # if params.full_libri:
    #     train_cuts = librispeech.train_all_shuf_cuts()
@ -800,7 +800,7 @@ def scan_pessimistic_batches_for_oom(

 def main():
    parser = get_parser()
-    LibriSpeechAsrDataModule.add_arguments(parser)
+    MUCSAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    args.lang_dir = Path(args.lang_dir)
--- a/egs/mucs/ASR/local/compile_hlg.py
+++ b/egs/mucs/ASR/local/compile_hlg.py
@ -1 +0,0 @@
-../../../librispeech/ASR/local/compile_hlg.py
--- a/egs/mucs/ASR/local/compile_hlg.py
+++ b/egs/mucs/ASR/local/compile_hlg.py
@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This script takes as input lang_dir and generates HLG from
+
+    - H, the ctc topology, built from tokens contained in lang_dir/lexicon.txt
+    - L, the lexicon, built from lang_dir/L_disambig.pt
+
+        Caution: We use a lexicon that contains disambiguation symbols
+
+    - G, the LM, built from data/lm/G_n_gram.fst.txt
+
+The generated HLG is saved in $lang_dir/HLG.pt
+"""
+import argparse
+import logging
+from pathlib import Path
+
+import k2
+import torch
+
+from icefall.lexicon import Lexicon
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lm",
+        type=str,
+        default="G_3_gram",
+        help="""Stem name for LM used in HLG compiling.
+        """,
+    )
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        help="""Input and output directory.
+        """,
+    )
+
+    return parser.parse_args()
+
+
+def compile_HLG(lang_dir: str, lm: str = "G_3_gram") -> k2.Fsa:
+    """
+    Args:
+      lang_dir:
+        The language directory, e.g., data/lang_phone or data/lang_bpe_5000.
+      lm:
+        The language stem base name.
+
+    Return:
+      An FSA representing HLG.
+    """
+    lexicon = Lexicon(lang_dir)
+    datapath = str(lang_dir).split('/')[0]
+    max_token_id = max(lexicon.tokens)
+    logging.info(f"Building ctc_topo. max_token_id: {max_token_id}")
+    H = k2.ctc_topo(max_token_id)
+    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt"))
+
+    if Path(f"{datapath}/lm/{lm}.pt").is_file():
+        logging.info(f"Loading pre-compiled {lm}")
+        d = torch.load(f"{datapath}/lm/{lm}.pt")
+        G = k2.Fsa.from_dict(d)
+    else:
+        logging.info(f"Loading {lm}.fst.txt")
+        with open(f"{datapath}/lm/{lm}.fst.txt") as f:
+            G = k2.Fsa.from_openfst(f.read(), acceptor=False)
+            torch.save(G.as_dict(), f"{datapath}/lm/{lm}.pt")
+
+    first_token_disambig_id = lexicon.token_table["#0"]
+    first_word_disambig_id = lexicon.word_table["#0"]
+
+    L = k2.arc_sort(L)
+    G = k2.arc_sort(G)
+
+    logging.info("Intersecting L and G")
+    LG = k2.compose(L, G)
+    logging.info(f"LG shape: {LG.shape}")
+
+    logging.info("Connecting LG")
+    LG = k2.connect(LG)
+    logging.info(f"LG shape after k2.connect: {LG.shape}")
+
+    logging.info(type(LG.aux_labels))
+    logging.info("Determinizing LG")
+
+    LG = k2.determinize(LG)
+    logging.info(type(LG.aux_labels))
+
+    logging.info("Connecting LG after k2.determinize")
+    LG = k2.connect(LG)
+
+    logging.info("Removing disambiguation symbols on LG")
+
+    LG.labels[LG.labels >= first_token_disambig_id] = 0
+    # See https://github.com/k2-fsa/k2/issues/874
+    # for why we need to set LG.properties to None
+    LG.__dict__["_properties"] = None
+
+    assert isinstance(LG.aux_labels, k2.RaggedTensor)
+    LG.aux_labels.values[LG.aux_labels.values >= first_word_disambig_id] = 0
+
+    LG = k2.remove_epsilon(LG)
+    logging.info(f"LG shape after k2.remove_epsilon: {LG.shape}")
+
+    LG = k2.connect(LG)
+    LG.aux_labels = LG.aux_labels.remove_values_eq(0)
+
+    logging.info("Arc sorting LG")
+    LG = k2.arc_sort(LG)
+
+    logging.info("Composing H and LG")
+    # CAUTION: The name of the inner_labels is fixed
+    # to `tokens`. If you want to change it, please
+    # also change other places in icefall that are using
+    # it.
+    HLG = k2.compose(H, LG, inner_labels="tokens")
+
+    logging.info("Connecting LG")
+    HLG = k2.connect(HLG)
+
+    logging.info("Arc sorting LG")
+    HLG = k2.arc_sort(HLG)
+    logging.info(f"HLG.shape: {HLG.shape}")
+
+    return HLG
+
+
+def main():
+    args = get_args()
+    lang_dir = Path(args.lang_dir)
+
+    if (lang_dir / "HLG.pt").is_file():
+        logging.info(f"{lang_dir}/HLG.pt already exists - skipping")
+        return
+
+    logging.info(f"Processing {lang_dir}")
+
+    HLG = compile_HLG(lang_dir, args.lm)
+    logging.info(f"Saving HLG.pt to {lang_dir}")
+    torch.save(HLG.as_dict(), f"{lang_dir}/HLG.pt")
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    main()
--- a/egs/mucs/ASR/local/compute_fbank_mucs.py
+++ b/egs/mucs/ASR/local/compute_fbank_mucs.py
@ -59,6 +59,16 @@ def get_args():
        type=str,
        help="""Dataset parts to compute fbank. If None, we will use all""",
    )
+    parser.add_argument(
+        "--manifestpath",
+        type=str,
+        help="""Dataset parts to compute fbank. If None, we will use all""",
+    )
+    parser.add_argument(
+        "--fbankpath",
+        type=str,
+        help="""Dataset parts to compute fbank. If None, we will use all""",
+    )

    return parser.parse_args()

@ -67,8 +77,8 @@ def compute_fbank_mucs(
    bpe_model: Optional[str] = None,
    dataset: Optional[str] = None,
 ):
-    src_dir = Path("data/manifests")
-    output_dir = Path("data/fbank")
+    src_dir = Path(args.manifestpath)
+    output_dir = Path(args.fbankpath)
    num_jobs = min(48, os.cpu_count())
    num_mel_bins = 80

--- a/egs/mucs/ASR/local/filter_scp.pl
+++ b/egs/mucs/ASR/local/filter_scp.pl
@ -0,0 +1,87 @@
+#!/usr/bin/env perl
+# Copyright 2010-2012 Microsoft Corporation
+#                     Johns Hopkins University (author: Daniel Povey)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script takes a list of utterance-ids or any file whose first field
+# of each line is an utterance-id, and filters an scp
+# file (or any file whose "n-th" field is an utterance id), printing
+# out only those lines whose "n-th" field is in id_list. The index of
+# the "n-th" field is 1, by default, but can be changed by using
+# the -f <n> switch
+
+$exclude = 0;
+$field = 1;
+$shifted = 0;
+
+do {
+  $shifted=0;
+  if ($ARGV[0] eq "--exclude") {
+    $exclude = 1;
+    shift @ARGV;
+    $shifted=1;
+  }
+  if ($ARGV[0] eq "-f") {
+    $field = $ARGV[1];
+    shift @ARGV; shift @ARGV;
+    $shifted=1
+  }
+} while ($shifted);
+
+if(@ARGV < 1 || @ARGV > 2) {
+  die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
+      "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
+      "Note: only the first field of each line in id_list matters.  With --exclude, prints\n" .
+      "only the lines that were *not* in id_list.\n" .
+      "Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
+      "If your older scripts (written before Oct 2014) stopped working and you used the\n" .
+      "-f option, add 1 to the argument.\n" .
+      "See also: utils/filter_scp.pl .\n";
+}
+
+
+$idlist = shift @ARGV;
+open(F, "<$idlist") || die "Could not open id-list file $idlist";
+while(<F>) {
+  @A = split;
+  @A>=1 || die "Invalid id-list file line $_";
+  $seen{$A[0]} = 1;
+}
+
+if ($field == 1) { # Treat this as special case, since it is common.
+  while(<>) {
+    $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
+    # $1 is what we filter on.
+    if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
+      print $_;
+    }
+  }
+} else {
+  while(<>) {
+    @A = split;
+    @A > 0 || die "Invalid scp file line $_";
+    @A >= $field || die "Invalid scp file line $_";
+    if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
+      print $_;
+    }
+  }
+}
+
+# tests:
+# the following should print "foo 1"
+# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
+# the following should print "bar 2".
+# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)
--- a/egs/mucs/ASR/local/subset_data_dir.sh
+++ b/egs/mucs/ASR/local/subset_data_dir.sh
@ -0,0 +1,196 @@
+#!/usr/bin/env bash
+# Copyright 2010-2011  Microsoft Corporation
+#           2012-2013  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+
+# This script operates on a data directory, such as in data/train/.
+# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data
+# for what these directories contain.
+
+# This script creates a subset of that data, consisting of some specified
+# number of utterances.  (The selected utterances are distributed evenly
+# throughout the file, by the program ./subset_scp.pl).
+
+# There are six options, none compatible with any other.
+
+# If you give the --per-spk option, it will attempt to select the supplied
+# number of utterances for each speaker (typically you would supply a much
+# smaller number in this case).
+
+# If you give the --speakers option, it selects a subset of n randomly
+# selected speakers.
+
+# If you give the --shortest option, it will give you the n shortest utterances.
+
+# If you give the --first option, it will just give you the n first utterances.
+
+# If you give the --last option, it will just give you the n last utterances.
+
+# If you give the --spk-list or --utt-list option, it reads the
+# speakers/utterances to keep from <speaker-list-file>/<utt-list-file>" (note,
+# in this case there is no <num-utt> positional parameter; see usage message.)
+
+
+shortest=false
+perspk=false
+speakers=false
+first_opt=
+spk_list=
+utt_list=
+
+expect_args=3
+case $1 in
+  --first|--last) first_opt=$1; shift ;;
+  --per-spk)  perspk=true; shift ;;
+  --shortest) shortest=true; shift ;;
+  --speakers) speakers=true; shift ;;
+  --spk-list) shift; spk_list=$1; shift; expect_args=2 ;;
+  --utt-list) shift; utt_list=$1; shift; expect_args=2 ;;
+  --*) echo "$0: invalid option '$1'"; exit 1
+esac
+
+if [ $# != $expect_args ]; then
+  echo "Usage:"
+  echo "  subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] <srcdir> <num-utt> <destdir>"
+  echo "  subset_data_dir.sh [--spk-list <speaker-list-file>] <srcdir> <destdir>"
+  echo "  subset_data_dir.sh [--utt-list <utt-list-file>] <srcdir> <destdir>"
+  echo "By default, randomly selects <num-utt> utterances from the data directory."
+  echo "With --speakers, randomly selects enough speakers that we have <num-utt> utterances"
+  echo "With --per-spk, selects <num-utt> utterances per speaker, if available."
+  echo "With --first, selects the first <num-utt> utterances"
+  echo "With --last, selects the last <num-utt> utterances"
+  echo "With --shortest, selects the shortest <num-utt> utterances."
+  echo "With --spk-list, reads the speakers to keep from <speaker-list-file>"
+  echo "With --utt-list, reads the utterances to keep from <utt-list-file>"
+  exit 1;
+fi
+
+srcdir=$1
+if [[ $spk_list || $utt_list ]]; then
+  numutt=
+  destdir=$2
+else
+  numutt=$2
+  destdir=$3
+fi
+
+export LC_ALL=C
+
+if [ ! -f $srcdir/utt2spk ]; then
+  echo "$0: no such file $srcdir/utt2spk"
+  exit 1
+fi
+
+if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then
+  echo "$0: cannot subset to more utterances than you originally had."
+  exit 1
+fi
+
+if $shortest && [ ! -f $srcdir/feats.scp ]; then
+  echo "$0: you selected --shortest but no feats.scp exist."
+  exit 1
+fi
+
+mkdir -p $destdir || exit 1
+
+if [[ $spk_list ]]; then
+  local/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1;
+  utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1;
+elif [[ $utt_list ]]; then
+  local/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1;
+  local/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1;
+elif $speakers; then
+  utils/shuffle_list.pl < $srcdir/spk2utt |
+    awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' |
+    sort > $destdir/spk2utt
+  utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
+elif $perspk; then
+  awk '{ n='$numutt'; printf("%s ",$1);
+         skip=1; while(n*(skip+1) <= NF-1) { skip++; }
+         for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); }
+         printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt
+  utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
+else
+  if $shortest; then
+    # Select $numutt shortest utterances.
+    . ./path.sh
+    if [ -f $srcdir/utt2num_frames ]; then
+      ln -sf $(utils/make_absolute.sh $srcdir)/utt2num_frames $destdir/tmp.len
+    else
+      feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1;
+    fi
+    sort -n -k2 $destdir/tmp.len |
+      awk '{print $1}' |
+      head -$numutt >$destdir/tmp.uttlist
+    local/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk
+    rm $destdir/tmp.uttlist $destdir/tmp.len
+  else
+    # Select $numutt random utterances.
+    local/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1;
+  fi
+  local/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt
+fi
+
+# Perform filtering. utt2spk and spk2utt files already exist by this point.
+# Filter by utterance.
+[ -f $srcdir/feats.scp ] &&
+  local/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp
+[ -f $srcdir/vad.scp ] &&
+  local/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp
+[ -f $srcdir/utt2lang ] &&
+  local/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang
+[ -f $srcdir/utt2dur ] &&
+  local/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur
+[ -f $srcdir/utt2num_frames ] &&
+  local/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames
+[ -f $srcdir/utt2uniq ] &&
+  local/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq
+[ -f $srcdir/wav.scp ] &&
+  local/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp
+[ -f $srcdir/utt2warp ] &&
+  local/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp
+[ -f $srcdir/text ] &&
+  local/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text
+
+# Filter by speaker.
+[ -f $srcdir/spk2warp ] &&
+  local/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp
+[ -f $srcdir/spk2gender ] &&
+  local/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
+[ -f $srcdir/cmvn.scp ] &&
+  local/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp
+
+# Filter by recording-id.
+if [ -f $srcdir/segments ]; then
+  local/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments
+  # Recording-ids are in segments.
+  awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco
+  # The next line overrides the command above for wav.scp, which would be incorrect.
+  [ -f $srcdir/wav.scp ] &&
+    local/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
+else
+  # No segments; recording-ids are in wav.scp.
+  awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco
+fi
+
+[ -f $srcdir/reco2file_and_channel ] &&
+  local/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
+[ -f $srcdir/reco2dur ] &&
+  local/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur
+
+# Filter the STM file for proper sclite scoring.
+# Copy over the comments from STM file.
+[ -f $srcdir/stm ] &&
+  (grep "^;;" $srcdir/stm
+   local/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm
+
+rm $destdir/reco
+
+# Copy frame_shift if present.
+[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir
+
+srcutts=$(wc -l <$srcdir/utt2spk)
+destutts=$(wc -l <$destdir/utt2spk)
+echo "$0: reducing #utt from $srcutts to $destutts"
+exit 0
--- a/egs/mucs/ASR/local/subset_scp.pl
+++ b/egs/mucs/ASR/local/subset_scp.pl
@ -0,0 +1,105 @@
+#!/usr/bin/env perl
+use warnings; #sed replacement for -w perl parameter
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This program selects a subset of N elements in the scp.
+
+# By default, it selects them evenly from throughout the scp, in order to avoid
+# selecting too many from the same speaker.  It prints them on the standard
+# output.
+# With the option --first, it just selects the N first utterances.
+# With the option --last, it just selects the N last utterances.
+
+# Last modified by JHU & HKUST @2013
+
+
+$quiet = 0;
+$first = 0;
+$last = 0;
+
+if (@ARGV > 0 && $ARGV[0] eq "--quiet") {
+  shift;
+  $quiet = 1;
+}
+if (@ARGV > 0 && $ARGV[0] eq "--first") {
+  shift;
+  $first = 1;
+}
+if (@ARGV > 0 && $ARGV[0] eq "--last") {
+  shift;
+  $last = 1;
+}
+
+if(@ARGV < 2 ) {
+    die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" .
+        " --quiet  causes it to not die if N < num lines in scp.\n" .
+        " --first and --last make it equivalent to head or tail.\n" .
+        "See also: filter_scp.pl\n";
+}
+
+$N = shift @ARGV;
+if($N == 0) {
+    die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\"";
+}
+$inscp = shift @ARGV;
+open(I, "<$inscp") || die "Opening input scp file $inscp";
+
+@F = ();
+while(<I>) {
+    push @F, $_;
+}
+$numlines = @F;
+if($N > $numlines) {
+  if ($quiet) {
+    $N = $numlines;
+  } else {
+    die "You requested from subset_scp.pl more elements than available: $N > $numlines";
+  }
+}
+
+sub select_n {
+  my ($start,$end,$num_needed) = @_;
+  my $diff = $end - $start;
+  if ($num_needed > $diff) {
+    die "select_n: code error";
+  }
+  if ($diff == 1 ) {
+    if ($num_needed  > 0) {
+      print $F[$start];
+    }
+  } else {
+    my $halfdiff = int($diff/2);
+    my $halfneeded = int($num_needed/2);
+    select_n($start, $start+$halfdiff, $halfneeded);
+    select_n($start+$halfdiff, $end, $num_needed - $halfneeded);
+  }
+}
+
+if ( ! $first && ! $last) {
+  if ($N > 0) {
+    select_n(0, $numlines, $N);
+  }
+} else {
+  if ($first) { # --first option: same as head.
+    for ($n = 0; $n < $N; $n++) {
+      print $F[$n];
+    }
+  } else { # --last option: same as tail.
+    for ($n = @F - $N; $n < @F; $n++) {
+      print $F[$n];
+    }
+  }
+}
--- a/egs/mucs/ASR/local/utt2spk_to_spk2utt.pl
+++ b/egs/mucs/ASR/local/utt2spk_to_spk2utt.pl
@ -0,0 +1,38 @@
+#!/usr/bin/env perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# converts an utt2spk file to a spk2utt file.
+# Takes input from the stdin or from a file argument;
+# output goes to the standard out.
+
+if ( @ARGV > 1 ) {
+    die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt";
+}
+
+while(<>){ 
+    @A = split(" ", $_);
+    @A == 2 || die "Invalid line in utt2spk file: $_";
+    ($u,$s) = @A;
+    if(!$seen_spk{$s}) {
+        $seen_spk{$s} = 1;
+        push @spklist, $s;
+    }
+    push (@{$spk_hash{$s}}, "$u");
+}
+foreach $s (@spklist) {
+    $l = join(' ',@{$spk_hash{$s}});
+    print "$s $l\n";
+}
--- a/egs/mucs/ASR/prepare.sh
+++ b/egs/mucs/ASR/prepare.sh
@ -6,28 +6,30 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 set -eou pipefail

 nj=60
-stage=6
+stage=9
 stop_stage=9

 # We assume dl_dir (download dir) contains the following
-# directories and files. If not, they will be downloaded
-# by this script automatically.
+# directories and files. download them from https://www.openslr.org/resources/104/
 #
 #  - $dl_dir/hi-en

 dl_dir=$PWD/download
-espnet_path=/home/wtc7/espnet/egs2/MUCS/asr1/data/hi-en/
+mkdir -p $dl_dir
+
+raw_data_path="/data/Database/MUCS/"
+dataset="bn-en" #hin-en or bn-en
+datadir="data_"$dataset
+raw_kaldi_files_path=$dl_dir/$dataset/
+

 . shared/parse_options.sh || exit 1

 # vocab size for sentence piece models.
-# It will generate data/lang_bpe_xxx,
-# data/lang_bpe_yyy
 vocab_size=400

-# All files generated by this script are saved in "data".
-# You can safely remove "data" and rerun this script to regenerate it.
-mkdir -p data
+
+mkdir -p $datadir

 log() {
  # This function is from espnet
@ -38,43 +40,73 @@ log() {
 log "dl_dir: $dl_dir"

 if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
-  log "Stage -1: prepare LM files"
-  mkdir -p $dl_dir/lm
-  if [ ! -e $dl_dir/lm/.done ]; then
-    ./local/prepare_lm_files.py --out-dir=$dl_dir/lm --data-path=$espnet_path --mode="train"
-    touch $dl_dir/lm/.done
-  fi
+  log "Stage -1: prepare data files"
+
+  mkdir -p $dl_dir/$dataset
+  for x in train dev test train_all; do
+  if [ -d "$dl_dir/$dataset/$x" ]; then rm -Rf $dl_dir/$dataset/$x; fi
+  done
+  mkdir -p $dl_dir/$dataset/{train,test,dev}
+
+
+
+  cp -r $raw_data_path/$dataset/"train"/"transcripts"/* $dl_dir/$dataset/"train"
+  cp -r $raw_data_path/$dataset/"test"/"transcripts"/* $dl_dir/$dataset/"test"
+
+  for x in train test
+    do
+    cp $dl_dir/$dataset/$x/"wav.scp" $dl_dir/$dataset/$x/"wav.scp_old"
+    cat $dl_dir/$dataset/$x/"wav.scp" | cut -d' ' -f1 > $dl_dir/$dataset/$x/wav_ids
+    cat $dl_dir/$dataset/$x/"wav.scp" | cut -d' ' -f2 | awk -v var="$raw_data_path/$dataset/$x/" '{print var$1}' > $dl_dir/$dataset/$x/wav_ids_with_fullpath
+    paste -d' ' $dl_dir/$dataset/$x/wav_ids $dl_dir/$dataset/$x/wav_ids_with_fullpath > $dl_dir/$dataset/$x/"wav.scp"
+    rm $dl_dir/$dataset/$x/wav_ids
+    rm $dl_dir/$dataset/$x/wav_ids_with_fullpath
+    done
+  ./local/subset_data_dir.sh --first $dl_dir/$dataset/"train" 1000 $dl_dir/$dataset/"dev"
+  total=$(wc -l $dl_dir/$dataset/"train"/"text" | cut -d' ' -f1)
+  count=$(expr $total - 1000)
+
+  ./local/subset_data_dir.sh --first $dl_dir/$dataset/"train" $count $dl_dir/$dataset/"train_reduced"
+  mv $dl_dir/$dataset/"train" $dl_dir/$dataset/"train_all"
+  mv $dl_dir/$dataset/"train_reduced" $dl_dir/$dataset/"train"
+  
+
 fi

 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
-  log "Stage 0: Download data"
+  log "Stage 0: prepare LM files"
+  mkdir -p $raw_kaldi_files_path/lm
+  if [ ! -e $raw_kaldi_files_path/lm/.done ]; then
+    ./local/prepare_lm_files.py --out-dir=$dl_dir/lm --data-path=$raw_kaldi_files_path --mode="train"
+    touch $raw_kaldi_files_path/lm/.done
+  fi
 fi

 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  log "Stage 1: Prepare MUCS manifest"
  # We assume that you have downloaded the MUCS corpus
  # to $dl_dir/
-  mkdir -p data/manifests
-  if [ ! -e data/manifests/.mucs.done ]; then
+  mkdir -p $datadir/manifests
+  if [ ! -e $datadir/manifests/.mucs.done ]; then
    # generate lhotse manifests from kaldi style files
-    ./local/prepare_manifest.py "$espnet_path" $nj data/manifests
+    ./local/prepare_manifest.py "$raw_kaldi_files_path" $nj $datadir/manifests

-    touch data/manifests/.mucs.done
+    touch $datadir/manifests/.mucs.done
  fi
 fi

 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  log "Stage 3: Compute fbank for mucs"
-  mkdir -p data/fbank
-  if [ ! -e data/fbank/.mucs.done ]; then
-    ./local/compute_fbank_mucs.py
-    touch data/fbank/.mucs.done
+  mkdir -p $datadir/fbank
+  if [ ! -e $datadir/fbank/.mucs.done ]; then
+    ./local/compute_fbank_mucs.py --manifestpath $datadir/manifests/ --fbankpath $datadir/fbank
+    touch $datadir/fbank/.mucs.done
  fi

  # exit

-  if [ ! -e data/fbank/.mucs-validated.done ]; then
-    log "Validating data/fbank for mucs"
+  if [ ! -e $datadir/fbank/.mucs-validated.done ]; then
+    log "Validating $datadir/fbank for mucs"
    parts=(
      train
      test
@ -82,9 +114,9 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
    )
    for part in ${parts[@]}; do
      python3 ./local/validate_manifest.py \
-        data/fbank/mucs_cuts_${part}.jsonl.gz
+        $datadir/fbank/mucs_cuts_${part}.jsonl.gz
    done
-    touch data/fbank/.mucs-validated.done
+    touch $datadir/fbank/.mucs-validated.done
  fi
 fi

@ -92,7 +124,7 @@ fi

 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
  log "Stage 5: Prepare phone based lang"
-  lang_dir=data/lang_phone
+  lang_dir=$datadir/lang_phone
  mkdir -p $lang_dir

  (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |
@ -124,11 +156,11 @@ fi
 if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
  log "Stage 6: Prepare BPE based lang"

-    lang_dir=data/lang_bpe_${vocab_size}
+    lang_dir=$datadir/lang_bpe_${vocab_size}
    mkdir -p $lang_dir
    # We reuse words.txt from phone based lexicon
    # so that the two can share G.pt later.
-    cp data/lang_phone/words.txt $lang_dir
+    cp $datadir/lang_phone/words.txt $lang_dir

    if [ ! -f $lang_dir/transcript_words.txt ]; then
      log "Generate data for BPE training"
@ -172,7 +204,7 @@ fi
 if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
  log "Stage 7: Train LM from training data"

-    lang_dir=data/lang_bpe_${vocab_size}
+    lang_dir=$datadir/lang_bpe_${vocab_size}

    if [ ! -f $lang_dir/lm_3.arpa ]; then
      ./shared/make_kn_lm.py \
@ -195,37 +227,31 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
  # We assume you have install kaldilm, if not, please install
  # it using: pip install kaldilm

-  mkdir -p data/lm
-  if [ ! -f data/lm/G_3_gram.fst.txt ]; then
+  mkdir -p $datadir/lm
+  if [ ! -f $datadir/lm/G_3_gram.fst.txt ]; then
    # It is used in building HLG
    python3 -m kaldilm \
-      --read-symbol-table="data/lang_phone/words.txt" \
+      --read-symbol-table="$datadir/lang_phone/words.txt" \
      --disambig-symbol='#0' \
      --max-order=3 \
-      data/lang_bpe_${vocab_size}/lm_3.arpa > data/lm/G_3_gram.fst.txt
+      $datadir/lang_bpe_${vocab_size}/lm_3.arpa > $datadir/lm/G_3_gram.fst.txt
  fi

-  if [ ! -f data/lm/G_4_gram.fst.txt ]; then
+  if [ ! -f $datadir/lm/G_4_gram.fst.txt ]; then
    # It is used in building HLG
    python3 -m kaldilm \
-      --read-symbol-table="data/lang_phone/words.txt" \
+      --read-symbol-table="$datadir/lang_phone/words.txt" \
      --disambig-symbol='#0' \
      --max-order=3 \
-      data/lang_bpe_${vocab_size}/lm_4.arpa > data/lm/G_4_gram.fst.txt
+      $datadir/lang_bpe_${vocab_size}/lm_4.arpa > $datadir/lm/G_4_gram.fst.txt
  fi

 fi

 if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
  log "Stage 9: Compile HLG"
-  # ./local/compile_hlg.py --lang-dir data/lang_phone

-  # Note If ./local/compile_hlg.py throws OOM,
-  # please switch to the following command
-  #
-  # ./local/compile_hlg_using_openfst.py --lang-dir data/lang_phone
-
-    lang_dir=data/lang_bpe_${vocab_size}
+    lang_dir=$datadir/lang_bpe_${vocab_size}
    ./local/compile_hlg.py --lang-dir $lang_dir

 fi
--- a/egs/mucs/ASR/run.sh
+++ b/egs/mucs/ASR/run.sh
@ -1,17 +1,24 @@
 #!/bin/bash
 export CUDA_VISIBLE_DEVICES="0"

+set -e 
+dataset='bn-en'
+datadir=data_"$dataset"
+bpe=400
+
 ./conformer_ctc/train.py \
    --num-epochs 60 \
    --max-duration 300 \
-    --exp-dir ./conformer_ctc/exp_with_devset_split_bpe400 \
-    --lang-dir data/lang_bpe_400 \
+    --exp-dir ./conformer_ctc/exp_"$dataset"_bpe"$bpe" \
+    --manifest-dir $datadir/fbank \
+    --lang-dir $datadir/lang_bpe_"$bpe" \
    --enable-musan False \


 ./conformer_ctc/decode.py \
-    --epoch 59 \
+    --epoch 60 \
    --avg 10 \
-    --exp-dir ./conformer_ctc/exp_with_devset_split_bpe400 \
+    --manifest-dir $datadir/fbank \
+    --exp-dir ./conformer_ctc/exp_"$dataset"_bpe"$bpe" \
    --max-duration 100 \
-    --lang-dir ./data/lang_bpe_400
+    --lang-dir $datadir/lang_bpe_"$bpe"
				`@ -1 +0,0 @@`
				`../../../librispeech/ASR/local/compile_hlg.py`