Implement recipe for Fluent Speech Commands dataset (#1469)

--------- Signed-off-by: Xinyuan Li <xli257@c13.clsp.jhu.edu>
2025-12-11 06:55:27 +00:00 · 2024-01-31 09:53:36 -05:00 · 2024-01-31 09:53:36 -05:00 · b07d5472c5
commit b07d5472c5
parent 37b975cac9
24 changed files with 2124 additions and 3 deletions
--- a/egs/fluent_speech_commands/SLU/README.md
+++ b/egs/fluent_speech_commands/SLU/README.md
@ -0,0 +1,9 @@
 ## Fluent Speech Commands recipe
 This is a recipe for the Fluent Speech Commands dataset, a speech dataset which transcribes short utterances (such as "turn the lights on in the kitchen") into action frames (such as {"action": "activate", "object": "lights", "location": "kitchen"}). The training set contains 23,132 utterances, whereas the test set contains 3793 utterances. 
 Dataset Paper link: <https://paperswithcode.com/dataset/fluent-speech-commands>
 cd icefall/egs/fluent_speech_commands/
 Training: python transducer/train.py
 Decoding: python transducer/decode.py
--- a/egs/fluent_speech_commands/SLU/local/compile_hlg.py
+++ b/egs/fluent_speech_commands/SLU/local/compile_hlg.py
@ -0,0 +1,136 @@
 #!/usr/bin/env python3
 """
 This script takes as input lang_dir and generates HLG from
    - H, the ctc topology, built from tokens contained in lang_dir/lexicon.txt
    - L, the lexicon, built from lang_dir/L_disambig.pt
        Caution: We use a lexicon that contains disambiguation symbols
    - G, the LM, built from data/lm/G.fst.txt
 The generated HLG is saved in $lang_dir/HLG.pt
 """
 import argparse
 import logging
 from pathlib import Path
 import k2
 import torch
 from icefall.lexicon import Lexicon
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lang-dir",
        type=str,
        help="""Input and output directory.
        """,
    )
    return parser.parse_args()
 def compile_HLG(lang_dir: str) -> k2.Fsa:
    """
    Args:
      lang_dir:
        The language directory, e.g., data/lang_phone or data/lang_bpe_5000.
    Return:
      An FSA representing HLG.
    """
    lexicon = Lexicon(lang_dir)
    max_token_id = max(lexicon.tokens)
    logging.info(f"Building ctc_topo. max_token_id: {max_token_id}")
    H = k2.ctc_topo(max_token_id)
    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt"))
    logging.info("Loading G.fst.txt")
    with open(lang_dir / "G.fst.txt") as f:
        G = k2.Fsa.from_openfst(f.read(), acceptor=False)
    first_token_disambig_id = lexicon.token_table["#0"]
    first_word_disambig_id = lexicon.word_table["#0"]
    L = k2.arc_sort(L)
    G = k2.arc_sort(G)
    logging.info("Intersecting L and G")
    LG = k2.compose(L, G)
    logging.info(f"LG shape: {LG.shape}")
    logging.info("Connecting LG")
    LG = k2.connect(LG)
    logging.info(f"LG shape after k2.connect: {LG.shape}")
    logging.info(type(LG.aux_labels))
    logging.info("Determinizing LG")
    LG = k2.determinize(LG)
    logging.info(type(LG.aux_labels))
    logging.info("Connecting LG after k2.determinize")
    LG = k2.connect(LG)
    logging.info("Removing disambiguation symbols on LG")
    # LG.labels[LG.labels >= first_token_disambig_id] = 0
    # see https://github.com/k2-fsa/k2/pull/1140
    labels = LG.labels
    labels[labels >= first_token_disambig_id] = 0
    LG.labels = labels
    assert isinstance(LG.aux_labels, k2.RaggedTensor)
    LG.aux_labels.values[LG.aux_labels.values >= first_word_disambig_id] = 0
    LG = k2.remove_epsilon(LG)
    logging.info(f"LG shape after k2.remove_epsilon: {LG.shape}")
    LG = k2.connect(LG)
    LG.aux_labels = LG.aux_labels.remove_values_eq(0)
    logging.info("Arc sorting LG")
    LG = k2.arc_sort(LG)
    logging.info("Composing H and LG")
    # CAUTION: The name of the inner_labels is fixed
    # to `tokens`. If you want to change it, please
    # also change other places in icefall that are using
    # it.
    HLG = k2.compose(H, LG, inner_labels="tokens")
    logging.info("Connecting LG")
    HLG = k2.connect(HLG)
    logging.info("Arc sorting LG")
    HLG = k2.arc_sort(HLG)
    logging.info(f"HLG.shape: {HLG.shape}")
    return HLG
 def main():
    args = get_args()
    lang_dir = Path(args.lang_dir)
    if (lang_dir / "HLG.pt").is_file():
        logging.info(f"{lang_dir}/HLG.pt already exists - skipping")
        return
    logging.info(f"Processing {lang_dir}")
    HLG = compile_HLG(lang_dir)
    logging.info(f"Saving HLG.pt to {lang_dir}")
    torch.save(HLG.as_dict(), f"{lang_dir}/HLG.pt")
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/fluent_speech_commands/SLU/local/compute_fbank_slu.py
+++ b/egs/fluent_speech_commands/SLU/local/compute_fbank_slu.py
@ -0,0 +1,97 @@
 #!/usr/bin/env python3
 """
 This file computes fbank features of the Fluent Speech Commands dataset.
 It looks for manifests in the directory data/manifests.
 The generated fbank features are saved in data/fbank.
 """
 import argparse
 import logging
 import os
 from pathlib import Path
 import torch
 from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor
 # Torch's multithreaded behavior needs to be disabled or it wastes a
 # lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def compute_fbank_slu(manifest_dir, fbanks_dir):
    src_dir = Path(manifest_dir)
    output_dir = Path(fbanks_dir)
    # This dataset is rather small, so we use only one job
    num_jobs = min(1, os.cpu_count())
    num_mel_bins = 23
    dataset_parts = (
        "train",
        "valid",
        "test",
    )
    prefix = "slu"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=src_dir,
        prefix=prefix,
        suffix=suffix,
    )
    assert manifests is not None
    assert len(manifests) == len(dataset_parts), (
        len(manifests),
        len(dataset_parts),
        list(manifests.keys()),
        dataset_parts,
    )
    extractor = Fbank(FbankConfig(sampling_rate=16000, num_mel_bins=num_mel_bins))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
            cuts_file = output_dir / f"{prefix}_cuts_{partition}.{suffix}"
            if cuts_file.is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
            cut_set = CutSet.from_manifests(
                recordings=m["recordings"],
                supervisions=m["supervisions"],
            )
            if "train" in partition:
                cut_set = (
                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 1,  # use one job
                executor=ex,
                storage_type=LilcomChunkyWriter,
            )
            cut_set.to_file(cuts_file)
 parser = argparse.ArgumentParser()
 parser.add_argument("manifest_dir")
 parser.add_argument("fbanks_dir")
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    args = parser.parse_args()
    logging.basicConfig(format=formatter, level=logging.INFO)
    compute_fbank_slu(args.manifest_dir, args.fbanks_dir)
--- a/egs/fluent_speech_commands/SLU/local/generate_lexicon.py
+++ b/egs/fluent_speech_commands/SLU/local/generate_lexicon.py
@ -0,0 +1,59 @@
 import argparse
 import pandas
 from tqdm import tqdm
 def generate_lexicon(corpus_dir, lm_dir):
    data = pandas.read_csv(
        str(corpus_dir) + "/data/train_data.csv", index_col=0, header=0
    )
    vocab_transcript = set()
    vocab_frames = set()
    transcripts = data["transcription"].tolist()
    frames = list(
        i
        for i in zip(
            data["action"].tolist(), data["object"].tolist(), data["location"].tolist()
        )
    )
    for transcript in tqdm(transcripts):
        for word in transcript.split():
            vocab_transcript.add(word)
    for frame in tqdm(frames):
        for word in frame:
            vocab_frames.add("_".join(word.split()))
    with open(lm_dir + "/words_transcript.txt", "w") as lexicon_transcript_file:
        lexicon_transcript_file.write("<UNK> 1" + "\n")
        lexicon_transcript_file.write("<s> 2" + "\n")
        lexicon_transcript_file.write("</s> 0" + "\n")
        id = 3
        for vocab in vocab_transcript:
            lexicon_transcript_file.write(vocab + " " + str(id) + "\n")
            id += 1
    with open(lm_dir + "/words_frames.txt", "w") as lexicon_frames_file:
        lexicon_frames_file.write("<UNK> 1" + "\n")
        lexicon_frames_file.write("<s> 2" + "\n")
        lexicon_frames_file.write("</s> 0" + "\n")
        id = 3
        for vocab in vocab_frames:
            lexicon_frames_file.write(vocab + " " + str(id) + "\n")
            id += 1
 parser = argparse.ArgumentParser()
 parser.add_argument("corpus_dir")
 parser.add_argument("lm_dir")
 def main():
    args = parser.parse_args()
    generate_lexicon(args.corpus_dir, args.lm_dir)
 main()
--- a/egs/fluent_speech_commands/SLU/local/prepare_lang.py
+++ b/egs/fluent_speech_commands/SLU/local/prepare_lang.py
@ -0,0 +1,371 @@
 #!/usr/bin/env python3
 # Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
 """
 This script takes as input a lexicon file "data/lang_phone/lexicon.txt"
 consisting of words and tokens (i.e., phones) and does the following:
 1. Add disambiguation symbols to the lexicon and generate lexicon_disambig.txt
 2. Generate tokens.txt, the token table mapping a token to a unique integer.
 3. Generate words.txt, the word table mapping a word to a unique integer.
 4. Generate L.pt, in k2 format. It can be loaded by
        d = torch.load("L.pt")
        lexicon = k2.Fsa.from_dict(d)
 5. Generate L_disambig.pt, in k2 format.
 """
 import argparse
 import math
 from collections import defaultdict
 from pathlib import Path
 from typing import Any, Dict, List, Tuple
 import k2
 import torch
 from icefall.lexicon import read_lexicon, write_lexicon
 Lexicon = List[Tuple[str, List[str]]]
 def write_mapping(filename: str, sym2id: Dict[str, int]) -> None:
    """Write a symbol to ID mapping to a file.
    Note:
      No need to implement `read_mapping` as it can be done
      through :func:`k2.SymbolTable.from_file`.
    Args:
      filename:
        Filename to save the mapping.
      sym2id:
        A dict mapping symbols to IDs.
    Returns:
      Return None.
    """
    with open(filename, "w", encoding="utf-8") as f:
        for sym, i in sym2id.items():
            f.write(f"{sym} {i}\n")
 def get_tokens(lexicon: Lexicon) -> List[str]:
    """Get tokens from a lexicon.
    Args:
      lexicon:
        It is the return value of :func:`read_lexicon`.
    Returns:
      Return a list of unique tokens.
    """
    ans = set()
    for _, tokens in lexicon:
        ans.update(tokens)
    sorted_ans = sorted(list(ans))
    return sorted_ans
 def get_words(lexicon: Lexicon) -> List[str]:
    """Get words from a lexicon.
    Args:
      lexicon:
        It is the return value of :func:`read_lexicon`.
    Returns:
      Return a list of unique words.
    """
    ans = set()
    for word, _ in lexicon:
        ans.add(word)
    sorted_ans = sorted(list(ans))
    return sorted_ans
 def add_disambig_symbols(lexicon: Lexicon) -> Tuple[Lexicon, int]:
    """It adds pseudo-token disambiguation symbols #1, #2 and so on
    at the ends of tokens to ensure that all pronunciations are different,
    and that none is a prefix of another.
    See also add_lex_disambig.pl from kaldi.
    Args:
      lexicon:
        It is returned by :func:`read_lexicon`.
    Returns:
      Return a tuple with two elements:
        - The output lexicon with disambiguation symbols
        - The ID of the max disambiguation symbol that appears
          in the lexicon
    """
    # (1) Work out the count of each token-sequence in the
    # lexicon.
    count = defaultdict(int)
    for _, tokens in lexicon:
        count[" ".join(tokens)] += 1
    # (2) For each left sub-sequence of each token-sequence, note down
    # that it exists (for identifying prefixes of longer strings).
    issubseq = defaultdict(int)
    for _, tokens in lexicon:
        tokens = tokens.copy()
        tokens.pop()
        while tokens:
            issubseq[" ".join(tokens)] = 1
            tokens.pop()
    # (3) For each entry in the lexicon:
    # if the token sequence is unique and is not a
    # prefix of another word, no disambig symbol.
    # Else output #1, or #2, #3, ... if the same token-seq
    # has already been assigned a disambig symbol.
    ans = []
    # We start with #1 since #0 has its own purpose
    first_allowed_disambig = 1
    max_disambig = first_allowed_disambig - 1
    last_used_disambig_symbol_of = defaultdict(int)
    for word, tokens in lexicon:
        tokenseq = " ".join(tokens)
        assert tokenseq != ""
        if issubseq[tokenseq] == 0 and count[tokenseq] == 1:
            ans.append((word, tokens))
            continue
        cur_disambig = last_used_disambig_symbol_of[tokenseq]
        if cur_disambig == 0:
            cur_disambig = first_allowed_disambig
        else:
            cur_disambig += 1
        if cur_disambig > max_disambig:
            max_disambig = cur_disambig
        last_used_disambig_symbol_of[tokenseq] = cur_disambig
        tokenseq += f" #{cur_disambig}"
        ans.append((word, tokenseq.split()))
    return ans, max_disambig
 def generate_id_map(symbols: List[str]) -> Dict[str, int]:
    """Generate ID maps, i.e., map a symbol to a unique ID.
    Args:
      symbols:
        A list of unique symbols.
    Returns:
      A dict containing the mapping between symbols and IDs.
    """
    return {sym: i for i, sym in enumerate(symbols)}
 def add_self_loops(
    arcs: List[List[Any]], disambig_token: int, disambig_word: int
 ) -> List[List[Any]]:
    """Adds self-loops to states of an FST to propagate disambiguation symbols
    through it. They are added on each state with non-epsilon output symbols
    on at least one arc out of the state.
    See also fstaddselfloops.pl from Kaldi. One difference is that
    Kaldi uses OpenFst style FSTs and it has multiple final states.
    This function uses k2 style FSTs and it does not need to add self-loops
    to the final state.
    The input label of a self-loop is `disambig_token`, while the output
    label is `disambig_word`.
    Args:
      arcs:
        A list-of-list. The sublist contains
        `[src_state, dest_state, label, aux_label, score]`
      disambig_token:
        It is the token ID of the symbol `#0`.
      disambig_word:
        It is the word ID of the symbol `#0`.
    Return:
      Return new `arcs` containing self-loops.
    """
    states_needs_self_loops = set()
    for arc in arcs:
        src, dst, ilabel, olabel, score = arc
        if olabel != 0:
            states_needs_self_loops.add(src)
    ans = []
    for s in states_needs_self_loops:
        ans.append([s, s, disambig_token, disambig_word, 0])
    return arcs + ans
 def lexicon_to_fst(
    lexicon: Lexicon,
    token2id: Dict[str, int],
    word2id: Dict[str, int],
    sil_token: str = "!SIL",
    sil_prob: float = 0.5,
    need_self_loops: bool = False,
 ) -> k2.Fsa:
    """Convert a lexicon to an FST (in k2 format) with optional silence at
    the beginning and end of each word.
    Args:
      lexicon:
        The input lexicon. See also :func:`read_lexicon`
      token2id:
        A dict mapping tokens to IDs.
      word2id:
        A dict mapping words to IDs.
      sil_token:
        The silence token.
      sil_prob:
        The probability for adding a silence at the beginning and end
        of the word.
      need_self_loops:
        If True, add self-loop to states with non-epsilon output symbols
        on at least one arc out of the state. The input label for this
        self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
    Returns:
      Return an instance of `k2.Fsa` representing the given lexicon.
    """
    assert sil_prob > 0.0 and sil_prob < 1.0
    # CAUTION: we use score, i.e, negative cost.
    sil_score = math.log(sil_prob)
    no_sil_score = math.log(1.0 - sil_prob)
    start_state = 0
    loop_state = 1  # words enter and leave from here
    sil_state = 2  # words terminate here when followed by silence; this state
    # has a silence transition to loop_state.
    next_state = 3  # the next un-allocated state, will be incremented as we go.
    arcs = []
    # assert token2id["<eps>"] == 0
    # assert word2id["<eps>"] == 0
    eps = 0
    sil_token = word2id[sil_token]
    arcs.append([start_state, loop_state, eps, eps, no_sil_score])
    arcs.append([start_state, sil_state, eps, eps, sil_score])
    arcs.append([sil_state, loop_state, sil_token, eps, 0])
    for word, tokens in lexicon:
        assert len(tokens) > 0, f"{word} has no pronunciations"
        cur_state = loop_state
        word = word2id[word]
        tokens = [word2id[i] for i in tokens]
        for i in range(len(tokens) - 1):
            w = word if i == 0 else eps
            arcs.append([cur_state, next_state, tokens[i], w, 0])
            cur_state = next_state
            next_state += 1
        # now for the last token of this word
        # It has two out-going arcs, one to the loop state,
        # the other one to the sil_state.
        i = len(tokens) - 1
        w = word if i == 0 else eps
        arcs.append([cur_state, loop_state, tokens[i], w, no_sil_score])
        arcs.append([cur_state, sil_state, tokens[i], w, sil_score])
    if need_self_loops:
        disambig_token = word2id["#0"]
        disambig_word = word2id["#0"]
        arcs = add_self_loops(
            arcs,
            disambig_token=disambig_token,
            disambig_word=disambig_word,
        )
    final_state = next_state
    arcs.append([loop_state, final_state, -1, -1, 0])
    arcs.append([final_state])
    arcs = sorted(arcs, key=lambda arc: arc[0])
    arcs = [[str(i) for i in arc] for arc in arcs]
    arcs = [" ".join(arc) for arc in arcs]
    arcs = "\n".join(arcs)
    fsa = k2.Fsa.from_str(arcs, acceptor=False)
    return fsa
 parser = argparse.ArgumentParser()
 parser.add_argument("lm_dir")
 def main():
    args = parser.parse_args()
    out_dir = Path(args.lm_dir)
    lexicon_filenames = [out_dir / "words_frames.txt", out_dir / "words_transcript.txt"]
    names = ["frames", "transcript"]
    sil_token = "!SIL"
    sil_prob = 0.5
    for name, lexicon_filename in zip(names, lexicon_filenames):
        lexicon = read_lexicon(lexicon_filename)
        tokens = get_words(lexicon)
        words = get_words(lexicon)
        new_lexicon = []
        for lexicon_item in lexicon:
            new_lexicon.append((lexicon_item[0], [lexicon_item[0]]))
        lexicon = new_lexicon
        lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
        for i in range(max_disambig + 1):
            disambig = f"#{i}"
            assert disambig not in tokens
            tokens.append(f"#{i}")
        tokens = ["<eps>"] + tokens
        words = ["eps"] + words + ["#0", "!SIL"]
        token2id = generate_id_map(tokens)
        word2id = generate_id_map(words)
        write_mapping(out_dir / ("tokens_" + name + ".txt"), token2id)
        write_mapping(out_dir / ("words_" + name + ".txt"), word2id)
        write_lexicon(out_dir / ("lexicon_disambig_" + name + ".txt"), lexicon_disambig)
        L = lexicon_to_fst(
            lexicon,
            token2id=word2id,
            word2id=word2id,
            sil_token=sil_token,
            sil_prob=sil_prob,
        )
        L_disambig = lexicon_to_fst(
            lexicon_disambig,
            token2id=word2id,
            word2id=word2id,
            sil_token=sil_token,
            sil_prob=sil_prob,
            need_self_loops=True,
        )
        torch.save(L.as_dict(), out_dir / ("L_" + name + ".pt"))
        torch.save(L_disambig.as_dict(), out_dir / ("L_disambig_" + name + ".pt"))
        if False:
            # Just for debugging, will remove it
            L.labels_sym = k2.SymbolTable.from_file(out_dir / "tokens.txt")
            L.aux_labels_sym = k2.SymbolTable.from_file(out_dir / "words.txt")
            L_disambig.labels_sym = L.labels_sym
            L_disambig.aux_labels_sym = L.aux_labels_sym
            L.draw(out_dir / "L.png", title="L")
            L_disambig.draw(out_dir / "L_disambig.png", title="L_disambig")
 main()
--- a/egs/fluent_speech_commands/SLU/prepare.sh
+++ b/egs/fluent_speech_commands/SLU/prepare.sh
@ -0,0 +1,103 @@
 #!/usr/bin/env bash
 # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 set -eou pipefail
 stage=1
 stop_stage=5
 data_dir=path/to/fluent/speech/commands
 target_root_dir=data/
 lang_dir=${target_root_dir}/lang_phone
 lm_dir=${target_root_dir}/lm
 manifest_dir=${target_root_dir}/manifests
 fbanks_dir=${target_root_dir}/fbanks
 . shared/parse_options.sh || exit 1
 mkdir -p $lang_dir
 mkdir -p $lm_dir
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 log "data_dir: $data_dir"
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  log "Stage 1: Prepare slu manifest"
  mkdir -p $manifest_dir
  lhotse prepare slu $data_dir $manifest_dir
 fi
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  log "Stage 2: Compute fbank for SLU"
  mkdir -p $fbanks_dir
  python ./local/compute_fbank_slu.py $manifest_dir $fbanks_dir
 fi
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  log "Stage 3: Prepare lang"
  # NOTE: "<UNK> SIL" is added for implementation convenience
  # as the graph compiler code requires that there is a OOV word
  # in the lexicon.
  python ./local/generate_lexicon.py $data_dir $lm_dir
 fi
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Train LM"
  # We use a unigram G
  ./shared/make_kn_lm.py \
    -ngram-order 1 \
    -text $lm_dir/words_transcript.txt \
    -lm $lm_dir/G_transcript.arpa
  ./shared/make_kn_lm.py \
    -ngram-order 1 \
    -text $lm_dir/words_frames.txt \
    -lm $lm_dir/G_frames.arpa
  python ./local/prepare_lang.py $lm_dir
  if [ ! -f $lm_dir/G_transcript.fst.txt ]; then
    python -m kaldilm \
      --read-symbol-table="$lm_dir/words_transcript.txt" \
      $lm_dir/G_transcript.arpa > $lm_dir/G_transcript.fst.txt
  fi
  if [ ! -f $lm_dir/G_frames.fst.txt ]; then
    python -m kaldilm \
      --read-symbol-table="$lm_dir/words_frames.txt" \
      $lm_dir/G_frames.arpa > $lm_dir/G_frames.fst.txt
  fi
  mkdir -p $lm_dir/frames
  mkdir -p $lm_dir/transcript
  chmod -R +777 .
  for i in G_frames.arpa G_frames.fst.txt L_disambig_frames.pt L_frames.pt lexicon_disambig_frames.txt tokens_frames.txt words_frames.txt;
  do
    j=${i//"_frames"/}
    mv "$lm_dir/$i" $lm_dir/frames/$j
  done
  for i in G_transcript.arpa G_transcript.fst.txt L_disambig_transcript.pt L_transcript.pt lexicon_disambig_transcript.txt tokens_transcript.txt words_transcript.txt;
  do
    j=${i//"_transcript"/}
    mv "$lm_dir/$i" $lm_dir/transcript/$j
  done
 fi
 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
  log "Stage 5: Compile HLG"
  ./local/compile_hlg.py --lang-dir $lm_dir/frames
  ./local/compile_hlg.py --lang-dir $lm_dir/transcript
 fi
--- a/egs/fluent_speech_commands/SLU/shared
+++ b/egs/fluent_speech_commands/SLU/shared
@ -0,0 +1 @@
 ../../icefall/shared/
--- a/egs/fluent_speech_commands/SLU/transducer/init.py
+++ b/egs/fluent_speech_commands/SLU/transducer/init.py
--- a/egs/fluent_speech_commands/SLU/transducer/beam_search.py
+++ b/egs/fluent_speech_commands/SLU/transducer/beam_search.py
@ -0,0 +1,71 @@
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List
 import torch
 from transducer.model import Transducer
 def greedy_search(
    model: Transducer, encoder_out: torch.Tensor, id2word: dict
 ) -> List[str]:
    """
    Args:
      model:
        An instance of `Transducer`.
      encoder_out:
        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
    Returns:
      Return the decoded result.
    """
    assert encoder_out.ndim == 3
    # support only batch_size == 1 for now
    assert encoder_out.size(0) == 1, encoder_out.size(0)
    blank_id = model.decoder.blank_id
    device = model.device
    sos = torch.tensor([blank_id], device=device).reshape(1, 1)
    decoder_out, (h, c) = model.decoder(sos)
    T = encoder_out.size(1)
    t = 0
    hyp = []
    max_u = 1000  # terminate after this number of steps
    u = 0
    while t < T and u < max_u:
        # fmt: off
        current_encoder_out = encoder_out[:, t:t+1, :]
        # fmt: on
        logits = model.joiner(current_encoder_out, decoder_out)
        log_prob = logits.log_softmax(dim=-1)
        # log_prob is (N, 1, 1)
        # TODO: Use logits.argmax()
        y = log_prob.argmax()
        if y != blank_id:
            hyp.append(y.item())
            y = y.reshape(1, 1)
            decoder_out, (h, c) = model.decoder(y, (h, c))
            u += 1
        else:
            t += 1
    # id2word = {1: "YES", 2: "NO"}
    hyp = [id2word[i] for i in hyp]
    return hyp
--- a/egs/fluent_speech_commands/SLU/transducer/conformer.py
+++ b/egs/fluent_speech_commands/SLU/transducer/conformer.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer_stateless/conformer.py
--- a/egs/fluent_speech_commands/SLU/transducer/decode.py
+++ b/egs/fluent_speech_commands/SLU/transducer/decode.py
@ -0,0 +1,346 @@
 #!/usr/bin/env python3
 # Copyright      2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 from pathlib import Path
 from typing import List, Tuple
 import torch
 import torch.nn as nn
 from transducer.beam_search import greedy_search
 from transducer.conformer import Conformer
 from transducer.decoder import Decoder
 from transducer.joiner import Joiner
 from transducer.model import Transducer
 from transducer.slu_datamodule import SluDataModule
 from icefall.checkpoint import average_checkpoints, load_checkpoint
 from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    setup_logger,
    store_transcripts,
    write_error_stats,
 )
 def get_id2word(params):
    id2word = {}
    # 0 is blank
    id = 1
    try:
        with open(Path(params.lang_dir) / "lexicon_disambig.txt") as lexicon_file:
            for line in lexicon_file:
                if len(line.strip()) > 0:
                    id2word[id] = line.split()[0]
                    id += 1
    except:
        pass
    return id2word
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=6,
        help="It specifies the checkpoint to use for decoding."
        "Note: Epoch counts from 0.",
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=1,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch'. ",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="transducer/exp",
        help="Directory from which to load the checkpoints",
    )
    parser.add_argument("--lang-dir", type=str, default="data/lm/frames")
    return parser
 def get_params() -> AttributeDict:
    params = AttributeDict(
        {
            "feature_dim": 23,
            "lang_dir": Path("data/lm/frames"),
            # encoder/decoder params
            "vocab_size": 3,  # blank, yes, no
            "blank_id": 0,
            "embedding_dim": 32,
            "hidden_dim": 16,
            "num_decoder_layers": 4,
        }
    )
    vocab_size = 1
    with open(params.lang_dir / "lexicon_disambig.txt") as lexicon_file:
        for line in lexicon_file:
            if (
                len(line.strip()) > 0
            ):  # and '<UNK>' not in line and '<s>' not in line and '</s>' not in line:
                vocab_size += 1
    params.vocab_size = vocab_size
    return params
 def decode_one_batch(
    params: AttributeDict, model: nn.Module, batch: dict, id2word: dict
 ) -> List[List[int]]:
    """Decode one batch and return the result in a list-of-list.
    Each sub list contains the word IDs for an utterance in the batch.
    Args:
      params:
        It's the return value of :func:`get_params`.
        - params.method is "1best", it uses 1best decoding.
        - params.method is "nbest", it uses nbest decoding.
      model:
        The neural model.
      batch:
        It is the return value from iterating
        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
        for the format of the `batch`.
        (https://github.com/lhotse-speech/lhotse/blob/master/lhotse/dataset/speech_recognition.py)
    Returns:
      Return the decoding result. `len(ans)` == batch size.
    """
    device = model.device
    feature = batch["inputs"]
    feature = feature.to(device)
    # at entry, feature is (N, T, C)
    feature_lens = batch["supervisions"]["num_frames"].to(device)
    encoder_out, encoder_out_lens = model.encoder(x=feature, x_lens=feature_lens)
    hyps = []
    batch_size = encoder_out.size(0)
    for i in range(batch_size):
        # fmt: off
        encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
        # fmt: on
        hyp = greedy_search(model=model, encoder_out=encoder_out_i, id2word=id2word)
        hyps.append(hyp)
    #  hyps = [[word_table[i] for i in ids] for ids in hyps]
    return hyps
 def decode_dataset(
    dl: torch.utils.data.DataLoader,
    params: AttributeDict,
    model: nn.Module,
 ) -> List[Tuple[List[int], List[int]]]:
    """Decode dataset.
    Args:
      dl:
        PyTorch's dataloader containing the dataset to decode.
      params:
        It is returned by :func:`get_params`.
      model:
        The neural model.
    Returns:
      Return a tuple contains two elements (ref_text, hyp_text):
      The first is the reference transcript, and the second is the
      predicted result.
    """
    results = []
    num_cuts = 0
    try:
        num_batches = len(dl)
    except TypeError:
        num_batches = "?"
    id2word = get_id2word(params)
    results = []
    for batch_idx, batch in enumerate(dl):
        texts = [
            " ".join(a.supervisions[0].custom["frames"])
            for a in batch["supervisions"]["cut"]
        ]
        texts = [
            "<s> " + a.replace("change language", "change_language") + " </s>"
            for a in texts
        ]
        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
        hyps = decode_one_batch(
            params=params, model=model, batch=batch, id2word=id2word
        )
        this_batch = []
        assert len(hyps) == len(texts)
        for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
            ref_words = ref_text.split()
            this_batch.append((cut_id, ref_words, hyp_words))
        results.extend(this_batch)
        num_cuts += len(batch["supervisions"]["text"])
        if batch_idx % 100 == 0:
            batch_str = f"{batch_idx}/{num_batches}"
            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
    return results
 def save_results(
    exp_dir: Path,
    test_set_name: str,
    results: List[Tuple[List[int], List[int]]],
 ) -> None:
    """Save results to `exp_dir`.
    Args:
      exp_dir:
        The output directory. This function create the following files inside
        this directory:
            - recogs-{test_set_name}.text
                It contains the reference and hypothesis results, like below::
                    ref=['NO', 'NO', 'NO', 'YES', 'NO', 'NO', 'NO', 'YES']
                    hyp=['NO', 'NO', 'NO', 'YES', 'NO', 'NO', 'NO', 'YES']
                    ref=['NO', 'NO', 'YES', 'NO', 'YES', 'NO', 'NO', 'YES']
                    hyp=['NO', 'NO', 'YES', 'NO', 'YES', 'NO', 'NO', 'YES']
            - errs-{test_set_name}.txt
                It contains the detailed WER.
      test_set_name:
        The name of the test set, which will be part of the result filename.
      results:
        A list of tuples, each of which contains (ref_words, hyp_words).
    Returns:
      Return None.
    """
    recog_path = exp_dir / f"recogs-{test_set_name}.txt"
    results = sorted(results)
    store_transcripts(filename=recog_path, texts=results)
    logging.info(f"The transcripts are stored in {recog_path}")
    # The following prints out WERs, per-word error statistics and aligned
    # ref/hyp pairs.
    errs_filename = exp_dir / f"errs-{test_set_name}.txt"
    with open(errs_filename, "w") as f:
        write_error_stats(f, f"{test_set_name}", results)
    logging.info("Wrote detailed error stats to {}".format(errs_filename))
 def get_transducer_model(params: AttributeDict):
    # encoder = Tdnn(
    #     num_features=params.feature_dim,
    #     output_dim=params.hidden_dim,
    # )
    encoder = Conformer(
        num_features=params.feature_dim,
        output_dim=params.hidden_dim,
    )
    decoder = Decoder(
        vocab_size=params.vocab_size,
        embedding_dim=params.embedding_dim,
        blank_id=params.blank_id,
        num_layers=params.num_decoder_layers,
        hidden_dim=params.hidden_dim,
        embedding_dropout=0.4,
        rnn_dropout=0.4,
    )
    joiner = Joiner(input_dim=params.hidden_dim, output_dim=params.vocab_size)
    transducer = Transducer(encoder=encoder, decoder=decoder, joiner=joiner)
    return transducer
@torch.no_grad()
 def main():
    parser = get_parser()
    SluDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    params = get_params()
    params.update(vars(args))
    params["env_info"] = get_env_info()
    setup_logger(f"{params.exp_dir}/log/log-decode")
    logging.info("Decoding started")
    logging.info(params)
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    logging.info(f"device: {device}")
    model = get_transducer_model(params)
    if params.avg == 1:
        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
    else:
        start = params.epoch - params.avg + 1
        filenames = []
        for i in range(start, params.epoch + 1):
            if start >= 0:
                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
        logging.info(f"averaging {filenames}")
        model.load_state_dict(average_checkpoints(filenames))
    model.to(device)
    model.eval()
    model.device = device
    # we need cut ids to display recognition results.
    args.return_cuts = True
    slu = SluDataModule(args)
    test_dl = slu.test_dataloaders()
    results = decode_dataset(
        dl=test_dl,
        params=params,
        model=model,
    )
    test_set_name = str(args.feature_dir).split("/")[-2]
    save_results(exp_dir=params.exp_dir, test_set_name=test_set_name, results=results)
    logging.info("Done!")
 if __name__ == "__main__":
    main()
--- a/egs/fluent_speech_commands/SLU/transducer/decoder.py
+++ b/egs/fluent_speech_commands/SLU/transducer/decoder.py
@ -0,0 +1 @@
 ../../../yesno/ASR/transducer/decoder.py
--- a/egs/fluent_speech_commands/SLU/transducer/encoder_interface.py
+++ b/egs/fluent_speech_commands/SLU/transducer/encoder_interface.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer_stateless/encoder_interface.py
--- a/egs/fluent_speech_commands/SLU/transducer/joiner.py
+++ b/egs/fluent_speech_commands/SLU/transducer/joiner.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer/joiner.py
--- a/egs/fluent_speech_commands/SLU/transducer/model.py
+++ b/egs/fluent_speech_commands/SLU/transducer/model.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer/model.py
--- a/egs/fluent_speech_commands/SLU/transducer/slu_datamodule.py
+++ b/egs/fluent_speech_commands/SLU/transducer/slu_datamodule.py
@ -0,0 +1,289 @@
 # Copyright      2021  Piotr Żelasko
 #                2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 from functools import lru_cache
 from pathlib import Path
 from typing import List
 from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
    CutConcatenate,
    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
    PrecomputedFeatures,
    SimpleCutSampler,
 )
 from lhotse.dataset.input_strategies import OnTheFlyFeatures
 from torch.utils.data import DataLoader
 from icefall.dataset.datamodule import DataModule
 from icefall.utils import str2bool
 class SluDataModule(DataModule):
    """
    DataModule for k2 ASR experiments.
    It assumes there is always one train dataloader,
    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
    and test-other).
    It contains all the common data pipeline modules used in ASR
    experiments, e.g.:
    - dynamic batch size,
    - bucketing samplers,
    - cut concatenation,
    - augmentation,
    - on-the-fly feature extraction
    """
    @classmethod
    def add_arguments(cls, parser: argparse.ArgumentParser):
        super().add_arguments(parser)
        group = parser.add_argument_group(
            title="ASR data related options",
            description="These options are used for the preparation of "
            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
            "effective batch sizes, sampling strategies, applied data "
            "augmentations, etc.",
        )
        group.add_argument(
            "--feature-dir",
            type=Path,
            default=Path("data/fbanks"),
            help="Path to directory with train/test cuts.",
        )
        group.add_argument(
            "--max-duration",
            type=int,
            default=30.0,
            help="Maximum pooled recordings duration (seconds) in a "
            "single batch. You can reduce it if it causes CUDA OOM.",
        )
        group.add_argument(
            "--bucketing-sampler",
            type=str2bool,
            default=False,
            help="When enabled, the batches will come from buckets of "
            "similar duration (saves padding frames).",
        )
        group.add_argument(
            "--num-buckets",
            type=int,
            default=10,
            help="The number of buckets for the DynamicBucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
        group.add_argument(
            "--concatenate-cuts",
            type=str2bool,
            default=False,
            help="When enabled, utterances (cuts) will be concatenated "
            "to minimize the amount of padding.",
        )
        group.add_argument(
            "--duration-factor",
            type=float,
            default=1.0,
            help="Determines the maximum duration of a concatenated cut "
            "relative to the duration of the longest cut in a batch.",
        )
        group.add_argument(
            "--gap",
            type=float,
            default=1.0,
            help="The amount of padding (in seconds) inserted between "
            "concatenated cuts. This padding is filled with noise when "
            "noise augmentation is used.",
        )
        group.add_argument(
            "--on-the-fly-feats",
            type=str2bool,
            default=False,
            help="When enabled, use on-the-fly cut mixing and feature "
            "extraction. Will drop existing precomputed feature manifests "
            "if available.",
        )
        group.add_argument(
            "--shuffle",
            type=str2bool,
            default=True,
            help="When enabled (=default), the examples will be "
            "shuffled for each epoch.",
        )
        group.add_argument(
            "--return-cuts",
            type=str2bool,
            default=True,
            help="When enabled, each batch will have the "
            "field: batch['supervisions']['cut'] with the cuts that "
            "were used to construct it.",
        )
        group.add_argument(
            "--num-workers",
            type=int,
            default=2,
            help="The number of training dataloader workers that "
            "collect the batches.",
        )
    def train_dataloaders(self) -> DataLoader:
        logging.info("About to get train cuts")
        cuts_train = self.train_cuts()
        logging.info("About to create train dataset")
        transforms = []
        if self.args.concatenate_cuts:
            logging.info(
                f"Using cut concatenation with duration factor "
                f"{self.args.duration_factor} and gap {self.args.gap}."
            )
            # Cut concatenation should be the first transform in the list,
            # so that if we e.g. mix noise in, it will fill the gaps between
            # different utterances.
            transforms = [
                CutConcatenate(
                    duration_factor=self.args.duration_factor, gap=self.args.gap
                )
            ] + transforms
        train = K2SpeechRecognitionDataset(
            cut_transforms=transforms,
            return_cuts=self.args.return_cuts,
        )
        if self.args.on_the_fly_feats:
            # NOTE: the PerturbSpeed transform should be added only if we
            # remove it from data prep stage.
            # Add on-the-fly speed perturbation; since originally it would
            # have increased epoch size by 3, we will apply prob 2/3 and use
            # 3x more epochs.
            # Speed perturbation probably should come first before
            # concatenation, but in principle the transforms order doesn't have
            # to be strict (e.g. could be randomized)
            # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms   # noqa
            # Drop feats to be on the safe side.
            train = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(
                    FbankConfig(sampling_rate=8000, num_mel_bins=23)
                ),
                return_cuts=self.args.return_cuts,
            )
        if self.args.bucketing_sampler:
            logging.info("Using DynamicBucketingSampler.")
            train_sampler = DynamicBucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                drop_last=True,
            )
        else:
            logging.info("Using SimpleCutSampler.")
            train_sampler = SimpleCutSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
            )
        logging.info("About to create train dataloader")
        train_dl = DataLoader(
            train,
            sampler=train_sampler,
            batch_size=None,
            num_workers=self.args.num_workers,
            persistent_workers=True,
        )
        return train_dl
    def valid_dataloaders(self) -> DataLoader:
        logging.info("About to get valid cuts")
        cuts_valid = self.valid_cuts()
        logging.debug("About to create valid dataset")
        valid = K2SpeechRecognitionDataset(
            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=23)))
            if self.args.on_the_fly_feats
            else PrecomputedFeatures(),
            return_cuts=self.args.return_cuts,
        )
        sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        logging.debug("About to create valid dataloader")
        valid_dl = DataLoader(
            valid,
            batch_size=None,
            sampler=sampler,
            num_workers=self.args.num_workers,
            persistent_workers=True,
        )
        return valid_dl
    def test_dataloaders(self) -> DataLoader:
        logging.info("About to get test cuts")
        cuts_test = self.test_cuts()
        logging.debug("About to create test dataset")
        test = K2SpeechRecognitionDataset(
            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=23)))
            if self.args.on_the_fly_feats
            else PrecomputedFeatures(),
            return_cuts=self.args.return_cuts,
        )
        sampler = DynamicBucketingSampler(
            cuts_test,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        logging.debug("About to create test dataloader")
        test_dl = DataLoader(
            test,
            batch_size=None,
            sampler=sampler,
            num_workers=self.args.num_workers,
            persistent_workers=True,
        )
        return test_dl
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
        cuts_train = load_manifest_lazy(
            self.args.feature_dir / "slu_cuts_train.jsonl.gz"
        )
        return cuts_train
    @lru_cache()
    def valid_cuts(self) -> List[CutSet]:
        logging.info("About to get valid cuts")
        cuts_valid = load_manifest_lazy(
            self.args.feature_dir / "slu_cuts_valid.jsonl.gz"
        )
        return cuts_valid
    @lru_cache()
    def test_cuts(self) -> List[CutSet]:
        logging.info("About to get test cuts")
        cuts_test = load_manifest_lazy(self.args.feature_dir / "slu_cuts_test.jsonl.gz")
        return cuts_test
--- a/egs/fluent_speech_commands/SLU/transducer/subsampling.py
+++ b/egs/fluent_speech_commands/SLU/transducer/subsampling.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer_stateless/subsampling.py
--- a/egs/fluent_speech_commands/SLU/transducer/test_conformer.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_conformer.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer/test_conformer.py
--- a/egs/fluent_speech_commands/SLU/transducer/test_decoder.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_decoder.py
@ -0,0 +1 @@
 ../../../yesno/ASR/transducer/test_decoder.py
--- a/egs/fluent_speech_commands/SLU/transducer/test_joiner.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_joiner.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer/test_joiner.py
--- a/egs/fluent_speech_commands/SLU/transducer/test_transducer.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_transducer.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer/test_transducer.py
--- a/egs/fluent_speech_commands/SLU/transducer/train.py
+++ b/egs/fluent_speech_commands/SLU/transducer/train.py
@ -0,0 +1,625 @@
 #!/usr/bin/env python3
 # Copyright      2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 from pathlib import Path
 from shutil import copyfile
 from typing import List, Optional, Tuple
 import k2
 import torch
 import torch.multiprocessing as mp
 import torch.nn as nn
 import torch.optim as optim
 from lhotse.utils import fix_random_seed
 from slu_datamodule import SluDataModule
 from torch import Tensor
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.nn.utils import clip_grad_norm_
 from transducer.conformer import Conformer
 # from torch.utils.tensorboard import SummaryWriter
 from transducer.decoder import Decoder
 from transducer.joiner import Joiner
 from transducer.model import Transducer
 from icefall.checkpoint import load_checkpoint
 from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
 def get_word2id(params):
    word2id = {}
    # 0 is blank
    id = 1
    with open(Path(params.lang_dir) / "lexicon_disambig.txt") as lexicon_file:
        for line in lexicon_file:
            if len(line.strip()) > 0:
                word2id[line.split()[0]] = id
                id += 1
    return word2id
 def get_labels(texts: List[str], word2id) -> k2.RaggedTensor:
    """
    Args:
      texts:
        A list of transcripts.
    Returns:
      Return a ragged tensor containing the corresponding word ID.
    """
    # blank is 0
    word_ids = []
    for t in texts:
        words = t.split()
        ids = [word2id[w] for w in words]
        word_ids.append(ids)
    return k2.RaggedTensor(word_ids)
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--world-size",
        type=int,
        default=1,
        help="Number of GPUs for DDP training.",
    )
    parser.add_argument(
        "--master-port",
        type=int,
        default=12354,
        help="Master port to use for DDP training.",
    )
    parser.add_argument(
        "--tensorboard",
        type=str2bool,
        default=True,
        help="Should various information be logged in tensorboard.",
    )
    parser.add_argument(
        "--num-epochs",
        type=int,
        default=7,
        help="Number of epochs to train.",
    )
    parser.add_argument(
        "--start-epoch",
        type=int,
        default=0,
        help="""Resume training from from this epoch.
        If it is positive, it will load checkpoint from
        tdnn/exp/epoch-{start_epoch-1}.pt
        """,
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="transducer/exp",
        help="Directory to save results",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=42,
        help="The seed for random generators intended for reproducibility",
    )
    parser.add_argument("--lang-dir", type=str, default="data/lm/frames")
    return parser
 def get_params() -> AttributeDict:
    """Return a dict containing training parameters.
    All training related parameters that are not passed from the commandline
    is saved in the variable `params`.
    Commandline options are merged into `params` after they are parsed, so
    you can also access them via `params`.
    Explanation of options saved in `params`:
        - lr: It specifies the initial learning rate
        - feature_dim: The model input dim. It has to match the one used
                       in computing features.
        - weight_decay:  The weight_decay for the optimizer.
        - subsampling_factor:  The subsampling factor for the model.
        - start_epoch:  If it is not zero, load checkpoint `start_epoch-1`
                        and continue training from that checkpoint.
        - best_train_loss: Best training loss so far. It is used to select
                           the model that has the lowest training loss. It is
                           updated during the training.
        - best_valid_loss: Best validation loss so far. It is used to select
                           the model that has the lowest validation loss. It is
                           updated during the training.
        - best_train_epoch: It is the epoch that has the best training loss.
        - best_valid_epoch: It is the epoch that has the best validation loss.
        - batch_idx_train: Used to writing statistics to tensorboard. It
                           contains number of batches trained so far across
                           epochs.
        - log_interval:  Print training loss if batch_idx % log_interval` is 0
        - valid_interval:  Run validation if batch_idx % valid_interval` is 0
        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
    """
    params = AttributeDict(
        {
            "lr": 1e-4,
            "feature_dim": 23,
            "weight_decay": 1e-6,
            "start_epoch": 0,
            "best_train_loss": float("inf"),
            "best_valid_loss": float("inf"),
            "best_train_epoch": -1,
            "best_valid_epoch": -1,
            "batch_idx_train": 0,
            "log_interval": 100,
            "reset_interval": 20,
            "valid_interval": 3000,
            "exp_dir": Path("transducer/exp"),
            "lang_dir": Path("data/lm/frames"),
            # encoder/decoder params
            "vocab_size": 3,  # blank, yes, no
            "blank_id": 0,
            "embedding_dim": 32,
            "hidden_dim": 16,
            "num_decoder_layers": 4,
        }
    )
    vocab_size = 1
    with open(Path(params.lang_dir) / "lexicon_disambig.txt") as lexicon_file:
        for line in lexicon_file:
            if (
                len(line.strip()) > 0
            ):  # and '<UNK>' not in line and '<s>' not in line and '</s>' not in line:
                vocab_size += 1
    params.vocab_size = vocab_size
    return params
 def load_checkpoint_if_available(
    params: AttributeDict,
    model: nn.Module,
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
 ) -> None:
    """Load checkpoint from file.
    If params.start_epoch is positive, it will load the checkpoint from
    `params.start_epoch - 1`. Otherwise, this function does nothing.
    Apart from loading state dict for `model`, `optimizer` and `scheduler`,
    it also updates `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
    and `best_valid_loss` in `params`.
    Args:
      params:
        The return value of :func:`get_params`.
      model:
        The training model.
      optimizer:
        The optimizer that we are using.
      scheduler:
        The learning rate scheduler we are using.
    Returns:
      Return None.
    """
    if params.start_epoch <= 0:
        return
    filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
    saved_params = load_checkpoint(
        filename,
        model=model,
        optimizer=optimizer,
        scheduler=scheduler,
    )
    keys = [
        "best_train_epoch",
        "best_valid_epoch",
        "batch_idx_train",
        "best_train_loss",
        "best_valid_loss",
    ]
    for k in keys:
        params[k] = saved_params[k]
    return saved_params
 def save_checkpoint(
    params: AttributeDict,
    model: nn.Module,
    optimizer: torch.optim.Optimizer,
    scheduler: torch.optim.lr_scheduler._LRScheduler,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
    Args:
      params:
        It is returned by :func:`get_params`.
      model:
        The training model.
    """
    if rank != 0:
        return
    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
    save_checkpoint_impl(
        filename=filename,
        model=model,
        params=params,
        optimizer=optimizer,
        scheduler=scheduler,
        rank=rank,
    )
    if params.best_train_epoch == params.cur_epoch:
        best_train_filename = params.exp_dir / "best-train-loss.pt"
        copyfile(src=filename, dst=best_train_filename)
    if params.best_valid_epoch == params.cur_epoch:
        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
        copyfile(src=filename, dst=best_valid_filename)
 def compute_loss(
    params: AttributeDict, model: nn.Module, batch: dict, is_training: bool, word2ids
 ) -> Tuple[Tensor, MetricsTracker]:
    """
    Compute RNN-T loss given the model and its inputs.
    Args:
      params:
        Parameters for training. See :func:`get_params`.
      model:
        The model for training. It is an instance of Tdnn in our case.
      batch:
        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
        for the content in it.
      is_training:
        True for training. False for validation. When it is True, this
        function enables autograd during computation; when it is False, it
        disables autograd.
    """
    device = model.device
    feature = batch["inputs"]
    # at entry, feature is (N, T, C)
    assert feature.ndim == 3
    feature = feature.to(device)
    feature_lens = batch["supervisions"]["num_frames"].to(device)
    texts = [
        " ".join(a.supervisions[0].custom["frames"])
        for a in batch["supervisions"]["cut"]
    ]
    texts = [
        "<s> " + a.replace("change language", "change_language") + " </s>"
        for a in texts
    ]
    labels = get_labels(texts, word2ids).to(device)
    with torch.set_grad_enabled(is_training):
        loss = model(x=feature, x_lens=feature_lens, y=labels)
    assert loss.requires_grad == is_training
    info = MetricsTracker()
    info["frames"] = feature.size(0)
    info["loss"] = loss.detach().cpu().item()
    return loss, info
 def compute_validation_loss(
    params: AttributeDict,
    model: nn.Module,
    valid_dl: torch.utils.data.DataLoader,
    word2ids,
    world_size: int = 1,
 ) -> MetricsTracker:
    """Run the validation process. The validation loss
    is saved in `params.valid_loss`.
    """
    model.eval()
    tot_loss = MetricsTracker()
    for batch_idx, batch in enumerate(valid_dl):
        loss, loss_info = compute_loss(
            params=params,
            model=model,
            batch=batch,
            is_training=False,
            word2ids=word2ids,
        )
        assert loss.requires_grad is False
        tot_loss = tot_loss + loss_info
    if world_size > 1:
        tot_loss.reduce(loss.device)
    loss_value = tot_loss["loss"] / tot_loss["frames"]
    if loss_value < params.best_valid_loss:
        params.best_valid_epoch = params.cur_epoch
        params.best_valid_loss = loss_value
    return tot_loss
 def train_one_epoch(
    params: AttributeDict,
    model: nn.Module,
    optimizer: torch.optim.Optimizer,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
    word2ids,
    tb_writer: None,
    world_size: int = 1,
 ) -> None:
    """Train the model for one epoch.
    The training loss from the mean of all frames is saved in
    `params.train_loss`. It runs the validation process every
    `params.valid_interval` batches.
    Args:
      params:
        It is returned by :func:`get_params`.
      model:
        The model for training.
      optimizer:
        The optimizer we are using.
      train_dl:
        Dataloader for the training dataset.
      valid_dl:
        Dataloader for the validation dataset.
      tb_writer:
        Writer to write log messages to tensorboard.
      world_size:
        Number of nodes in DDP training. If it is 1, DDP is disabled.
    """
    model.train()
    tot_loss = MetricsTracker()
    for batch_idx, batch in enumerate(train_dl):
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])
        loss, loss_info = compute_loss(
            params=params, model=model, batch=batch, is_training=True, word2ids=word2ids
        )
        # summary stats.
        tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 5.0, 2.0)
        optimizer.step()
        if batch_idx % params.log_interval == 0:
            logging.info(
                f"Epoch {params.cur_epoch}, "
                f"batch {batch_idx}, loss[{loss_info}], "
                f"tot_loss[{tot_loss}], batch size: {batch_size}"
            )
        if batch_idx % params.log_interval == 0:
            if tb_writer is not None:
                loss_info.write_summary(
                    tb_writer, "train/current_", params.batch_idx_train
                )
                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
        if batch_idx > 0 and batch_idx % params.valid_interval == 0:
            valid_info = compute_validation_loss(
                params=params,
                model=model,
                valid_dl=valid_dl,
                world_size=world_size,
                word2ids=word2ids,
            )
            model.train()
            logging.info(f"Epoch {params.cur_epoch}, validation {valid_info}")
            if tb_writer is not None:
                valid_info.write_summary(
                    tb_writer,
                    "train/valid_",
                    params.batch_idx_train,
                )
    loss_value = tot_loss["loss"] / tot_loss["frames"]
    params.train_loss = loss_value
    if params.train_loss < params.best_train_loss:
        params.best_train_epoch = params.cur_epoch
        params.best_train_loss = params.train_loss
 def get_transducer_model(params: AttributeDict):
    encoder = Conformer(
        num_features=params.feature_dim,
        output_dim=params.hidden_dim,
    )
    decoder = Decoder(
        vocab_size=params.vocab_size,
        embedding_dim=params.embedding_dim,
        blank_id=params.blank_id,
        num_layers=params.num_decoder_layers,
        hidden_dim=params.hidden_dim,
        embedding_dropout=0.4,
        rnn_dropout=0.4,
    )
    joiner = Joiner(input_dim=params.hidden_dim, output_dim=params.vocab_size)
    transducer = Transducer(encoder=encoder, decoder=decoder, joiner=joiner)
    return transducer
 def run(rank, world_size, args):
    """
    Args:
      rank:
        It is a value between 0 and `world_size-1`, which is
        passed automatically by `mp.spawn()` in :func:`main`.
        The node with rank 0 is responsible for saving checkpoint.
      world_size:
        Number of GPUs for DDP training.
      args:
        The return value of get_parser().parse_args()
    """
    params = get_params()
    params.update(vars(args))
    params["env_info"] = get_env_info()
    word2ids = get_word2id(params)
    fix_random_seed(params.seed)
    if world_size > 1:
        setup_dist(rank, world_size, params.master_port)
    setup_logger(f"{params.exp_dir}/log/log-train")
    logging.info("Training started")
    logging.info(params)
    # if args.tensorboard and rank == 0:
    #     tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
    # else:
    #     tb_writer = None
    tb_writer = None
    if torch.cuda.is_available():
        device = torch.device("cuda", rank)
    else:
        device = torch.device("cpu")
    logging.info(f"device: {device}")
    model = get_transducer_model(params)
    checkpoints = load_checkpoint_if_available(params=params, model=model)
    model.to(device)
    if world_size > 1:
        model = DDP(model, device_ids=[rank])
    model.device = device
    optimizer = optim.Adam(
        model.parameters(),
        lr=params.lr,
        weight_decay=params.weight_decay,
    )
    if checkpoints:
        optimizer.load_state_dict(checkpoints["optimizer"])
    slu = SluDataModule(args)
    train_dl = slu.train_dataloaders()
    # There are only 60 waves: 30 files are used for training
    # and the remaining 30 files are used for testing.
    # We use test data as validation.
    valid_dl = slu.test_dataloaders()
    for epoch in range(params.start_epoch, params.num_epochs):
        fix_random_seed(params.seed + epoch)
        train_dl.sampler.set_epoch(epoch)
        if tb_writer is not None:
            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
        params.cur_epoch = epoch
        train_one_epoch(
            params=params,
            model=model,
            optimizer=optimizer,
            train_dl=train_dl,
            valid_dl=valid_dl,
            tb_writer=tb_writer,
            world_size=world_size,
            word2ids=word2ids,
        )
        save_checkpoint(
            params=params,
            model=model,
            optimizer=optimizer,
            scheduler=None,
            rank=rank,
        )
    logging.info("Done!")
    if world_size > 1:
        torch.distributed.barrier()
        cleanup_dist()
 def main():
    parser = get_parser()
    SluDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    world_size = args.world_size
    assert world_size >= 1
    if world_size > 1:
        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
    else:
        run(rank=0, world_size=1, args=args)
 if __name__ == "__main__":
    main()
--- a/egs/fluent_speech_commands/SLU/transducer/transformer.py
+++ b/egs/fluent_speech_commands/SLU/transducer/transformer.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer_stateless/transformer.py
--- a/icefall/shared/make_kn_lm.py
+++ b/icefall/shared/make_kn_lm.py
@ -33,7 +33,7 @@ parser.add_argument(
    "-ngram-order",
    type=int,
    default=4,
-    choices=[2, 3, 4, 5, 6, 7],
+    choices=[1, 2, 3, 4, 5, 6, 7],
    help="Order of n-gram",
 )
 parser.add_argument("-text", type=str, default=None, help="Path to the corpus file")
@ -105,7 +105,7 @@ class NgramCounts:
    # do as follows: self.counts[3][[5,6,7]][8] += 1.0 where the [3] indexes an
    # array, the [[5,6,7]] indexes a dict, and the [8] indexes a dict.
    def __init__(self, ngram_order, bos_symbol="<s>", eos_symbol="</s>"):
-        assert ngram_order >= 2
+        assert ngram_order >= 1
        self.ngram_order = ngram_order
        self.bos_symbol = bos_symbol
@ -169,6 +169,9 @@ class NgramCounts:
        with open(filename, encoding=default_encoding) as fp:
            for line in fp:
                line = line.strip(strip_chars)
                if self.ngram_order == 1:
                    self.add_raw_counts_from_line(line.split()[0])
                else:
                    self.add_raw_counts_from_line(line)
                lines_processed += 1
        if lines_processed == 0 or args.verbose > 0:
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/transducer_stateless/conformer.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/transducer/joiner.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/transducer/model.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/transducer/test_conformer.py`
		`@ -0,0 +1 @@`
							`../../../yesno/ASR/transducer/test_decoder.py`