Merge remote-tracking branch 'dan/master' into update-giga-libri-results

2025-09-08 16:44:20 +00:00 · 2022-05-14 08:58:11 +08:00 · 2022-05-14 08:58:11 +08:00 · ad3fb63ad6
commit ad3fb63ad6
parent 56974a900d 2d7096dfc6
6 changed files with 280 additions and 5 deletions
--- a/.github/scripts/download-gigaspeech-dev-test-dataset.sh
+++ b/.github/scripts/download-gigaspeech-dev-test-dataset.sh
@ -0,0 +1,15 @@
 #!/usr/bin/env bash
 # This script downloads the pre-computed fbank features for
 # dev and test datasets of GigaSpeech.
 #
 # You will find directories `~/tmp/giga-dev-dataset-fbank` after running
 # this script.
 mkdir -p ~/tmp
 cd ~/tmp
 git lfs install
 git clone https://huggingface.co/csukuangfj/giga-dev-dataset-fbank
 ls -lh giga-dev-dataset-fbank/data/fbank
--- a/.github/scripts/run-gigaspeech-pruned-transducer-stateless2-2022-05-12.sh
+++ b/.github/scripts/run-gigaspeech-pruned-transducer-stateless2-2022-05-12.sh
@ -0,0 +1,49 @@
 #!/usr/bin/env bash
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/gigaspeech/ASR
 repo_url=https://huggingface.co/wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 git clone $repo_url
 repo=$(basename $repo_url)
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p pruned_transducer_stateless2/exp
  ln -s $PWD/$repo/exp/pretrained-epoch-29-avg-11.pt pruned_transducer_stateless2/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh data/lang_bpe_500
  ls -lh data/fbank
  ls -lh pruned_transducer_stateless2/exp
  log "Decoding dev and test"
  # use a small value for decoding with CPU
  max_duration=100
  # Test only greedy_search to reduce CI running time
  # for method in greedy_search fast_beam_search modified_beam_search; do
  for method in greedy_search; do
    log "Decoding with $method"
    ./pruned_transducer_stateless2/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --max-duration $max_duration \
      --exp-dir pruned_transducer_stateless2/exp
  done
  rm pruned_transducer_stateless2/exp/*.pt
 fi
--- a/.github/workflows/run-gigaspeech-2022-05-13.yml
+++ b/.github/workflows/run-gigaspeech-2022-05-13.yml
@ -0,0 +1,120 @@
 # Copyright      2021  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-gigaspeech-2022-05-13
 # stateless transducer + k2 pruned rnnt-loss + reworked conformer
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 jobs:
  run_gigaspeech_2022_05_13:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Download GigaSpeech dev/test dataset
        shell: bash
        run: |
          sudo apt-get install -y -q git-lfs
          .github/scripts/download-gigaspeech-dev-test-dataset.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          ln -s ~/tmp/giga-dev-dataset-fbank/data egs/gigaspeech/ASR/
          ls -lh egs/gigaspeech/ASR/data/fbank
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-gigaspeech-pruned-transducer-stateless2-2022-05-12.sh
      - name: Display decoding results for gigaspeech pruned_transducer_stateless2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/gigaspeech/ASR/
          tree ./pruned_transducer_stateless2/exp
          sudo apt-get -qq install tree
          cd pruned_transducer_stateless2
          echo "results for pruned_transducer_stateless2"
          echo "===greedy search==="
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for dev" {} + | sort -n -k2
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test" {} + | sort -n -k2
      - name: Upload decoding results for gigaspeech pruned_transducer_stateless2
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-gigaspeech-pruned_transducer_stateless2-2022-05-12
          path: egs/gigaspeech/ASR/pruned_transducer_stateless2/exp/
--- a/egs/librispeech/ASR/local/prepare_lang_bpe.py
+++ b/egs/librispeech/ASR/local/prepare_lang_bpe.py
@ -145,7 +145,14 @@ def generate_lexicon(
    sp = spm.SentencePieceProcessor()
    sp.load(str(model_file))
-    words_pieces: List[List[str]] = sp.encode(words, out_type=str)
+    # Convert word to word piece IDs instead of word piece strings
    # to avoid OOV tokens.
    words_pieces_ids: List[List[int]] = sp.encode(words, out_type=int)
    # Now convert word piece IDs back to word piece strings.
    words_pieces: List[List[str]] = [
        sp.id_to_piece(ids) for ids in words_pieces_ids
    ]
    lexicon = []
    for word, pieces in zip(words, words_pieces):
--- a/egs/librispeech/ASR/local/validate_bpe_lexicon.py
+++ b/egs/librispeech/ASR/local/validate_bpe_lexicon.py
@ -0,0 +1,77 @@
 #!/usr/bin/env python3
 # Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script checks that there are no OOV tokens in the BPE-based lexicon.
 Usage example:
    python3 ./local/validate_bpe_lexicon.py \
            --lexicon /path/to/lexicon.txt \
            --bpe-model /path/to/bpe.model
 """
 import argparse
 from pathlib import Path
 from typing import List, Tuple
 import sentencepiece as spm
 from icefall.lexicon import read_lexicon
 # Map word to word pieces
 Lexicon = List[Tuple[str, List[str]]]
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lexicon",
        required=True,
        type=Path,
        help="Path to lexicon.txt",
    )
    parser.add_argument(
        "--bpe-model",
        required=True,
        type=Path,
        help="Path to bpe.model",
    )
    return parser.parse_args()
 def main():
    args = get_args()
    assert args.lexicon.is_file(), args.lexicon
    assert args.bpe_model.is_file(), args.bpe_model
    lexicon = read_lexicon(args.lexicon)
    sp = spm.SentencePieceProcessor()
    sp.load(str(args.bpe_model))
    word_pieces = set(sp.id_to_piece(list(range(sp.vocab_size()))))
    for word, pieces in lexicon:
        for p in pieces:
            if p not in word_pieces:
                raise ValueError(f"The word {word} contains an OOV token {p}")
 if __name__ == "__main__":
    main()
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@ -184,13 +184,20 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
      done > $lang_dir/transcript_words.txt
    fi
-    ./local/train_bpe_model.py \
+    if [ ! -f $lang_dir/bpe.model ]; then
-      --lang-dir $lang_dir \
+      ./local/train_bpe_model.py \
-      --vocab-size $vocab_size \
+        --lang-dir $lang_dir \
-      --transcript $lang_dir/transcript_words.txt
+        --vocab-size $vocab_size \
        --transcript $lang_dir/transcript_words.txt
    fi
    if [ ! -f $lang_dir/L_disambig.pt ]; then
      ./local/prepare_lang_bpe.py --lang-dir $lang_dir
      log "Validating $lang_dir/lexicon.txt"
      ./local/validate_bpe_lexicon.py \
        --lexicon $lang_dir/lexicon.txt \
        --bpe-model $lang_dir/bpe.model
    fi
  done
 fi