use a separate script to download data and generate features

2025-12-11 06:55:27 +00:00 · 2023-12-22 16:27:39 +08:00 · 2023-12-22 16:27:39 +08:00 · ed286895bc
commit ed286895bc
parent 4fa40f5dcd
4 changed files with 173 additions and 29 deletions
--- a/.github/scripts/docker/Dockerfile
+++ b/.github/scripts/docker/Dockerfile
@ -1,7 +1,55 @@
-FROM k2fsa/icefall:torch1.13.0-cuda11.6
+ARG PYTHON_VERSION=3.8

-WORKDIR /workspace/icefall
+FROM python:${PYTHON_VERSION}

-RUN cd egs/librispeech/ASR && \
-    ./prepare.sh --stop-stage 1 && \
-    ls -lh download data
+RUN apt-get update -y && \
+    apt-get install -qq -y \
+    git \
+    git-lfs \
+    less \
+    vim \
+    && \
+    apt-get clean && \
+    rm -rf /var/cache/apt/archives /var/lib/apt/lists
+
+ARG K2_VERSION="1.24.4.dev20231220+cpu.torch1.13.0"
+ARG KALDIFEAT_VERSION="1.25.3.dev20231221+cpu.torch1.13.0"
+ARG TORCHAUDIO_VERSION="0.13.0"
+ARG TORCH_VERSION="1.13.0"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+      torch==${TORCH_VERSION} torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/cpu/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cpu.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cpu.html \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      six \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz
+
+# RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+#     cd /workspace/icefall && \
+#     pip install --no-cache-dir -r requirements.txt
+#
+# ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+#
+# WORKDIR /workspace/icefall
--- a/.github/scripts/docker/run.sh
+++ b/.github/scripts/docker/run.sh
@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+set -ex
+
+cd /icefall
+export PYTHONPATH=/icefall:$PYTHONPATH
+
+cd egs/librispeech/ASR
+
+# We don't download the LM file since it is so large that it will
+# cause OOM error for CI later.
+mkdir -p download/lm
+pushd download/lm
+wget -q http://www.openslr.org/resources/11/librispeech-vocab.txt
+wget -q http://www.openslr.org/resources/11/librispeech-lexicon.txt
+wget -q http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz
+ls -lh
+gunzip librispeech-lm-norm.txt.gz
+
+ls -lh
+popd
+
+pushd download/
+wget -q https://huggingface.co/csukuangfj/librispeech-for-ci/resolve/main/LibriSpeech.tar.bz2
+tar xf LibriSpeech.tar.bz2
+rm LibriSpeech.tar.bz2
+
+cd LibriSpeech
+ln -s train-clean-100 train-clean-360
+ln -s train-other-500 train-other-500
+popd
+
+mkdir -p data/manifests
+
+lhotse prepare librispeech -j 2 -p dev-clean -p dev-other -p test-clean -p test-other -p train-clean-100 download/LibriSpeech data/manifests
+ls -lh data/manifests
+
+./local/compute_fbank_librispeech.py --dataset "dev-clean dev-other test-clean test-other train-clean-100" --perturb-speed False
+ls -lh data/fbank
+
+./prepare.sh --stage 5 --stop-stage 6
+
+./zipformer/train.py \
+  --world-size 1 \
+  --num-epochs 1 \
+  --start-epoch 1 \
+  --use-fp16 0 \
+  --exp-dir zipformer/exp-small \
+  --causal 0 \
+  --num-encoder-layers 1,1,1,1,1,1 \
+  --feedforward-dim 64,96,96,96,96,96 \
+  --encoder-dim 32,64,64,64,64,64 \
+  --encoder-unmasked-dim 32,32,32,32,32,32 \
+  --base-lr 0.04 \
+  --full-libri 0 \
+  --enable-musan 0 \
+  --max-duration 30 \
+  --print-diagnostics 1
--- a/.github/workflows/build-cpu-docker.yml
+++ b/.github/workflows/build-cpu-docker.yml
@ -0,0 +1,49 @@
+name: build-cpu-docker
+on:
+  push:
+    branches:
+      - ci-train-2
+  workflow_dispatch:
+
+concurrency:
+  group: build-cpu-docker-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-cpu-docker:
+    name: ${{ matrix.python-version }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        python-version: ["3.8", "3.9", "3.10"]
+
+    steps:
+      # refer to https://github.com/actions/checkout
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Free space
+        shell: bash
+        run: |
+          df -h
+          rm -rf /opt/hostedtoolcache
+          df -h
+
+      - name: 'Login to GitHub Container Registry'
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build docker Image
+        shell: bash
+        run: |
+          cd .github/scripts/docker
+
+          docker build -t ghcr.io/csukuangfj/icefall:cpu-py${{ matrix.python-version }} --build-arg PYTHON_VERSION=${{ matrix.python-version }} .
+          docker image ls
+          docker push ghcr.io/csukuangfj/icefall:cpu-py${{ matrix.python-version }}
--- a/.github/workflows/train-librispeech.yml
+++ b/.github/workflows/train-librispeech.yml
@ -11,47 +11,37 @@ concurrency:

 jobs:
  train-librispeech:
-    name: ${{ matrix.image }}
+    name: ${{ matrix.python-version }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        os: [ubuntu-latest]
+        python-version: ["3.8", "3.9", "3.10"]

    steps:
      # refer to https://github.com/actions/checkout
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
        with:
          fetch-depth: 0

-      - name: Rename
-        shell: bash
-        run: |
-          cp -v .github/scripts/docker/Dockerfile ./Dockerfile
-
      - name: Free space
        shell: bash
        run: |
          df -h
          rm -rf /opt/hostedtoolcache
          df -h
+          echo "pwd: $PWD"
+          echo "github.workspace ${{ github.workspace }}"

-      - name: Log in to Docker Hub
-        uses: docker/login-action@v2
+      - name: Run the build process with Docker
+        uses: addnab/docker-run-action@v3
        with:
-          username: ${{ secrets.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_PASSWORD }}
+            image: ghcr.io/csukuangfj/icefall:cpu-py${{ matrix.python-version }}
+            options: |
+              --volume ${{ github.workspace }}/:/icefall
+            shell: bash
+            run: |
+              ls -lh /icefall

-      - name: 'Login to GitHub Container Registry'
-        uses: docker/login-action@v2
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Build docker Image
-        shell: bash
-        run: |
-          cp -v .github/scripts/docker/Dockerfile ./Dockerfile
-          docker build -t ghcr.io/k2-fsa/icefall:librispeech .
-          docker image ls
+              /icefall/.github/scripts/docker/run.sh