Merge branch 'k2-fsa:master' into musan

This commit is contained in:
Bailey Machiko Hirota 2025-07-11 17:52:46 +09:00 committed by GitHub
commit 345b2ab1b0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
519 changed files with 29044 additions and 2823 deletions

167
.github/scripts/baker_zh/TTS/run-matcha.sh vendored Executable file
View File

@ -0,0 +1,167 @@
#!/usr/bin/env bash
set -ex
apt-get update
apt-get install -y sox
python3 -m pip install numba conformer==0.3.2 diffusers librosa
python3 -m pip install jieba
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
cd egs/baker_zh/TTS
sed -i.bak s/600/8/g ./prepare.sh
sed -i.bak s/"first 100"/"first 3"/g ./prepare.sh
sed -i.bak s/500/5/g ./prepare.sh
git diff
function prepare_data() {
# We have created a subset of the data for testing
#
mkdir -p download
pushd download
wget -q https://huggingface.co/csukuangfj/tmp-files/resolve/main/BZNSYP-samples.tar.bz2
tar xvf BZNSYP-samples.tar.bz2
mv BZNSYP-samples BZNSYP
rm BZNSYP-samples.tar.bz2
popd
./prepare.sh
tree .
}
function train() {
pushd ./matcha
sed -i.bak s/1500/3/g ./train.py
git diff .
popd
./matcha/train.py \
--exp-dir matcha/exp \
--num-epochs 1 \
--save-every-n 1 \
--num-buckets 2 \
--tokens data/tokens.txt \
--max-duration 20
ls -lh matcha/exp
}
function infer() {
curl -SL -O https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v2
./matcha/infer.py \
--num-buckets 2 \
--epoch 1 \
--exp-dir ./matcha/exp \
--tokens data/tokens.txt \
--cmvn ./data/fbank/cmvn.json \
--vocoder ./generator_v2 \
--input-text "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。" \
--output-wav ./generated.wav
ls -lh *.wav
soxi ./generated.wav
rm -v ./generated.wav
rm -v generator_v2
}
function export_onnx() {
pushd matcha/exp
curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-baker-matcha-zh-2024-12-27/resolve/main/epoch-2000.pt
popd
pushd data/fbank
rm -v *.json
curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-baker-matcha-zh-2024-12-27/resolve/main/cmvn.json
popd
./matcha/export_onnx.py \
--exp-dir ./matcha/exp \
--epoch 2000 \
--tokens ./data/tokens.txt \
--cmvn ./data/fbank/cmvn.json
ls -lh *.onnx
if false; then
# The CI machine does not have enough memory to run it
#
curl -SL -O https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
curl -SL -O https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v2
curl -SL -O https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v3
python3 ./matcha/export_onnx_hifigan.py
else
curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/exp/hifigan_v1.onnx
curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/exp/hifigan_v2.onnx
curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/exp/hifigan_v3.onnx
fi
ls -lh *.onnx
python3 ./matcha/generate_lexicon.py
for v in v1 v2 v3; do
python3 ./matcha/onnx_pretrained.py \
--acoustic-model ./model-steps-6.onnx \
--vocoder ./hifigan_$v.onnx \
--tokens ./data/tokens.txt \
--lexicon ./lexicon.txt \
--input-text "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。" \
--output-wav /icefall/generated-matcha-tts-steps-6-$v.wav
done
ls -lh /icefall/*.wav
soxi /icefall/generated-matcha-tts-steps-6-*.wav
cp ./model-steps-*.onnx /icefall
d=matcha-icefall-zh-baker
mkdir $d
cp -v data/tokens.txt $d
cp -v lexicon.txt $d
cp model-steps-3.onnx $d
pushd $d
curl -SL -O https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2
tar xvf dict.tar.bz2
rm dict.tar.bz2
curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/date.fst
curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/number.fst
curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/phone.fst
cat >README.md <<EOF
# Introduction
This model is trained using the dataset from
https://en.data-baker.com/datasets/freeDatasets/
The dataset contains 10000 Chinese sentences of a native Chinese female speaker,
which is about 12 hours.
**Note**: The dataset is for non-commercial use only.
You can find the training code at
https://github.com/k2-fsa/icefall/tree/master/egs/baker_zh/TTS
EOF
ls -lh
popd
tar cvjf $d.tar.bz2 $d
mv $d.tar.bz2 /icefall
mv $d /icefall
}
prepare_data
train
infer
export_onnx
rm -rfv generator_v* matcha/exp
git checkout .

View File

@ -49,15 +49,15 @@ RUN pip install --no-cache-dir \
kaldifst \
kaldilm \
librosa \
matplotlib \
"matplotlib<=3.9.4" \
multi_quantization \
numba \
"numpy<2.0" \
onnxoptimizer \
onnxsim \
onnx \
onnx==1.17.0 \
onnxmltools \
onnxruntime \
onnxruntime==1.17.1 \
piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html \
pypinyin==0.50.0 \
pytest \

View File

@ -2,9 +2,29 @@
# Copyright 2023 Xiaomi Corp. (authors: Fangjun Kuang)
import argparse
import json
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--min-torch-version",
help="torch version",
)
parser.add_argument(
"--torch-version",
help="torch version",
)
parser.add_argument(
"--python-version",
help="python version",
)
return parser.parse_args()
def version_gt(a, b):
a_major, a_minor = list(map(int, a.split(".")))[:2]
b_major, b_minor = list(map(int, b.split(".")))[:2]
@ -42,27 +62,38 @@ def get_torchaudio_version(torch_version):
return torch_version
def get_matrix():
k2_version = "1.24.4.dev20241029"
kaldifeat_version = "1.25.5.dev20241029"
version = "20241029"
def get_matrix(min_torch_version, specified_torch_version, specified_python_version):
k2_version = "1.24.4.dev20250630"
kaldifeat_version = "1.25.5.dev20250630"
version = "20250630"
# torchaudio 2.5.0 does not support python 3.13
python_version = ["3.8", "3.9", "3.10", "3.11", "3.12"]
python_version = ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
torch_version = []
# torch_version += ["1.13.0", "1.13.1"]
# torch_version += ["2.0.0", "2.0.1"]
# torch_version += ["2.1.0", "2.1.1", "2.1.2"]
# torch_version += ["2.2.0", "2.2.1", "2.2.2"]
torch_version += ["1.13.0", "1.13.1"]
torch_version += ["2.0.0", "2.0.1"]
torch_version += ["2.1.0", "2.1.1", "2.1.2"]
torch_version += ["2.2.0", "2.2.1", "2.2.2"]
# Test only torch >= 2.3.0
torch_version += ["2.3.0", "2.3.1"]
torch_version += ["2.4.0"]
torch_version += ["2.4.1"]
torch_version += ["2.5.0"]
torch_version += ["2.5.1"]
torch_version += ["2.6.0", "2.7.0", "2.7.1"]
if specified_torch_version:
torch_version = [specified_torch_version]
if specified_python_version:
python_version = [specified_python_version]
matrix = []
for p in python_version:
for t in torch_version:
if min_torch_version and version_gt(min_torch_version, t):
continue
# torchaudio <= 1.13.x supports only python <= 3.10
if version_gt(p, "3.10") and not version_gt(t, "2.0"):
@ -96,7 +127,12 @@ def get_matrix():
def main():
matrix = get_matrix()
args = get_args()
matrix = get_matrix(
min_torch_version=args.min_torch_version,
specified_torch_version=args.torch_version,
specified_python_version=args.python_version,
)
print(json.dumps({"include": matrix}))

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python3
def main():
def get_v1_2_0_files():
prefix = (
"https://github.com/csukuangfj/piper-phonemize/releases/download/2023.12.5/"
)
@ -19,9 +19,70 @@ def main():
"piper_phonemize-1.2.0-cp39-cp39-macosx_10_14_x86_64.whl",
"piper_phonemize-1.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
]
ans = [prefix + f for f in files]
ans.sort()
return ans
def get_v1_3_0_files():
prefix = (
"https://github.com/csukuangfj/piper-phonemize/releases/download/2025.06.23/"
)
files = [
"piper_phonemize-1.3.0-cp310-cp310-macosx_10_9_universal2.whl",
"piper_phonemize-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl",
"piper_phonemize-1.3.0-cp310-cp310-macosx_11_0_arm64.whl",
"piper_phonemize-1.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl",
"piper_phonemize-1.3.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl",
"piper_phonemize-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
"piper_phonemize-1.3.0-cp310-cp310-win_amd64.whl",
"piper_phonemize-1.3.0-cp311-cp311-macosx_10_9_universal2.whl",
"piper_phonemize-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl",
"piper_phonemize-1.3.0-cp311-cp311-macosx_11_0_arm64.whl",
"piper_phonemize-1.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl",
"piper_phonemize-1.3.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl",
"piper_phonemize-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
"piper_phonemize-1.3.0-cp311-cp311-win_amd64.whl",
"piper_phonemize-1.3.0-cp312-cp312-macosx_10_13_universal2.whl",
"piper_phonemize-1.3.0-cp312-cp312-macosx_10_13_x86_64.whl",
"piper_phonemize-1.3.0-cp312-cp312-macosx_11_0_arm64.whl",
"piper_phonemize-1.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl",
"piper_phonemize-1.3.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl",
"piper_phonemize-1.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
"piper_phonemize-1.3.0-cp312-cp312-win_amd64.whl",
"piper_phonemize-1.3.0-cp313-cp313-macosx_10_13_universal2.whl",
"piper_phonemize-1.3.0-cp313-cp313-macosx_10_13_x86_64.whl",
"piper_phonemize-1.3.0-cp313-cp313-macosx_11_0_arm64.whl",
"piper_phonemize-1.3.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl",
"piper_phonemize-1.3.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl",
"piper_phonemize-1.3.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
"piper_phonemize-1.3.0-cp313-cp313-win_amd64.whl",
"piper_phonemize-1.3.0-cp38-cp38-macosx_10_9_universal2.whl",
"piper_phonemize-1.3.0-cp38-cp38-macosx_10_9_x86_64.whl",
"piper_phonemize-1.3.0-cp38-cp38-macosx_11_0_arm64.whl",
"piper_phonemize-1.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl",
"piper_phonemize-1.3.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl",
"piper_phonemize-1.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
"piper_phonemize-1.3.0-cp38-cp38-win_amd64.whl",
"piper_phonemize-1.3.0-cp39-cp39-macosx_10_9_universal2.whl",
"piper_phonemize-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl",
"piper_phonemize-1.3.0-cp39-cp39-macosx_11_0_arm64.whl",
"piper_phonemize-1.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl",
"piper_phonemize-1.3.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl",
"piper_phonemize-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
"piper_phonemize-1.3.0-cp39-cp39-win_amd64.whl",
]
ans = [prefix + f for f in files]
ans.sort()
return ans
def main():
files = get_v1_3_0_files() + get_v1_2_0_files()
with open("piper_phonemize.html", "w") as f:
for file in files:
url = prefix + file
for url in files:
file = url.split("/")[-1]
f.write(f'<a href="{url}">{file}</a><br/>\n')

275
.github/scripts/librispeech/ASR/run_rknn.sh vendored Executable file
View File

@ -0,0 +1,275 @@
#!/usr/bin/env bash
set -ex
python3 -m pip install kaldi-native-fbank soundfile librosa
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
cd egs/librispeech/ASR
# https://huggingface.co/csukuangfj/k2fsa-zipformer-chinese-english-mixed
# sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20
function export_2023_02_20() {
d=exp_2023_02_20
mkdir $d
pushd $d
curl -SL -O https://huggingface.co/csukuangfj/k2fsa-zipformer-chinese-english-mixed/resolve/main/exp/pretrained.pt
mv pretrained.pt epoch-99.pt
curl -SL -O https://huggingface.co/csukuangfj/k2fsa-zipformer-chinese-english-mixed/resolve/main/data/lang_char_bpe/tokens.txt
curl -SL -O https://huggingface.co/csukuangfj/k2fsa-zipformer-chinese-english-mixed/resolve/main/test_wavs/0.wav
curl -SL -O https://huggingface.co/csukuangfj/k2fsa-zipformer-chinese-english-mixed/resolve/main/test_wavs/1.wav
curl -SL -O https://huggingface.co/csukuangfj/k2fsa-zipformer-chinese-english-mixed/resolve/main/test_wavs/2.wav
curl -SL -O https://huggingface.co/csukuangfj/k2fsa-zipformer-chinese-english-mixed/resolve/main/test_wavs/3.wav
curl -SL -O https://huggingface.co/csukuangfj/k2fsa-zipformer-chinese-english-mixed/resolve/main/test_wavs/4.wav
ls -lh
popd
./pruned_transducer_stateless7_streaming/export-onnx-zh.py \
--dynamic-batch 0 \
--enable-int8-quantization 0 \
--tokens $d/tokens.txt \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--exp-dir $d/ \
--decode-chunk-len 64 \
--num-encoder-layers "2,4,3,2,4" \
--feedforward-dims "1024,1024,1536,1536,1024" \
--nhead "8,8,8,8,8" \
--encoder-dims "384,384,384,384,384" \
--attention-dims "192,192,192,192,192" \
--encoder-unmasked-dims "256,256,256,256,256" \
--zipformer-downsampling-factors "1,2,4,8,2" \
--cnn-module-kernels "31,31,31,31,31" \
--decoder-dim 512 \
--joiner-dim 512
ls -lh $d/
./pruned_transducer_stateless7_streaming/onnx_pretrained.py \
--encoder-model-filename $d/encoder-epoch-99-avg-1.onnx \
--decoder-model-filename $d/decoder-epoch-99-avg-1.onnx \
--joiner-model-filename $d/joiner-epoch-99-avg-1.onnx \
--tokens $d/tokens.txt \
$d/0.wav
./pruned_transducer_stateless7_streaming/onnx_pretrained.py \
--encoder-model-filename $d/encoder-epoch-99-avg-1.onnx \
--decoder-model-filename $d/decoder-epoch-99-avg-1.onnx \
--joiner-model-filename $d/joiner-epoch-99-avg-1.onnx \
--tokens $d/tokens.txt \
$d/1.wav
for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do
dst=sherpa-onnx-$platform-streaming-zipformer-bilingual-zh-en-2023-02-20
mkdir -p $dst
./pruned_transducer_stateless7_streaming/export_rknn.py \
--in-encoder $d/encoder-epoch-99-avg-1.onnx \
--in-decoder $d/decoder-epoch-99-avg-1.onnx \
--in-joiner $d/joiner-epoch-99-avg-1.onnx \
--out-encoder $dst/encoder.rknn \
--out-decoder $dst/decoder.rknn \
--out-joiner $dst/joiner.rknn \
--target-platform $platform 2>/dev/null
ls -lh $dst/
./pruned_transducer_stateless7_streaming/test_rknn_on_cpu_simulator.py \
--encoder $d/encoder-epoch-99-avg-1.onnx \
--decoder $d/decoder-epoch-99-avg-1.onnx \
--joiner $d/joiner-epoch-99-avg-1.onnx \
--tokens $d/tokens.txt \
--wav $d/0.wav
cp $d/tokens.txt $dst
mkdir $dst/test_wavs
cp $d/*.wav $dst/test_wavs
tar cjvf $dst.tar.bz2 $dst
ls -lh $dst.tar.bz2
mv $dst.tar.bz2 /icefall/
ls -lh $dst/
echo "---"
rm -rf $dst
done
}
# https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t
# sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16
function export_2023_02_16() {
d=exp_2023_02_16
mkdir $d
pushd $d
curl -SL -O https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t/resolve/main/exp/pretrained.pt
mv pretrained.pt epoch-99.pt
curl -SL -O https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t/resolve/main/data/lang_char_bpe/tokens.txt
curl -SL -O https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t/resolve/main/test_wavs/0.wav
curl -SL -O https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t/resolve/main/test_wavs/1.wav
curl -SL -O https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t/resolve/main/test_wavs/2.wav
curl -SL -O https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t/resolve/main/test_wavs/3.wav
curl -SL -O https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t/resolve/main/test_wavs/4.wav
ls -lh
popd
./pruned_transducer_stateless7_streaming/export-onnx-zh.py \
--dynamic-batch 0 \
--enable-int8-quantization 0 \
--tokens $d/tokens.txt \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--exp-dir $d/ \
--decode-chunk-len 64 \
\
--num-encoder-layers 2,2,2,2,2 \
--feedforward-dims 768,768,768,768,768 \
--nhead 4,4,4,4,4 \
--encoder-dims 256,256,256,256,256 \
--attention-dims 192,192,192,192,192 \
--encoder-unmasked-dims 192,192,192,192,192 \
\
--zipformer-downsampling-factors "1,2,4,8,2" \
--cnn-module-kernels "31,31,31,31,31" \
--decoder-dim 512 \
--joiner-dim 512
ls -lh $d/
./pruned_transducer_stateless7_streaming/onnx_pretrained.py \
--encoder-model-filename $d/encoder-epoch-99-avg-1.onnx \
--decoder-model-filename $d/decoder-epoch-99-avg-1.onnx \
--joiner-model-filename $d/joiner-epoch-99-avg-1.onnx \
--tokens $d/tokens.txt \
$d/0.wav
./pruned_transducer_stateless7_streaming/onnx_pretrained.py \
--encoder-model-filename $d/encoder-epoch-99-avg-1.onnx \
--decoder-model-filename $d/decoder-epoch-99-avg-1.onnx \
--joiner-model-filename $d/joiner-epoch-99-avg-1.onnx \
--tokens $d/tokens.txt \
$d/1.wav
for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do
dst=sherpa-onnx-$platform-streaming-zipformer-small-bilingual-zh-en-2023-02-16
mkdir -p $dst
./pruned_transducer_stateless7_streaming/export_rknn.py \
--in-encoder $d/encoder-epoch-99-avg-1.onnx \
--in-decoder $d/decoder-epoch-99-avg-1.onnx \
--in-joiner $d/joiner-epoch-99-avg-1.onnx \
--out-encoder $dst/encoder.rknn \
--out-decoder $dst/decoder.rknn \
--out-joiner $dst/joiner.rknn \
--target-platform $platform 2>/dev/null
ls -lh $dst/
./pruned_transducer_stateless7_streaming/test_rknn_on_cpu_simulator.py \
--encoder $d/encoder-epoch-99-avg-1.onnx \
--decoder $d/decoder-epoch-99-avg-1.onnx \
--joiner $d/joiner-epoch-99-avg-1.onnx \
--tokens $d/tokens.txt \
--wav $d/0.wav
cp $d/tokens.txt $dst
mkdir $dst/test_wavs
cp $d/*.wav $dst/test_wavs
tar cjvf $dst.tar.bz2 $dst
ls -lh $dst.tar.bz2
mv $dst.tar.bz2 /icefall/
ls -lh $dst/
echo "---"
rm -rf $dst
done
}
# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-en-2023-06-26-english
function export_2023_06_26() {
d=exp_2023_06_26
mkdir $d
pushd $d
curl -SL -O https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17/resolve/main/exp/pretrained.pt
mv pretrained.pt epoch-99.pt
curl -SL -O https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17/resolve/main/data/lang_bpe_500/tokens.txt
curl -SL -o 0.wav https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17/resolve/main/data/lang_bpe_500/tokens.txt
curl -SL -o 1.wav https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17/resolve/main/test_wavs/1221-135766-0001.wav
curl -SL -o 2.wav https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17/resolve/main/test_wavs/1221-135766-0002.wav
ls -lh
popd
./zipformer/export-onnx-streaming.py \
--dynamic-batch 0 \
--enable-int8-quantization 0 \
--tokens $d/tokens.txt \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--exp-dir $d \
--use-ctc 0 \
--use-transducer 1 \
\
--chunk-size 32 \
--left-context-frames 128 \
--causal 1
ls -lh $d/
for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do
dst=sherpa-onnx-$platform-streaming-zipformer-en-2023-06-26
mkdir -p $dst
./zipformer/export_rknn_transducer_streaming.py \
--in-encoder $d/encoder-epoch-99-avg-1-chunk-32-left-128.onnx \
--in-decoder $d/decoder-epoch-99-avg-1-chunk-32-left-128.onnx \
--in-joiner $d/joiner-epoch-99-avg-1-chunk-32-left-128.onnx \
--out-encoder $dst/encoder.rknn \
--out-decoder $dst/decoder.rknn \
--out-joiner $dst/joiner.rknn \
--target-platform $platform
ls -lh $dst/
cp $d/tokens.txt $dst
mkdir $dst/test_wavs
cp $d/*.wav $dst/test_wavs
tar cjvf $dst.tar.bz2 $dst
ls -lh $dst.tar.bz2
mv $dst.tar.bz2 /icefall/
ls -lh $dst/
echo "---"
rm -rf $dst
done
}
if [[ $rknn_toolkit2_version == "2.1.0" ]]; then
export_2023_02_16
export_2023_02_20
else
export_2023_06_26
fi

View File

@ -56,7 +56,8 @@ function infer() {
curl -SL -O https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
./matcha/inference.py \
./matcha/infer.py \
--num-buckets 2 \
--epoch 1 \
--exp-dir ./matcha/exp \
--tokens data/tokens.txt \
@ -76,7 +77,7 @@ function export_onnx() {
popd
pushd data/fbank
rm -v *.json
rm -fv *.json
curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/data/cmvn.json
popd
@ -89,7 +90,7 @@ function export_onnx() {
ls -lh *.onnx
if false; then
# THe CI machine does not have enough memory to run it
# The CI machine does not have enough memory to run it
#
curl -SL -O https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
curl -SL -O https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v2
@ -97,19 +98,54 @@ function export_onnx() {
python3 ./matcha/export_onnx_hifigan.py
else
curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/exp/hifigan_v1.onnx
curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/exp/hifigan_v2.onnx
curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/exp/hifigan_v3.onnx
fi
ls -lh *.onnx
python3 ./matcha/onnx_pretrained.py \
--acoustic-model ./model-steps-6.onnx \
--vocoder ./hifigan_v1.onnx \
--tokens ./data/tokens.txt \
--input-text "how are you doing?" \
--output-wav /icefall/generated-matcha-tts-steps-6-v1.wav
for v in v1 v2 v3; do
python3 ./matcha/onnx_pretrained.py \
--acoustic-model ./model-steps-6.onnx \
--vocoder ./hifigan_$v.onnx \
--tokens ./data/tokens.txt \
--input-text "how are you doing?" \
--output-wav /icefall/generated-matcha-tts-steps-6-$v.wav
done
ls -lh /icefall/*.wav
soxi /icefall/generated-matcha-tts-steps-6-v1.wav
soxi /icefall/generated-matcha-tts-steps-6-*.wav
cp ./model-steps-*.onnx /icefall
d=matcha-icefall-en_US-ljspeech
mkdir $d
cp -v data/tokens.txt $d
cp model-steps-3.onnx $d
pushd $d
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
tar xf espeak-ng-data.tar.bz2
rm espeak-ng-data.tar.bz2
cat >README.md <<EOF
# Introduction
This model is trained using the dataset from
https://keithito.com/LJ-Speech-Dataset/
The dataset contains only 1 female speaker.
You can find the training code at
https://github.com/k2-fsa/icefall/tree/master/egs/ljspeech/TTS#matcha
EOF
ls -lh
popd
tar cvjf $d.tar.bz2 $d
mv $d.tar.bz2 /icefall
mv $d /icefall
}
prepare_data
@ -118,3 +154,4 @@ infer
export_onnx
rm -rfv generator_v* matcha/exp
git checkout .

View File

@ -1,200 +0,0 @@
#!/usr/bin/env bash
set -ex
git config --global user.name "k2-fsa"
git config --global user.email "csukuangfj@gmail.com"
git config --global lfs.allowincompletepush true
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
log "pwd: $PWD"
cd egs/multi_zh-hans/ASR
repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-2023-9-2
log "Downloading pre-trained model from $repo_url"
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
pushd $repo
cd exp
git lfs pull --include pretrained.pt
ln -s pretrained.pt epoch-99.pt
cd ../data/lang_bpe_2000
ls -lh
git lfs pull --include L.pt L_disambig.pt Linv.pt bpe.model
git lfs pull --include "*.model"
ls -lh
popd
log "--------------------------------------------"
log "Export non-streaming ONNX transducer models "
log "--------------------------------------------"
./zipformer/export-onnx.py \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--exp-dir $repo/exp \
--causal False
ls -lh $repo/exp
./zipformer/onnx_pretrained.py \
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
$repo/test_wavs/DEV_T0000000000.wav \
$repo/test_wavs/DEV_T0000000001.wav \
$repo/test_wavs/DEV_T0000000002.wav \
$repo/test_wavs/TEST_MEETING_T0000000113.wav \
$repo/test_wavs/TEST_MEETING_T0000000219.wav \
$repo/test_wavs/TEST_MEETING_T0000000351.wav
rm -rf $repo
repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05
log "Downloading pre-trained model from $repo_url"
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
pushd $repo
cd exp/
git lfs pull --include pretrained.pt
rm -fv epoch-20.pt
rm -fv *.onnx
ln -s pretrained.pt epoch-20.pt
cd ../data/lang_bpe_2000
ls -lh
git lfs pull --include L.pt L_disambig.pt Linv.pt bpe.model
git lfs pull --include "*.model"
ls -lh
popd
log "----------------------------------------"
log "Export streaming ONNX CTC models "
log "----------------------------------------"
./zipformer/export-onnx-streaming-ctc.py \
--exp-dir $repo/exp \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
--causal 1 \
--avg 1 \
--epoch 20 \
--use-averaged-model 0 \
--chunk-size 16 \
--left-context-frames 128 \
--use-ctc 1
ls -lh $repo/exp/
log "------------------------------------------------------------"
log "Test exported streaming ONNX CTC models (greedy search) "
log "------------------------------------------------------------"
test_wavs=(
DEV_T0000000000.wav
DEV_T0000000001.wav
DEV_T0000000002.wav
TEST_MEETING_T0000000113.wav
TEST_MEETING_T0000000219.wav
TEST_MEETING_T0000000351.wav
)
for w in ${test_wavs[@]}; do
./zipformer/onnx_pretrained-streaming-ctc.py \
--model-filename $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
$repo/test_wavs/$w
done
log "Upload onnx CTC models to huggingface"
url=https://huggingface.co/k2-fsa/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13
GIT_LFS_SKIP_SMUDGE=1 git clone $url
dst=$(basename $url)
cp -v $repo/exp/ctc*.onnx $dst
cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
cp -v $repo/data/lang_bpe_2000/bpe.model $dst
mkdir -p $dst/test_wavs
cp -v $repo/test_wavs/*.wav $dst/test_wavs
cd $dst
git lfs track "*.onnx" "bpe.model"
ls -lh
file bpe.model
git status
git add .
git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true
log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
rm -rf .git
rm -fv .gitattributes
cd ..
tar cjfv $dst.tar.bz2 $dst
ls -lh *.tar.bz2
mv -v $dst.tar.bz2 ../../../
log "----------------------------------------"
log "Export streaming ONNX transducer models "
log "----------------------------------------"
./zipformer/export-onnx-streaming.py \
--exp-dir $repo/exp \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
--causal 1 \
--avg 1 \
--epoch 20 \
--use-averaged-model 0 \
--chunk-size 16 \
--left-context-frames 128 \
--use-ctc 0
ls -lh $repo/exp
log "------------------------------------------------------------"
log "Test exported streaming ONNX transducer models (Python code)"
log "------------------------------------------------------------"
log "test fp32"
./zipformer/onnx_pretrained-streaming.py \
--encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.onnx \
--decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \
--joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.onnx \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
$repo/test_wavs/DEV_T0000000000.wav
log "test int8"
./zipformer/onnx_pretrained-streaming.py \
--encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
--decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \
--joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
$repo/test_wavs/DEV_T0000000000.wav
log "Upload onnx transducer models to huggingface"
url=https://huggingface.co/k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12
GIT_LFS_SKIP_SMUDGE=1 git clone $url
dst=$(basename $url)
cp -v $repo/exp/encoder*.onnx $dst
cp -v $repo/exp/decoder*.onnx $dst
cp -v $repo/exp/joiner*.onnx $dst
cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
cp -v $repo/data/lang_bpe_2000/bpe.model $dst
mkdir -p $dst/test_wavs
cp -v $repo/test_wavs/*.wav $dst/test_wavs
cd $dst
git lfs track "*.onnx" bpe.model
git add .
git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true
log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
rm -rf .git
rm -fv .gitattributes
cd ..
tar cjfv $dst.tar.bz2 $dst
ls -lh *.tar.bz2
mv -v $dst.tar.bz2 ../../../

756
.github/scripts/multi_zh-hans/ASR/run.sh vendored Executable file
View File

@ -0,0 +1,756 @@
#!/usr/bin/env bash
set -ex
git config --global user.name "k2-fsa"
git config --global user.email "csukuangfj@gmail.com"
git config --global lfs.allowincompletepush true
python3 -m pip install onnxmltools==1.13.0 onnx==1.17.0 onnxruntime==1.17.1 sherpa-onnx
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
cd egs/multi_zh-hans/ASR
log "pwd: $PWD"
function run_2023_9_2() {
repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-2023-9-2
log "Downloading pre-trained model from $repo_url"
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
pushd $repo
cd exp
git lfs pull --include pretrained.pt
ln -s pretrained.pt epoch-99.pt
cd ../data/lang_bpe_2000
ls -lh
git lfs pull --include L.pt L_disambig.pt Linv.pt bpe.model
git lfs pull --include "*.model"
ls -lh
popd
log "--------------------------------------------"
log "Export non-streaming ONNX transducer models "
log "--------------------------------------------"
./zipformer/export-onnx.py \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--exp-dir $repo/exp \
--causal False \
--fp16 1
ls -lh $repo/exp
./zipformer/onnx_pretrained.py \
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
$repo/test_wavs/DEV_T0000000000.wav \
$repo/test_wavs/DEV_T0000000001.wav \
$repo/test_wavs/DEV_T0000000002.wav \
$repo/test_wavs/TEST_MEETING_T0000000113.wav \
$repo/test_wavs/TEST_MEETING_T0000000219.wav \
$repo/test_wavs/TEST_MEETING_T0000000351.wav
./zipformer/onnx_pretrained.py \
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.int8.onnx \
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.int8.onnx \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
$repo/test_wavs/DEV_T0000000000.wav \
$repo/test_wavs/DEV_T0000000001.wav \
$repo/test_wavs/DEV_T0000000002.wav \
$repo/test_wavs/TEST_MEETING_T0000000113.wav \
$repo/test_wavs/TEST_MEETING_T0000000219.wav \
$repo/test_wavs/TEST_MEETING_T0000000351.wav
./zipformer/onnx_pretrained.py \
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.fp16.onnx \
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.fp16.onnx \
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.fp16.onnx \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
$repo/test_wavs/DEV_T0000000000.wav \
$repo/test_wavs/DEV_T0000000001.wav \
$repo/test_wavs/DEV_T0000000002.wav \
$repo/test_wavs/TEST_MEETING_T0000000113.wav \
$repo/test_wavs/TEST_MEETING_T0000000219.wav \
$repo/test_wavs/TEST_MEETING_T0000000351.wav
rm -rf $repo
}
function run_2023_11_05_streaming() {
repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05
log "Downloading pre-trained model from $repo_url"
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
pushd $repo
cd exp/
git lfs pull --include pretrained.pt
rm -fv epoch-20.pt
rm -fv *.onnx
ln -s pretrained.pt epoch-20.pt
cd ../data/lang_bpe_2000
ls -lh
git lfs pull --include L.pt L_disambig.pt Linv.pt bpe.model
git lfs pull --include "*.model"
ls -lh
popd
log "----------------------------------------"
log "Export streaming ONNX CTC models "
log "----------------------------------------"
./zipformer/export-onnx-streaming-ctc.py \
--exp-dir $repo/exp \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
--causal 1 \
--avg 1 \
--epoch 20 \
--use-averaged-model 0 \
--chunk-size 16 \
--left-context-frames 128 \
--use-ctc 1 \
--fp16 1
ls -lh $repo/exp/
log "------------------------------------------------------------"
log "Test exported streaming ONNX CTC models (greedy search) "
log "------------------------------------------------------------"
test_wavs=(
DEV_T0000000000.wav
DEV_T0000000001.wav
DEV_T0000000002.wav
TEST_MEETING_T0000000113.wav
TEST_MEETING_T0000000219.wav
TEST_MEETING_T0000000351.wav
)
for w in ${test_wavs[@]}; do
log "----fp32----"
./zipformer/onnx_pretrained-streaming-ctc.py \
--model-filename $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.onnx \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
$repo/test_wavs/$w
log "----int8----"
./zipformer/onnx_pretrained-streaming-ctc.py \
--model-filename $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
$repo/test_wavs/$w
log "----fp16----"
./zipformer/onnx_pretrained-streaming-ctc.py \
--model-filename $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.fp16.onnx \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
$repo/test_wavs/$w
done
log "Upload onnx CTC models to huggingface"
name=(
sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13
sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-int8-2023-12-13
sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-fp16-2023-12-13
)
for n in ${name[@]}; do
url=https://huggingface.co/k2-fsa/$n
GIT_LFS_SKIP_SMUDGE=1 git clone $url
dst=$(basename $url)
if [[ $n == sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13 ]]; then
cp -v $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.onnx $dst
elif [[ $n == sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-int8-2023-12-13 ]]; then
cp -v $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx $dst
elif [[ $n == sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-fp16-2023-12-13 ]]; then
cp -v $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.fp16.onnx $dst
fi
cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
cp -v $repo/data/lang_bpe_2000/bpe.model $dst
mkdir -p $dst/test_wavs
cp -v $repo/test_wavs/*.wav $dst/test_wavs
cd $dst
git lfs track "*.onnx" "bpe.model" "*.wav"
ls -lh
file bpe.model
git status
git add .
git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true
log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
rm -rf .git
rm -fv .gitattributes
cd ..
tar cjfv $dst.tar.bz2 $dst
ls -lh *.tar.bz2
mv -v $dst.tar.bz2 ../../../
done
log "----------------------------------------"
log "Export streaming ONNX transducer models "
log "----------------------------------------"
./zipformer/export-onnx-streaming.py \
--exp-dir $repo/exp \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
--causal 1 \
--avg 1 \
--epoch 20 \
--use-averaged-model 0 \
--chunk-size 16 \
--left-context-frames 128 \
--use-ctc 0 \
--fp16 1
ls -lh $repo/exp
log "------------------------------------------------------------"
log "Test exported streaming ONNX transducer models (Python code)"
log "------------------------------------------------------------"
log "test fp32"
./zipformer/onnx_pretrained-streaming.py \
--encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.onnx \
--decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \
--joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.onnx \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
$repo/test_wavs/DEV_T0000000000.wav
log "test int8"
./zipformer/onnx_pretrained-streaming.py \
--encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
--decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \
--joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
$repo/test_wavs/DEV_T0000000000.wav
log "test fp16"
./zipformer/onnx_pretrained-streaming.py \
--encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.fp16.onnx \
--decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.fp16.onnx \
--joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.fp16.onnx \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
$repo/test_wavs/DEV_T0000000000.wav
name=(
sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-13
sherpa-onnx-streaming-zipformer-multi-zh-hans-int8-2023-12-13
sherpa-onnx-streaming-zipformer-multi-zh-hans-fp16-2023-12-13
)
for n in ${name[@]}; do
url=https://huggingface.co/csukuangfj/$n
GIT_LFS_SKIP_SMUDGE=1 git clone $url
dst=$(basename $url)
if [[ $n == sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-13 ]]; then
cp -v $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.onnx $dst
cp -v $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx $dst
cp -v $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.onnx $dst
elif [[ $n == sherpa-onnx-streaming-zipformer-multi-zh-hans-int8-2023-12-13 ]]; then
cp -v $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx $dst
cp -v $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx $dst
cp -v $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx $dst
elif [[ $n == sherpa-onnx-streaming-zipformer-multi-zh-hans-fp16-2023-12-13 ]]; then
cp -v $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.fp16.onnx $dst
cp -v $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.fp16.onnx $dst
cp -v $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.fp16.onnx $dst
fi
cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
cp -v $repo/data/lang_bpe_2000/bpe.model $dst
mkdir -p $dst/test_wavs
cp -v $repo/test_wavs/*.wav $dst/test_wavs
cd $dst
git lfs track "*.onnx" "bpe.model" "*.wav"
ls -lh
file bpe.model
git status
git add .
git commit -m "upload model" && git push https://csukuangfj:${HF_TOKEN}@huggingface.co/csukuangfj/$dst main || true
log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
rm -rf .git
rm -fv .gitattributes
cd ..
tar cjfv $dst.tar.bz2 $dst
ls -lh *.tar.bz2
mv -v $dst.tar.bz2 ../../../
done
}
function run_2023_12_12_streaming() {
log "Upload onnx transducer models to huggingface"
url=https://huggingface.co/k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12
GIT_LFS_SKIP_SMUDGE=1 git clone $url
dst=$(basename $url)
cp -v $repo/exp/encoder*.onnx $dst
cp -v $repo/exp/decoder*.onnx $dst
cp -v $repo/exp/joiner*.onnx $dst
cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
cp -v $repo/data/lang_bpe_2000/bpe.model $dst
mkdir -p $dst/test_wavs
cp -v $repo/test_wavs/*.wav $dst/test_wavs
cd $dst
git lfs track "*.onnx" bpe.model "*.wav"
git add .
git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true
log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
rm -rf .git
rm -fv .gitattributes
cd ..
tar cjfv $dst.tar.bz2 $dst
ls -lh *.tar.bz2
mv -v $dst.tar.bz2 ../../../
}
function run_yuekai_large() {
repo_url=https://csukuangfj:${HF_TOKEN}@huggingface.co/yuekai/icefall-asr-multi-zh-hans-zipformer-large
log "Downloading pre-trained model from $repo_url"
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
pushd $repo
git lfs pull --include pretrained.pt
mv pretrained.pt epoch-99.pt
curl -SL -O https://huggingface.co/pingzxy/icefall-asr-multi-zh-hans-zipformer-large-onnx/resolve/main/tokens.txt
popd
log "----------------------------------------"
log "Export streaming ONNX CTC models "
log "----------------------------------------"
./zipformer/export-onnx-streaming-ctc.py \
--exp-dir $repo/ \
--tokens $repo/tokens.txt \
--causal 1 \
--avg 1 \
--epoch 99 \
--use-averaged-model 0 \
--chunk-size 16 \
--left-context-frames 128 \
--use-ctc 1 \
\
--num-encoder-layers 2,2,4,5,4,2 \
--feedforward-dim 768,1024,1536,2048,1536,768 \
--encoder-dim 256,384,512,768,512,256 \
--encoder-unmasked-dim 192,192,256,320,256,192 \
\
--fp16 1 \
--use-whisper-features 1
ls -lh $repo/
pushd $repo
cat >README.md <<EOF
# Introduction
This model is converted
from
https://huggingface.co/yuekai/icefall-asr-multi-zh-hans-zipformer-large
The training code can be found at
https://github.com/k2-fsa/icefall/blob/master/egs/multi_zh-hans/ASR/RESULTS.md#multi-chinese-datasets-char-based-training-results-streaming-on-zipformer-large-model
EOF
mv -v ctc-epoch-99-avg-1-chunk-16-left-128.fp16.onnx model.fp16.onnx
mv -v ctc-epoch-99-avg-1-chunk-16-left-128.int8.onnx model.int8.onnx
mv -v ctc-epoch-99-avg-1-chunk-16-left-128.onnx model.onnx
ls -lh *.onnx
mkdir test_wavs
cd test_wavs
curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-small-ctc-zh-int8-2025-04-01/resolve/main/test_wavs/0.wav
curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-small-ctc-zh-int8-2025-04-01/resolve/main/test_wavs/1.wav
curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-small-ctc-zh-int8-2025-04-01/resolve/main/test_wavs/8k.wav
popd
for w in 0.wav 1.wav 8k.wav; do
log "---fp32---"
sherpa-onnx \
--zipformer2-ctc-model=$repo/model.onnx \
--tokens=$repo/tokens.txt \
$repo/test_wavs/$w
log "---int8---"
sherpa-onnx \
--zipformer2-ctc-model=$repo/model.int8.onnx \
--tokens=$repo/tokens.txt \
$repo/test_wavs/$w
log "---fp16---"
sherpa-onnx \
--zipformer2-ctc-model=$repo/model.fp16.onnx \
--tokens=$repo/tokens.txt \
$repo/test_wavs/$w
done
name=(
sherpa-onnx-streaming-zipformer-ctc-zh-2025-06-30
sherpa-onnx-streaming-zipformer-ctc-zh-int8-2025-06-30
sherpa-onnx-streaming-zipformer-ctc-zh-fp16-2025-06-30
)
for n in ${name[@]}; do
url=https://huggingface.co/csukuangfj/$n
GIT_LFS_SKIP_SMUDGE=1 git clone $url
dst=$(basename $url)
if [[ $n == sherpa-onnx-streaming-zipformer-ctc-zh-2025-06-30 ]]; then
cp -v $repo/model.onnx $dst
elif [[ $n == sherpa-onnx-streaming-zipformer-ctc-zh-int8-2025-06-30 ]]; then
cp -v $repo/model.int8.onnx $dst
elif [[ $n == sherpa-onnx-streaming-zipformer-ctc-zh-fp16-2025-06-30 ]]; then
cp -v $repo/model.fp16.onnx $dst
fi
cp -v $repo/tokens.txt $dst
cp -v $repo/README.md $dst
mkdir -p $dst/test_wavs
cp -v $repo/test_wavs/*.wav $dst/test_wavs
cd $dst
git lfs track "*.onnx" "*.wav"
ls -lh
git status
git add .
git commit -m "upload model" && git push https://csukuangfj:${HF_TOKEN}@huggingface.co/csukuangfj/$dst main || true
log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
rm -rf .git
rm -fv .gitattributes
cd ..
tar cjfv $dst.tar.bz2 $dst
ls -lh *.tar.bz2
mv -v $dst.tar.bz2 ../../../
done
rm $repo/*.onnx
log "----------------------------------------"
log "Export streaming ONNX transducer models "
log "----------------------------------------"
./zipformer/export-onnx-streaming.py \
--exp-dir $repo \
--tokens $repo/tokens.txt \
--causal 1 \
--avg 1 \
--epoch 99 \
--use-averaged-model 0 \
--chunk-size 16 \
--left-context-frames 128 \
--use-ctc 0 \
\
--num-encoder-layers 2,2,4,5,4,2 \
--feedforward-dim 768,1024,1536,2048,1536,768 \
--encoder-dim 256,384,512,768,512,256 \
--encoder-unmasked-dim 192,192,256,320,256,192 \
\
--fp16 1 \
--use-whisper-features 1
ls -lh $repo
pushd $repo
for m in encoder decoder joiner; do
mv -v $m-epoch-99-avg-1-chunk-16-left-128.onnx $m.onnx
mv -v $m-epoch-99-avg-1-chunk-16-left-128.fp16.onnx $m.fp16.onnx
mv -v $m-epoch-99-avg-1-chunk-16-left-128.int8.onnx $m.int8.onnx
done
ls -lh *.onnx
popd
for w in 0.wav 1.wav 8k.wav; do
log "---fp32---"
sherpa-onnx \
--encoder=$repo/encoder.onnx \
--decoder=$repo/decoder.onnx \
--joiner=$repo/joiner.onnx \
--tokens=$repo/tokens.txt \
$repo/test_wavs/$w
log "---int8---"
sherpa-onnx \
--encoder=$repo/encoder.int8.onnx \
--decoder=$repo/decoder.onnx \
--joiner=$repo/joiner.int8.onnx \
--tokens=$repo/tokens.txt \
$repo/test_wavs/$w
log "---fp16---"
sherpa-onnx \
--encoder=$repo/encoder.fp16.onnx \
--decoder=$repo/decoder.fp16.onnx \
--joiner=$repo/joiner.fp16.onnx \
--tokens=$repo/tokens.txt \
$repo/test_wavs/$w
done
name=(
sherpa-onnx-streaming-zipformer-zh-2025-06-30
sherpa-onnx-streaming-zipformer-zh-int8-2025-06-30
sherpa-onnx-streaming-zipformer-zh-fp16-2025-06-30
)
for n in ${name[@]}; do
url=https://huggingface.co/csukuangfj/$n
GIT_LFS_SKIP_SMUDGE=1 git clone $url
dst=$(basename $url)
if [[ $n == sherpa-onnx-streaming-zipformer-zh-2025-06-30 ]]; then
cp -v $repo/encoder.onnx $dst
cp -v $repo/decoder.onnx $dst
cp -v $repo/joiner.onnx $dst
elif [[ $n == sherpa-onnx-streaming-zipformer-zh-int8-2025-06-30 ]]; then
cp -v $repo/encoder.int8.onnx $dst
cp -v $repo/decoder.onnx $dst
cp -v $repo/joiner.int8.onnx $dst
elif [[ $n == sherpa-onnx-streaming-zipformer-zh-fp16-2025-06-30 ]]; then
cp -v $repo/encoder.fp16.onnx $dst
cp -v $repo/decoder.fp16.onnx $dst
cp -v $repo/joiner.fp16.onnx $dst
fi
cp -v $repo/tokens.txt $dst
cp -v $repo/README.md $dst
mkdir -p $dst/test_wavs
cp -v $repo/test_wavs/*.wav $dst/test_wavs
cd $dst
git lfs track "*.onnx" "*.wav"
ls -lh
git status
git add .
git commit -m "upload model" && git push https://csukuangfj:${HF_TOKEN}@huggingface.co/csukuangfj/$dst main || true
log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
rm -rf .git
rm -fv .gitattributes
cd ..
tar cjfv $dst.tar.bz2 $dst
ls -lh *.tar.bz2
mv -v $dst.tar.bz2 ../../../
done
}
function run_yuekai_xl() {
repo_url=https://csukuangfj:${HF_TOKEN}@huggingface.co/yuekai/icefall-asr-multi-zh-hans-zipformer-xl
log "Downloading pre-trained model from $repo_url"
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
pushd $repo
git lfs pull --include pretrained.pt
git lfs pull --include data/lang_bpe_2000/bpe.model
mv pretrained.pt epoch-99.pt
ls -lh *.pt
popd
log "----------------------------------------"
log "Export streaming ONNX CTC models "
log "----------------------------------------"
./zipformer/export-onnx-streaming-ctc.py \
--exp-dir $repo/ \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
--causal 1 \
--avg 1 \
--epoch 99 \
--use-averaged-model 0 \
--chunk-size 16 \
--left-context-frames 128 \
--use-ctc 1 \
\
--num-encoder-layers 2,3,5,6,5,3 \
--feedforward-dim 1536,2048,3072,4096,3072,1536 \
--encoder-dim 512,768,1024,1536,1024,512 \
--encoder-unmasked-dim 192,192,256,320,256,192 \
--decoder-dim 768 --joiner-dim 768 \
--value-head-dim 18 \
--query-head-dim 48 \
--num-heads 4,4,4,8,4,4 \
\
--fp16 1 \
--use-whisper-features 1 \
--use-external-data 1
mv -v ctc-epoch-99-avg-1-chunk-16-left-128.int8.onnx model.int8.onnx
mv -v ctc-epoch-99-avg-1-chunk-16-left-128.fp16.onnx model.fp16.onnx
ls -lh *.onnx
mkdir test_wavs
pushd test_wavs
curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-small-ctc-zh-int8-2025-04-01/resolve/main/test_wavs/0.wav
curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-small-ctc-zh-int8-2025-04-01/resolve/main/test_wavs/1.wav
curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-small-ctc-zh-int8-2025-04-01/resolve/main/test_wavs/8k.wav
popd
for w in 0.wav 1.wav 8k.wav; do
log "---int8---"
sherpa-onnx \
--zipformer2-ctc-model=./model.int8.onnx \
--tokens=$repo/data/lang_bpe_2000/tokens.txt \
test_wavs/$w
log "---fp16---"
sherpa-onnx \
--zipformer2-ctc-model=./model.fp16.onnx \
--tokens=$repo/data/lang_bpe_2000/tokens.txt \
test_wavs/$w
done
pushd $repo
cat >README.md <<EOF
# Introduction
This model is converted
from
https://huggingface.co/yuekai/icefall-asr-multi-zh-hans-zipformer-xl
The training code can be found at
https://github.com/k2-fsa/icefall/blob/master/egs/multi_zh-hans/ASR/RESULTS.md#multi-chinese-datasets-char-based-training-results-streaming-on-zipformer-xl-model
EOF
popd
name=(
sherpa-onnx-streaming-zipformer-ctc-zh-xlarge-int8-2025-06-30
sherpa-onnx-streaming-zipformer-ctc-zh-xlarge-fp16-2025-06-30
)
for n in ${name[@]}; do
url=https://huggingface.co/csukuangfj/$n
GIT_LFS_SKIP_SMUDGE=1 git clone $url
dst=$(basename $url)
if [[ $n == sherpa-onnx-streaming-zipformer-ctc-zh-xlarge-fp16-2025-06-30 ]]; then
cp -v model.fp16.onnx $dst
elif [[ $n == sherpa-onnx-streaming-zipformer-ctc-zh-xlarge-int8-2025-06-30 ]]; then
cp -v model.int8.onnx $dst
fi
cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
cp -v $repo/data/lang_bpe_2000/bpe.model $dst
cp -v $repo/README.md $dst
mkdir -p $dst/test_wavs
cp -v ./test_wavs/*.wav $dst/test_wavs
cd $dst
git lfs track "*.onnx" "*.wav" "bpe.model"
ls -lh
git status
git add .
git commit -m "upload model" && git push https://csukuangfj:${HF_TOKEN}@huggingface.co/csukuangfj/$dst main || true
log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
rm -rf .git
rm -fv .gitattributes
cd ..
ls -lh $dst
tar cjfv $dst.tar.bz2 $dst
ls -lh *.tar.bz2
mv -v $dst.tar.bz2 ../../../
done
rm -fv *.onnx *.weights
log "----------------------------------------"
log "Export streaming ONNX transducer models "
log "----------------------------------------"
./zipformer/export-onnx-streaming.py \
--exp-dir $repo/ \
--tokens $repo/data/lang_bpe_2000/tokens.txt \
--causal 1 \
--avg 1 \
--epoch 99 \
--use-averaged-model 0 \
--chunk-size 16 \
--left-context-frames 128 \
--use-ctc 0 \
\
--num-encoder-layers 2,3,5,6,5,3 \
--feedforward-dim 1536,2048,3072,4096,3072,1536 \
--encoder-dim 512,768,1024,1536,1024,512 \
--encoder-unmasked-dim 192,192,256,320,256,192 \
--decoder-dim 768 --joiner-dim 768 \
--value-head-dim 18 \
--query-head-dim 48 \
--num-heads 4,4,4,8,4,4 \
\
--fp16 1 \
--use-whisper-features 1 \
--use-external-data 1
ls -lh *.onnx
ls -lh *.weights
mv encoder-epoch-99-avg-1-chunk-16-left-128.fp16.onnx encoder.fp16.onnx
mv encoder-epoch-99-avg-1-chunk-16-left-128.int8.onnx encoder.int8.onnx
mv $repo/decoder-epoch-99-avg-1-chunk-16-left-128.onnx decoder.onnx
mv $repo/decoder-epoch-99-avg-1-chunk-16-left-128.fp16.onnx decoder.fp16.onnx
mv $repo/joiner-epoch-99-avg-1-chunk-16-left-128.int8.onnx joiner.int8.onnx
mv $repo/joiner-epoch-99-avg-1-chunk-16-left-128.fp16.onnx joiner.fp16.onnx
name=(
sherpa-onnx-streaming-zipformer-zh-xlarge-int8-2025-06-30
sherpa-onnx-streaming-zipformer-zh-xlarge-fp16-2025-06-30
)
for n in ${name[@]}; do
url=https://huggingface.co/csukuangfj/$n
GIT_LFS_SKIP_SMUDGE=1 git clone $url
dst=$(basename $url)
if [[ $n == sherpa-onnx-streaming-zipformer-zh-xlarge-fp16-2025-06-30 ]]; then
cp -v encoder.fp16.onnx $dst
cp -v decoder.fp16.onnx $dst
cp -v joiner.fp16.onnx $dst
elif [[ $n == sherpa-onnx-streaming-zipformer-zh-xlarge-int8-2025-06-30 ]]; then
cp -v encoder.int8.onnx $dst
cp -v decoder.onnx $dst
cp -v joiner.int8.onnx $dst
fi
cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
cp -v $repo/data/lang_bpe_2000/bpe.model $dst
cp -v $repo/README.md $dst
mkdir -p $dst/test_wavs
cp -v ./test_wavs/*.wav $dst/test_wavs
cd $dst
git lfs track "*.onnx" "*.wav" "bpe.model"
ls -lh
git status
git add .
git commit -m "upload model" && git push https://csukuangfj:${HF_TOKEN}@huggingface.co/csukuangfj/$dst main || true
log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
rm -rf .git
rm -fv .gitattributes
cd ..
ls -lh $dst
tar cjfv $dst.tar.bz2 $dst
ls -lh *.tar.bz2
mv -v $dst.tar.bz2 ../../../
done
rm -fv *.onnx *.weights
}
# run_yuekai_large
# run_yuekai_xl
# run_2023_9_2
run_2023_11_05_streaming
# run_2023_12_12_streaming

73
.github/scripts/multi_zh-hans/ASR/run_rknn.sh vendored Executable file
View File

@ -0,0 +1,73 @@
#!/usr/bin/env bash
set -ex
python3 -m pip install kaldi-native-fbank soundfile librosa
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
cd egs/multi_zh-hans/ASR
# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12-chinese
function export_2023_11_05() {
d=exp
mkdir $d
pushd $d
curl -SL -O https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05/resolve/main/data/lang_bpe_2000/tokens.txt
curl -SL -O https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05/resolve/main/exp/pretrained.pt
mv pretrained.pt epoch-99.pt
curl -SL -o 0.wav https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05/resolve/main/test_wavs/DEV_T0000000000.wav
curl -SL -o 1.wav https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05/resolve/main/test_wavs/DEV_T0000000001.wav
curl -SL -o 2.wav https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05/resolve/main/test_wavs/DEV_T0000000002.wav
ls -lh
popd
./zipformer/export-onnx-streaming.py \
--dynamic-batch 0 \
--enable-int8-quantization 0 \
--tokens $d/tokens.txt \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--exp-dir $d \
--use-ctc 0 \
--use-transducer 1 \
--chunk-size 32 \
--left-context-frames 128 \
--causal 1
for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do
dst=sherpa-onnx-$platform-streaming-zipformer-multi-zh-hans-2023-12-12
mkdir -p $dst
./zipformer/export_rknn_transducer_streaming.py \
--in-encoder $d/encoder-epoch-99-avg-1-chunk-32-left-128.onnx \
--in-decoder $d/decoder-epoch-99-avg-1-chunk-32-left-128.onnx \
--in-joiner $d/joiner-epoch-99-avg-1-chunk-32-left-128.onnx \
--out-encoder $dst/encoder.rknn \
--out-decoder $dst/decoder.rknn \
--out-joiner $dst/joiner.rknn \
--target-platform $platform
cp $d/tokens.txt $dst
mkdir $dst/test_wavs
cp $d/*.wav $dst/test_wavs
tar cjvf $dst.tar.bz2 $dst
ls -lh $dst.tar.bz2
mv $dst.tar.bz2 /icefall/
ls -lh $dst/
echo "---"
rm -rf $dst
done
}
export_2023_11_05

196
.github/scripts/wenetspeech/ASR/run_rknn.sh vendored Executable file
View File

@ -0,0 +1,196 @@
#!/usr/bin/env bash
set -ex
python3 -m pip install kaldi-native-fbank soundfile librosa
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
cd egs/wenetspeech/ASR
#https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#k2-fsa-icefall-asr-zipformer-wenetspeech-streaming-small-chinese
function export_2025_03_02() {
d=exp_2025_03_02
mkdir $d
pushd $d
curl -SL -O https://huggingface.co/k2-fsa/icefall-asr-zipformer-wenetspeech-streaming-small/resolve/main/data/lang_char/tokens.txt
curl -SL -O https://huggingface.co/k2-fsa/icefall-asr-zipformer-wenetspeech-streaming-small/resolve/main/exp/pretrained.pt
mv pretrained.pt epoch-99.pt
curl -SL -o 0.wav https://huggingface.co/k2-fsa/icefall-asr-zipformer-wenetspeech-streaming-small/resolve/main/test_wavs/DEV_T0000000000.wav
curl -SL -o 1.wav https://huggingface.co/k2-fsa/icefall-asr-zipformer-wenetspeech-streaming-small/resolve/main/test_wavs/DEV_T0000000001.wav
curl -SL -o 2.wav https://huggingface.co/k2-fsa/icefall-asr-zipformer-wenetspeech-streaming-small/resolve/main/test_wavs/DEV_T0000000002.wav
ls -lh
popd
./zipformer/export-onnx-streaming.py \
--dynamic-batch 0 \
--enable-int8-quantization 0 \
--tokens $d/tokens.txt \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--exp-dir $d \
--use-ctc 0 \
--use-transducer 1 \
\
--num-encoder-layers 2,2,2,2,2,2 \
--feedforward-dim 512,768,768,768,768,768 \
--encoder-dim 192,256,256,256,256,256 \
--encoder-unmasked-dim 192,192,192,192,192,192 \
\
--chunk-size 32 \
--left-context-frames 128 \
--causal 1
for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do
dst=sherpa-onnx-$platform-streaming-zipformer-small-zh-2025-03-02
mkdir -p $dst
./zipformer/export_rknn_transducer_streaming.py \
--in-encoder $d/encoder-epoch-99-avg-1-chunk-32-left-128.onnx \
--in-decoder $d/decoder-epoch-99-avg-1-chunk-32-left-128.onnx \
--in-joiner $d/joiner-epoch-99-avg-1-chunk-32-left-128.onnx \
--out-encoder $dst/encoder.rknn \
--out-decoder $dst/decoder.rknn \
--out-joiner $dst/joiner.rknn \
--target-platform $platform
cp $d/tokens.txt $dst
mkdir $dst/test_wavs
cp $d/*.wav $dst/test_wavs
tar cjvf $dst.tar.bz2 $dst
ls -lh $dst.tar.bz2
mv $dst.tar.bz2 /icefall/
ls -lh $dst/
echo "---"
rm -rf $dst
done
rm -rf $d
}
# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#k2-fsa-icefall-asr-zipformer-wenetspeech-streaming-large-chinese
function export_2025_03_03() {
d=exp_2025_03_03
mkdir $d
pushd $d
curl -SL -O https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/data/lang_char/tokens.txt
curl -SL -O https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/pretrained.pt
mv pretrained.pt epoch-99.pt
curl -SL -o 0.wav https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/test_wavs/DEV_T0000000000.wav
curl -SL -o 1.wav https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/test_wavs/DEV_T0000000001.wav
curl -SL -o 2.wav https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/test_wavs/DEV_T0000000002.wav
ls -lh
popd
./zipformer/export-onnx-streaming.py \
--dynamic-batch 0 \
--enable-int8-quantization 0 \
--tokens $d/tokens.txt \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--exp-dir $d \
--use-ctc 0 \
--use-transducer 1 \
\
--chunk-size 32 \
--left-context-frames 128 \
--causal 1
for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do
dst=sherpa-onnx-$platform-streaming-zipformer-zh-2025-03-03
mkdir -p $dst
./zipformer/export_rknn_transducer_streaming.py \
--in-encoder $d/encoder-epoch-99-avg-1-chunk-32-left-128.onnx \
--in-decoder $d/decoder-epoch-99-avg-1-chunk-32-left-128.onnx \
--in-joiner $d/joiner-epoch-99-avg-1-chunk-32-left-128.onnx \
--out-encoder $dst/encoder.rknn \
--out-decoder $dst/decoder.rknn \
--out-joiner $dst/joiner.rknn \
--target-platform $platform
cp $d/tokens.txt $dst
mkdir $dst/test_wavs
cp $d/*.wav $dst/test_wavs
tar cjvf $dst.tar.bz2 $dst
ls -lh $dst.tar.bz2
mv $dst.tar.bz2 /icefall/
ls -lh $dst/
echo "---"
ls -lh $dst.tar.bz2
rm -rf $dst
done
rm -rf $d
}
function export_2023_06_15() {
d=exp_2023_06_15
mkdir $d
pushd $d
curl -SL -O https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/data/lang_char/tokens.txt
curl -SL -O https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/pretrained.pt
mv pretrained.pt epoch-99.pt
curl -SL -o 0.wav https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/test_wavs/DEV_T0000000000.wav
curl -SL -o 1.wav https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/test_wavs/DEV_T0000000001.wav
curl -SL -o 2.wav https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/test_wavs/DEV_T0000000002.wav
ls -lh
popd
./zipformer/export-onnx-streaming.py \
--dynamic-batch 0 \
--enable-int8-quantization 0 \
--tokens $d/tokens.txt \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--exp-dir $d \
--use-ctc 0 \
--use-transducer 1 \
\
--chunk-size 32 \
--left-context-frames 128 \
--causal 1
for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do
dst=sherpa-onnx-$platform-streaming-zipformer-zh-2023-06-15
mkdir -p $dst
./zipformer/export_rknn_transducer_streaming.py \
--in-encoder $d/encoder-epoch-99-avg-1-chunk-32-left-128.onnx \
--in-decoder $d/decoder-epoch-99-avg-1-chunk-32-left-128.onnx \
--in-joiner $d/joiner-epoch-99-avg-1-chunk-32-left-128.onnx \
--out-encoder $dst/encoder.rknn \
--out-decoder $dst/decoder.rknn \
--out-joiner $dst/joiner.rknn \
--target-platform $platform
cp $d/tokens.txt $dst
mkdir $dst/test_wavs
cp $d/*.wav $dst/test_wavs
tar cjvf $dst.tar.bz2 $dst
ls -lh $dst.tar.bz2
mv $dst.tar.bz2 /icefall/
ls -lh $dst/
echo "---"
ls -lh $dst.tar.bz2
rm -rf $dst
done
}
export_2025_03_02
export_2025_03_03
export_2023_06_15

View File

@ -17,7 +17,7 @@ concurrency:
jobs:
generate_build_matrix:
if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && (github.event.label.name == 'ready' || github.event_name == 'push' || github.event_name == 'aishell')
if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa'
# see https://github.com/pytorch/pytorch/pull/50633
runs-on: ubuntu-latest
@ -31,8 +31,8 @@ jobs:
id: set-matrix
run: |
# outputting for debugging purposes
python ./.github/scripts/docker/generate_build_matrix.py
MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
echo "::set-output name=matrix::${MATRIX}"
aishell:
needs: generate_build_matrix

View File

@ -30,8 +30,8 @@ jobs:
id: set-matrix
run: |
# outputting for debugging purposes
python ./.github/scripts/docker/generate_build_matrix.py
MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
echo "::set-output name=matrix::${MATRIX}"
audioset:
@ -83,7 +83,7 @@ jobs:
ls -lh ./model-onnx/*
- name: Upload model to huggingface
if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
uses: nick-fields/retry@v3
@ -116,7 +116,7 @@ jobs:
rm -rf huggingface
- name: Prepare for release
if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
shell: bash
run: |
d=sherpa-onnx-zipformer-audio-tagging-2024-04-09
@ -125,7 +125,7 @@ jobs:
ls -lh
- name: Release exported onnx models
if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
uses: svenstaro/upload-release-action@v2
with:
file_glob: true

152
.github/workflows/baker_zh.yml vendored Normal file
View File

@ -0,0 +1,152 @@
name: baker_zh
on:
push:
branches:
- master
- baker-matcha-2
pull_request:
branches:
- master
workflow_dispatch:
concurrency:
group: baker-zh-${{ github.ref }}
cancel-in-progress: true
jobs:
generate_build_matrix:
if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa'
# see https://github.com/pytorch/pytorch/pull/50633
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Generating build matrix
id: set-matrix
run: |
# outputting for debugging purposes
python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
echo "::set-output name=matrix::${MATRIX}"
baker_zh:
needs: generate_build_matrix
name: py${{ matrix.python-version }} torch${{ matrix.torch-version }} v${{ matrix.version }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Free space
shell: bash
run: |
ls -lh
df -h
rm -rf /opt/hostedtoolcache
df -h
echo "pwd: $PWD"
echo "github.workspace ${{ github.workspace }}"
- name: Run tests
uses: addnab/docker-run-action@v3
with:
image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }}
options: |
--volume ${{ github.workspace }}/:/icefall
shell: bash
run: |
export PYTHONPATH=/icefall:$PYTHONPATH
cd /icefall
pip install onnx==1.17.0
pip list
git config --global --add safe.directory /icefall
.github/scripts/baker_zh/TTS/run-matcha.sh
- name: display files
shell: bash
run: |
ls -lh
- uses: actions/upload-artifact@v4
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
with:
name: generated-test-files-${{ matrix.python-version }}-${{ matrix.torch-version }}
path: ./*.wav
- uses: actions/upload-artifact@v4
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
with:
name: step-2
path: ./model-steps-2.onnx
- uses: actions/upload-artifact@v4
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
with:
name: step-3
path: ./model-steps-3.onnx
- uses: actions/upload-artifact@v4
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
with:
name: step-4
path: ./model-steps-4.onnx
- uses: actions/upload-artifact@v4
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
with:
name: step-5
path: ./model-steps-5.onnx
- uses: actions/upload-artifact@v4
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
with:
name: step-6
path: ./model-steps-6.onnx
- name: Upload models to huggingface
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
shell: bash
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
d=matcha-icefall-zh-baker
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/$d hf
cp -av $d/* hf/
pushd hf
git add .
git config --global user.name "csukuangfj"
git config --global user.email "csukuangfj@gmail.com"
git config --global lfs.allowincompletepush true
git commit -m "upload model" && git push https://csukuangfj:${HF_TOKEN}@huggingface.co/csukuangfj/$d main || true
popd
- name: Release exported onnx models
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
uses: svenstaro/upload-release-action@v2
with:
file_glob: true
overwrite: true
file: matcha-icefall-*.tar.bz2
repo_name: k2-fsa/sherpa-onnx
repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
tag: tts-models

View File

@ -34,7 +34,7 @@ concurrency:
jobs:
build-doc:
if: github.event.label.name == 'doc' || github.event_name == 'push'
# if: github.event.label.name == 'doc' || github.event_name == 'push'
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
@ -43,7 +43,7 @@ jobs:
python-version: ["3.8"]
steps:
# refer to https://github.com/actions/checkout
- uses: actions/checkout@v2
- uses: actions/checkout@v4
with:
fetch-depth: 0

View File

@ -29,8 +29,9 @@ jobs:
id: set-matrix
run: |
# outputting for debugging purposes
python ./.github/scripts/docker/generate_build_matrix.py
MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
# MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10" --min-torch-version "2.6.0")
echo "::set-output name=matrix::${MATRIX}"
librispeech:
needs: generate_build_matrix

View File

@ -30,8 +30,8 @@ jobs:
id: set-matrix
run: |
# outputting for debugging purposes
python ./.github/scripts/docker/generate_build_matrix.py
MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
echo "::set-output name=matrix::${MATRIX}"
ljspeech:
@ -70,6 +70,10 @@ jobs:
cd /icefall
git config --global --add safe.directory /icefall
pip install "matplotlib<=3.9.4"
pip list
.github/scripts/ljspeech/TTS/run-matcha.sh
.github/scripts/ljspeech/TTS/run.sh
@ -79,13 +83,13 @@ jobs:
ls -lh
- uses: actions/upload-artifact@v4
if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
with:
name: generated-test-files-${{ matrix.python-version }}-${{ matrix.torch-version }}
path: ./*.wav
- name: Release exported onnx models
if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
uses: svenstaro/upload-release-action@v2
with:
file_glob: true
@ -94,3 +98,69 @@ jobs:
repo_name: k2-fsa/sherpa-onnx
repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
tag: tts-models
- uses: actions/upload-artifact@v4
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
with:
name: step-2
path: ./model-steps-2.onnx
- uses: actions/upload-artifact@v4
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
with:
name: step-3
path: ./model-steps-3.onnx
- uses: actions/upload-artifact@v4
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
with:
name: step-4
path: ./model-steps-4.onnx
- uses: actions/upload-artifact@v4
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
with:
name: step-5
path: ./model-steps-5.onnx
- uses: actions/upload-artifact@v4
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
with:
name: step-6
path: ./model-steps-6.onnx
- name: Upload models to huggingface
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
shell: bash
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
d=matcha-icefall-en_US-ljspeech
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/$d hf
cp -av $d/* hf/
pushd hf
git lfs track "cmn_dict"
git lfs track "ru_dict"
git add .
git config --global user.name "csukuangfj"
git config --global user.email "csukuangfj@gmail.com"
git config --global lfs.allowincompletepush true
git commit -m "upload model" && git push https://csukuangfj:${HF_TOKEN}@huggingface.co/csukuangfj/$d main || true
popd
- name: Release exported onnx models
if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
uses: svenstaro/upload-release-action@v2
with:
file_glob: true
overwrite: true
file: matcha-icefall-*.tar.bz2
repo_name: k2-fsa/sherpa-onnx
repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
tag: tts-models

View File

@ -1,4 +1,4 @@
name: run-multi-zh-hans
name: multi-zh-hans
on:
push:
@ -8,65 +8,72 @@ on:
workflow_dispatch:
concurrency:
group: run-multi-zh-hans-${{ github.ref }}
group: multi-zh-hans-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: write
jobs:
generate_build_matrix:
if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa'
# see https://github.com/pytorch/pytorch/pull/50633
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Generating build matrix
id: set-matrix
run: |
# outputting for debugging purposes
python ./.github/scripts/docker/generate_build_matrix.py --torch-version "2.7.0" --python-version "3.11"
MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --torch-version "2.7.0" --python-version "3.11")
echo "::set-output name=matrix::${MATRIX}"
multi-zh-hans:
runs-on: ${{ matrix.os }}
needs: generate_build_matrix
name: py${{ matrix.python-version }} torch${{ matrix.torch-version }} v${{ matrix.version }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
python-version: [3.8]
${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'
cache-dependency-path: '**/requirements-ci.txt'
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
uses: actions/cache@v2
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
- name: Free space
shell: bash
run: |
.github/scripts/install-kaldifeat.sh
df -h
rm -rf /opt/hostedtoolcache
df -h
echo "pwd: $PWD"
echo "github.workspace ${{ github.workspace }}"
- name: export-model
- name: Test with multi_zh-hans
uses: addnab/docker-run-action@v3
with:
image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }}
options: |
--volume ${{ github.workspace }}/:/icefall
shell: bash
run: |
export PYTHONPATH=/icefall:$PYTHONPATH
export HF_TOKEN=${{ secrets.HF_TOKEN }}
cd /icefall
git config --global --add safe.directory /icefall
.github/scripts/multi_zh-hans/ASR/run.sh
- name: Show models
shell: bash
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
.github/scripts/multi-zh-hans.sh
ls -lh
ls -lh *.tar.bz2
- name: upload model to https://github.com/k2-fsa/sherpa-onnx
uses: svenstaro/upload-release-action@v2

134
.github/workflows/rknn.yml vendored Normal file
View File

@ -0,0 +1,134 @@
name: rknn
on:
push:
branches:
- master
- rknn-zipformer2
pull_request:
branches:
- master
workflow_dispatch:
concurrency:
group: rknn-${{ github.ref }}
cancel-in-progress: true
jobs:
rknn:
name: RKNN ${{ matrix.recipe }} ${{ matrix.rknn_toolkit2_version }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.10"]
k2-version: ["1.24.4.dev20241029"]
kaldifeat-version: ["1.25.5.dev20241029"]
torch-version: ["2.0.0"]
torchaudio-version: ["2.0.1"]
version: ["20241218"]
# recipe: ["librispeech", "wenetspeech", "multi_zh-hans"]
recipe: ["librispeech"]
rknn_toolkit2_version: ["2.2.0", "2.1.0"]
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Export RKNN model
uses: addnab/docker-run-action@v3
with:
image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }}
options: |
--volume ${{ github.workspace }}/:/icefall
shell: bash
run: |
cat /etc/*release
lsb_release -a
uname -a
python3 --version
export PYTHONPATH=/icefall:$PYTHONPATH
cd /icefall
git config --global --add safe.directory /icefall
python3 -m torch.utils.collect_env
python3 -m k2.version
pip list
export rknn_toolkit2_version=${{ matrix.rknn_toolkit2_version }}
if [[ $rknn_toolkit2_version == "2.1.0" ]]; then
# for the folder pruned_transducer_stateless7_streaming
curl -SL -O https://huggingface.co/csukuangfj/rknn-toolkit2/resolve/main/rknn_toolkit2-2.1.0%2B708089d1-cp310-cp310-linux_x86_64.whl
else
# for the folder zipformer/
curl -SL -O https://huggingface.co/csukuangfj/rknn-toolkit2/resolve/main/rknn_toolkit2-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
fi
# Install rknn
pip install ./*.whl "numpy<=1.26.4"
pip list | grep rknn
echo "---"
pip list
echo "---"
recipe=${{ matrix.recipe }}
.github/scripts/$recipe/ASR/run_rknn.sh > log-$recipe.txt 2>&1 || true
- uses: actions/upload-artifact@v4
with:
name: log-${{ matrix.recipe }}-${{ matrix.rknn_toolkit2_version }}
path: ./log-*.txt
- name: Display results
shell: bash
run: |
ls -lh *rk*.tar.bz2 || true
- name: Release to GitHub
uses: svenstaro/upload-release-action@v2
with:
file_glob: true
overwrite: true
file: sherpa-onnx-*.tar.bz2
repo_name: k2-fsa/sherpa-onnx
repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
tag: asr-models
- name: Upload model to huggingface
if: github.event_name == 'push' || github.event_name == 'workflow_dispatch'
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
uses: nick-fields/retry@v3
with:
max_attempts: 20
timeout_seconds: 200
shell: bash
command: |
git config --global user.email "csukuangfj@gmail.com"
git config --global user.name "Fangjun Kuang"
rm -rf huggingface
export GIT_LFS_SKIP_SMUDGE=1
git clone https://huggingface.co/csukuangfj/sherpa-onnx-rknn-models huggingface
cd huggingface
git fetch
git pull
git merge -m "merge remote" --ff origin main
dst=streaming-asr
mkdir -p $dst
cp ../*rk*.tar.bz2 $dst/ || true
ls -lh $dst
git add .
git status
git commit -m "update models"
git status
git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-rknn-models main || true
rm -rf huggingface

View File

@ -36,7 +36,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest]
python-version: [3.8]
python-version: ["3.10"]
fail-fast: false
steps:
@ -69,7 +69,7 @@ jobs:
working-directory: ${{github.workspace}}
run: |
black --check --diff .
- name: Run isort
shell: bash
working-directory: ${{github.workspace}}

View File

@ -30,8 +30,8 @@ jobs:
id: set-matrix
run: |
# outputting for debugging purposes
python ./.github/scripts/docker/generate_build_matrix.py
MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
echo "::set-output name=matrix::${MATRIX}"
test:
needs: generate_build_matrix

View File

@ -30,8 +30,9 @@ jobs:
id: set-matrix
run: |
# outputting for debugging purposes
python ./.github/scripts/docker/generate_build_matrix.py
MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
# MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10" --min-torch-version "2.5.0")
echo "::set-output name=matrix::${MATRIX}"
yesno:
needs: generate_build_matrix

View File

@ -383,3 +383,7 @@ Please see: [![Open In Colab](https://colab.research.google.com/assets/colab-bad
[vctk]: egs/vctk/TTS
[ljspeech]: egs/ljspeech/TTS
[libritts_tts]: egs/libritts/TTS
## Acknowledgements
Some contributors to this project were supported by Xiaomi Corporation. Others were supported by National Science Foundation CCRI award 2120435. This is not an exhaustive list of sources of support.

View File

@ -41,7 +41,7 @@ To give you an idea of what ``tdnn/exp/pretrained.pt`` contains, we can use the
.. code-block:: python3
>>> import torch
>>> m = torch.load("tdnn/exp/pretrained.pt")
>>> m = torch.load("tdnn/exp/pretrained.pt", weights_only=False)
>>> list(m.keys())
['model']
>>> list(m["model"].keys())

View File

@ -28,7 +28,7 @@ consisting of words and tokens (i.e., phones) and does the following:
4. Generate L.pt, in k2 format. It can be loaded by
d = torch.load("L.pt")
d = torch.load("L.pt", weights_only=False)
lexicon = k2.Fsa.from_dict(d)
5. Generate L_disambig.pt, in k2 format.

View File

@ -224,7 +224,7 @@ def main():
logging.info("Creating model")
model = get_transducer_model(params)
checkpoint = torch.load(args.checkpoint, map_location="cpu")
checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
model.load_state_dict(checkpoint["model"], strict=False)
model.to(device)
model.eval()

View File

@ -79,7 +79,13 @@ from icefall.checkpoint import save_checkpoint_with_global_batch_idx
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.lexicon import Lexicon
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
from icefall.utils import (
AttributeDict,
MetricsTracker,
setup_logger,
str2bool,
torch_autocast,
)
LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -638,7 +644,7 @@ def train_one_epoch(
params.batch_idx_train += 1
batch_size = len(batch["supervisions"]["text"])
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, loss_info = compute_loss(
params=params,
model=model,
@ -912,7 +918,7 @@ def scan_pessimistic_batches_for_oom(
# warmup = 0.0 is so that the derivs for the pruned loss stay zero
# (i.e. are not remembered by the decaying-average in adam), because
# we want to avoid these params being subject to shrinkage in adam.
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, _ = compute_loss(
params=params,
model=model,

View File

@ -1,5 +1,63 @@
## Results
### Aishell training results (zipformer + CR-CTC)
See <https://github.com/k2-fsa/icefall/pull/1980> for more details.
[zipformer](./zipformer)
#### Non-streaming
##### medium-scale model, number of model parameters: 66218471, i.e., 66.2 M
| decoding method | test | dev | comment |
|--------------------------------------|------------|------------|---------------------|
| ctc-greedy-search | 3.98 | 3.69 | --epoch 60 --avg 28 |
| ctc-prefix-beam-search | 3.98 | 3.70 | --epoch 60 --avg 21 |
The training command using 2 32G-V100 GPUs is:
```bash
export CUDA_VISIBLE_DEVICES="0,1"
./zipformer/train.py \
--world-size 2 \
--num-epochs 60 \
--start-epoch 1 \
--use-fp16 1 \
--context-size 1 \
--enable-musan 0 \
--exp-dir zipformer/exp \
--max-duration 500 \
--base-lr 0.045 \
--lr-batches 7500 \
--lr-epochs 18 \
--spec-aug-time-warp-factor 20 \
--use-ctc 1 \
--use-cr-ctc 1 \
--use-transducer 0 \
--enable-spec-aug 0 \
--cr-loss-scale 0.2
```
The decoding command is:
```bash
export CUDA_VISIBLE_DEVICES="0"
for m in ctc-greedy-search ctc-prefix-beam-search; do
./zipformer/ctc_decode.py \
--epoch 60 \
--avg 28 \
--exp-dir zipformer/exp \
--use-cr-ctc 1 \
--use-ctc 1 \
--use-transducer 0 \
--max-duration 600 \
--decoding-method $m
done
```
Pretrained models, training logs, decoding logs, tensorboard and decoding results
are available at
<https://huggingface.co/MistMoon/icefall-asr-aishell-zipformer-medium-cr-ctc-20250702>
### Aishell training results (Fine-tuning Pretrained Models)
#### Whisper
[./whisper](./whisper)

View File

@ -503,7 +503,7 @@ def main():
else:
H = None
HLG = k2.Fsa.from_dict(
torch.load(f"{params.lang_dir}/HLG.pt", map_location=device)
torch.load(f"{params.lang_dir}/HLG.pt", map_location=device, weights_only=False)
)
assert HLG.requires_grad is False

View File

@ -249,7 +249,7 @@ def main():
use_feat_batchnorm=params.use_feat_batchnorm,
)
checkpoint = torch.load(args.checkpoint, map_location="cpu")
checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
model.load_state_dict(checkpoint["model"], strict=False)
model.to(device)
model.eval()
@ -315,7 +315,7 @@ def main():
hyps = [[token_sym_table[i] for i in ids] for ids in token_ids]
elif params.method in ["1best", "attention-decoder"]:
logging.info(f"Loading HLG from {params.HLG}")
HLG = k2.Fsa.from_dict(torch.load(params.HLG, map_location="cpu"))
HLG = k2.Fsa.from_dict(torch.load(params.HLG, map_location="cpu", weights_only=False))
HLG = HLG.to(device)
if not hasattr(HLG, "lm_scores"):
# For whole-lattice-rescoring and attention-decoder

View File

@ -516,7 +516,7 @@ def main():
else:
H = None
HLG = k2.Fsa.from_dict(
torch.load(f"{params.lang_dir}/HLG.pt", map_location=device)
torch.load(f"{params.lang_dir}/HLG.pt", map_location=device, weights_only=False)
)
assert HLG.requires_grad is False

View File

@ -28,7 +28,7 @@ consisting of words and tokens (i.e., phones) and does the following:
4. Generate L.pt, in k2 format. It can be loaded by
d = torch.load("L.pt")
d = torch.load("L.pt", weights_only=False)
lexicon = k2.Fsa.from_dict(d)
5. Generate L_disambig.pt, in k2 format.

View File

@ -227,7 +227,7 @@ def main():
logging.info("About to create model")
model = get_transducer_model(params)
checkpoint = torch.load(args.checkpoint, map_location="cpu")
checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
model.load_state_dict(checkpoint["model"], strict=False)
model.to(device)
model.eval()

View File

@ -72,7 +72,13 @@ from icefall.checkpoint import save_checkpoint_with_global_batch_idx
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.lexicon import Lexicon
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
from icefall.utils import (
AttributeDict,
MetricsTracker,
setup_logger,
str2bool,
torch_autocast,
)
LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -688,7 +694,7 @@ def train_one_epoch(
batch_size = len(batch["supervisions"]["text"])
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, loss_info = compute_loss(
params=params,
model=model,
@ -989,7 +995,7 @@ def scan_pessimistic_batches_for_oom(
# warmup = 0.0 is so that the derivs for the pruned loss stay zero
# (i.e. are not remembered by the decaying-average in adam), because
# we want to avoid these params being subject to shrinkage in adam.
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, _ = compute_loss(
params=params,
model=model,

View File

@ -23,7 +23,7 @@ import torch.nn as nn
from encoder_interface import EncoderInterface
from scaling import ScaledLinear
from icefall.utils import add_sos
from icefall.utils import add_sos, torch_autocast
class Transducer(nn.Module):
@ -184,7 +184,7 @@ class Transducer(nn.Module):
lm = simple_lm_proj(decoder_out)
am = simple_am_proj(encoder_out)
with torch.cuda.amp.autocast(enabled=False):
with torch_autocast(enabled=False):
simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
lm=lm.float(),
am=am.float(),
@ -219,7 +219,7 @@ class Transducer(nn.Module):
# prior to do_rnnt_pruning (this is an optimization for speed).
logits = joiner(am_pruned, lm_pruned, project_input=False)
with torch.cuda.amp.autocast(enabled=False):
with torch_autocast(enabled=False):
pruned_loss = k2.rnnt_loss_pruned(
logits=logits.float(),
symbols=y_padded,

View File

@ -228,7 +228,7 @@ def main():
logging.info("About to create model")
model = get_transducer_model(params)
checkpoint = torch.load(args.checkpoint, map_location="cpu")
checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
model.load_state_dict(checkpoint["model"], strict=False)
model.to(device)
model.eval()

View File

@ -94,7 +94,13 @@ from icefall.checkpoint import (
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.lexicon import Lexicon
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
from icefall.utils import (
AttributeDict,
MetricsTracker,
setup_logger,
str2bool,
torch_autocast,
)
LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -797,7 +803,7 @@ def train_one_epoch(
aishell = is_aishell(batch["supervisions"]["cut"][0])
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, loss_info = compute_loss(
params=params,
model=model,
@ -1202,7 +1208,7 @@ def scan_pessimistic_batches_for_oom(
# warmup = 0.0 is so that the derivs for the pruned loss stay zero
# (i.e. are not remembered by the decaying-average in adam), because
# we want to avoid these params being subject to shrinkage in adam.
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, _ = compute_loss(
params=params,
model=model,

View File

@ -94,6 +94,7 @@ from icefall.utils import (
filter_uneven_sized_batch,
setup_logger,
str2bool,
torch_autocast,
)
LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -809,7 +810,7 @@ def train_one_epoch(
batch_size = len(batch["supervisions"]["text"])
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, loss_info = compute_loss(
params=params,
model=model,
@ -1206,7 +1207,7 @@ def scan_pessimistic_batches_for_oom(
for criterion, cuts in batches.items():
batch = train_dl.dataset[cuts]
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, _ = compute_loss(
params=params,
model=model,

View File

@ -773,7 +773,7 @@ def main():
lg_filename = params.lang_dir / "LG.pt"
logging.info(f"Loading {lg_filename}")
decoding_graph = k2.Fsa.from_dict(
torch.load(lg_filename, map_location=device)
torch.load(lg_filename, map_location=device, weights_only=False)
)
decoding_graph.scores *= params.ngram_lm_scale
else:

View File

@ -237,7 +237,7 @@ def main():
num_param = sum([p.numel() for p in model.parameters()])
logging.info(f"Number of model parameters: {num_param}")
checkpoint = torch.load(args.checkpoint, map_location="cpu")
checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
model.load_state_dict(checkpoint["model"], strict=False)
model.to(device)
model.eval()

View File

@ -87,6 +87,7 @@ from icefall.utils import (
setup_logger,
str2bool,
tokenize_by_CJK_char,
torch_autocast,
)
LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -802,7 +803,7 @@ def train_one_epoch(
batch_size = len(batch["supervisions"]["text"])
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, loss_info = compute_loss(
params=params,
model=model,
@ -1202,7 +1203,7 @@ def scan_pessimistic_batches_for_oom(
for criterion, cuts in batches.items():
batch = train_dl.dataset[cuts]
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, _ = compute_loss(
params=params,
model=model,

View File

@ -81,7 +81,13 @@ from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.hooks import register_inf_check_hooks
from icefall.lexicon import Lexicon
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
from icefall.utils import (
AttributeDict,
MetricsTracker,
setup_logger,
str2bool,
torch_autocast,
)
LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -812,7 +818,7 @@ def train_one_epoch(
batch_size = len(batch["supervisions"]["text"])
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, loss_info = compute_loss(
params=params,
model=model,
@ -1202,7 +1208,7 @@ def scan_pessimistic_batches_for_oom(
for criterion, cuts in batches.items():
batch = train_dl.dataset[cuts]
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, _ = compute_loss(
params=params,
model=model,

View File

@ -92,7 +92,7 @@ class AishellAsrDataModule:
group.add_argument(
"--num-buckets",
type=int,
default=30,
default=15,
help="The number of buckets for the DynamicBucketingSampler"
"(you might want to increase it for larger datasets).",
)
@ -275,8 +275,7 @@ class AishellAsrDataModule:
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
buffer_size=self.args.num_buckets * 2000,
shuffle_buffer_size=self.args.num_buckets * 5000,
buffer_size=self.args.num_buckets * 5000,
drop_last=self.args.drop_last,
)
else:

View File

@ -337,7 +337,7 @@ def main():
logging.info(f"device: {device}")
HLG = k2.Fsa.from_dict(torch.load(f"{params.lang_dir}/HLG.pt", map_location="cpu"))
HLG = k2.Fsa.from_dict(torch.load(f"{params.lang_dir}/HLG.pt", map_location="cpu", weights_only=False))
HLG = HLG.to(device)
assert HLG.requires_grad is False

View File

@ -139,13 +139,13 @@ def main():
subsampling_factor=params.subsampling_factor,
)
checkpoint = torch.load(args.checkpoint, map_location="cpu")
checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
model.load_state_dict(checkpoint["model"])
model.to(device)
model.eval()
logging.info(f"Loading HLG from {params.HLG}")
HLG = k2.Fsa.from_dict(torch.load(params.HLG, map_location="cpu"))
HLG = k2.Fsa.from_dict(torch.load(params.HLG, map_location="cpu", weights_only=False))
HLG = HLG.to(device)
if not hasattr(HLG, "lm_scores"):
# For whole-lattice-rescoring and attention-decoder

View File

@ -245,7 +245,7 @@ def main():
logging.info("Creating model")
model = get_transducer_model(params)
checkpoint = torch.load(args.checkpoint, map_location="cpu")
checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
model.load_state_dict(checkpoint["model"], strict=False)
model.to(device)
model.eval()

View File

@ -225,7 +225,7 @@ def main():
logging.info("About to create model")
model = get_transducer_model(params)
checkpoint = torch.load(args.checkpoint, map_location="cpu")
checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
model.load_state_dict(checkpoint["model"], strict=False)
model.to(device)
model.eval()

View File

@ -225,7 +225,7 @@ def main():
logging.info("About to create model")
model = get_transducer_model(params)
checkpoint = torch.load(args.checkpoint, map_location="cpu")
checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
model.load_state_dict(checkpoint["model"])
model.to(device)
model.eval()

View File

@ -89,10 +89,10 @@ def average_checkpoints(
"""
n = len(filenames)
if "model" in torch.load(filenames[0], map_location=device):
avg = torch.load(filenames[0], map_location=device)["model"]
if "model" in torch.load(filenames[0], map_location=device, weights_only=False):
avg = torch.load(filenames[0], map_location=device, weights_only=False)["model"]
else:
avg = torch.load(filenames[0], map_location=device)
avg = torch.load(filenames[0], map_location=device, weights_only=False)
# Identify shared parameters. Two parameters are said to be shared
# if they have the same data_ptr
@ -107,10 +107,10 @@ def average_checkpoints(
uniqued_names = list(uniqued.values())
for i in range(1, n):
if "model" in torch.load(filenames[i], map_location=device):
state_dict = torch.load(filenames[i], map_location=device)["model"]
if "model" in torch.load(filenames[i], map_location=device, weights_only=False):
state_dict = torch.load(filenames[i], map_location=device, weights_only=False)["model"]
else:
state_dict = torch.load(filenames[i], map_location=device)
state_dict = torch.load(filenames[i], map_location=device, weights_only=False)
for k in uniqued_names:
avg[k] += state_dict[k]
@ -440,7 +440,7 @@ def main():
start = params.epoch - params.avg
assert start >= 1, start
checkpoint = torch.load(
f"{params.exp_dir}/epoch-{params.epoch}.pt", map_location="cpu"
f"{params.exp_dir}/epoch-{params.epoch}.pt", map_location="cpu", weights_only=False
)
if "model" not in checkpoint:
# deepspeed converted checkpoint only contains model state_dict
@ -469,7 +469,7 @@ def main():
torch.save(model.state_dict(), filename)
else:
checkpoint = torch.load(
f"{params.exp_dir}/epoch-{params.epoch}.pt", map_location="cpu"
f"{params.exp_dir}/epoch-{params.epoch}.pt", map_location="cpu", weights_only=False
)
if "model" not in checkpoint:
model.load_state_dict(checkpoint, strict=True)

View File

@ -81,6 +81,7 @@ from icefall.utils import (
filter_uneven_sized_batch,
setup_logger,
str2bool,
torch_autocast,
)
LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -514,7 +515,7 @@ def compute_validation_loss(
tot_loss = MetricsTracker()
for batch_idx, batch in enumerate(valid_dl):
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, loss_info = compute_loss(
params=params,
tokenizer=tokenizer,
@ -608,7 +609,7 @@ def train_one_epoch(
)
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, loss_info = compute_loss(
params=params,
tokenizer=tokenizer,

View File

@ -0,0 +1,540 @@
#!/usr/bin/env python3
#
# Copyright 2021-2022 Xiaomi Corporation (Author: Fangjun Kuang,
# Liyong Guo,
# Quandong Wang,
# Zengwei Yao,
# Zhifeng Han,)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Usage:
(1) ctc-greedy-search (with cr-ctc)
./zipformer/ctc_decode.py \
--epoch 60 \
--avg 28 \
--exp-dir ./zipformer/exp \
--use-cr-ctc 1 \
--use-ctc 1 \
--use-transducer 0 \
--max-duration 600 \
--decoding-method ctc-greedy-search
(2) ctc-prefix-beam-search (with cr-ctc)
./zipformer/ctc_decode.py \
--epoch 60 \
--avg 21 \
--exp-dir zipformer/exp \
--use-cr-ctc 1 \
--use-ctc 1 \
--use-transducer 0 \
--max-duration 600 \
--decoding-method ctc-prefix-beam-search
"""
import argparse
import logging
import math
import os
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import k2
import torch
import torch.nn as nn
from asr_datamodule import AishellAsrDataModule
from lhotse.cut import Cut
from train import add_model_arguments, get_model, get_params
from icefall.checkpoint import (
average_checkpoints,
average_checkpoints_with_averaged_model,
find_checkpoints,
load_checkpoint,
)
from icefall.decode import (
ctc_greedy_search,
ctc_prefix_beam_search,
)
from icefall.lexicon import Lexicon
from icefall.utils import (
AttributeDict,
make_pad_mask,
setup_logger,
store_transcripts,
str2bool,
write_error_stats,
)
LOG_EPS = math.log(1e-10)
def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--epoch",
type=int,
default=30,
help="""It specifies the checkpoint to use for decoding.
Note: Epoch counts from 1.
You can specify --avg to use more checkpoints for model averaging.""",
)
parser.add_argument(
"--iter",
type=int,
default=0,
help="""If positive, --epoch is ignored and it
will use the checkpoint exp_dir/checkpoint-iter.pt.
You can specify --avg to use more checkpoints for model averaging.
""",
)
parser.add_argument(
"--avg",
type=int,
default=15,
help="Number of checkpoints to average. Automatically select "
"consecutive checkpoints before the checkpoint specified by "
"'--epoch' and '--iter'",
)
parser.add_argument(
"--use-averaged-model",
type=str2bool,
default=True,
help="Whether to load averaged model. Currently it only supports "
"using --epoch. If True, it would decode with the averaged model "
"over the epoch range from `epoch-avg` (excluded) to `epoch`."
"Actually only the models with epoch number of `epoch-avg` and "
"`epoch` are loaded for averaging. ",
)
parser.add_argument(
"--exp-dir",
type=str,
default="zipformer/exp",
help="The experiment dir",
)
parser.add_argument(
"--lang-dir",
type=Path,
default="data/lang_char",
help="The lang dir containing word table and LG graph",
)
parser.add_argument(
"--decoding-method",
type=str,
default="ctc-greedy-search",
help="""Decoding method.
Supported values are:
- (1) ctc-greedy-search. Use CTC greedy search. It uses a sentence piece
model, i.e., lang_dir/bpe.model, to convert word pieces to words.
It needs neither a lexicon nor an n-gram LM.
(2) ctc-prefix-beam-search. Extract n paths with the given beam, the best
path of the n paths is the decoding result.
""",
)
add_model_arguments(parser)
return parser
def get_decoding_params() -> AttributeDict:
"""Parameters for decoding."""
params = AttributeDict(
{
"beam": 4, # for prefix-beam-search
}
)
return params
def decode_one_batch(
params: AttributeDict,
model: nn.Module,
lexicon: Lexicon,
batch: dict,
) -> Dict[str, Tuple[List[List[str]], List[List[Tuple[float, float]]]]]:
"""Decode one batch and return the result in a dict. The dict has the
following format:
- key: It indicates the setting used for decoding. For example,
if greedy_search is used, it would be "greedy_search"
If beam search with a beam size of 7 is used, it would be
"beam_7"
- value: It contains the decoding result. `len(value)` equals to
batch size. `value[i]` is the decoding result for the i-th
utterance in the given batch.
Args:
params:
It's the return value of :func:`get_params`.
model:
The neural model.
batch:
It is the return value from iterating
`lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
for the format of the `batch`.
Returns:
Return the decoding result. See above description for the format of
the returned dict.
"""
device = next(model.parameters()).device
feature = batch["inputs"]
assert feature.ndim == 3
feature = feature.to(device)
# at entry, feature is (N, T, C)
supervisions = batch["supervisions"]
feature_lens = supervisions["num_frames"].to(device)
if params.causal:
# this seems to cause insertions at the end of the utterance if used with zipformer.
pad_len = 30
feature_lens += pad_len
feature = torch.nn.functional.pad(
feature,
pad=(0, 0, 0, pad_len),
value=LOG_EPS,
)
x, x_lens = model.encoder_embed(feature, feature_lens)
src_key_padding_mask = make_pad_mask(x_lens)
x = x.permute(1, 0, 2) # (N, T, C) -> (T, N, C)
encoder_out, encoder_out_lens = model.encoder(x, x_lens, src_key_padding_mask)
encoder_out = encoder_out.permute(1, 0, 2) # (T, N, C) ->(N, T, C)
ctc_output = model.ctc_output(encoder_out) # (N, T, C)
hyp_tokens = []
hyps = []
if params.decoding_method == "ctc-greedy-search":
hyp_tokens = ctc_greedy_search(
ctc_output=ctc_output,
encoder_out_lens=encoder_out_lens,
)
elif params.decoding_method == "ctc-prefix-beam-search":
hyp_tokens = ctc_prefix_beam_search(
ctc_output=ctc_output,
encoder_out_lens=encoder_out_lens,
)
else:
raise ValueError(
f"Unsupported decoding method: {params.decoding_method}"
)
for i in range(encoder_out.size(0)):
hyps.append([lexicon.token_table[idx] for idx in hyp_tokens[i]])
if params.decoding_method == "ctc-greedy-search":
return {"ctc-greedy-search" : hyps}
elif params.decoding_method == "ctc-prefix-beam-search":
return {"ctc-prefix-beam-search" : hyps}
else:
assert False, f"Unsupported decoding method: {params.decoding_method}"
def decode_dataset(
dl: torch.utils.data.DataLoader,
params: AttributeDict,
model: nn.Module,
lexicon: Lexicon,
) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
"""Decode dataset.
Args:
dl:
PyTorch's dataloader containing the dataset to decode.
params:
It is returned by :func:`get_params`.
model:
The neural model.
Returns:
Return a dict, whose key may be "greedy_search" if greedy search
is used, or it may be "beam_7" if beam size of 7 is used.
Its value is a list of tuples. Each tuple contains 3 elements:
Respectively, they are cut_id, the reference transcript, and the predicted result.
"""
num_cuts = 0
try:
num_batches = len(dl)
except TypeError:
num_batches = "?"
log_interval = 20
results = defaultdict(list)
for batch_idx, batch in enumerate(dl):
texts = batch["supervisions"]["text"]
texts = [list("".join(text.split())) for text in texts]
cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
hyps_dict = decode_one_batch(
params=params,
model=model,
lexicon=lexicon,
batch=batch,
)
for name, hyps in hyps_dict.items():
this_batch = []
assert len(hyps) == len(texts)
for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
this_batch.append((cut_id, ref_text, hyp_words))
results[name].extend(this_batch)
num_cuts += len(texts)
if batch_idx % log_interval == 0:
batch_str = f"{batch_idx}/{num_batches}"
logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
return results
def save_results(
params: AttributeDict,
test_set_name: str,
results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
):
test_set_wers = dict()
for key, results in results_dict.items():
recog_path = (
params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
)
results = sorted(results)
store_transcripts(filename=recog_path, texts=results, char_level = True)
logging.info(f"The transcripts are stored in {recog_path}")
# The following prints out WERs, per-word error statistics and aligned
# ref/hyp pairs.
errs_filename = (
params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
)
with open(errs_filename, "w") as f:
wer = write_error_stats(
f,
f"{test_set_name}-{key}",
results,
enable_log=True,
compute_CER=True,
)
test_set_wers[key] = wer
logging.info("Wrote detailed error stats to {}".format(errs_filename))
test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
errs_info = (
params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
)
with open(errs_info, "w") as f:
print("settings\tWER", file=f)
for key, val in test_set_wers:
print("{}\t{}".format(key, val), file=f)
s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
note = "\tbest for {}".format(test_set_name)
for key, val in test_set_wers:
s += "{}\t{}{}\n".format(key, val, note)
note = ""
logging.info(s)
@torch.no_grad()
def main():
parser = get_parser()
AishellAsrDataModule.add_arguments(parser)
args = parser.parse_args()
args.exp_dir = Path(args.exp_dir)
args.lang_dir = Path(args.lang_dir)
params = get_params()
# add decoding params
params.update(get_decoding_params())
params.update(vars(args))
assert params.decoding_method in (
"ctc-greedy-search",
"ctc-prefix-beam-search",
) # support ctc-greedy-search and ctc-prefix-beam-search
params.res_dir = params.exp_dir / params.decoding_method
if params.iter > 0:
params.suffix = f"iter-{params.iter}-avg-{params.avg}"
else:
params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
if params.causal:
assert (
"," not in params.chunk_size
), "chunk_size should be one value in decoding."
assert (
"," not in params.left_context_frames
), "left_context_frames should be one value in decoding."
params.suffix += f"-chunk-{params.chunk_size}"
params.suffix += f"-left-context-{params.left_context_frames}"
if "prefix-beam-search" in params.decoding_method:
params.suffix += f"_beam-{params.beam}"
if params.use_averaged_model:
params.suffix += "-use-averaged-model"
setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
logging.info("Decoding started")
device = torch.device("cpu")
if torch.cuda.is_available():
device = torch.device("cuda", 0)
params.device = device
logging.info(f"Device: {device}")
lexicon = Lexicon(params.lang_dir)
params.blank_id = lexicon.token_table["<blk>"]
params.vocab_size = max(lexicon.tokens) + 1
logging.info(params)
logging.info("About to create model")
model = get_model(params)
if not params.use_averaged_model:
if params.iter > 0:
filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
: params.avg
]
if len(filenames) == 0:
raise ValueError(
f"No checkpoints found for"
f" --iter {params.iter}, --avg {params.avg}"
)
elif len(filenames) < params.avg:
raise ValueError(
f"Not enough checkpoints ({len(filenames)}) found for"
f" --iter {params.iter}, --avg {params.avg}"
)
logging.info(f"averaging {filenames}")
model.to(device)
model.load_state_dict(average_checkpoints(filenames, device=device))
elif params.avg == 1:
load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
else:
start = params.epoch - params.avg + 1
filenames = []
for i in range(start, params.epoch + 1):
if i >= 1:
filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
logging.info(f"averaging {filenames}")
model.to(device)
model.load_state_dict(average_checkpoints(filenames, device=device))
else:
if params.iter > 0:
filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
: params.avg + 1
]
if len(filenames) == 0:
raise ValueError(
f"No checkpoints found for"
f" --iter {params.iter}, --avg {params.avg}"
)
elif len(filenames) < params.avg + 1:
raise ValueError(
f"Not enough checkpoints ({len(filenames)}) found for"
f" --iter {params.iter}, --avg {params.avg}"
)
filename_start = filenames[-1]
filename_end = filenames[0]
logging.info(
"Calculating the averaged model over iteration checkpoints"
f" from {filename_start} (excluded) to {filename_end}"
)
model.to(device)
model.load_state_dict(
average_checkpoints_with_averaged_model(
filename_start=filename_start,
filename_end=filename_end,
device=device,
)
)
else:
assert params.avg > 0, params.avg
start = params.epoch - params.avg
assert start >= 1, start
filename_start = f"{params.exp_dir}/epoch-{start}.pt"
filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
logging.info(
f"Calculating the averaged model over epoch range from "
f"{start} (excluded) to {params.epoch}"
)
model.to(device)
model.load_state_dict(
average_checkpoints_with_averaged_model(
filename_start=filename_start,
filename_end=filename_end,
device=device,
)
)
model.to(device)
model.eval()
num_param = sum([p.numel() for p in model.parameters()])
logging.info(f"Number of model parameters: {num_param}")
# we need cut ids to display recognition results.
args.return_cuts = True
aishell = AishellAsrDataModule(args)
dev_cuts = aishell.valid_cuts()
dev_dl = aishell.valid_dataloaders(dev_cuts)
test_cuts = aishell.test_cuts()
test_dl = aishell.test_dataloaders(test_cuts)
test_sets = ["dev", "test"]
test_dls = [dev_dl, test_dl]
for test_set, test_dl in zip(test_sets, test_dls):
results_dict = decode_dataset(
dl=test_dl,
params=params,
model=model,
lexicon=lexicon,
)
save_results(
params=params,
test_set_name=test_set,
results_dict=results_dict,
)
logging.info("Done!")
if __name__ == "__main__":
main()

View File

@ -761,7 +761,7 @@ def main():
lg_filename = params.lang_dir / "LG.pt"
logging.info(f"Loading {lg_filename}")
decoding_graph = k2.Fsa.from_dict(
torch.load(lg_filename, map_location=device)
torch.load(lg_filename, map_location=device, weights_only=False)
)
decoding_graph.scores *= params.ngram_lm_scale
else:

View File

@ -783,7 +783,7 @@ def main():
lg_filename = params.lang_dir / "LG.pt"
logging.info(f"Loading {lg_filename}")
decoding_graph = k2.Fsa.from_dict(
torch.load(lg_filename, map_location=device)
torch.load(lg_filename, map_location=device, weights_only=False)
)
decoding_graph.scores *= params.ngram_lm_scale
else:

View File

@ -298,7 +298,7 @@ def main():
num_param = sum([p.numel() for p in model.parameters()])
logging.info(f"Number of model parameters: {num_param}")
checkpoint = torch.load(args.checkpoint, map_location="cpu")
checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
model.load_state_dict(checkpoint["model"], strict=False)
model.to(device)
model.eval()

View File

@ -64,6 +64,7 @@ from asr_datamodule import AishellAsrDataModule
from decoder import Decoder
from joiner import Joiner
from lhotse.cut import Cut
from lhotse.dataset import SpecAugment
from lhotse.dataset.sampling.base import CutSampler
from lhotse.utils import fix_random_seed
from model import AsrModel
@ -95,6 +96,7 @@ from icefall.utils import (
get_parameter_groups_with_lrs,
setup_logger,
str2bool,
torch_autocast,
)
LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -239,6 +241,27 @@ def add_model_arguments(parser: argparse.ArgumentParser):
chunk left-context frames will be chosen randomly from this list; else not relevant.""",
)
parser.add_argument(
"--use-transducer",
type=str2bool,
default=True,
help="If True, use Transducer head.",
)
parser.add_argument(
"--use-ctc",
type=str2bool,
default=False,
help="If True, use CTC head.",
)
parser.add_argument(
"--use-cr-ctc",
type=str2bool,
default=False,
help="If True, use consistency-regularized CTC.",
)
def get_parser():
parser = argparse.ArgumentParser(
@ -379,6 +402,27 @@ def get_parser():
with this parameter before adding to the final loss.""",
)
parser.add_argument(
"--ctc-loss-scale",
type=float,
default=0.2,
help="Scale for CTC loss.",
)
parser.add_argument(
"--cr-loss-scale",
type=float,
default=0.2,
help="Scale for consistency-regularization loss.",
)
parser.add_argument(
"--time-mask-ratio",
type=float,
default=2.5,
help="When using cr-ctc, we increase the amount of time-masking in SpecAugment.",
)
parser.add_argument(
"--seed",
type=int,
@ -582,8 +626,13 @@ def get_joiner_model(params: AttributeDict) -> nn.Module:
def get_model(params: AttributeDict) -> nn.Module:
encoder_embed = get_encoder_embed(params)
encoder = get_encoder_model(params)
decoder = get_decoder_model(params)
joiner = get_joiner_model(params)
if params.use_transducer:
decoder = get_decoder_model(params)
joiner = get_joiner_model(params)
else:
decoder = None
joiner = None
model = AsrModel(
encoder_embed=encoder_embed,
@ -593,9 +642,27 @@ def get_model(params: AttributeDict) -> nn.Module:
encoder_dim=int(max(params.encoder_dim.split(","))),
decoder_dim=params.decoder_dim,
vocab_size=params.vocab_size,
use_transducer=params.use_transducer,
use_ctc=params.use_ctc,
)
return model
def get_spec_augment(params: AttributeDict) -> SpecAugment:
num_frame_masks = int(10 * params.time_mask_ratio)
max_frames_mask_fraction = 0.15 * params.time_mask_ratio
logging.info(
f"num_frame_masks: {num_frame_masks}, "
f"max_frames_mask_fraction: {max_frames_mask_fraction}"
)
spec_augment = SpecAugment(
time_warp_factor=0, # Do time warping in model.py
num_frame_masks=num_frame_masks, # default: 10
features_mask_size=27,
num_feature_masks=2,
frames_mask_size=100,
max_frames_mask_fraction=max_frames_mask_fraction, # default: 0.15
)
return spec_augment
def load_checkpoint_if_available(
params: AttributeDict,
@ -722,6 +789,7 @@ def compute_loss(
graph_compiler: CharCtcTrainingGraphCompiler,
batch: dict,
is_training: bool,
spec_augment: Optional[SpecAugment] = None,
) -> Tuple[Tensor, MetricsTracker]:
"""
Compute CTC loss given the model and its inputs.
@ -738,6 +806,8 @@ def compute_loss(
True for training. False for validation. When it is True, this
function enables autograd during computation; when it is False, it
disables autograd.
spec_augment:
The SpecAugment instance used only when use_cr_ctc is True.
warmup: a floating point value which increases throughout training;
values >= 1.0 are fully warmed up and have all modules present.
"""
@ -757,6 +827,21 @@ def compute_loss(
y = graph_compiler.texts_to_ids(texts)
y = k2.RaggedTensor(y).to(device)
use_cr_ctc = params.use_cr_ctc
use_spec_aug = use_cr_ctc and is_training
if use_spec_aug:
supervision_intervals = batch["supervisions"]
supervision_segments = torch.stack(
[
supervision_intervals["sequence_idx"],
supervision_intervals["start_frame"],
supervision_intervals["num_frames"],
],
dim=1,
) # shape: (S, 3)
else:
supervision_segments = None
with torch.set_grad_enabled(is_training):
losses = model(
x=feature,
@ -765,25 +850,40 @@ def compute_loss(
prune_range=params.prune_range,
am_scale=params.am_scale,
lm_scale=params.lm_scale,
use_cr_ctc=use_cr_ctc,
use_spec_aug=use_spec_aug,
spec_augment=spec_augment,
supervision_segments=supervision_segments,
time_warp_factor=params.spec_aug_time_warp_factor,
)
simple_loss, pruned_loss = losses[:2]
if params.use_ctc:
simple_loss, pruned_loss, ctc_loss, _, cr_loss = losses[:5]
else:
simple_loss, pruned_loss = losses[:2]
s = params.simple_loss_scale
# take down the scale on the simple loss from 1.0 at the start
# to params.simple_loss scale by warm_step.
simple_loss_scale = (
s
if batch_idx_train >= warm_step
else 1.0 - (batch_idx_train / warm_step) * (1.0 - s)
)
pruned_loss_scale = (
1.0
if batch_idx_train >= warm_step
else 0.1 + 0.9 * (batch_idx_train / warm_step)
)
loss = 0.0
loss = simple_loss_scale * simple_loss + pruned_loss_scale * pruned_loss
if params.use_transducer:
s = params.simple_loss_scale
# take down the scale on the simple loss from 1.0 at the start
# to params.simple_loss scale by warm_step.
simple_loss_scale = (
s
if batch_idx_train >= warm_step
else 1.0 - (batch_idx_train / warm_step) * (1.0 - s)
)
pruned_loss_scale = (
1.0
if batch_idx_train >= warm_step
else 0.1 + 0.9 * (batch_idx_train / warm_step)
)
loss += simple_loss_scale * simple_loss + pruned_loss_scale * pruned_loss
if params.use_ctc:
loss += params.ctc_loss_scale * ctc_loss
if use_cr_ctc:
loss += params.cr_loss_scale * cr_loss
assert loss.requires_grad == is_training
info = MetricsTracker()
@ -793,8 +893,13 @@ def compute_loss(
# Note: We use reduction=sum while computing the loss.
info["loss"] = loss.detach().cpu().item()
info["simple_loss"] = simple_loss.detach().cpu().item()
info["pruned_loss"] = pruned_loss.detach().cpu().item()
if params.use_transducer:
info["simple_loss"] = simple_loss.detach().cpu().item()
info["pruned_loss"] = pruned_loss.detach().cpu().item()
if params.use_ctc:
info["ctc_loss"] = ctc_loss.detach().cpu().item()
if params.use_cr_ctc:
info["cr_loss"] = cr_loss.detach().cpu().item()
return loss, info
@ -842,6 +947,7 @@ def train_one_epoch(
train_dl: torch.utils.data.DataLoader,
valid_dl: torch.utils.data.DataLoader,
scaler: GradScaler,
spec_augment: Optional[SpecAugment] = None,
model_avg: Optional[nn.Module] = None,
tb_writer: Optional[SummaryWriter] = None,
world_size: int = 1,
@ -868,6 +974,8 @@ def train_one_epoch(
Dataloader for the validation dataset.
scaler:
The scaler used for mix precision training.
spec_augment:
The SpecAugment instance used only when use_cr_ctc is True.
model_avg:
The stored model averaged from the start of training.
tb_writer:
@ -910,13 +1018,14 @@ def train_one_epoch(
batch_size = len(batch["supervisions"]["text"])
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, loss_info = compute_loss(
params=params,
model=model,
graph_compiler=graph_compiler,
batch=batch,
is_training=True,
spec_augment=spec_augment,
)
# summary stats
tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
@ -1082,6 +1191,9 @@ def run(rank, world_size, args):
params.blank_id = lexicon.token_table["<blk>"]
params.vocab_size = max(lexicon.tokens) + 1
if not params.use_transducer:
params.ctc_loss_scale = 1.0
logging.info(params)
logging.info("About to create model")
@ -1090,6 +1202,12 @@ def run(rank, world_size, args):
num_param = sum([p.numel() for p in model.parameters()])
logging.info(f"Number of model parameters: {num_param}")
if params.use_cr_ctc:
assert not params.enable_spec_aug # we will do spec_augment in model.py
spec_augment = get_spec_augment(params)
else:
spec_augment = None
assert params.save_every_n >= params.average_period
model_avg: Optional[nn.Module] = None
if rank == 0:
@ -1199,6 +1317,7 @@ def run(rank, world_size, args):
optimizer=optimizer,
graph_compiler=graph_compiler,
params=params,
spec_augment=spec_augment,
)
scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
@ -1226,6 +1345,7 @@ def run(rank, world_size, args):
train_dl=train_dl,
valid_dl=valid_dl,
scaler=scaler,
spec_augment=spec_augment,
tb_writer=tb_writer,
world_size=world_size,
rank=rank,
@ -1292,6 +1412,7 @@ def scan_pessimistic_batches_for_oom(
optimizer: torch.optim.Optimizer,
graph_compiler: CharCtcTrainingGraphCompiler,
params: AttributeDict,
spec_augment: Optional[SpecAugment] = None,
):
from lhotse.dataset import find_pessimistic_batches
@ -1302,13 +1423,14 @@ def scan_pessimistic_batches_for_oom(
for criterion, cuts in batches.items():
batch = train_dl.dataset[cuts]
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, _ = compute_loss(
params=params,
model=model,
graph_compiler=graph_compiler,
batch=batch,
is_training=True,
spec_augment=spec_augment,
)
loss.backward()
optimizer.zero_grad()
@ -1343,8 +1465,7 @@ def main():
run(rank=0, world_size=1, args=args)
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
if __name__ == "__main__":
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
main()

View File

@ -92,6 +92,7 @@ from icefall.utils import (
setup_logger,
str2bool,
tokenize_by_CJK_char,
torch_autocast,
)
@ -495,7 +496,7 @@ def train_one_epoch(
batch_size = len(batch["supervisions"]["text"])
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, loss_info = compute_loss(
params=params,
model=model,
@ -895,7 +896,7 @@ def scan_pessimistic_batches_for_oom(
for criterion, cuts in batches.items():
batch = train_dl.dataset[cuts]
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, _ = compute_loss(
params=params,
model=model,
@ -935,8 +936,7 @@ def main():
run(rank=0, world_size=1, args=args)
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
if __name__ == "__main__":
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
main()

View File

@ -104,7 +104,7 @@ class AiShell2AsrDataModule:
group.add_argument(
"--num-buckets",
type=int,
default=30,
default=15,
help="The number of buckets for the DynamicBucketingSampler"
"(you might want to increase it for larger datasets).",
)
@ -296,8 +296,7 @@ class AiShell2AsrDataModule:
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
buffer_size=self.args.num_buckets * 2000,
shuffle_buffer_size=self.args.num_buckets * 5000,
buffer_size=self.args.num_buckets * 5000,
drop_last=self.args.drop_last,
)
else:

View File

@ -728,7 +728,7 @@ def main():
lg_filename = params.lang_dir / "LG.pt"
logging.info(f"Loading {lg_filename}")
decoding_graph = k2.Fsa.from_dict(
torch.load(lg_filename, map_location=device)
torch.load(lg_filename, map_location=device, weights_only=False)
)
decoding_graph.scores *= params.ngram_lm_scale
else:

View File

@ -226,7 +226,7 @@ def main():
num_param = sum([p.numel() for p in model.parameters()])
logging.info(f"Number of model parameters: {num_param}")
checkpoint = torch.load(args.checkpoint, map_location="cpu")
checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
model.load_state_dict(checkpoint["model"], strict=False)
model.to(device)
model.eval()

View File

@ -90,7 +90,13 @@ from icefall.checkpoint import (
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.lexicon import Lexicon
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
from icefall.utils import (
AttributeDict,
MetricsTracker,
setup_logger,
str2bool,
torch_autocast,
)
LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -734,7 +740,7 @@ def train_one_epoch(
batch_size = len(batch["supervisions"]["text"])
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, loss_info = compute_loss(
params=params,
model=model,
@ -1062,7 +1068,7 @@ def scan_pessimistic_batches_for_oom(
for criterion, cuts in batches.items():
batch = train_dl.dataset[cuts]
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, _ = compute_loss(
params=params,
model=model,

View File

@ -28,7 +28,7 @@ consisting of words and tokens (i.e., phones) and does the following:
4. Generate L.pt, in k2 format. It can be loaded by
d = torch.load("L.pt")
d = torch.load("L.pt", weights_only=False)
lexicon = k2.Fsa.from_dict(d)
5. Generate L_disambig.pt, in k2 format.

View File

@ -238,7 +238,7 @@ def main():
num_param = sum([p.numel() for p in model.parameters()])
logging.info(f"Number of model parameters: {num_param}")
checkpoint = torch.load(args.checkpoint, map_location="cpu")
checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
model.load_state_dict(checkpoint["model"], strict=False)
model.to(device)
model.eval()

View File

@ -83,7 +83,13 @@ from icefall.checkpoint import (
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.lexicon import Lexicon
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
from icefall.utils import (
AttributeDict,
MetricsTracker,
setup_logger,
str2bool,
torch_autocast,
)
LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -727,7 +733,7 @@ def train_one_epoch(
batch_size = len(batch["supervisions"]["text"])
# print(batch["supervisions"])
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, loss_info = compute_loss(
params=params,
model=model,
@ -1034,7 +1040,7 @@ def scan_pessimistic_batches_for_oom(
# warmup = 0.0 is so that the derivs for the pruned loss stay zero
# (i.e. are not remembered by the decaying-average in adam), because
# we want to avoid these params being subject to shrinkage in adam.
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, _ = compute_loss(
params=params,
model=model,

View File

@ -28,7 +28,7 @@ consisting of words and tokens (i.e., phones) and does the following:
4. Generate L.pt, in k2 format. It can be loaded by
d = torch.load("L.pt")
d = torch.load("L.pt", weights_only=False)
lexicon = k2.Fsa.from_dict(d)
5. Generate L_disambig.pt, in k2 format.

View File

@ -224,7 +224,7 @@ def main():
logging.info("Creating model")
model = get_transducer_model(params)
checkpoint = torch.load(args.checkpoint, map_location="cpu")
checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
model.load_state_dict(checkpoint["model"], strict=False)
model.to(device)
model.eval()

View File

@ -79,7 +79,13 @@ from icefall.checkpoint import save_checkpoint_with_global_batch_idx
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.lexicon import Lexicon
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
from icefall.utils import (
AttributeDict,
MetricsTracker,
setup_logger,
str2bool,
torch_autocast,
)
LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -638,7 +644,7 @@ def train_one_epoch(
params.batch_idx_train += 1
batch_size = len(batch["supervisions"]["text"])
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, loss_info = compute_loss(
params=params,
model=model,
@ -912,7 +918,7 @@ def scan_pessimistic_batches_for_oom(
# warmup = 0.0 is so that the derivs for the pruned loss stay zero
# (i.e. are not remembered by the decaying-average in adam), because
# we want to avoid these params being subject to shrinkage in adam.
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, _ = compute_loss(
params=params,
model=model,

View File

@ -73,7 +73,13 @@ from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.hooks import register_inf_check_hooks
from icefall.lexicon import Lexicon
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
from icefall.utils import (
AttributeDict,
MetricsTracker,
setup_logger,
str2bool,
torch_autocast,
)
LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -782,7 +788,7 @@ def train_one_epoch(
batch_size = len(batch["supervisions"]["text"])
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, loss_info = compute_loss(
params=params,
model=model,
@ -1127,7 +1133,7 @@ def scan_pessimistic_batches_for_oom(
for criterion, cuts in batches.items():
batch = train_dl.dataset[cuts]
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, _ = compute_loss(
params=params,
model=model,

View File

@ -672,7 +672,7 @@ def main():
lg_filename = params.lang_dir / "LG.pt"
logging.info(f"Loading {lg_filename}")
decoding_graph = k2.Fsa.from_dict(
torch.load(lg_filename, map_location=device)
torch.load(lg_filename, map_location=device, weights_only=False)
)
decoding_graph.scores *= params.ngram_lm_scale
else:

View File

@ -71,7 +71,13 @@ from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.hooks import register_inf_check_hooks
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
from icefall.utils import (
AttributeDict,
MetricsTracker,
setup_logger,
str2bool,
torch_autocast,
)
LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -773,7 +779,7 @@ def train_one_epoch(
batch_size = len(batch["supervisions"]["text"])
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, loss_info = compute_loss(
params=params,
model=model,
@ -1134,7 +1140,7 @@ def scan_pessimistic_batches_for_oom(
for criterion, cuts in batches.items():
batch = train_dl.dataset[cuts]
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, _ = compute_loss(
params=params,
model=model,

View File

@ -76,7 +76,13 @@ from icefall.checkpoint import (
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
from icefall.utils import (
AttributeDict,
MetricsTracker,
setup_logger,
str2bool,
torch_autocast,
)
LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -1067,7 +1073,7 @@ def train_one_epoch(
batch_size = batch["inputs"].shape[0]
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, loss_info = compute_loss(
params=params,
model=model,
@ -1257,7 +1263,7 @@ def run(rank, world_size, args):
logging.info(
f"Initializing model with checkpoint from {params.model_init_ckpt}"
)
init_ckpt = torch.load(params.model_init_ckpt, map_location=device)
init_ckpt = torch.load(params.model_init_ckpt, map_location=device, weights_only=False)
model.load_state_dict(init_ckpt["model"], strict=False)
if world_size > 1:

View File

@ -76,7 +76,13 @@ from icefall.checkpoint import (
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
from icefall.utils import (
AttributeDict,
MetricsTracker,
setup_logger,
str2bool,
torch_autocast,
)
LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -1058,7 +1064,7 @@ def train_one_epoch(
batch_size = batch["inputs"].shape[0]
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, loss_info = compute_loss(
params=params,
model=model,
@ -1248,7 +1254,7 @@ def run(rank, world_size, args):
logging.info(
f"Initializing model with checkpoint from {params.model_init_ckpt}"
)
init_ckpt = torch.load(params.model_init_ckpt, map_location=device)
init_ckpt = torch.load(params.model_init_ckpt, map_location=device, weights_only=False)
model.load_state_dict(init_ckpt["model"], strict=False)
if world_size > 1:

View File

@ -141,7 +141,7 @@ def main():
num_param = sum([p.numel() for p in model.parameters()])
logging.info(f"Number of model parameters: {num_param}")
checkpoint = torch.load(args.checkpoint, map_location="cpu")
checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
model.load_state_dict(checkpoint["model"], strict=False)
model.to(device)
model.eval()

View File

@ -74,6 +74,7 @@ from icefall.utils import (
get_parameter_groups_with_lrs,
setup_logger,
str2bool,
torch_autocast,
)
LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -799,7 +800,7 @@ def train_one_epoch(
num_samples += batch_size
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, loss_info = compute_loss(
params=params,
model=model,
@ -1148,7 +1149,7 @@ def scan_pessimistic_batches_for_oom(
for criterion, cuts in batches.items():
batch = train_dl.dataset[cuts]
try:
with torch.cuda.amp.autocast(enabled=params.use_fp16):
with torch_autocast(enabled=params.use_fp16):
loss, _ = compute_loss(
params=params,
model=model,

6
egs/baker_zh/TTS/.gitignore vendored Normal file
View File

@ -0,0 +1,6 @@
path.sh
*.onnx
*.wav
generator_v1
generator_v2
generator_v3

146
egs/baker_zh/TTS/README.md Normal file
View File

@ -0,0 +1,146 @@
# Introduction
It is for the dataset from
https://en.data-baker.com/datasets/freeDatasets/
The dataset contains 10000 Chinese sentences of a native Chinese female speaker,
which is about 12 hours.
**Note**: The dataset is for non-commercial use only.
# matcha
[./matcha](./matcha) contains the code for training [Matcha-TTS](https://github.com/shivammehta25/Matcha-TTS)
Checkpoints and training logs can be found [here](https://huggingface.co/csukuangfj/icefall-tts-baker-matcha-zh-2024-12-27).
The pull-request for this recipe can be found at <https://github.com/k2-fsa/icefall/pull/1849>
The training command is given below:
```bash
python3 ./matcha/train.py \
--exp-dir ./matcha/exp-1/ \
--num-workers 4 \
--world-size 1 \
--num-epochs 2000 \
--max-duration 1200 \
--bucketing-sampler 1 \
--start-epoch 1
```
To inference, use:
```bash
# Download Hifigan vocoder. We use Hifigan v2 below. You can select from v1, v2, or v3
wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v2
python3 ./matcha/infer.py \
--epoch 2000 \
--exp-dir ./matcha/exp-1 \
--vocoder ./generator_v2 \
--tokens ./data/tokens.txt \
--cmvn ./data/fbank/cmvn.json \
--input-text "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。" \
--output-wav ./generated.wav
```
```bash
soxi ./generated.wav
```
prints:
```
Input File : './generated.wav'
Channels : 1
Sample Rate : 22050
Precision : 16-bit
Duration : 00:00:17.31 = 381696 samples ~ 1298.29 CDDA sectors
File Size : 763k
Bit Rate : 353k
Sample Encoding: 16-bit Signed Integer PCM
```
https://github.com/user-attachments/assets/88d4e88f-ebc4-4f32-b216-16d46b966024
To export the checkpoint to onnx:
```bash
python3 ./matcha/export_onnx.py \
--exp-dir ./matcha/exp-1 \
--epoch 2000 \
--tokens ./data/tokens.txt \
--cmvn ./data/fbank/cmvn.json
```
The above command generates the following files:
```
-rw-r--r-- 1 kuangfangjun root 72M Dec 27 18:53 model-steps-2.onnx
-rw-r--r-- 1 kuangfangjun root 73M Dec 27 18:54 model-steps-3.onnx
-rw-r--r-- 1 kuangfangjun root 73M Dec 27 18:54 model-steps-4.onnx
-rw-r--r-- 1 kuangfangjun root 74M Dec 27 18:55 model-steps-5.onnx
-rw-r--r-- 1 kuangfangjun root 74M Dec 27 18:57 model-steps-6.onnx
```
where the 2 in `model-steps-2.onnx` means it uses 2 steps for the ODE solver.
**HINT**: If you get the following error while running `export_onnx.py`:
```
torch.onnx.errors.UnsupportedOperatorError: Exporting the operator
'aten::scaled_dot_product_attention' to ONNX opset version 14 is not supported.
```
please use `torch>=2.2.0`.
To export the Hifigan vocoder to onnx, please use:
```bash
wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v2
wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v3
python3 ./matcha/export_onnx_hifigan.py
```
The above command generates 3 files:
- hifigan_v1.onnx
- hifigan_v2.onnx
- hifigan_v3.onnx
**HINT**: You can download pre-exported hifigan ONNX models from
<https://github.com/k2-fsa/sherpa-onnx/releases/tag/vocoder-models>
To use the generated onnx files to generate speech from text, please run:
```bash
# First, generate ./lexicon.txt
python3 ./matcha/generate_lexicon.py
python3 ./matcha/onnx_pretrained.py \
--acoustic-model ./model-steps-4.onnx \
--vocoder ./hifigan_v2.onnx \
--tokens ./data/tokens.txt \
--lexicon ./lexicon.txt \
--input-text "在一个阳光明媚的夏天,小马、小羊和小狗它们一块儿在广阔的草地上,嬉戏玩耍,这时小猴来了,还带着它心爱的足球活蹦乱跳地跑前、跑后教小马、小羊、小狗踢足球。" \
--output-wav ./1.wav
```
```bash
soxi ./1.wav
Input File : './1.wav'
Channels : 1
Sample Rate : 22050
Precision : 16-bit
Duration : 00:00:16.37 = 360960 samples ~ 1227.76 CDDA sectors
File Size : 722k
Bit Rate : 353k
Sample Encoding: 16-bit Signed Integer PCM
```
https://github.com/user-attachments/assets/578d04bb-fee8-47e5-9984-a868dcce610e

View File

@ -0,0 +1 @@
../matcha/audio.py

View File

@ -0,0 +1,112 @@
#!/usr/bin/env python3
# Copyright 2021-2023 Xiaomi Corp. (authors: Fangjun Kuang,
# Zengwei Yao)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This file computes fbank features of the baker-zh dataset.
It looks for manifests in the directory data/manifests.
The generated fbank features are saved in data/fbank.
"""
import argparse
import logging
import os
from pathlib import Path
import torch
from fbank import MatchaFbank, MatchaFbankConfig
from lhotse import CutSet, LilcomChunkyWriter, load_manifest
from lhotse.audio import RecordingSet
from lhotse.supervision import SupervisionSet
from icefall.utils import get_executor
def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--num-jobs",
type=int,
default=4,
help="""It specifies the checkpoint to use for decoding.
Note: Epoch counts from 1.
""",
)
return parser
def compute_fbank_baker_zh(num_jobs: int):
src_dir = Path("data/manifests")
output_dir = Path("data/fbank")
if num_jobs < 1:
num_jobs = os.cpu_count()
logging.info(f"num_jobs: {num_jobs}")
logging.info(f"src_dir: {src_dir}")
logging.info(f"output_dir: {output_dir}")
config = MatchaFbankConfig(
n_fft=1024,
n_mels=80,
sampling_rate=22050,
hop_length=256,
win_length=1024,
f_min=0,
f_max=8000,
)
if not torch.cuda.is_available():
config.device = "cpu"
prefix = "baker_zh"
suffix = "jsonl.gz"
extractor = MatchaFbank(config)
with get_executor() as ex: # Initialize the executor only once.
cuts_filename = f"{prefix}_cuts.{suffix}"
logging.info(f"Processing {cuts_filename}")
cut_set = load_manifest(src_dir / cuts_filename).resample(22050)
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/{prefix}_feats",
num_jobs=num_jobs if ex is None else 80,
executor=ex,
storage_type=LilcomChunkyWriter,
)
cut_set.to_file(output_dir / cuts_filename)
if __name__ == "__main__":
# Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down.
# Do this outside of main() in case it needs to take effect
# even when we are not invoking the main (e.g. when spawning subprocesses).
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)
args = get_parser().parse_args()
compute_fbank_baker_zh(args.num_jobs)

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/local/compute_fbank_statistics.py

View File

@ -0,0 +1,121 @@
#!/usr/bin/env python3
import argparse
import re
from typing import List
import jieba
from lhotse import load_manifest
from pypinyin import Style, lazy_pinyin, load_phrases_dict
load_phrases_dict(
{
"行长": [["hang2"], ["zhang3"]],
"银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
}
)
whiter_space_re = re.compile(r"\s+")
punctuations_re = [
(re.compile(x[0], re.IGNORECASE), x[1])
for x in [
("", ","),
("", "."),
("", "!"),
("", "?"),
("", '"'),
("", '"'),
("", "'"),
("", "'"),
("", ":"),
("", ","),
("", ""),
("", ""),
]
]
def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--in-file",
type=str,
required=True,
help="Input cutset.",
)
parser.add_argument(
"--out-file",
type=str,
required=True,
help="Output cutset.",
)
return parser
def normalize_white_spaces(text):
return whiter_space_re.sub(" ", text)
def normalize_punctuations(text):
for regex, replacement in punctuations_re:
text = re.sub(regex, replacement, text)
return text
def split_text(text: str) -> List[str]:
"""
Example input: '你好呀You are 一个好人。 去银行存钱How about you?'
Example output: ['你好', '', ',', 'you are', '一个', '好人', '.', '', '银行', '存钱', '?', 'how about you', '?']
"""
text = text.lower()
text = normalize_white_spaces(text)
text = normalize_punctuations(text)
ans = []
for seg in jieba.cut(text):
if seg in ",.!?:\"'":
ans.append(seg)
elif seg == " " and len(ans) > 0:
if ord("a") <= ord(ans[-1][-1]) <= ord("z"):
ans[-1] += seg
elif ord("a") <= ord(seg[0]) <= ord("z"):
if len(ans) == 0:
ans.append(seg)
continue
if ans[-1][-1] == " ":
ans[-1] += seg
continue
ans.append(seg)
else:
ans.append(seg)
ans = [s.strip() for s in ans]
return ans
def main():
args = get_parser().parse_args()
cuts = load_manifest(args.in_file)
for c in cuts:
assert len(c.supervisions) == 1, (len(c.supervisions), c.supervisions)
text = c.supervisions[0].normalized_text
text_list = split_text(text)
tokens = lazy_pinyin(text_list, style=Style.TONE3, tone_sandhi=True)
c.tokens = tokens
cuts.to_file(args.out_file)
print(f"saved to {args.out_file}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1 @@
../matcha/fbank.py

View File

@ -0,0 +1,85 @@
#!/usr/bin/env python3
"""
This file generates the file tokens.txt.
Usage:
python3 ./local/generate_tokens.py > data/tokens.txt
"""
import argparse
from typing import List
import jieba
from pypinyin import Style, lazy_pinyin, pinyin_dict
def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--tokens",
type=str,
required=True,
help="Path to to save tokens.txt.",
)
return parser
def generate_token_list() -> List[str]:
token_set = set()
word_dict = pinyin_dict.pinyin_dict
i = 0
for key in word_dict:
if not (0x4E00 <= key <= 0x9FFF):
continue
w = chr(key)
t = lazy_pinyin(w, style=Style.TONE3, tone_sandhi=True)[0]
token_set.add(t)
no_digit = set()
for t in token_set:
if t[-1] not in "1234":
no_digit.add(t)
else:
no_digit.add(t[:-1])
no_digit.add("dei")
no_digit.add("tou")
no_digit.add("dia")
for t in no_digit:
token_set.add(t)
for i in range(1, 5):
token_set.add(f"{t}{i}")
ans = list(token_set)
ans.sort()
punctuations = list(",.!?:\"'")
ans = punctuations + ans
# use ID 0 for blank
# Use ID 1 of _ for padding
ans.insert(0, " ")
ans.insert(1, "_") #
return ans
def main():
args = get_parser().parse_args()
token_list = generate_token_list()
with open(args.tokens, "w", encoding="utf-8") as f:
for indx, token in enumerate(token_list):
f.write(f"{token} {indx}\n")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,70 @@
#!/usr/bin/env python3
# Copyright 2022-2023 Xiaomi Corp. (authors: Fangjun Kuang,
# Zengwei Yao)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script checks the following assumptions of the generated manifest:
- Single supervision per cut
We will add more checks later if needed.
Usage example:
python3 ./local/validate_manifest.py \
./data/spectrogram/baker_zh_cuts_all.jsonl.gz
"""
import argparse
import logging
from pathlib import Path
from lhotse import CutSet, load_manifest_lazy
from lhotse.dataset.speech_synthesis import validate_for_tts
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"manifest",
type=Path,
help="Path to the manifest file",
)
return parser.parse_args()
def main():
args = get_args()
manifest = args.manifest
logging.info(f"Validating {manifest}")
assert manifest.is_file(), f"{manifest} does not exist"
cut_set = load_manifest_lazy(manifest)
assert isinstance(cut_set, CutSet), type(cut_set)
validate_for_tts(cut_set)
if __name__ == "__main__":
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)
main()

View File

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/matcha/audio.py

View File

@ -0,0 +1,207 @@
#!/usr/bin/env python3
# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
"""
This script exports a Matcha-TTS model to ONNX.
Note that the model outputs fbank. You need to use a vocoder to convert
it to audio. See also ./export_onnx_hifigan.py
python3 ./matcha/export_onnx.py \
--exp-dir ./matcha/exp-1 \
--epoch 2000 \
--tokens ./data/tokens.txt \
--cmvn ./data/fbank/cmvn.json
"""
import argparse
import json
import logging
from pathlib import Path
from typing import Any, Dict
import onnx
import torch
from tokenizer import Tokenizer
from train import get_model, get_params
from icefall.checkpoint import load_checkpoint
def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--epoch",
type=int,
default=2000,
help="""It specifies the checkpoint to use for decoding.
Note: Epoch counts from 1.
""",
)
parser.add_argument(
"--exp-dir",
type=Path,
default="matcha/exp-new-3",
help="""The experiment dir.
It specifies the directory where all training related
files, e.g., checkpoints, log, etc, are saved
""",
)
parser.add_argument(
"--tokens",
type=Path,
default="data/tokens.txt",
)
parser.add_argument(
"--cmvn",
type=str,
default="data/fbank/cmvn.json",
help="""Path to vocabulary.""",
)
return parser
def add_meta_data(filename: str, meta_data: Dict[str, Any]):
"""Add meta data to an ONNX model. It is changed in-place.
Args:
filename:
Filename of the ONNX model to be changed.
meta_data:
Key-value pairs.
"""
model = onnx.load(filename)
while len(model.metadata_props):
model.metadata_props.pop()
for key, value in meta_data.items():
meta = model.metadata_props.add()
meta.key = key
meta.value = str(value)
onnx.save(model, filename)
class ModelWrapper(torch.nn.Module):
def __init__(self, model, num_steps: int = 5):
super().__init__()
self.model = model
self.num_steps = num_steps
def forward(
self,
x: torch.Tensor,
x_lengths: torch.Tensor,
noise_scale: torch.Tensor,
length_scale: torch.Tensor,
) -> torch.Tensor:
"""
Args: :
x: (batch_size, num_tokens), torch.int64
x_lengths: (batch_size,), torch.int64
noise_scale: (1,), torch.float32
length_scale (1,), torch.float32
Returns:
audio: (batch_size, num_samples)
"""
mel = self.model.synthesise(
x=x,
x_lengths=x_lengths,
n_timesteps=self.num_steps,
temperature=noise_scale,
length_scale=length_scale,
)["mel"]
# mel: (batch_size, feat_dim, num_frames)
return mel
@torch.inference_mode()
def main():
parser = get_parser()
args = parser.parse_args()
params = get_params()
params.update(vars(args))
tokenizer = Tokenizer(params.tokens)
params.pad_id = tokenizer.pad_id
params.vocab_size = tokenizer.vocab_size
params.model_args.n_vocab = params.vocab_size
with open(params.cmvn) as f:
stats = json.load(f)
params.data_args.data_statistics.mel_mean = stats["fbank_mean"]
params.data_args.data_statistics.mel_std = stats["fbank_std"]
params.model_args.data_statistics.mel_mean = stats["fbank_mean"]
params.model_args.data_statistics.mel_std = stats["fbank_std"]
logging.info(params)
logging.info("About to create model")
model = get_model(params)
load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
for num_steps in [2, 3, 4, 5, 6]:
logging.info(f"num_steps: {num_steps}")
wrapper = ModelWrapper(model, num_steps=num_steps)
wrapper.eval()
# Use a large value so the rotary position embedding in the text
# encoder has a large initial length
x = torch.ones(1, 1000, dtype=torch.int64)
x_lengths = torch.tensor([x.shape[1]], dtype=torch.int64)
noise_scale = torch.tensor([1.0])
length_scale = torch.tensor([1.0])
opset_version = 14
filename = f"model-steps-{num_steps}.onnx"
torch.onnx.export(
wrapper,
(x, x_lengths, noise_scale, length_scale),
filename,
opset_version=opset_version,
input_names=["x", "x_length", "noise_scale", "length_scale"],
output_names=["mel"],
dynamic_axes={
"x": {0: "N", 1: "L"},
"x_length": {0: "N"},
"mel": {0: "N", 2: "L"},
},
)
meta_data = {
"model_type": "matcha-tts",
"language": "Chinese",
"has_espeak": 0,
"n_speakers": 1,
"jieba": 1,
"sample_rate": 22050,
"version": 1,
"pad_id": params.pad_id,
"model_author": "icefall",
"maintainer": "k2-fsa",
"dataset": "baker-zh",
"use_eos_bos": 0,
"dataset_url": "https://www.data-baker.com/open_source.html",
"dataset_comment": "The dataset is for non-commercial use only.",
"num_ode_steps": num_steps,
}
add_meta_data(filename=filename, meta_data=meta_data)
print(meta_data)
if __name__ == "__main__":
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)
main()

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/matcha/export_onnx_hifigan.py

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/matcha/fbank.py

View File

@ -0,0 +1,42 @@
#!/usr/bin/env python3
import jieba
from pypinyin import Style, lazy_pinyin, load_phrases_dict, phrases_dict, pinyin_dict
from tokenizer import Tokenizer
load_phrases_dict(
{
"行长": [["hang2"], ["zhang3"]],
"银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
}
)
def main():
filename = "lexicon.txt"
tokens = "./data/tokens.txt"
tokenizer = Tokenizer(tokens)
word_dict = pinyin_dict.pinyin_dict
phrases = phrases_dict.phrases_dict
i = 0
with open(filename, "w", encoding="utf-8") as f:
for key in word_dict:
if not (0x4E00 <= key <= 0x9FFF):
continue
w = chr(key)
tokens = lazy_pinyin(w, style=Style.TONE3, tone_sandhi=True)[0]
f.write(f"{w} {tokens}\n")
for key in phrases:
tokens = lazy_pinyin(key, style=Style.TONE3, tone_sandhi=True)
tokens = " ".join(tokens)
f.write(f"{key} {tokens}\n")
if __name__ == "__main__":
main()

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/matcha/hifigan

342
egs/baker_zh/TTS/matcha/infer.py Executable file
View File

@ -0,0 +1,342 @@
#!/usr/bin/env python3
# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
"""
python3 ./matcha/infer.py \
--epoch 2000 \
--exp-dir ./matcha/exp-1 \
--vocoder ./generator_v2 \
--tokens ./data/tokens.txt \
--cmvn ./data/fbank/cmvn.json \
--input-text "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。" \
--output-wav ./generated.wav
"""
import argparse
import datetime as dt
import json
import logging
from pathlib import Path
import soundfile as sf
import torch
import torch.nn as nn
from hifigan.config import v1, v2, v3
from hifigan.denoiser import Denoiser
from hifigan.models import Generator as HiFiGAN
from local.convert_text_to_tokens import split_text
from pypinyin import Style, lazy_pinyin
from tokenizer import Tokenizer
from train import get_model, get_params
from tts_datamodule import BakerZhTtsDataModule
from icefall.checkpoint import load_checkpoint
from icefall.utils import AttributeDict, setup_logger
def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--epoch",
type=int,
default=4000,
help="""It specifies the checkpoint to use for decoding.
Note: Epoch counts from 1.
""",
)
parser.add_argument(
"--exp-dir",
type=Path,
default="matcha/exp",
help="""The experiment dir.
It specifies the directory where all training related
files, e.g., checkpoints, log, etc, are saved
""",
)
parser.add_argument(
"--vocoder",
type=Path,
default="./generator_v1",
help="Path to the vocoder",
)
parser.add_argument(
"--tokens",
type=Path,
default="data/tokens.txt",
)
parser.add_argument(
"--cmvn",
type=str,
default="data/fbank/cmvn.json",
help="""Path to vocabulary.""",
)
# The following arguments are used for inference on single text
parser.add_argument(
"--input-text",
type=str,
required=False,
help="The text to generate speech for",
)
parser.add_argument(
"--output-wav",
type=str,
required=False,
help="The filename of the wave to save the generated speech",
)
parser.add_argument(
"--sampling-rate",
type=int,
default=22050,
help="The sampling rate of the generated speech (default: 22050 for baker_zh)",
)
return parser
def load_vocoder(checkpoint_path: Path) -> nn.Module:
checkpoint_path = str(checkpoint_path)
if checkpoint_path.endswith("v1"):
h = AttributeDict(v1)
elif checkpoint_path.endswith("v2"):
h = AttributeDict(v2)
elif checkpoint_path.endswith("v3"):
h = AttributeDict(v3)
else:
raise ValueError(f"supports only v1, v2, and v3, given {checkpoint_path}")
hifigan = HiFiGAN(h).to("cpu")
hifigan.load_state_dict(
torch.load(checkpoint_path, map_location="cpu", weights_only=False)["generator"]
)
_ = hifigan.eval()
hifigan.remove_weight_norm()
return hifigan
def to_waveform(
mel: torch.Tensor, vocoder: nn.Module, denoiser: nn.Module
) -> torch.Tensor:
audio = vocoder(mel).clamp(-1, 1)
audio = denoiser(audio.squeeze(0), strength=0.00025).cpu().squeeze()
return audio.squeeze()
def process_text(text: str, tokenizer: Tokenizer, device: str = "cpu") -> dict:
text = split_text(text)
tokens = lazy_pinyin(text, style=Style.TONE3, tone_sandhi=True)
x = tokenizer.texts_to_token_ids([tokens])
x = torch.tensor(x, dtype=torch.long, device=device)
x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device)
return {"x_orig": text, "x": x, "x_lengths": x_lengths}
def synthesize(
model: nn.Module,
tokenizer: Tokenizer,
n_timesteps: int,
text: str,
length_scale: float,
temperature: float,
device: str = "cpu",
spks=None,
) -> dict:
text_processed = process_text(text=text, tokenizer=tokenizer, device=device)
start_t = dt.datetime.now()
output = model.synthesise(
text_processed["x"],
text_processed["x_lengths"],
n_timesteps=n_timesteps,
temperature=temperature,
spks=spks,
length_scale=length_scale,
)
# merge everything to one dict
output.update({"start_t": start_t, **text_processed})
return output
def infer_dataset(
dl: torch.utils.data.DataLoader,
params: AttributeDict,
model: nn.Module,
vocoder: nn.Module,
denoiser: nn.Module,
tokenizer: Tokenizer,
) -> None:
"""Decode dataset.
The ground-truth and generated audio pairs will be saved to `params.save_wav_dir`.
Args:
dl:
PyTorch's dataloader containing the dataset to decode.
params:
It is returned by :func:`get_params`.
model:
The neural model.
tokenizer:
Used to convert text to phonemes.
"""
device = next(model.parameters()).device
num_cuts = 0
log_interval = 5
try:
num_batches = len(dl)
except TypeError:
num_batches = "?"
for batch_idx, batch in enumerate(dl):
batch_size = len(batch["tokens"])
texts = [c.supervisions[0].normalized_text for c in batch["cut"]]
audio = batch["audio"]
audio_lens = batch["audio_lens"].tolist()
cut_ids = [cut.id for cut in batch["cut"]]
for i in range(batch_size):
output = synthesize(
model=model,
tokenizer=tokenizer,
n_timesteps=params.n_timesteps,
text=texts[i],
length_scale=params.length_scale,
temperature=params.temperature,
device=device,
)
output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
sf.write(
file=params.save_wav_dir / f"{cut_ids[i]}_pred.wav",
data=output["waveform"],
samplerate=params.data_args.sampling_rate,
subtype="PCM_16",
)
sf.write(
file=params.save_wav_dir / f"{cut_ids[i]}_gt.wav",
data=audio[i].numpy(),
samplerate=params.data_args.sampling_rate,
subtype="PCM_16",
)
num_cuts += batch_size
if batch_idx % log_interval == 0:
batch_str = f"{batch_idx}/{num_batches}"
logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
@torch.inference_mode()
def main():
parser = get_parser()
BakerZhTtsDataModule.add_arguments(parser)
args = parser.parse_args()
args.exp_dir = Path(args.exp_dir)
params = get_params()
params.update(vars(args))
params.suffix = f"epoch-{params.epoch}"
params.res_dir = params.exp_dir / "infer" / params.suffix
params.save_wav_dir = params.res_dir / "wav"
params.save_wav_dir.mkdir(parents=True, exist_ok=True)
setup_logger(f"{params.res_dir}/log-infer-{params.suffix}")
logging.info("Infer started")
device = torch.device("cpu")
if torch.cuda.is_available():
device = torch.device("cuda", 0)
logging.info(f"Device: {device}")
tokenizer = Tokenizer(params.tokens)
params.vocab_size = tokenizer.vocab_size
params.model_args.n_vocab = params.vocab_size
with open(params.cmvn) as f:
stats = json.load(f)
params.data_args.data_statistics.mel_mean = stats["fbank_mean"]
params.data_args.data_statistics.mel_std = stats["fbank_std"]
params.model_args.data_statistics.mel_mean = stats["fbank_mean"]
params.model_args.data_statistics.mel_std = stats["fbank_std"]
# Number of ODE Solver steps
params.n_timesteps = 2
# Changes to the speaking rate
params.length_scale = 1.0
# Sampling temperature
params.temperature = 0.667
logging.info(params)
logging.info("About to create model")
model = get_model(params)
load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
model.to(device)
model.eval()
# we need cut ids to organize tts results.
args.return_cuts = True
baker_zh = BakerZhTtsDataModule(args)
test_cuts = baker_zh.test_cuts()
test_dl = baker_zh.test_dataloaders(test_cuts)
if not Path(params.vocoder).is_file():
raise ValueError(f"{params.vocoder} does not exist")
vocoder = load_vocoder(params.vocoder)
vocoder.to(device)
denoiser = Denoiser(vocoder, mode="zeros")
denoiser.to(device)
if params.input_text is not None and params.output_wav is not None:
logging.info("Synthesizing a single text")
output = synthesize(
model=model,
tokenizer=tokenizer,
n_timesteps=params.n_timesteps,
text=params.input_text,
length_scale=params.length_scale,
temperature=params.temperature,
device=device,
)
output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
sf.write(
file=params.output_wav,
data=output["waveform"],
samplerate=params.sampling_rate,
subtype="PCM_16",
)
else:
logging.info("Decoding the test set")
infer_dataset(
dl=test_dl,
params=params,
model=model,
vocoder=vocoder,
denoiser=denoiser,
tokenizer=tokenizer,
)
if __name__ == "__main__":
main()

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/matcha/model.py

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/matcha/models

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/matcha/monotonic_align

View File

@ -0,0 +1,316 @@
#!/usr/bin/env python3
# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
"""
python3 ./matcha/onnx_pretrained.py \
--acoustic-model ./model-steps-4.onnx \
--vocoder ./hifigan_v2.onnx \
--tokens ./data/tokens.txt \
--lexicon ./lexicon.txt \
--input-text "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。" \
--output-wav ./b.wav
"""
import argparse
import datetime as dt
import logging
import re
from typing import Dict, List
import jieba
import onnxruntime as ort
import soundfile as sf
import torch
from infer import load_vocoder
from utils import intersperse
def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--acoustic-model",
type=str,
required=True,
help="Path to the acoustic model",
)
parser.add_argument(
"--tokens",
type=str,
required=True,
help="Path to the tokens.txt",
)
parser.add_argument(
"--lexicon",
type=str,
required=True,
help="Path to the lexicon.txt",
)
parser.add_argument(
"--vocoder",
type=str,
required=True,
help="Path to the vocoder",
)
parser.add_argument(
"--input-text",
type=str,
required=True,
help="The text to generate speech for",
)
parser.add_argument(
"--output-wav",
type=str,
required=True,
help="The filename of the wave to save the generated speech",
)
return parser
class OnnxHifiGANModel:
def __init__(
self,
filename: str,
):
session_opts = ort.SessionOptions()
session_opts.inter_op_num_threads = 1
session_opts.intra_op_num_threads = 1
self.session_opts = session_opts
self.model = ort.InferenceSession(
filename,
sess_options=self.session_opts,
providers=["CPUExecutionProvider"],
)
for i in self.model.get_inputs():
print(i)
print("-----")
for i in self.model.get_outputs():
print(i)
def __call__(self, x: torch.tensor):
assert x.ndim == 3, x.shape
assert x.shape[0] == 1, x.shape
audio = self.model.run(
[self.model.get_outputs()[0].name],
{
self.model.get_inputs()[0].name: x.numpy(),
},
)[0]
# audio: (batch_size, num_samples)
return torch.from_numpy(audio)
class OnnxModel:
def __init__(
self,
filename: str,
):
session_opts = ort.SessionOptions()
session_opts.inter_op_num_threads = 1
session_opts.intra_op_num_threads = 2
self.session_opts = session_opts
self.model = ort.InferenceSession(
filename,
sess_options=self.session_opts,
providers=["CPUExecutionProvider"],
)
logging.info(f"{self.model.get_modelmeta().custom_metadata_map}")
metadata = self.model.get_modelmeta().custom_metadata_map
self.sample_rate = int(metadata["sample_rate"])
for i in self.model.get_inputs():
print(i)
print("-----")
for i in self.model.get_outputs():
print(i)
def __call__(self, x: torch.tensor):
assert x.ndim == 2, x.shape
assert x.shape[0] == 1, x.shape
x_lengths = torch.tensor([x.shape[1]], dtype=torch.int64)
print("x_lengths", x_lengths)
print("x", x.shape)
noise_scale = torch.tensor([1.0], dtype=torch.float32)
length_scale = torch.tensor([1.0], dtype=torch.float32)
mel = self.model.run(
[self.model.get_outputs()[0].name],
{
self.model.get_inputs()[0].name: x.numpy(),
self.model.get_inputs()[1].name: x_lengths.numpy(),
self.model.get_inputs()[2].name: noise_scale.numpy(),
self.model.get_inputs()[3].name: length_scale.numpy(),
},
)[0]
# mel: (batch_size, feat_dim, num_frames)
return torch.from_numpy(mel)
def read_tokens(filename: str) -> Dict[str, int]:
token2id = dict()
with open(filename, encoding="utf-8") as f:
for line in f.readlines():
info = line.rstrip().split()
if len(info) == 1:
# case of space
token = " "
idx = int(info[0])
else:
token, idx = info[0], int(info[1])
assert token not in token2id, token
token2id[token] = idx
return token2id
def read_lexicon(filename: str) -> Dict[str, List[str]]:
word2token = dict()
with open(filename, encoding="utf-8") as f:
for line in f.readlines():
info = line.rstrip().split()
w = info[0]
tokens = info[1:]
word2token[w] = tokens
return word2token
def convert_word_to_tokens(word2tokens: Dict[str, List[str]], word: str) -> List[str]:
if word in word2tokens:
return word2tokens[word]
if len(word) == 1:
return []
ans = []
for w in word:
t = convert_word_to_tokens(word2tokens, w)
ans.extend(t)
return ans
def normalize_text(text):
whiter_space_re = re.compile(r"\s+")
punctuations_re = [
(re.compile(x[0], re.IGNORECASE), x[1])
for x in [
("", ","),
("", "."),
("", "!"),
("", "?"),
("", '"'),
("", '"'),
("", "'"),
("", "'"),
("", ":"),
("", ","),
]
]
for regex, replacement in punctuations_re:
text = re.sub(regex, replacement, text)
return text
@torch.no_grad()
def main():
params = get_parser().parse_args()
logging.info(vars(params))
token2id = read_tokens(params.tokens)
word2tokens = read_lexicon(params.lexicon)
text = normalize_text(params.input_text)
seg = jieba.cut(text)
tokens = []
for s in seg:
if s in token2id:
tokens.append(s)
continue
t = convert_word_to_tokens(word2tokens, s)
if t:
tokens.extend(t)
model = OnnxModel(params.acoustic_model)
vocoder = OnnxHifiGANModel(params.vocoder)
x = []
for t in tokens:
if t in token2id:
x.append(token2id[t])
x = intersperse(x, item=token2id["_"])
x = torch.tensor(x, dtype=torch.int64).unsqueeze(0)
start_t = dt.datetime.now()
mel = model(x)
end_t = dt.datetime.now()
start_t2 = dt.datetime.now()
audio = vocoder(mel)
end_t2 = dt.datetime.now()
print("audio", audio.shape) # (1, 1, num_samples)
audio = audio.squeeze()
sample_rate = model.sample_rate
t = (end_t - start_t).total_seconds()
t2 = (end_t2 - start_t2).total_seconds()
rtf_am = t * sample_rate / audio.shape[-1]
rtf_vocoder = t2 * sample_rate / audio.shape[-1]
print("RTF for acoustic model ", rtf_am)
print("RTF for vocoder", rtf_vocoder)
# skip denoiser
sf.write(params.output_wav, audio, sample_rate, "PCM_16")
logging.info(f"Saved to {params.output_wav}")
if __name__ == "__main__":
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)
main()
"""
|HifiGAN |RTF |#Parameters (M)|
|----------|-----|---------------|
|v1 |0.818| 13.926 |
|v2 |0.101| 0.925 |
|v3 |0.118| 1.462 |
|Num steps|Acoustic Model RTF|
|---------|------------------|
| 2 | 0.039 |
| 3 | 0.047 |
| 4 | 0.071 |
| 5 | 0.076 |
| 6 | 0.103 |
"""

View File

@ -0,0 +1,119 @@
# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
import logging
from typing import Dict, List
import tacotron_cleaner.cleaners
try:
from piper_phonemize import phonemize_espeak
except Exception as ex:
raise RuntimeError(
f"{ex}\nPlease run\n"
"pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html"
)
from utils import intersperse
# This tokenizer supports both English and Chinese.
# We assume you have used
# ../local/convert_text_to_tokens.py
# to process your text
class Tokenizer(object):
def __init__(self, tokens: str):
"""
Args:
tokens: the file that maps tokens to ids
"""
# Parse token file
self.token2id: Dict[str, int] = {}
with open(tokens, "r", encoding="utf-8") as f:
for line in f.readlines():
info = line.rstrip().split()
if len(info) == 1:
# case of space
token = " "
id = int(info[0])
else:
token, id = info[0], int(info[1])
assert token not in self.token2id, token
self.token2id[token] = id
# Refer to https://github.com/rhasspy/piper/blob/master/TRAINING.md
self.pad_id = self.token2id["_"] # padding
self.space_id = self.token2id[" "] # word separator (whitespace)
self.vocab_size = len(self.token2id)
def texts_to_token_ids(
self,
sentence_list: List[List[str]],
intersperse_blank: bool = True,
lang: str = "en-us",
) -> List[List[int]]:
"""
Args:
sentence_list:
A list of sentences.
intersperse_blank:
Whether to intersperse blanks in the token sequence.
lang:
Language argument passed to phonemize_espeak().
Returns:
Return a list of token id list [utterance][token_id]
"""
token_ids_list = []
for sentence in sentence_list:
tokens_list = []
for word in sentence:
if word in self.token2id:
tokens_list.append(word)
continue
tmp_tokens_list = phonemize_espeak(word, lang)
for t in tmp_tokens_list:
tokens_list.extend(t)
token_ids = []
for t in tokens_list:
if t not in self.token2id:
logging.warning(f"Skip OOV {t} {sentence}")
continue
if t == " " and len(token_ids) > 0 and token_ids[-1] == self.space_id:
continue
token_ids.append(self.token2id[t])
if intersperse_blank:
token_ids = intersperse(token_ids, self.pad_id)
token_ids_list.append(token_ids)
return token_ids_list
def test_tokenizer():
import jieba
from pypinyin import Style, lazy_pinyin
tokenizer = Tokenizer("data/tokens.txt")
text1 = "今天is Monday, tomorrow is 星期二"
text2 = "你好吗? 我很好, how about you?"
text1 = list(jieba.cut(text1))
text2 = list(jieba.cut(text2))
tokens1 = lazy_pinyin(text1, style=Style.TONE3, tone_sandhi=True)
tokens2 = lazy_pinyin(text2, style=Style.TONE3, tone_sandhi=True)
print(tokens1)
print(tokens2)
ids = tokenizer.texts_to_token_ids([tokens1, tokens2])
print(ids)
if __name__ == "__main__":
test_tokenizer()

717
egs/baker_zh/TTS/matcha/train.py Executable file
View File

@ -0,0 +1,717 @@
#!/usr/bin/env python3
# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
import argparse
import json
import logging
from pathlib import Path
from shutil import copyfile
from typing import Any, Dict, Optional, Union
import k2
import torch
import torch.multiprocessing as mp
import torch.nn as nn
from lhotse.utils import fix_random_seed
from model import fix_len_compatibility
from models.matcha_tts import MatchaTTS
from tokenizer import Tokenizer
from torch.cuda.amp import GradScaler, autocast
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import Optimizer
from torch.utils.tensorboard import SummaryWriter
from tts_datamodule import BakerZhTtsDataModule
from utils import MetricsTracker
from icefall.checkpoint import load_checkpoint, save_checkpoint
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.utils import AttributeDict, setup_logger, str2bool
def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--world-size",
type=int,
default=1,
help="Number of GPUs for DDP training.",
)
parser.add_argument(
"--master-port",
type=int,
default=12335,
help="Master port to use for DDP training.",
)
parser.add_argument(
"--tensorboard",
type=str2bool,
default=True,
help="Should various information be logged in tensorboard.",
)
parser.add_argument(
"--num-epochs",
type=int,
default=1000,
help="Number of epochs to train.",
)
parser.add_argument(
"--start-epoch",
type=int,
default=1,
help="""Resume training from this epoch. It should be positive.
If larger than 1, it will load checkpoint from
exp-dir/epoch-{start_epoch-1}.pt
""",
)
parser.add_argument(
"--exp-dir",
type=Path,
default="matcha/exp",
help="""The experiment dir.
It specifies the directory where all training related
files, e.g., checkpoints, log, etc, are saved
""",
)
parser.add_argument(
"--tokens",
type=str,
default="data/tokens.txt",
help="""Path to vocabulary.""",
)
parser.add_argument(
"--cmvn",
type=str,
default="data/fbank/cmvn.json",
help="""Path to vocabulary.""",
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="The seed for random generators intended for reproducibility",
)
parser.add_argument(
"--save-every-n",
type=int,
default=10,
help="""Save checkpoint after processing this number of epochs"
periodically. We save checkpoint to exp-dir/ whenever
params.cur_epoch % save_every_n == 0. The checkpoint filename
has the form: f'exp-dir/epoch-{params.cur_epoch}.pt'.
Since it will take around 1000 epochs, we suggest using a large
save_every_n to save disk space.
""",
)
parser.add_argument(
"--use-fp16",
type=str2bool,
default=False,
help="Whether to use half precision training.",
)
return parser
def get_data_statistics():
return AttributeDict(
{
"mel_mean": 0,
"mel_std": 1,
}
)
def _get_data_params() -> AttributeDict:
params = AttributeDict(
{
"name": "baker-zh",
"train_filelist_path": "./filelists/ljs_audio_text_train_filelist.txt",
"valid_filelist_path": "./filelists/ljs_audio_text_val_filelist.txt",
# "batch_size": 64,
# "num_workers": 1,
# "pin_memory": False,
"cleaners": ["english_cleaners2"],
"add_blank": True,
"n_spks": 1,
"n_fft": 1024,
"n_feats": 80,
"sampling_rate": 22050,
"hop_length": 256,
"win_length": 1024,
"f_min": 0,
"f_max": 8000,
"seed": 1234,
"load_durations": False,
"data_statistics": get_data_statistics(),
}
)
return params
def _get_model_params() -> AttributeDict:
n_feats = 80
filter_channels_dp = 256
encoder_params_p_dropout = 0.1
params = AttributeDict(
{
"n_spks": 1, # for baker-zh.
"spk_emb_dim": 64,
"n_feats": n_feats,
"out_size": None, # or use 172
"prior_loss": True,
"use_precomputed_durations": False,
"data_statistics": get_data_statistics(),
"encoder": AttributeDict(
{
"encoder_type": "RoPE Encoder", # not used
"encoder_params": AttributeDict(
{
"n_feats": n_feats,
"n_channels": 192,
"filter_channels": 768,
"filter_channels_dp": filter_channels_dp,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": encoder_params_p_dropout,
"spk_emb_dim": 64,
"n_spks": 1,
"prenet": True,
}
),
"duration_predictor_params": AttributeDict(
{
"filter_channels_dp": filter_channels_dp,
"kernel_size": 3,
"p_dropout": encoder_params_p_dropout,
}
),
}
),
"decoder": AttributeDict(
{
"channels": [256, 256],
"dropout": 0.05,
"attention_head_dim": 64,
"n_blocks": 1,
"num_mid_blocks": 2,
"num_heads": 2,
"act_fn": "snakebeta",
}
),
"cfm": AttributeDict(
{
"name": "CFM",
"solver": "euler",
"sigma_min": 1e-4,
}
),
"optimizer": AttributeDict(
{
"lr": 1e-4,
"weight_decay": 0.0,
}
),
}
)
return params
def get_params():
params = AttributeDict(
{
"model_args": _get_model_params(),
"data_args": _get_data_params(),
"best_train_loss": float("inf"),
"best_valid_loss": float("inf"),
"best_train_epoch": -1,
"best_valid_epoch": -1,
"batch_idx_train": -1, # 0
"log_interval": 10,
"valid_interval": 1500,
"env_info": get_env_info(),
}
)
return params
def get_model(params):
m = MatchaTTS(**params.model_args)
return m
def load_checkpoint_if_available(
params: AttributeDict, model: nn.Module
) -> Optional[Dict[str, Any]]:
"""Load checkpoint from file.
If params.start_epoch is larger than 1, it will load the checkpoint from
`params.start_epoch - 1`.
Apart from loading state dict for `model` and `optimizer` it also updates
`best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
and `best_valid_loss` in `params`.
Args:
params:
The return value of :func:`get_params`.
model:
The training model.
Returns:
Return a dict containing previously saved training info.
"""
if params.start_epoch > 1:
filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
else:
return None
assert filename.is_file(), f"{filename} does not exist!"
saved_params = load_checkpoint(filename, model=model)
keys = [
"best_train_epoch",
"best_valid_epoch",
"batch_idx_train",
"best_train_loss",
"best_valid_loss",
]
for k in keys:
params[k] = saved_params[k]
return saved_params
def prepare_input(batch: dict, tokenizer: Tokenizer, device: torch.device, params):
"""Parse batch data"""
mel_mean = params.data_args.data_statistics.mel_mean
mel_std_inv = 1 / params.data_args.data_statistics.mel_std
for i in range(batch["features"].shape[0]):
n = batch["features_lens"][i]
batch["features"][i : i + 1, :n, :] = (
batch["features"][i : i + 1, :n, :] - mel_mean
) * mel_std_inv
batch["features"][i : i + 1, n:, :] = 0
audio = batch["audio"].to(device)
features = batch["features"].to(device)
audio_lens = batch["audio_lens"].to(device)
features_lens = batch["features_lens"].to(device)
tokens = batch["tokens"]
tokens = tokenizer.texts_to_token_ids(tokens, intersperse_blank=True)
tokens = k2.RaggedTensor(tokens)
row_splits = tokens.shape.row_splits(1)
tokens_lens = row_splits[1:] - row_splits[:-1]
tokens = tokens.to(device)
tokens_lens = tokens_lens.to(device)
# a tensor of shape (B, T)
tokens = tokens.pad(mode="constant", padding_value=tokenizer.pad_id)
max_feature_length = fix_len_compatibility(features.shape[1])
if max_feature_length > features.shape[1]:
pad = max_feature_length - features.shape[1]
features = torch.nn.functional.pad(features, (0, 0, 0, pad))
# features_lens[features_lens.argmax()] += pad
return audio, audio_lens, features, features_lens.long(), tokens, tokens_lens.long()
def compute_validation_loss(
params: AttributeDict,
model: Union[nn.Module, DDP],
tokenizer: Tokenizer,
valid_dl: torch.utils.data.DataLoader,
world_size: int = 1,
rank: int = 0,
) -> MetricsTracker:
"""Run the validation process."""
model.eval()
device = model.device if isinstance(model, DDP) else next(model.parameters()).device
get_losses = model.module.get_losses if isinstance(model, DDP) else model.get_losses
# used to summary the stats over iterations
tot_loss = MetricsTracker()
with torch.no_grad():
for batch_idx, batch in enumerate(valid_dl):
(
audio,
audio_lens,
features,
features_lens,
tokens,
tokens_lens,
) = prepare_input(batch, tokenizer, device, params)
losses = get_losses(
{
"x": tokens,
"x_lengths": tokens_lens,
"y": features.permute(0, 2, 1),
"y_lengths": features_lens,
"spks": None, # should change it for multi-speakers
"durations": None,
}
)
batch_size = len(batch["tokens"])
loss_info = MetricsTracker()
loss_info["samples"] = batch_size
s = 0
for key, value in losses.items():
v = value.detach().item()
loss_info[key] = v * batch_size
s += v * batch_size
loss_info["tot_loss"] = s
# summary stats
tot_loss = tot_loss + loss_info
if world_size > 1:
tot_loss.reduce(device)
loss_value = tot_loss["tot_loss"] / tot_loss["samples"]
if loss_value < params.best_valid_loss:
params.best_valid_epoch = params.cur_epoch
params.best_valid_loss = loss_value
return tot_loss
def train_one_epoch(
params: AttributeDict,
model: Union[nn.Module, DDP],
tokenizer: Tokenizer,
optimizer: Optimizer,
train_dl: torch.utils.data.DataLoader,
valid_dl: torch.utils.data.DataLoader,
scaler: GradScaler,
tb_writer: Optional[SummaryWriter] = None,
world_size: int = 1,
rank: int = 0,
) -> None:
"""Train the model for one epoch.
The training loss from the mean of all frames is saved in
`params.train_loss`. It runs the validation process every
`params.valid_interval` batches.
Args:
params:
It is returned by :func:`get_params`.
model:
The model for training.
optimizer:
The optimizer.
train_dl:
Dataloader for the training dataset.
valid_dl:
Dataloader for the validation dataset.
scaler:
The scaler used for mix precision training.
tb_writer:
Writer to write log messages to tensorboard.
"""
model.train()
device = model.device if isinstance(model, DDP) else next(model.parameters()).device
get_losses = model.module.get_losses if isinstance(model, DDP) else model.get_losses
# used to track the stats over iterations in one epoch
tot_loss = MetricsTracker()
saved_bad_model = False
def save_bad_model(suffix: str = ""):
save_checkpoint(
filename=params.exp_dir / f"bad-model{suffix}-{rank}.pt",
model=model,
params=params,
optimizer=optimizer,
scaler=scaler,
rank=0,
)
for batch_idx, batch in enumerate(train_dl):
params.batch_idx_train += 1
# audio: (N, T), float32
# features: (N, T, C), float32
# audio_lens, (N,), int32
# features_lens, (N,), int32
# tokens: List[List[str]], len(tokens) == N
batch_size = len(batch["tokens"])
(
audio,
audio_lens,
features,
features_lens,
tokens,
tokens_lens,
) = prepare_input(batch, tokenizer, device, params)
try:
with autocast(enabled=params.use_fp16):
losses = get_losses(
{
"x": tokens,
"x_lengths": tokens_lens,
"y": features.permute(0, 2, 1),
"y_lengths": features_lens,
"spks": None, # should change it for multi-speakers
"durations": None,
}
)
loss = sum(losses.values())
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
loss_info = MetricsTracker()
loss_info["samples"] = batch_size
s = 0
for key, value in losses.items():
v = value.detach().item()
loss_info[key] = v * batch_size
s += v * batch_size
loss_info["tot_loss"] = s
tot_loss = tot_loss + loss_info
except: # noqa
save_bad_model()
raise
if params.batch_idx_train % 100 == 0 and params.use_fp16:
# If the grad scale was less than 1, try increasing it.
# The _growth_interval of the grad scaler is configurable,
# but we can't configure it to have different
# behavior depending on the current grad scale.
cur_grad_scale = scaler._scale.item()
if cur_grad_scale < 8.0 or (
cur_grad_scale < 32.0 and params.batch_idx_train % 400 == 0
):
scaler.update(cur_grad_scale * 2.0)
if cur_grad_scale < 0.01:
if not saved_bad_model:
save_bad_model(suffix="-first-warning")
saved_bad_model = True
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
save_bad_model()
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
if params.batch_idx_train % params.log_interval == 0:
cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
logging.info(
f"Epoch {params.cur_epoch}, batch {batch_idx}, "
f"global_batch_idx: {params.batch_idx_train}, "
f"batch size: {batch_size}, "
f"loss[{loss_info}], tot_loss[{tot_loss}], "
+ (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
)
if tb_writer is not None:
loss_info.write_summary(
tb_writer, "train/current_", params.batch_idx_train
)
tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
if params.use_fp16:
tb_writer.add_scalar(
"train/grad_scale", cur_grad_scale, params.batch_idx_train
)
if params.batch_idx_train % params.valid_interval == 1:
logging.info("Computing validation loss")
valid_info = compute_validation_loss(
params=params,
model=model,
tokenizer=tokenizer,
valid_dl=valid_dl,
world_size=world_size,
rank=rank,
)
model.train()
logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
logging.info(
"Maximum memory allocated so far is "
f"{torch.cuda.max_memory_allocated()//1000000}MB"
)
if tb_writer is not None:
valid_info.write_summary(
tb_writer, "train/valid_", params.batch_idx_train
)
loss_value = tot_loss["tot_loss"] / tot_loss["samples"]
params.train_loss = loss_value
if params.train_loss < params.best_train_loss:
params.best_train_epoch = params.cur_epoch
params.best_train_loss = params.train_loss
def run(rank, world_size, args):
params = get_params()
params.update(vars(args))
fix_random_seed(params.seed)
if world_size > 1:
setup_dist(rank, world_size, params.master_port)
setup_logger(f"{params.exp_dir}/log/log-train")
logging.info("Training started")
if args.tensorboard and rank == 0:
tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
else:
tb_writer = None
device = torch.device("cpu")
if torch.cuda.is_available():
device = torch.device("cuda", rank)
logging.info(f"Device: {device}")
tokenizer = Tokenizer(params.tokens)
params.pad_id = tokenizer.pad_id
params.vocab_size = tokenizer.vocab_size
params.model_args.n_vocab = params.vocab_size
with open(params.cmvn) as f:
stats = json.load(f)
params.data_args.data_statistics.mel_mean = stats["fbank_mean"]
params.data_args.data_statistics.mel_std = stats["fbank_std"]
params.model_args.data_statistics.mel_mean = stats["fbank_mean"]
params.model_args.data_statistics.mel_std = stats["fbank_std"]
logging.info(params)
print(params)
logging.info("About to create model")
model = get_model(params)
num_param = sum([p.numel() for p in model.parameters()])
logging.info(f"Number of parameters: {num_param}")
assert params.start_epoch > 0, params.start_epoch
checkpoints = load_checkpoint_if_available(params=params, model=model)
model.to(device)
if world_size > 1:
logging.info("Using DDP")
model = DDP(model, device_ids=[rank], find_unused_parameters=True)
optimizer = torch.optim.Adam(model.parameters(), **params.model_args.optimizer)
logging.info("About to create datamodule")
baker_zh = BakerZhTtsDataModule(args)
train_cuts = baker_zh.train_cuts()
train_dl = baker_zh.train_dataloaders(train_cuts)
valid_cuts = baker_zh.valid_cuts()
valid_dl = baker_zh.valid_dataloaders(valid_cuts)
scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
if checkpoints and "grad_scaler" in checkpoints:
logging.info("Loading grad scaler state dict")
scaler.load_state_dict(checkpoints["grad_scaler"])
for epoch in range(params.start_epoch, params.num_epochs + 1):
logging.info(f"Start epoch {epoch}")
fix_random_seed(params.seed + epoch - 1)
if "sampler" in train_dl:
train_dl.sampler.set_epoch(epoch - 1)
params.cur_epoch = epoch
if tb_writer is not None:
tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
train_one_epoch(
params=params,
model=model,
tokenizer=tokenizer,
optimizer=optimizer,
train_dl=train_dl,
valid_dl=valid_dl,
scaler=scaler,
tb_writer=tb_writer,
world_size=world_size,
rank=rank,
)
if epoch % params.save_every_n == 0 or epoch == params.num_epochs:
filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
save_checkpoint(
filename=filename,
params=params,
model=model,
optimizer=optimizer,
scaler=scaler,
rank=rank,
)
if rank == 0:
if params.best_train_epoch == params.cur_epoch:
best_train_filename = params.exp_dir / "best-train-loss.pt"
copyfile(src=filename, dst=best_train_filename)
if params.best_valid_epoch == params.cur_epoch:
best_valid_filename = params.exp_dir / "best-valid-loss.pt"
copyfile(src=filename, dst=best_valid_filename)
logging.info("Done!")
if world_size > 1:
torch.distributed.barrier()
cleanup_dist()
def main():
parser = get_parser()
BakerZhTtsDataModule.add_arguments(parser)
args = parser.parse_args()
world_size = args.world_size
assert world_size >= 1
if world_size > 1:
mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
else:
run(rank=0, world_size=1, args=args)
if __name__ == "__main__":
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
main()

View File

@ -0,0 +1,340 @@
# Copyright 2021 Piotr Żelasko
# Copyright 2022-2023 Xiaomi Corporation (Authors: Mingshuang Luo,
# Zengwei Yao)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, Optional
import torch
from fbank import MatchaFbank, MatchaFbankConfig
from lhotse import CutSet, load_manifest_lazy
from lhotse.dataset import ( # noqa F401 for PrecomputedFeatures
CutConcatenate,
CutMix,
DynamicBucketingSampler,
PrecomputedFeatures,
SimpleCutSampler,
SpeechSynthesisDataset,
)
from lhotse.dataset.input_strategies import ( # noqa F401 For AudioSamples
AudioSamples,
OnTheFlyFeatures,
)
from lhotse.utils import fix_random_seed
from torch.utils.data import DataLoader
from icefall.utils import str2bool
class _SeedWorkers:
def __init__(self, seed: int):
self.seed = seed
def __call__(self, worker_id: int):
fix_random_seed(self.seed + worker_id)
class BakerZhTtsDataModule:
"""
DataModule for tts experiments.
It assumes there is always one train and valid dataloader,
but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
and test-other).
It contains all the common data pipeline modules used in ASR
experiments, e.g.:
- dynamic batch size,
- bucketing samplers,
- cut concatenation,
- on-the-fly feature extraction
This class should be derived for specific corpora used in ASR tasks.
"""
def __init__(self, args: argparse.Namespace):
self.args = args
@classmethod
def add_arguments(cls, parser: argparse.ArgumentParser):
group = parser.add_argument_group(
title="TTS data related options",
description="These options are used for the preparation of "
"PyTorch DataLoaders from Lhotse CutSet's -- they control the "
"effective batch sizes, sampling strategies, applied data "
"augmentations, etc.",
)
group.add_argument(
"--manifest-dir",
type=Path,
default=Path("data/fbank"),
help="Path to directory with train/valid/test cuts.",
)
group.add_argument(
"--max-duration",
type=int,
default=200.0,
help="Maximum pooled recordings duration (seconds) in a "
"single batch. You can reduce it if it causes CUDA OOM.",
)
group.add_argument(
"--bucketing-sampler",
type=str2bool,
default=True,
help="When enabled, the batches will come from buckets of "
"similar duration (saves padding frames).",
)
group.add_argument(
"--num-buckets",
type=int,
default=30,
help="The number of buckets for the DynamicBucketingSampler"
"(you might want to increase it for larger datasets).",
)
group.add_argument(
"--on-the-fly-feats",
type=str2bool,
default=False,
help="When enabled, use on-the-fly cut mixing and feature "
"extraction. Will drop existing precomputed feature manifests "
"if available.",
)
group.add_argument(
"--shuffle",
type=str2bool,
default=True,
help="When enabled (=default), the examples will be "
"shuffled for each epoch.",
)
group.add_argument(
"--drop-last",
type=str2bool,
default=True,
help="Whether to drop last batch. Used by sampler.",
)
group.add_argument(
"--return-cuts",
type=str2bool,
default=False,
help="When enabled, each batch will have the "
"field: batch['cut'] with the cuts that "
"were used to construct it.",
)
group.add_argument(
"--num-workers",
type=int,
default=2,
help="The number of training dataloader workers that "
"collect the batches.",
)
group.add_argument(
"--input-strategy",
type=str,
default="PrecomputedFeatures",
help="AudioSamples or PrecomputedFeatures",
)
def train_dataloaders(
self,
cuts_train: CutSet,
sampler_state_dict: Optional[Dict[str, Any]] = None,
) -> DataLoader:
"""
Args:
cuts_train:
CutSet for training.
sampler_state_dict:
The state dict for the training sampler.
"""
logging.info("About to create train dataset")
train = SpeechSynthesisDataset(
return_text=False,
return_tokens=True,
feature_input_strategy=eval(self.args.input_strategy)(),
return_cuts=self.args.return_cuts,
)
if self.args.on_the_fly_feats:
sampling_rate = 22050
config = MatchaFbankConfig(
n_fft=1024,
n_mels=80,
sampling_rate=sampling_rate,
hop_length=256,
win_length=1024,
f_min=0,
f_max=8000,
)
train = SpeechSynthesisDataset(
return_text=False,
return_tokens=True,
feature_input_strategy=OnTheFlyFeatures(MatchaFbank(config)),
return_cuts=self.args.return_cuts,
)
if self.args.bucketing_sampler:
logging.info("Using DynamicBucketingSampler.")
train_sampler = DynamicBucketingSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
buffer_size=self.args.num_buckets * 2000,
shuffle_buffer_size=self.args.num_buckets * 5000,
drop_last=self.args.drop_last,
)
else:
logging.info("Using SimpleCutSampler.")
train_sampler = SimpleCutSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
)
logging.info("About to create train dataloader")
if sampler_state_dict is not None:
logging.info("Loading sampler state dict")
train_sampler.load_state_dict(sampler_state_dict)
# 'seed' is derived from the current random state, which will have
# previously been set in the main process.
seed = torch.randint(0, 100000, ()).item()
worker_init_fn = _SeedWorkers(seed)
train_dl = DataLoader(
train,
sampler=train_sampler,
batch_size=None,
num_workers=self.args.num_workers,
persistent_workers=True,
pin_memory=True,
worker_init_fn=worker_init_fn,
)
return train_dl
def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
logging.info("About to create dev dataset")
if self.args.on_the_fly_feats:
sampling_rate = 22050
config = MatchaFbankConfig(
n_fft=1024,
n_mels=80,
sampling_rate=sampling_rate,
hop_length=256,
win_length=1024,
f_min=0,
f_max=8000,
)
validate = SpeechSynthesisDataset(
return_text=False,
return_tokens=True,
feature_input_strategy=OnTheFlyFeatures(MatchaFbank(config)),
return_cuts=self.args.return_cuts,
)
else:
validate = SpeechSynthesisDataset(
return_text=False,
return_tokens=True,
feature_input_strategy=eval(self.args.input_strategy)(),
return_cuts=self.args.return_cuts,
)
valid_sampler = DynamicBucketingSampler(
cuts_valid,
max_duration=self.args.max_duration,
num_buckets=self.args.num_buckets,
shuffle=False,
)
logging.info("About to create valid dataloader")
valid_dl = DataLoader(
validate,
sampler=valid_sampler,
batch_size=None,
num_workers=2,
persistent_workers=True,
pin_memory=True,
)
return valid_dl
def test_dataloaders(self, cuts: CutSet) -> DataLoader:
logging.info("About to create test dataset")
if self.args.on_the_fly_feats:
sampling_rate = 22050
config = MatchaFbankConfig(
n_fft=1024,
n_mels=80,
sampling_rate=sampling_rate,
hop_length=256,
win_length=1024,
f_min=0,
f_max=8000,
)
test = SpeechSynthesisDataset(
return_text=False,
return_tokens=True,
feature_input_strategy=OnTheFlyFeatures(MatchaFbank(config)),
return_cuts=self.args.return_cuts,
)
else:
test = SpeechSynthesisDataset(
return_text=False,
return_tokens=True,
feature_input_strategy=eval(self.args.input_strategy)(),
return_cuts=self.args.return_cuts,
)
test_sampler = DynamicBucketingSampler(
cuts,
max_duration=self.args.max_duration,
num_buckets=self.args.num_buckets,
shuffle=False,
)
logging.info("About to create test dataloader")
test_dl = DataLoader(
test,
batch_size=None,
sampler=test_sampler,
num_workers=self.args.num_workers,
)
return test_dl
@lru_cache()
def train_cuts(self) -> CutSet:
logging.info("About to get train cuts")
return load_manifest_lazy(
self.args.manifest_dir / "baker_zh_cuts_train.jsonl.gz"
)
@lru_cache()
def valid_cuts(self) -> CutSet:
logging.info("About to get validation cuts")
return load_manifest_lazy(
self.args.manifest_dir / "baker_zh_cuts_valid.jsonl.gz"
)
@lru_cache()
def test_cuts(self) -> CutSet:
logging.info("About to get test cuts")
return load_manifest_lazy(
self.args.manifest_dir / "baker_zh_cuts_test.jsonl.gz"
)

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/matcha/utils.py

151
egs/baker_zh/TTS/prepare.sh Executable file
View File

@ -0,0 +1,151 @@
#!/usr/bin/env bash
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
set -eou pipefail
stage=-1
stop_stage=100
dl_dir=$PWD/download
mkdir -p $dl_dir
. shared/parse_options.sh || exit 1
# All files generated by this script are saved in "data".
# You can safely remove "data" and rerun this script to regenerate it.
mkdir -p data
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
log "dl_dir: $dl_dir"
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
log "Stage -1: build monotonic_align lib (used by ./matcha)"
for recipe in matcha; do
if [ ! -d $recipe/monotonic_align/build ]; then
cd $recipe/monotonic_align
python3 setup.py build_ext --inplace
cd ../../
else
log "monotonic_align lib for $recipe already built"
fi
done
fi
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download data"
# The directory $dl_dir/BANSYP contains the following 3 directories
# ls -lh $dl_dir/BZNSYP/
# total 0
# drwxr-xr-x 10002 kuangfangjun root 0 Jan 4 2019 PhoneLabeling
# drwxr-xr-x 3 kuangfangjun root 0 Jan 31 2019 ProsodyLabeling
# drwxr-xr-x 10003 kuangfangjun root 0 Aug 26 17:45 Wave
# If you have trouble accessing huggingface.co, please use
#
# cd $dl_dir
# wget https://huggingface.co/openspeech/BZNSYP/resolve/main/BZNSYP.tar.bz2
# tar xf BZNSYP.tar.bz2
# cd ..
# If you have pre-downloaded it to /path/to/BZNSYP, you can create a symlink
#
# ln -sfv /path/to/BZNSYP $dl_dir/BZNSYP
#
if [ ! -d $dl_dir/BZNSYP/Wave ]; then
lhotse download baker-zh $dl_dir
fi
fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare baker-zh manifest"
# We assume that you have downloaded the baker corpus
# to $dl_dir/BZNSYP
mkdir -p data/manifests
if [ ! -e data/manifests/.baker-zh.done ]; then
lhotse prepare baker-zh $dl_dir/BZNSYP data/manifests
touch data/manifests/.baker-zh.done
fi
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Generate tokens.txt"
if [ ! -e data/tokens.txt ]; then
python3 ./local/generate_tokens.py --tokens data/tokens.txt
fi
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Generate raw cutset"
if [ ! -e data/manifests/baker_zh_cuts_raw.jsonl.gz ]; then
lhotse cut simple \
-r ./data/manifests/baker_zh_recordings_all.jsonl.gz \
-s ./data/manifests/baker_zh_supervisions_all.jsonl.gz \
./data/manifests/baker_zh_cuts_raw.jsonl.gz
fi
fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Convert text to tokens"
if [ ! -e data/manifests/baker_zh_cuts.jsonl.gz ]; then
python3 ./local/convert_text_to_tokens.py \
--in-file ./data/manifests/baker_zh_cuts_raw.jsonl.gz \
--out-file ./data/manifests/baker_zh_cuts.jsonl.gz
fi
fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Generate fbank (used by ./matcha)"
mkdir -p data/fbank
if [ ! -e data/fbank/.baker-zh.done ]; then
./local/compute_fbank_baker_zh.py
touch data/fbank/.baker-zh.done
fi
if [ ! -e data/fbank/.baker-zh-validated.done ]; then
log "Validating data/fbank for baker-zh (used by ./matcha)"
python3 ./local/validate_manifest.py \
data/fbank/baker_zh_cuts.jsonl.gz
touch data/fbank/.baker-zh-validated.done
fi
fi
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 6: Split the baker-zh cuts into train, valid and test sets (used by ./matcha)"
if [ ! -e data/fbank/.baker_zh_split.done ]; then
lhotse subset --last 600 \
data/fbank/baker_zh_cuts.jsonl.gz \
data/fbank/baker_zh_cuts_validtest.jsonl.gz
lhotse subset --first 100 \
data/fbank/baker_zh_cuts_validtest.jsonl.gz \
data/fbank/baker_zh_cuts_valid.jsonl.gz
lhotse subset --last 500 \
data/fbank/baker_zh_cuts_validtest.jsonl.gz \
data/fbank/baker_zh_cuts_test.jsonl.gz
rm data/fbank/baker_zh_cuts_validtest.jsonl.gz
n=$(( $(gunzip -c data/fbank/baker_zh_cuts.jsonl.gz | wc -l) - 600 ))
lhotse subset --first $n \
data/fbank/baker_zh_cuts.jsonl.gz \
data/fbank/baker_zh_cuts_train.jsonl.gz
touch data/fbank/.baker_zh_split.done
fi
fi
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
log "Stage 6: Compute fbank mean and std (used by ./matcha)"
if [ ! -f ./data/fbank/cmvn.json ]; then
./local/compute_fbank_statistics.py ./data/fbank/baker_zh_cuts_train.jsonl.gz ./data/fbank/cmvn.json
fi
fi

1
egs/baker_zh/TTS/shared Symbolic link
View File

@ -0,0 +1 @@
../../../icefall/shared

View File

@ -73,11 +73,11 @@ def compile_HLG(lang_dir: str, lm: str = "G_3_gram") -> k2.Fsa:
max_token_id = max(lexicon.tokens)
logging.info(f"Building ctc_topo. max_token_id: {max_token_id}")
H = k2.ctc_topo(max_token_id)
L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt"))
L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt", weights_only=False))
if Path(f"{lang_dir}/lm/{lm}.pt").is_file():
logging.info(f"Loading pre-compiled {lm}")
d = torch.load(f"{lang_dir}/lm/{lm}.pt")
d = torch.load(f"{lang_dir}/lm/{lm}.pt", weights_only=False)
G = k2.Fsa.from_dict(d)
else:
logging.info(f"Loading {lm}.fst.txt")

Some files were not shown because too many files have changed in this diff Show More