minor fixes

This commit is contained in:
zr_jin 2023-09-02 17:41:57 +08:00
parent 28a361795f
commit 9d25e6e7f0
9 changed files with 20 additions and 17 deletions

View File

@ -27,7 +27,7 @@ This recipe includes scripts for training Zipformer model using multiple Chinese
|MagicData|755|https://www.openslr.org/68/|
|AliMeeting|100|https://openslr.org/119/|
|WeNetSpeech|10,000|https://github.com/wenet-e2e/WenetSpeech|
|KeSpeech|1,542|https://openreview.net/forum?id=b3Zoeq2sCLq|
|KeSpeech|1,542|https://github.com/KeSpeech/KeSpeech|
# Included Test Sets

View File

@ -80,7 +80,7 @@ def compute_fbank_magicdata(num_mel_bins: int = 80, speed_perturb: bool = False)
)
if "train" in partition and speed_perturb:
cut_set = (
(cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1))
cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
)
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
@ -117,6 +117,6 @@ if __name__ == "__main__":
logging.basicConfig(format=formatter, level=logging.INFO)
args = get_args()
compute_fbank_thchs30(
compute_fbank_magicdata(
num_mel_bins=args.num_mel_bins, speed_perturb=args.speed_perturb
)

View File

@ -117,6 +117,6 @@ if __name__ == "__main__":
logging.basicConfig(format=formatter, level=logging.INFO)
args = get_args()
compute_fbank_thchs30(
compute_fbank_primewords(
num_mel_bins=args.num_mel_bins, speed_perturb=args.speed_perturb
)

View File

@ -80,7 +80,7 @@ def compute_fbank_stcmds(num_mel_bins: int = 80, speed_perturb: bool = False):
)
if "train" in partition and speed_perturb:
cut_set = (
(cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1))
cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
)
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
@ -116,6 +116,6 @@ if __name__ == "__main__":
logging.basicConfig(format=formatter, level=logging.INFO)
args = get_args()
compute_fbank_thchs30(
compute_fbank_stcmds(
num_mel_bins=args.num_mel_bins, speed_perturb=args.speed_perturb
)

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python3
# Copyright 2021 Xiaomi Corp. (authors: Zengrui Jin)
# Copyright 2023 Xiaomi Corp. (authors: Zengrui Jin)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
@ -15,10 +15,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# This script tokenizes the training transcript by CJK characters
# and saves the result to transcript_chars.txt, which is used
# to train the BPE model later.
import argparse
from pathlib import Path
from tqdm.auto import tqdm
from icefall.utils import tokenize_by_CJK_char
@ -52,11 +57,8 @@ def main():
with open(text, "r", encoding="utf-8") as fin:
text_lines = fin.readlines()
tokenized_lines = []
for line in tqdm(text_lines, desc="Tokenizing training transcript"):
tokenized_lines.append(f"{tokenize_by_CJK_char(line)}\n")
with open(transcript_path, "w+", encoding="utf-8") as fout:
fout.writelines(tokenized_lines)
fout.writelines([f"{tokenize_by_CJK_char(line)}\n" for line in text_lines])
if __name__ == "__main__":

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python3
# Copyright 2021 Johns Hopkins University (Piotr Żelasko)
# Copyright 2021 Xiaomi Corp. (Fangjun Kuang)
# Copyright 2023 Xiaomi Corp. (Zengrui Jin)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
@ -32,7 +33,6 @@ from icefall import setup_logger
def normalize_text(
utt: str,
# punct_pattern=re.compile(r"<(COMMA|PERIOD|QUESTIONMARK|EXCLAMATIONPOINT)>"),
punct_pattern=re.compile(r"<(PERIOD|QUESTIONMARK|EXCLAMATIONPOINT)>"),
whitespace_pattern=re.compile(r"\s\s+"),
) -> str:

View File

@ -1,5 +1,6 @@
#!/usr/bin/env python3
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
# Copyright 2023 Xiaomi Corp. (authors: Zengrui Jin)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#

View File

@ -5,7 +5,6 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
set -eou pipefail
nj=16
stage=-1
stop_stage=100
num_splits=100
@ -256,11 +255,12 @@ if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
log "Stage 12: Prepare KeSpeech"
if [ ! -d $dl_dir/KeSpeech ]; then
log "Abort! Please download KeSpeech first."
log "KeSpeech download link: https://github.com/KeSpeech/KeSpeech"
fi
if [ ! -f data/manifests/.kespeech.done ]; then
mkdir -p data/manifests
lhotse prepare kespeech -j $nj $dl_dir/KeSpeech data/manifests/kespeech
lhotse prepare kespeech -j 16 $dl_dir/KeSpeech data/manifests/kespeech
touch data/manifests/.kespeech.done
fi
@ -303,7 +303,7 @@ if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
fi
if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
log "Stage 13: BPE model training"
log "Stage 13: BPE model training (note that we use transcripts of wenetspeech only for BPE training)"
./local/prepare_for_bpe_model.py --lang-dir ./data/lang_char --text ./data/lang_char/text
for vocab_size in ${vocab_sizes[@]}; do
@ -348,7 +348,7 @@ if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
fi
if [ $stage -le 14 ] && [ $stop_stage -ge 14 ]; then
log "Stage 14: Prepare G"
log "Stage 14: Prepare G (note that we use ngram lm of wenetspeech only for G preparation)"
if [ -d ../../wenetspeech/ASR/data/lang_char/ ]; then
cd data

View File

@ -322,7 +322,7 @@ class AsrDataModule:
sampler=train_sampler,
batch_size=None,
num_workers=self.args.num_workers,
persistent_workers=False,
persistent_workers=True,
worker_init_fn=worker_init_fn,
)