diff --git a/egs/librispeech/ASR/zipformer/decoder.py b/egs/librispeech/ASR/zipformer/decoder.py index e8db988f6..45432d570 100644 --- a/egs/librispeech/ASR/zipformer/decoder.py +++ b/egs/librispeech/ASR/zipformer/decoder.py @@ -58,6 +58,7 @@ class Decoder(nn.Module): self.embedding = nn.Embedding( num_embeddings=vocab_size, embedding_dim=decoder_dim, + padding_idx=blank_id, ) # the balancers are to avoid any drift in the magnitude of the # embeddings, which would interact badly with parameter averaging. diff --git a/egs/librispeech/ASR/zipformer/model.py b/egs/librispeech/ASR/zipformer/model.py index 0c3ea6a86..9b7494972 100644 --- a/egs/librispeech/ASR/zipformer/model.py +++ b/egs/librispeech/ASR/zipformer/model.py @@ -333,7 +333,7 @@ class AsrModel(nn.Module): simple_loss, pruned_loss = self.forward_transducer( encoder_out=encoder_out, encoder_out_lens=encoder_out_lens, - y=y.to(x.device), + y=y, y_lens=y_lens, prune_range=prune_range, am_scale=am_scale, diff --git a/egs/librispeech/ASR/zipformer/train.py b/egs/librispeech/ASR/zipformer/train.py index bc3e9c1ba..1d1bee947 100755 --- a/egs/librispeech/ASR/zipformer/train.py +++ b/egs/librispeech/ASR/zipformer/train.py @@ -789,7 +789,7 @@ def compute_loss( texts = batch["supervisions"]["text"] y = sp.encode(texts, out_type=int) - y = k2.RaggedTensor(y) + y = k2.RaggedTensor(y).to(device) with torch.set_grad_enabled(is_training): simple_loss, pruned_loss, ctc_loss = model( diff --git a/egs/librispeech/ASR/zipformer/zipformer.py b/egs/librispeech/ASR/zipformer/zipformer.py index 7d98dbeb1..612356a50 100644 --- a/egs/librispeech/ASR/zipformer/zipformer.py +++ b/egs/librispeech/ASR/zipformer/zipformer.py @@ -2190,7 +2190,7 @@ class ConvolutionModule(nn.Module): x = self.in_proj(x) # (time, batch, 2*channels) - x, s = x.chunk(2, dim=2) + x, s = x.chunk(2, dim=-1) s = self.sigmoid(s) x = x * s # (time, batch, channels) diff --git a/egs/multi_en/ASR/prepare.sh b/egs/multi_en/ASR/prepare.sh index 65969a913..2e429fb3a 100755 --- a/egs/multi_en/ASR/prepare.sh +++ b/egs/multi_en/ASR/prepare.sh @@ -66,6 +66,17 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then if [ -e ../../librispeech/ASR/data/fbank/.librispeech.done ]; then cd data/fbank ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz) . + ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_cuts_dev-clean.jsonl.gz) . + ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_cuts_dev-other.jsonl.gz) . + ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_cuts_test-clean.jsonl.gz) . + ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_cuts_test-other.jsonl.gz) . + ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_feats_train-clean-100) . + ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_feats_train-clean-360) . + ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_feats_train-other-500) . + ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_feats_dev-clean) . + ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_feats_dev-other) . + ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_feats_test-clean) . + ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_feats_test-other) . cd ../.. else log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 3 --stop-stage 3" diff --git a/egs/multi_en/ASR/zipformer/train.py b/egs/multi_en/ASR/zipformer/train.py index bfb6e116c..a42613a27 100755 --- a/egs/multi_en/ASR/zipformer/train.py +++ b/egs/multi_en/ASR/zipformer/train.py @@ -790,7 +790,7 @@ def compute_loss( texts = batch["supervisions"]["text"] y = sp.encode(texts, out_type=int) - y = k2.RaggedTensor(y).to(device) + y = k2.RaggedTensor(y) with torch.set_grad_enabled(is_training): simple_loss, pruned_loss, ctc_loss = model(