diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md
new file mode 100644
index 000000000..159147a3e
--- /dev/null
+++ b/egs/librispeech/ASR/RESULTS.md
@@ -0,0 +1,23 @@
+## Results
+
+### LibriSpeech BPE training results (Conformer-CTC)
+#### 2021-08-19
+(Wei Kang): Result of https://github.com/k2-fsa/icefall/pull/13
+
+TensorBoard log is available at https://tensorboard.dev/experiment/GnRzq8WWQW62dK4bklXBTg/#scalars
+
+Pretrained model is available at https://huggingface.co/pkufool/conformer_ctc
+
+The best decoding results (WER) are listed below, we got this results by averaging models from epoch 15 to 34, and using `attention-decoder` decoder with num_paths equals to 100.
+
+||test-clean|test-other|
+|--|--|--|
+|WER| 2.57% | 5.94% |
+
+To get more unique paths, we scaled the lattice.scores with 0.5 (see https://github.com/k2-fsa/icefall/pull/10#discussion_r690951662 for more details), we searched the lm_score_scale and attention_score_scale for best results, the scales that produced the WER above are also listed below.
+
+||lm_scale|attention_scale|
+|--|--|--|
+|test-clean|1.3|1.2|
+|test-other|1.2|1.1|
+
diff --git a/egs/librispeech/ASR/conformer_ctc/decode.py b/egs/librispeech/ASR/conformer_ctc/decode.py
index 604ac005e..c17a8b284 100755
--- a/egs/librispeech/ASR/conformer_ctc/decode.py
+++ b/egs/librispeech/ASR/conformer_ctc/decode.py
@@ -317,6 +317,7 @@ def decode_dataset(
     results = []
 
     num_cuts = 0
+    tot_num_batches = len(dl)
 
     results = defaultdict(list)
     for batch_idx, batch in enumerate(dl):
@@ -346,6 +347,8 @@ def decode_dataset(
 
         if batch_idx % 100 == 0:
             logging.info(
+                f"batch {batch_idx}/{tot_num_batches}, cuts processed until now is "
+                f"{num_cuts}"
                 f"batch {batch_idx}, cuts processed until now is {num_cuts}"
             )
     return results
@@ -406,7 +409,7 @@ def main():
     params = get_params()
     params.update(vars(args))
 
-    setup_logger(f"{params.exp_dir}/log/log-decode")
+    setup_logger(f"{params.exp_dir}/log-{params.method}/log-decode")
     logging.info("Decoding started")
     logging.info(params)
 
diff --git a/egs/librispeech/ASR/conformer_ctc/train.py b/egs/librispeech/ASR/conformer_ctc/train.py
index 645757ebc..36464439b 100755
--- a/egs/librispeech/ASR/conformer_ctc/train.py
+++ b/egs/librispeech/ASR/conformer_ctc/train.py
@@ -16,6 +16,7 @@ import torch.nn as nn
 from conformer import Conformer
 from lhotse.utils import fix_random_seed
 from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.nn.utils import clip_grad_value_
 from torch.nn.utils import clip_grad_norm_
 from torch.utils.tensorboard import SummaryWriter
 from transformer import Noam
@@ -145,7 +146,6 @@ def get_params() -> AttributeDict:
             "beam_size": 10,
             "reduction": "sum",
             "use_double_scores": True,
-            #
             "accum_grad": 1,
             "att_rate": 0.7,
             "attention_dim": 512,
@@ -463,7 +463,7 @@ def train_one_epoch(
 
         optimizer.zero_grad()
         loss.backward()
-        clip_grad_norm_(model.parameters(), 5.0, 2.0)
+        clip_grad_value_(model.parameters(), 5.0)
         optimizer.step()
 
         loss_cpu = loss.detach().cpu().item()
diff --git a/icefall/dataset/asr_datamodule.py b/icefall/dataset/asr_datamodule.py
index aae7af9ce..73eef9c31 100644
--- a/icefall/dataset/asr_datamodule.py
+++ b/icefall/dataset/asr_datamodule.py
@@ -171,6 +171,8 @@ class AsrDataModule(DataModule):
                 max_duration=self.args.max_duration,
                 shuffle=True,
                 num_buckets=self.args.num_buckets,
+                bucket_method='equal_duration',
+                drop_last=True,
             )
         else:
             logging.info("Using SingleCutSampler.")
@@ -184,8 +186,8 @@ class AsrDataModule(DataModule):
             train,
             sampler=train_sampler,
             batch_size=None,
-            num_workers=4,
-            persistent_workers=True,
+            num_workers=2,
+            persistent_workers=False,
         )
         return train_dl
 
@@ -214,7 +216,7 @@ class AsrDataModule(DataModule):
             sampler=valid_sampler,
             batch_size=None,
             num_workers=2,
-            persistent_workers=True,
+            persistent_workers=False,
         )
         return valid_dl
 
diff --git a/icefall/decode.py b/icefall/decode.py
index 8c1eef530..49d642f1c 100644
--- a/icefall/decode.py
+++ b/icefall/decode.py
@@ -750,7 +750,7 @@ def rescore_with_attention_decoder(
     # Since k2.ragged.unique_sequences will reorder paths within a seq,
     # `new2old` is a 1-D torch.Tensor mapping from the output path index
     # to the input path index.
-    # new2old.numel() == unique_word_seqs.tot_size(1)
+    # new2old.numel() == unique_word_seq.tot_size(1)
     unique_word_seq, num_repeats, new2old = k2.ragged.unique_sequences(
         word_seq, need_num_repeats=True, need_new2old_indexes=True
     )