From f0264bed1b0b2e5d8b7e890e125ed8ca4059cc6c Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 4 May 2023 16:18:31 +0800
Subject: [PATCH] Fix DDP issue; Change configurations, reducing subsampling
 factor; increase sequence length.

---
 egs/libriheavy/LM/zipformer1/chunk_decoder.py |  2 +-
 egs/libriheavy/LM/zipformer1/lm_datamodule.py | 15 ++++++++++++---
 egs/libriheavy/LM/zipformer1/train.py         | 19 +++++++++----------
 3 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/egs/libriheavy/LM/zipformer1/chunk_decoder.py b/egs/libriheavy/LM/zipformer1/chunk_decoder.py
index 823df602e..7b0827796 100644
--- a/egs/libriheavy/LM/zipformer1/chunk_decoder.py
+++ b/egs/libriheavy/LM/zipformer1/chunk_decoder.py
@@ -119,6 +119,6 @@ class ChunkDecoder(nn.Module):
             # occasionally print out average logprob per position in the chunk.
             l = logprobs.reshape(batch_size, num_chunks, chunk_size).mean(dim=(0, 1))
             l = l.to('cpu').tolist()
-            logging.info(l"Logprobs per position in chunk: {l}")
+            logging.info(f"Logprobs per position in chunk: {l}")
 
         return logprobs
diff --git a/egs/libriheavy/LM/zipformer1/lm_datamodule.py b/egs/libriheavy/LM/zipformer1/lm_datamodule.py
index 8a269a179..6b91a1dca 100644
--- a/egs/libriheavy/LM/zipformer1/lm_datamodule.py
+++ b/egs/libriheavy/LM/zipformer1/lm_datamodule.py
@@ -37,7 +37,10 @@ from icefall.utils import str2bool
 class LmDataset(torch.utils.data.IterableDataset):
     def __init__(self,
                  file_list_fn: Path,
-                 bytes_per_segment: int = 200):
+                 bytes_per_segment: int = 200,
+                 world_size: int = 1,
+                 rank: int = 0,
+    ):
         """
         Initialize LmDataset object.  Args:
           file_list_fn: a file in which each line contains: a number of bytes, then a space, then a filename.
@@ -48,6 +51,7 @@ class LmDataset(torch.utils.data.IterableDataset):
         self.files = []
         self.num_bytes = []
         self.bytes_per_segment = bytes_per_segment
+        self.ddp_rank = get_rank()
 
         num_bytes = []
         with open(file_list_fn) as f:
@@ -64,18 +68,23 @@ class LmDataset(torch.utils.data.IterableDataset):
         worker_info = torch.utils.data.get_worker_info()
         num_workers = (1 if worker_info is None else worker_info.num_workers)
 
+        # world_size is for ddp training, num_workers for data-loader worker threads.
         tot_workers = num_workers * get_world_size()
 
+
         self.num_segments = tot_bytes // (bytes_per_segment * tot_workers)
 
 
     def __iter__(self):
         worker_info = torch.utils.data.get_worker_info()
         # id includes both worker (within training job) and rank of training job
-        my_id = (0 if worker_info is None else worker_info.id) + 1000 * get_rank()
+        my_id = (0 if worker_info is None else worker_info.id) + 1000 * self.ddp_rank
 
         seed = random.randint(0, 10000) + my_id
-        logging.info(f"seed={seed}, num_segments={self.num_segments}")
+        # the next line is because, for some reason, when we ran with --worle-size more than 1,
+        # this info message was not printed out.
+        logging.getLogger().setLevel(logging.INFO)
+        logging.info(f"my_id={my_id}, seed={seed}, num_segments={self.num_segments}")
         rng = np.random.default_rng(seed=seed)
         for n in range(self.num_segments):
             # np.random.multinomial / np.random.Generator.multinomial has an interface
diff --git a/egs/libriheavy/LM/zipformer1/train.py b/egs/libriheavy/LM/zipformer1/train.py
index 8f7abed90..e6ddae4e8 100755
--- a/egs/libriheavy/LM/zipformer1/train.py
+++ b/egs/libriheavy/LM/zipformer1/train.py
@@ -121,7 +121,7 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--num-encoder-layers",
         type=str,
-        default="2,4,5,6",
+        default="2,4,8",
         help="Number of zipformer encoder layers per stack, comma separated.",
     )
 
@@ -129,7 +129,7 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--downsampling-factor",
         type=str,
-        default="1,2,4,8",
+        default="1,2,4",
         help="Downsampling factor for each stack of encoder layers.",
     )
 
@@ -137,21 +137,21 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--feedforward-dim",
         type=str,
-        default="512,768,1024,1536",
+        default="768,1024,1536",
         help="Feedforward dimension of the zipformer encoder layers, per stack, comma separated.",
     )
 
     parser.add_argument(
         "--num-heads",
         type=str,
-        default="4,4,6,8",
+        default="4,4,8",
         help="Number of attention heads in the zipformer encoder layers: a single int or comma-separated list.",
     )
 
     parser.add_argument(
         "--encoder-dim",
         type=str,
-        default="192,256,384,512",
+        default="256,384,512",
         help="Embedding dimension in encoder stacks: a single int or comma-separated list."
     )
 
@@ -186,7 +186,7 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--encoder-unmasked-dim",
         type=str,
-        default="192,192,256,256",
+        default="192,192,256",
         help="Unmasked dimensions in the encoders, relates to augmentation during training.  "
         "A single int or comma-separated list.  Must be <= each corresponding encoder_dim."
     )
@@ -194,7 +194,7 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--cnn-module-kernel",
         type=str,
-        default="31,31,15,15",
+        default="31,31,15",
         help="Sizes of convolutional kernels in convolution modules in each encoder stack: "
         "a single int or comma-separated list.",
     )
@@ -214,7 +214,6 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     )
 
 
-
 def get_parser():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
@@ -481,8 +480,8 @@ def get_params() -> AttributeDict:
             "valid_interval": 3000,
             "warm_step": 2000,
             "env_info": get_env_info(),
-            "bytes_per_segment": 1024,
-            "batch_size": 64,
+            "bytes_per_segment": 2048,
+            "batch_size": 40,
             "train_file_list": "train.txt",
             "valid_file_list": "valid.txt",
             "num_workers": 4,