WIP: Begin to add RNNLM.

Finish the dataset part.
2025-08-26 18:24:18 +00:00 · 2021-11-21 15:58:05 +08:00 · 2021-11-21 15:58:05 +08:00 · 42dcd53361
commit 42dcd53361
parent 30c43b7f69
12 changed files with 997 additions and 1 deletions
--- a/egs/librispeech/ASR/local/train_bpe_model.py
+++ b/egs/librispeech/ASR/local/train_bpe_model.py
@ -38,7 +38,6 @@ def get_args():
        "--lang-dir",
        type=str,
        help="""Input and output directory.
-        It should contain the training corpus: transcript_words.txt.
        The generated bpe.model is saved to this directory.
        """,
    )
--- a/egs/ptb/LM/README.md
+++ b/egs/ptb/LM/README.md
@ -0,0 +1,18 @@
+## Description
+
+(Note: the experiments here are only about language modeling)
+
+ptb is short for Penn Treebank.
+
+
+About the Penn Treebank corpus:
+  - This corpus is free for research purposes
+  - ptb.train.txt: train set
+  - ptb.valid.txt: development set (should be used just for tuning hyper-parameters, but not for training)
+  - ptb.test.txt: test set for reporting perplexity
+
+You can download the dataset from one of the following URLs:
+
+- https://github.com/townie/PTB-dataset-from-Tomas-Mikolov-s-webpage
+- http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
+- https://deepai.org/dataset/penn-treebank
--- a/egs/ptb/LM/local/prepare_lm_training_data.py
+++ b/egs/ptb/LM/local/prepare_lm_training_data.py
@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2021  Xiaomi Corporation (authors: Daniel Povey
+#                                                   Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script takes a `bpe.model` and a text file such as
+`download/ptb.train.txt`,
+and outputs the LM training data to a supplied directory such
+as data/bpe_500.  The format is as follows:
+
+It creates a PyTorch archive (.pt file), say data/lm_training.pt, which is a
+representation of a dict with the following format:
+
+  'words' -> a k2.RaggedTensor of two axes [word][token] with dtype torch.int32
+             containing the BPE representations of each word, indexed by
+             integer word ID. (These integer word IDS are present in
+             'lm_data').  The sentencepiece object can be used to turn the
+             words and BPE units into string form.
+  'sentences' -> a k2.RaggedTensor of two axes [sentence][word] with dtype
+            torch.int32 containing all the sentences, as word-ids (we don't
+            output the string form of this directly but it can be worked out
+            together with 'words' and the bpe.model).
+  'sentence_lengths' -> a 1-D torch.Tensor of dtype torch.int32, containing
+            number of BPE tokens of each sentence.
+"""
+
+import argparse
+import logging
+from pathlib import Path
+
+import k2
+import sentencepiece as spm
+import torch
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        help="Input BPE model, e.g. data/bpe_500/bpe.model",
+    )
+    parser.add_argument(
+        "--lm-data",
+        type=str,
+        help="""Input LM training data as text, e.g.
+        download/pb.train.txt""",
+    )
+    parser.add_argument(
+        "--lm-archive",
+        type=str,
+        help="""Path to output archive, e.g. data/bpe_500/lm_data.pt;
+        look at the source of this script to see the format.""",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+
+    if Path(args.lm_archive).exists():
+        logging.warning(f"{args.lm_archive} exists - skipping")
+        return
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(args.bpe_model)
+
+    # word2index is a dictionary from words to integer ids.  No need to reserve
+    # space for epsilon, etc.; the words are just used as a convenient way to
+    # compress the sequences of BPE pieces.
+    word2index = dict()
+
+    word2bpe = []  # Will be a list-of-list-of-int, representing BPE pieces.
+
+    # ptb.train.txt has already converted oov words to <unk>
+    word2bpe.append([sp.unk_id()])
+    word2index["<unk>"] = 0
+
+    sentences = []  # Will be a list-of-list-of-int, representing word-ids.
+
+    with open(args.lm_data) as f:
+        while True:
+            line = f.readline()
+            if line == "":
+                break
+            line_words = line.split()
+            for w in line_words:
+                if w not in word2index:
+                    w_bpe = sp.encode(w)
+                    word2index[w] = len(word2bpe)
+                    word2bpe.append(w_bpe)
+            sentences.append([word2index[w] for w in line_words])
+
+    words = k2.ragged.RaggedTensor(word2bpe)
+    sentences = k2.ragged.RaggedTensor(sentences)
+
+    output = dict(words=words, sentences=sentences)
+
+    num_sentences = sentences.dim0
+    sentence_lengths = [0] * num_sentences
+    for i in range(num_sentences):
+        word_ids = sentences[i]
+
+        # NOTE: If word_ids is a tensor with only 1 entry,
+        # token_ids is a torch.Tensor
+        token_ids = words[word_ids]
+        if isinstance(token_ids, k2.RaggedTensor):
+            token_ids = token_ids.values
+
+        # token_ids is a 1-D tensor containing the BPE tokens
+        # of the current sentence
+
+        sentence_lengths[i] = token_ids.numel()
+
+    output["sentence_lengths"] = torch.tensor(
+        sentence_lengths, dtype=torch.int32
+    )
+
+    torch.save(output, args.lm_archive)
+    logging.info(f"Saved to {args.lm_archive}")
+
+
+if __name__ == "__main__":
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    main()
--- a/egs/ptb/LM/local/sort_lm_training_data.py
+++ b/egs/ptb/LM/local/sort_lm_training_data.py
@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This file takes as input the filename of LM training data
+generated by ./local/prepare_lm_training_data.py and sorts
+it by sentence length.
+
+Sentence length equals to the number of BPE tokens in a sentence.
+"""
+
+import argparse
+import logging
+from pathlib import Path
+
+import k2
+import numpy as np
+import torch
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--in-lm-data",
+        type=str,
+        help="Input LM training data, e.g., data/bpe_500/lm_data.pt",
+    )
+
+    parser.add_argument(
+        "--out-lm-data",
+        type=str,
+        help="Input LM training data, e.g., data/bpe_500/sorted_lm_data.pt",
+    )
+
+    parser.add_argument(
+        "--out-statistics",
+        type=str,
+        help="Statistics about LM training data., data/bpe_500/statistics.txt",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    in_lm_data = Path(args.in_lm_data)
+    out_lm_data = Path(args.out_lm_data)
+    assert in_lm_data.is_file(), f"{in_lm_data}"
+    if out_lm_data.is_file():
+        logging.warning(f"{out_lm_data} exists - skipping")
+        return
+    data = torch.load(in_lm_data)
+    words2bpe = data["words"]
+    sentences = data["sentences"]
+    sentence_lengths = data["sentence_lengths"]
+
+    num_sentences = sentences.dim0
+    assert num_sentences == sentence_lengths.numel(), (
+        num_sentences,
+        sentence_lengths.numel(),
+    )
+
+    indices = torch.argsort(sentence_lengths, descending=True)
+
+    sorted_sentences = sentences[indices.to(torch.int32)]
+    sorted_sentence_lengths = sentence_lengths[indices]
+
+    # Check that sentences are ordered by length
+    assert num_sentences == sorted_sentences.dim0, (
+        num_sentences,
+        sorted_sentences.dim0,
+    )
+
+    cur = None
+    for i in range(num_sentences):
+        word_ids = sorted_sentences[i]
+        token_ids = words2bpe[word_ids]
+        if isinstance(token_ids, k2.RaggedTensor):
+            token_ids = token_ids.values
+        if cur is not None:
+            assert cur >= token_ids.numel(), (cur, token_ids.numel())
+
+        cur = token_ids.numel()
+        assert cur == sorted_sentence_lengths[i]
+
+    data["sentences"] = sorted_sentences
+    data["sentence_lengths"] = sorted_sentence_lengths
+    torch.save(data, args.out_lm_data)
+    logging.info(f"Saved to {args.out_lm_data}")
+
+    statistics = Path(args.out_statistics)
+
+    # Write statistics
+    num_words = sorted_sentences.numel()
+    num_tokens = sentence_lengths.sum().item()
+    max_sentence_length = sentence_lengths[indices[0]]
+    min_sentence_length = sentence_lengths[indices[-1]]
+
+    step = 10
+    hist, bins = np.histogram(
+        sentence_lengths.numpy(),
+        bins=np.arange(1, max_sentence_length + step, step),
+    )
+
+    histogram = np.stack((bins[:-1], hist)).transpose()
+
+    with open(statistics, "w") as f:
+        f.write(f"num_sentences: {num_sentences}\n")
+        f.write(f"num_words: {num_words}\n")
+        f.write(f"num_tokens: {num_tokens}\n")
+        f.write(f"max_sentence_length: {max_sentence_length}\n")
+        f.write(f"min_sentence_length: {min_sentence_length}\n")
+        f.write("histogram:\n")
+        f.write("  bin  count  percent\n")
+        for row in histogram:
+            f.write(
+                f"{int(row[0]):>5} {int(row[1]):>5}   "
+                f"{100.*row[1]/num_sentences:.3f}%\n"
+            )
+
+
+if __name__ == "__main__":
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    main()
--- a/egs/ptb/LM/local/test_prepare_lm_training_data.py
+++ b/egs/ptb/LM/local/test_prepare_lm_training_data.py
@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from pathlib import Path
+
+import sentencepiece as spm
+import torch
+
+
+def main():
+    lm_training_data = Path("./data/bpe_500/lm_data.pt")
+    bpe_model = Path("./data/bpe_500/bpe.model")
+    if not lm_training_data.exists():
+        logging.warning(f"{lm_training_data} does not exist - skipping")
+        return
+
+    if not bpe_model.exists():
+        logging.warning(f"{bpe_model} does not exist - skipping")
+        return
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(str(bpe_model))
+
+    data = torch.load(lm_training_data)
+    words2bpe = data["words"]
+    sentences = data["sentences"]
+
+    ss = []
+    unk = sp.decode(sp.unk_id()).strip()
+    for i in range(10):
+        s = sp.decode(words2bpe[sentences[i]].values.tolist())
+        s = s.replace(unk, "<unk>")
+        ss.append(s)
+
+    for s in ss:
+        print(s)
+    # You can compare the output with the first 10 lines of ptb.train.txt
+
+
+if __name__ == "__main__":
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    main()
--- a/egs/ptb/LM/local/train_bpe_model.py
+++ b/egs/ptb/LM/local/train_bpe_model.py
@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# You can install sentencepiece via:
+#
+#  pip install sentencepiece
+#
+# Due to an issue reported in
+# https://github.com/google/sentencepiece/pull/642#issuecomment-857972030
+#
+# Please install a version >=0.1.96
+
+import argparse
+import shutil
+from pathlib import Path
+
+import sentencepiece as spm
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--out-dir",
+        type=str,
+        help="""Input and output directory.
+        The generated bpe.model is saved to this directory.
+        """,
+    )
+
+    parser.add_argument(
+        "--transcript",
+        type=str,
+        help="Training transcript.",
+    )
+
+    parser.add_argument(
+        "--vocab-size",
+        type=int,
+        help="Vocabulary size for BPE training",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    vocab_size = args.vocab_size
+    model_type = "unigram"
+
+    model_prefix = f"{args.out_dir}/{model_type}_{vocab_size}"
+    train_text = args.transcript
+    character_coverage = 1.0
+    input_sentence_size = 100000000
+
+    user_defined_symbols = ["<blk>", "<sos/eos>"]
+    unk_id = len(user_defined_symbols)
+    # Note: unk_id is fixed to 2.
+    # If you change it, you should also change other
+    # places that are using it.
+
+    model_file = Path(model_prefix + ".model")
+    if not model_file.is_file():
+        spm.SentencePieceTrainer.train(
+            input=train_text,
+            vocab_size=vocab_size,
+            model_type=model_type,
+            model_prefix=model_prefix,
+            input_sentence_size=input_sentence_size,
+            character_coverage=character_coverage,
+            user_defined_symbols=user_defined_symbols,
+            unk_id=unk_id,
+            bos_id=-1,
+            eos_id=-1,
+        )
+
+    shutil.copyfile(model_file, f"{args.out_dir}/bpe.model")
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/ptb/LM/prepare.sh
+++ b/egs/ptb/LM/prepare.sh
@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+nj=15
+stage=-1
+stop_stage=100
+
+dl_dir=$PWD/download
+# The following files will be downloaded to $dl_dir
+#  - ptb.train.txt
+#  - ptb.valid.txt
+#  - ptb.test.txt
+
+. shared/parse_options.sh || exit 1
+
+# vocab size for sentence piece models.
+# It will generate data/bpe_xxx, data/bpe_yyy
+# if the array contains xxx, yyy
+vocab_sizes=(
+  500
+  1000
+  2000
+  5000
+)
+
+# All files generated by this script are saved in "data".
+# You can safely remove "data" and rerun this script to regenerate it.
+mkdir -p data
+mkdir -p $dl_dir
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+log "dl_dir: $dl_dir"
+
+if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
+  log "Stage -1: Download data"
+  if [ ! -f $dl_dir/.complete ]; then
+    url=https://raw.githubusercontent.com/townie/PTB-dataset-from-Tomas-Mikolov-s-webpage/master/data/
+    wget --no-verbose --directory-prefix $dl_dir $url/ptb.train.txt
+    wget --no-verbose --directory-prefix $dl_dir $url/ptb.valid.txt
+    wget --no-verbose --directory-prefix $dl_dir $url/ptb.test.txt
+    touch $dl_dir/.complete
+  fi
+fi
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "Stage 0: Train BPE model"
+
+  for vocab_size in ${vocab_sizes[@]}; do
+    out_dir=data/bpe_${vocab_size}
+    mkdir -p $out_dir
+    ./local/train_bpe_model.py \
+      --out-dir $out_dir \
+      --vocab-size $vocab_size \
+      --transcript $dl_dir/ptb.train.txt
+  done
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Generate LM training data"
+  # Note: ptb.train.txt has already been normalized
+
+  for vocab_size in ${vocab_sizes[@]}; do
+    out_dir=data/bpe_${vocab_size}
+    mkdir -p $out_dir
+    ./local/prepare_lm_training_data.py \
+      --bpe-model $out_dir/bpe.model \
+      --lm-data $dl_dir/ptb.train.txt \
+      --lm-archive $out_dir/lm_data.pt
+  done
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Sort LM training data"
+  # Sort LM training data generated in stage 1
+  # by sentence length in descending order
+  # for ease of training.
+  #
+  # Sentence length equals to the number of BPE tokens
+  # in a sentence.
+
+  for vocab_size in ${vocab_sizes[@]}; do
+    out_dir=data/bpe_${vocab_size}
+    mkdir -p $out_dir
+    ./local/sort_lm_training_data.py \
+      --in-lm-data $out_dir/lm_data.pt \
+      --out-lm-data $out_dir/sorted_lm_data.pt \
+      --out-statistics $out_dir/statistics.txt
+  done
+fi
--- a/egs/ptb/LM/rnn_lm/init.py
+++ b/egs/ptb/LM/rnn_lm/init.py
--- a/egs/ptb/LM/rnn_lm/dataset.py
+++ b/egs/ptb/LM/rnn_lm/dataset.py
@ -0,0 +1,260 @@
+# Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Tuple
+
+import k2
+import torch
+
+
+class LmDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        sentences: k2.RaggedTensor,
+        words: k2.RaggedTensor,
+        sentence_lengths: torch.Tensor,
+        max_sent_len: int,
+        batch_size: int,
+    ):
+        """
+        Args:
+          sentences:
+            A ragged tensor of dtype torch.int32 with 2 axes [sentence][word].
+          words:
+            A ragged tensor of dtype torch.int32 with 2 axes [word][token].
+          sentence_lengths:
+            A 1-D tensor of dtype torch.int32 containing number of tokens
+            of each sentence.
+          max_sent_len:
+            Maximum sentence length. It is used to change the batch size
+            dynamically. In general, we try to keep the product of
+            "max_sent_len in a batch" and "num_of_sent in a batch" being
+            a constant.
+          batch_size:
+            The expected batch size. It is changed dynamically according
+            to the "max_sent_len".
+
+        See `../local/prepare_lm_training_data.py` for how `sentences` and
+        `words` are generated. We assume that `sentences` are sorted by length.
+        See `../local/sort_lm_training_data.py`.
+        """
+        super().__init__()
+        self.sentences = sentences
+        self.words = words
+
+        sentence_lengths = sentence_lengths.tolist()
+
+        assert batch_size > 0, batch_size
+        assert max_sent_len > 1, max_sent_len
+        batch_indexes = []
+        num_sentences = sentences.dim0
+        cur = 0
+        while cur < num_sentences:
+            sz = sentence_lengths[cur] // max_sent_len + 1
+            # Assume the current sentence has 3 * max_sent_len tokens,
+            # in the worst case, the subsequent sentences also have
+            # this number of tokens, we should reduce the batch size
+            # so that this batch will not contain too many tokens
+            actucal_batch_size = batch_size // sz + 1
+            actucal_batch_size = min(actucal_batch_size, batch_size)
+            end = cur + actucal_batch_size
+            end = min(end, num_sentences)
+            this_batch_indexes = torch.arange(cur, end).tolist()
+            batch_indexes.append(this_batch_indexes)
+            cur = end
+        assert batch_indexes[-1][-1] == num_sentences - 1
+
+        self.batch_indexes = k2.RaggedTensor(batch_indexes)
+
+    def __len__(self) -> int:
+        """Return number of batches in this dataset"""
+        return self.batch_indexes.dim0
+
+    def __getitem__(self, i: int) -> k2.RaggedTensor:
+        """Get the i'th batch in this dataset
+        Return a ragged tensor with 2 axes [sentence][token].
+        """
+        assert 0 <= i < len(self), i
+
+        # indexes is a 1-D tensor containing sentence indexes
+        indexes = self.batch_indexes[i]
+
+        # sentence_words is a ragged tensor with 2 axes
+        # [sentence][word]
+        sentence_words = self.sentences[indexes]
+
+        # in case indexes contains only 1 entry, the returned
+        # sentence_words is a 1-D tensor, we have to convert
+        # it to a ragged tensor
+        if isinstance(sentence_words, torch.Tensor):
+            sentence_words = k2.RaggedTensor(sentence_words.unsqueeze(0))
+
+        # sentence_word_tokens is a ragged tensor with 3 axes
+        # [sentence][word][token]
+        sentence_word_tokens = self.words.index(sentence_words)
+        assert sentence_word_tokens.num_axes == 3
+
+        sentence_tokens = sentence_word_tokens.remove_axis(1)
+        return sentence_tokens
+
+
+def concat(
+    ragged: k2.RaggedTensor, value: int, direction: str
+) -> k2.RaggedTensor:
+    """Prepend a value to the beginning of each sublist or append a value.
+    to the end of each sublist.
+
+    Args:
+      ragged:
+        A ragged tensor with two axes.
+      value:
+        The value to prepend or append.
+      direction:
+        It can be either "left" or "right". If it is "left", we
+        prepend the value to the beginning of each sublist;
+        if it is "right", we append the value to the end of each
+        sublist.
+
+    Returns:
+      Return a new ragged tensor, whose sublists either start with
+      or end with the given value.
+
+    >>> a = k2.RaggedTensor([[1, 3], [5]])
+    >>> a
+    [ [ 1 3 ] [ 5 ] ]
+    >>> concat(a, value=0, direction="left")
+    [ [ 0 1 3 ] [ 0 5 ] ]
+    >>> concat(a, value=0, direction="right")
+    [ [ 1 3 0 ] [ 5 0 ] ]
+
+    """
+    dtype = ragged.dtype
+    device = ragged.device
+
+    assert ragged.num_axes == 2, f"num_axes: {ragged.num_axes}"
+    pad_values = torch.full(
+        size=(ragged.tot_size(0), 1),
+        fill_value=value,
+        device=device,
+        dtype=dtype,
+    )
+    pad = k2.RaggedTensor(pad_values)
+
+    if direction == "left":
+        ans = k2.ragged.cat([pad, ragged], axis=1)
+    elif direction == "right":
+        ans = k2.ragged.cat([ragged, pad], axis=1)
+    else:
+        raise ValueError(
+            f'Unsupported direction: {direction}. " \
+            "Expect either "left" or "right"'
+        )
+    return ans
+
+
+def add_sos(ragged: k2.RaggedTensor, sos_id: int) -> k2.RaggedTensor:
+    """Add SOS to each sublist.
+
+    Args:
+      ragged:
+        A ragged tensor with two axes.
+      sos_id:
+        The ID of the SOS symbol.
+
+    Returns:
+      Return a new ragged tensor, where each sublist starts with SOS.
+
+    >>> a = k2.RaggedTensor([[1, 3], [5]])
+    >>> a
+    [ [ 1 3 ] [ 5 ] ]
+    >>> add_sos(a, sos_id=0)
+    [ [ 0 1 3 ] [ 0 5 ] ]
+
+    """
+    return concat(ragged, sos_id, direction="left")
+
+
+def add_eos(ragged: k2.RaggedTensor, eos_id: int) -> k2.RaggedTensor:
+    """Add EOS to each sublist.
+
+    Args:
+      ragged:
+        A ragged tensor with two axes.
+      eos_id:
+        The ID of the EOS symbol.
+
+    Returns:
+      Return a new ragged tensor, where each sublist ends with EOS.
+
+    >>> a = k2.RaggedTensor([[1, 3], [5]])
+    >>> a
+    [ [ 1 3 ] [ 5 ] ]
+    >>> add_eos(a, eos_id=0)
+    [ [ 1 3 0 ] [ 5 0 ] ]
+
+    """
+    return concat(ragged, eos_id, direction="right")
+
+
+class LmDatasetCollate:
+    def __init__(self, sos_id: int, eos_id: int, blank_id: int):
+        """
+        Args:
+          sos_id:
+            Token ID of the SOS symbol.
+          eos_id:
+            Token ID of the EOS symbol.
+          blank_id:
+            Token ID of the blank symbol.
+        """
+        self.sos_id = sos_id
+        self.eos_id = eos_id
+        self.blank_id = blank_id
+
+    def __call__(
+        self, batch: List[k2.RaggedTensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Return a tuple containing 3 tensors:
+
+        - x, a 2-D tensor of dtype torch.int32; each row contains tokens
+             for a sentence starting with `self.sos_id`. It is padded to
+             the max sentence length with `self.blank_id`.
+
+        - x, a 2-D tensor of dtype torch.int32; each row contains tokens
+             for a sentence ending with `self.eos_id` before padding.
+             Then it is padded to the max sentence length with
+             `self.blank_id`.
+
+        - lengths, a 2-D tensor of dtype torch.int32, containing the number of
+                   tokens of each sentence before padding.
+        """
+        # The batching stuff has already been done in LmDataset
+        assert len(batch) == 1
+        sentence_tokens = batch[0]
+        row_splits = sentence_tokens.shape.row_splits(1)
+        sentence_token_lengths = row_splits[1:] - row_splits[:-1]
+        sentence_tokens_with_sos = add_sos(sentence_tokens, self.sos_id)
+        sentence_tokens_with_eos = add_eos(sentence_tokens, self.eos_id)
+
+        x = sentence_tokens_with_sos.pad(
+            mode="constant", padding_value=self.blank_id
+        )
+        y = sentence_tokens_with_eos.pad(
+            mode="constant", padding_value=self.blank_id
+        )
+        sentence_token_lengths += 1  # plus 1 since we added a SOS
+
+        return x, y, sentence_token_lengths
--- a/egs/ptb/LM/rnn_lm/test_dataset.py
+++ b/egs/ptb/LM/rnn_lm/test_dataset.py
@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+# Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import k2
+import torch
+from rnn_lm.dataset import LmDataset, LmDatasetCollate
+
+
+def main():
+    sentences = k2.RaggedTensor(
+        [[0, 1, 2], [1, 0, 1], [0, 1], [1, 3, 0, 2, 0], [3], [0, 2, 1]]
+    )
+    words = k2.RaggedTensor([[3, 6], [2, 8, 9, 3], [5], [5, 6, 7, 8, 9]])
+
+    num_sentences = sentences.dim0
+
+    sentence_lengths = [0] * num_sentences
+    for i in range(num_sentences):
+        word_ids = sentences[i]
+
+        # NOTE: If word_ids is a tensor with only 1 entry,
+        # token_ids is a torch.Tensor
+        token_ids = words[word_ids]
+        if isinstance(token_ids, k2.RaggedTensor):
+            token_ids = token_ids.values
+
+        # token_ids is a 1-D tensor containing the BPE tokens
+        # of the current sentence
+
+        sentence_lengths[i] = token_ids.numel()
+
+    sentence_lengths = torch.tensor(sentence_lengths, dtype=torch.int32)
+
+    indices = torch.argsort(sentence_lengths, descending=True)
+    sentences = sentences[indices.to(torch.int32)]
+    sentence_lengths = sentence_lengths[indices]
+
+    dataset = LmDataset(
+        sentences=sentences,
+        words=words,
+        sentence_lengths=sentence_lengths,
+        max_sent_len=3,
+        batch_size=4,
+    )
+    print(dataset.sentences)
+    print(dataset.words)
+    print(dataset.batch_indexes)
+    print(len(dataset))
+    collate_fn = LmDatasetCollate(sos_id=1, eos_id=-1, blank_id=0)
+    dataloader = torch.utils.data.DataLoader(
+        dataset, batch_size=1, collate_fn=collate_fn
+    )
+
+    for i in dataloader:
+        print(i)
+    # I've checked the output manually; the output is as expected.
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/ptb/LM/rnn_lm/test_dataset_ddp.py
+++ b/egs/ptb/LM/rnn_lm/test_dataset_ddp.py
@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+# Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import k2
+import torch
+import torch.multiprocessing as mp
+from rnn_lm.dataset import LmDataset, LmDatasetCollate
+from torch import distributed as dist
+
+
+def generate_data():
+    sentences = k2.RaggedTensor(
+        [[0, 1, 2], [1, 0, 1], [0, 1], [1, 3, 0, 2, 0], [3], [0, 2, 1]]
+    )
+    words = k2.RaggedTensor([[3, 6], [2, 8, 9, 3], [5], [5, 6, 7, 8, 9]])
+
+    num_sentences = sentences.dim0
+
+    sentence_lengths = [0] * num_sentences
+    for i in range(num_sentences):
+        word_ids = sentences[i]
+
+        # NOTE: If word_ids is a tensor with only 1 entry,
+        # token_ids is a torch.Tensor
+        token_ids = words[word_ids]
+        if isinstance(token_ids, k2.RaggedTensor):
+            token_ids = token_ids.values
+
+        # token_ids is a 1-D tensor containing the BPE tokens
+        # of the current sentence
+
+        sentence_lengths[i] = token_ids.numel()
+
+    sentence_lengths = torch.tensor(sentence_lengths, dtype=torch.int32)
+
+    indices = torch.argsort(sentence_lengths, descending=True)
+    sentences = sentences[indices.to(torch.int32)]
+    sentence_lengths = sentence_lengths[indices]
+
+    return sentences, words, sentence_lengths
+
+
+def run(rank, world_size):
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12352"
+
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+    torch.cuda.set_device(rank)
+
+    sentences, words, sentence_lengths = generate_data()
+
+    dataset = LmDataset(
+        sentences=sentences,
+        words=words,
+        sentence_lengths=sentence_lengths,
+        max_sent_len=3,
+        batch_size=4,
+    )
+    sampler = torch.utils.data.distributed.DistributedSampler(
+        dataset, shuffle=True, drop_last=False
+    )
+
+    collate_fn = LmDatasetCollate(sos_id=1, eos_id=-1, blank_id=0)
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=1,
+        collate_fn=collate_fn,
+        sampler=sampler,
+        shuffle=False,
+    )
+
+    for i in dataloader:
+        print(f"rank: {rank}", i)
+
+    dist.destroy_process_group()
+
+
+def main():
+    world_size = 2
+    mp.spawn(run, args=(world_size,), nprocs=world_size, join=True)
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+if __name__ == "__main__":
+    main()
--- a/egs/ptb/LM/shared
+++ b/egs/ptb/LM/shared
@ -0,0 +1 @@
+../../../icefall/shared/