mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 10:02:22 +00:00
104 lines
2.9 KiB
Python
Executable File
104 lines
2.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang)
|
|
#
|
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import os
|
|
|
|
import k2
|
|
import torch
|
|
import torch.multiprocessing as mp
|
|
from rnn_lm.dataset import LmDataset, LmDatasetCollate
|
|
from torch import distributed as dist
|
|
|
|
|
|
def generate_data():
|
|
sentences = k2.RaggedTensor(
|
|
[[0, 1, 2], [1, 0, 1], [0, 1], [1, 3, 0, 2, 0], [3], [0, 2, 1]]
|
|
)
|
|
words = k2.RaggedTensor([[3, 6], [2, 8, 9, 3], [5], [5, 6, 7, 8, 9]])
|
|
|
|
num_sentences = sentences.dim0
|
|
|
|
sentence_lengths = [0] * num_sentences
|
|
for i in range(num_sentences):
|
|
word_ids = sentences[i]
|
|
|
|
# NOTE: If word_ids is a tensor with only 1 entry,
|
|
# token_ids is a torch.Tensor
|
|
token_ids = words[word_ids]
|
|
if isinstance(token_ids, k2.RaggedTensor):
|
|
token_ids = token_ids.values
|
|
|
|
# token_ids is a 1-D tensor containing the BPE tokens
|
|
# of the current sentence
|
|
|
|
sentence_lengths[i] = token_ids.numel()
|
|
|
|
sentence_lengths = torch.tensor(sentence_lengths, dtype=torch.int32)
|
|
|
|
indices = torch.argsort(sentence_lengths, descending=True)
|
|
sentences = sentences[indices.to(torch.int32)]
|
|
sentence_lengths = sentence_lengths[indices]
|
|
|
|
return sentences, words, sentence_lengths
|
|
|
|
|
|
def run(rank, world_size):
|
|
os.environ["MASTER_ADDR"] = "localhost"
|
|
os.environ["MASTER_PORT"] = "12352"
|
|
|
|
dist.init_process_group("nccl", rank=rank, world_size=world_size)
|
|
torch.cuda.set_device(rank)
|
|
|
|
sentences, words, sentence_lengths = generate_data()
|
|
|
|
dataset = LmDataset(
|
|
sentences=sentences,
|
|
words=words,
|
|
sentence_lengths=sentence_lengths,
|
|
max_sent_len=3,
|
|
batch_size=4,
|
|
)
|
|
sampler = torch.utils.data.distributed.DistributedSampler(
|
|
dataset, shuffle=True, drop_last=False
|
|
)
|
|
|
|
collate_fn = LmDatasetCollate(sos_id=1, eos_id=-1, blank_id=0)
|
|
dataloader = torch.utils.data.DataLoader(
|
|
dataset,
|
|
batch_size=1,
|
|
collate_fn=collate_fn,
|
|
sampler=sampler,
|
|
shuffle=False,
|
|
)
|
|
|
|
for i in dataloader:
|
|
print(f"rank: {rank}", i)
|
|
|
|
dist.destroy_process_group()
|
|
|
|
|
|
def main():
|
|
world_size = 2
|
|
mp.spawn(run, args=(world_size,), nprocs=world_size, join=True)
|
|
|
|
|
|
torch.set_num_threads(1)
|
|
torch.set_num_interop_threads(1)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|