merge change to remove bilingual param with new multidataset_datamodule

2025-08-09 18:12:19 +00:00 · 2025-05-14 08:51:11 +09:00 · 2025-05-14 08:51:11 +09:00 · e34f2dbb2a
commit e34f2dbb2a
parent eb5004880f 7ef1811063
1 changed files with 25 additions and 90 deletions
--- a/egs/multi_ja_en/ASR/zipformer/train.py
+++ b/egs/multi_ja_en/ASR/zipformer/train.py
@ -25,7 +25,6 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3"
 # For non-streaming model training:
 ./zipformer/train.py \
  --bilingual 1 \
  --world-size 4 \
  --num-epochs 30 \
  --start-epoch 1 \
@ -35,7 +34,6 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3"
 # For streaming model training:
 ./zipformer/train.py \
  --bilingual 1 \
  --world-size 4 \
  --num-epochs 30 \
  --start-epoch 1 \
@ -50,6 +48,7 @@ It supports training with:
  - transducer loss & ctc loss, with `--use-transducer True --use-ctc True`
 """
 import argparse
 import copy
 import logging
@ -77,7 +76,6 @@ from multi_dataset import MultiDataset
 from optim import Eden, ScaledAdam
 from scaling import ScheduledFloat
 from subsampling import Conv2dSubsampling
 from tokenizer import Tokenizer
 from torch import Tensor
 from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
@ -269,13 +267,6 @@ def get_parser():
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--bilingual",
        type=str2bool,
        default=True,
        help="Whether the model is bilingual or not. 1 = bilingual.",
    )
    parser.add_argument(
        "--world-size",
        type=int,
@ -333,7 +324,6 @@ def get_parser():
        """,
    )
    # changed - not used in monolingual streaming
    parser.add_argument(
        "--bpe-model",
        type=str,
@ -763,11 +753,9 @@ def save_checkpoint(
        copyfile(src=filename, dst=best_valid_filename)
 # fix implementation for sentencepiece_processor: spm.SentencePieceProcessor, stuff
 def compute_loss(
    params: AttributeDict,
    model: Union[nn.Module, DDP],
    tokenizer: Tokenizer,
    sentencepiece_processor: spm.SentencePieceProcessor,
    batch: dict,
    is_training: bool,
@ -803,11 +791,7 @@ def compute_loss(
    warm_step = params.warm_step
    texts = batch["supervisions"]["text"]
-    if not params.bilingual:
+    y = sentencepiece_processor.encode(texts, out_type=int)
        assert NotImplementedError("only bilingual training has been implemented")
        # y = tokenizer.encode(texts, out_type=int)
    else:
        y = sentencepiece_processor.encode(texts, out_type=int)
    y = k2.RaggedTensor(y)
    with torch.set_grad_enabled(is_training):
@ -863,7 +847,6 @@ def compute_loss(
 def compute_validation_loss(
    params: AttributeDict,
    model: Union[nn.Module, DDP],
    tokenizer: Tokenizer,
    sentencepiece_processor: spm.SentencePieceProcessor,
    valid_dl: torch.utils.data.DataLoader,
    world_size: int = 1,
@ -877,7 +860,6 @@ def compute_validation_loss(
        loss, loss_info = compute_loss(
            params=params,
            model=model,
            tokenizer=tokenizer,
            sentencepiece_processor=sentencepiece_processor,
            batch=batch,
            is_training=False,
@ -901,7 +883,6 @@ def train_one_epoch(
    model: Union[nn.Module, DDP],
    optimizer: torch.optim.Optimizer,
    scheduler: LRSchedulerType,
    tokenizer: Tokenizer,
    sentencepiece_processor: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
@ -973,7 +954,6 @@ def train_one_epoch(
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
                    tokenizer=tokenizer,
                    sentencepiece_processor=sentencepiece_processor,
                    batch=batch,
                    is_training=True,
@ -994,7 +974,6 @@ def train_one_epoch(
            display_and_save_batch(
                batch,
                params=params,
                tokenizer=tokenizer,
                sentencepiece_processor=sentencepiece_processor,
            )
            raise
@ -1083,7 +1062,6 @@ def train_one_epoch(
            valid_info = compute_validation_loss(
                params=params,
                model=model,
                tokenizer=tokenizer,
                sentencepiece_processor=sentencepiece_processor,
                valid_dl=valid_dl,
                world_size=world_size,
@ -1137,26 +1115,12 @@ def run(rank, world_size, args):
        device = torch.device("cuda", rank)
    logging.info(f"Device: {device}")
-    # Use lang_dir for further operations
+    sentencepiece_processor = spm.SentencePieceProcessor()
-    # tokenizer = Tokenizer.load(args.lang, args.lang_type)
+    sentencepiece_processor.load(params.bpe_model)
    # sentencepiece_processor = spm.SentencePieceProcessor()
    # sentencepiece_processor.load(params.bpe_model)
    tokenizer = None
    sentencepiece_processor = None
    # <blk> is defined in local/prepare_lang_char.py
-
+    params.blank_id = sentencepiece_processor.piece_to_id("<blk>")
-    if not params.bilingual:
+    arams.vocab_size = sentencepiece_processor.get_piece_size()
        assert NotImplementedError("only bilingual training has been implemented")
        # tokenizer = Tokenizer.load(args.lang, args.lang_type)
        # params.blank_id = tokenizer.piece_to_id("<blk>")
        # params.vocab_size = tokenizer.get_piece_size()
    else:
        sentencepiece_processor = spm.SentencePieceProcessor()
        sentencepiece_processor.load(params.bpe_model)
        params.blank_id = sentencepiece_processor.piece_to_id("<blk>")
        params.vocab_size = sentencepiece_processor.get_piece_size()
    if not params.use_transducer:
        params.ctc_loss_scale = 1.0
@ -1215,27 +1179,25 @@ def run(rank, world_size, args):
        register_inf_check_hooks(model)
    multidataset_datamodule = MultiDatasetAsrDataModule(args)
-    if params.bilingual:
+
-        multi_dataset = MultiDataset(args)
+    multi_dataset = MultiDataset(args)
-        train_cuts = multi_dataset.train_cuts()
+
-    else:
+    train_cuts = multi_dataset.train_cuts()
        assert NotImplementedError("only bilingual training has been implemented")
        # train_cuts = reazonspeech_corpus.train_cuts()
    def remove_short_and_long_utt(c: Cut):
-        # Keep only utterances with duration between 1 second and 20 seconds
+        # Keep only utterances with duration between 1 second and 30 seconds
        #
-        # Caution: There is a reason to select 20.0 here. Please see
+        # Caution: There is a reason to select 30.0 here. Please see
        # ../local/display_manifest_statistics.py
        #
        # You should use ../local/display_manifest_statistics.py to get
        # an utterance duration distribution for your dataset to select
        # the threshold
-        # if c.duration < 1.0 or c.duration > 30.0:
+        if c.duration < 1.0 or c.duration > 30.0:
-        #     logging.warning(
+            logging.warning(
-        #         f"Exclude cut with ID {c.id} from training. Duration: {c.duration}"
+                f"Exclude cut with ID {c.id} from training. Duration: {c.duration}"
-        #     )
+            )
-        #     return False
+            return False
        # In pruned RNN-T, we require that T >= S
        # where T is the number of feature frames after subsampling
@ -1243,19 +1205,13 @@ def run(rank, world_size, args):
        # In ./zipformer.py, the conv module uses the following expression
        # for subsampling
-        T = ((c.num_samples - 7) // 2 + 1) // 2
+        T = ((c.num_frames - 7) // 2 + 1) // 2
-        if not params.bilingual:
+        tokens = sentencepiece_processor.encode(c.supervisions[0].text, out_type=str)
            assert NotImplementedError("only bilingual training has been implemented")
            tokens = tokenizer.encode(c.supervisions[0].text, out_type=str)
        else:
            tokens = sentencepiece_processor.encode(
                c.supervisions[0].text, out_type=str
            )
        if T < len(tokens):
            logging.warning(
                f"Exclude cut with ID {c.id} from training. "
-                f"Number of frames (before subsampling): {c.num_samples}. "
+                f"Number of frames (before subsampling): {c.num_frames}. "
                f"Number of frames (after subsampling): {T}. "
                f"Text: {c.supervisions[0].text}. "
                f"Tokens: {tokens}. "
@ -1274,10 +1230,7 @@ def run(rank, world_size, args):
    train_cuts = train_cuts.filter(remove_short_and_long_utt)
-    if params.bilingual:
+    train_cuts = train_cuts.map(tokenize_and_encode_text)
        train_cuts = train_cuts.map(tokenize_and_encode_text)
    else:
        assert NotImplementedError("only bilingual training has been implemented")
    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
        # We only load the sampler's state dict when it loads a checkpoint
@ -1293,12 +1246,8 @@ def run(rank, world_size, args):
        train_cuts, sampler_state_dict=sampler_state_dict
    )
-    if params.bilingual:
+
-        valid_cuts = multi_dataset.dev_cuts()
+    valid_cuts = multi_dataset.dev_cuts()
    else:
        assert NotImplementedError("only bilingual training has been implemented")
        # valid_cuts = multi_dataset.dev_cuts()
    valid_dl = multidataset_datamodule.valid_dataloaders(valid_cuts)
    if not params.print_diagnostics:
@ -1306,7 +1255,6 @@ def run(rank, world_size, args):
            model=model,
            train_dl=train_dl,
            optimizer=optimizer,
            tokenizer=tokenizer,
            sentencepiece_processor=sentencepiece_processor,
            params=params,
        )
@ -1332,7 +1280,6 @@ def run(rank, world_size, args):
            model_avg=model_avg,
            optimizer=optimizer,
            scheduler=scheduler,
            tokenizer=tokenizer,
            sentencepiece_processor=sentencepiece_processor,
            train_dl=train_dl,
            valid_dl=valid_dl,
@ -1367,7 +1314,6 @@ def run(rank, world_size, args):
 def display_and_save_batch(
    batch: dict,
    params: AttributeDict,
    tokenizer: Tokenizer,
    sentencepiece_processor: spm.SentencePieceProcessor,
 ) -> None:
    """Display the batch statistics and save the batch into disk.
@ -1378,10 +1324,8 @@ def display_and_save_batch(
        for the content in it.
      params:
        Parameters for training. See :func:`get_params`.
      tokenizer:
        The BPE Tokenizer model.
      sentencepiece_processor:
-        The BPE SentencePieceProcessor model.
+        The BPE model.
    """
    from lhotse.utils import uuid4
@ -1393,12 +1337,7 @@ def display_and_save_batch(
    features = batch["inputs"]
    logging.info(f"features shape: {features.shape}")
-
+    y = sentencepiece_processor.encode(supervisions["text"], out_type=int)
    if params.bilingual:
        y = sentencepiece_processor.encode(supervisions["text"], out_type=int)
    else:
        assert NotImplementedError("only bilingual training has been implemented")
        # y = tokenizer.encode(supervisions["text"], out_type=int)
    num_tokens = sum(len(i) for i in y)
    logging.info(f"num tokens: {num_tokens}")
@ -1407,7 +1346,6 @@ def scan_pessimistic_batches_for_oom(
    model: Union[nn.Module, DDP],
    train_dl: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    tokenizer: Tokenizer,
    sentencepiece_processor: spm.SentencePieceProcessor,
    params: AttributeDict,
 ):
@ -1424,7 +1362,6 @@ def scan_pessimistic_batches_for_oom(
                loss, _ = compute_loss(
                    params=params,
                    model=model,
                    tokenizer=tokenizer,
                    sentencepiece_processor=sentencepiece_processor,
                    batch=batch,
                    is_training=True,
@ -1443,7 +1380,6 @@ def scan_pessimistic_batches_for_oom(
            display_and_save_batch(
                batch,
                params=params,
                tokenizer=tokenizer,
                sentencepiece_processor=sentencepiece_processor,
            )
            raise
@ -1455,7 +1391,6 @@ def scan_pessimistic_batches_for_oom(
 def main():
    parser = get_parser()
    MultiDatasetAsrDataModule.add_arguments(parser)
    Tokenizer.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)