NVIDIA · janEbert · Dec 13, 2022 · Dec 13, 2022 · Dec 13, 2022 · Dec 13, 2022
diff --git a/megatron/arguments.py b/megatron/arguments.py
@@ -15,6 +15,8 @@
 
 from megatron.core.transformer import TransformerConfig
 
+from megatron.model.enums import UL2ModelType
+
 def parse_args(extra_args_provider=None, ignore_unknown_args=False):
     """Parse all arguments."""
     parser = argparse.ArgumentParser(description='Megatron-LM Arguments',
@@ -34,6 +36,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
     parser = _add_autoresume_args(parser)
     parser = _add_biencoder_args(parser)
     parser = _add_vision_args(parser)
+    parser = _add_ul2_args(parser)
     parser = _add_logging_args(parser)
     parser = _add_inference_args(parser)
     parser = _add_transformer_engine_args(parser)
@@ -336,6 +339,17 @@ def validate_args(args, defaults={}):
     if args.sequence_parallel:
         args.async_tensor_model_parallel_allreduce = False
 
+    args.ul2_model_type = UL2ModelType(args.ul2_model_type)
+    if (
+            args.ul2_model_type is not UL2ModelType.encoder_decoder
+            and args.decoder_seq_length is not None
+    ):
+        print(
+            f'WARNING: `--decoder_seq_length` is ignored when '
+            f'`--ul2-model-type` is not '
+            f'"{UL2ModelType.encoder_decoder.value}"!'
+        )
+
     if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
         if args.sequence_parallel:
             raise RuntimeError(
@@ -1137,6 +1151,13 @@ def _add_data_args(parser):
                        help='Probability of replacing a token with mask.')
     group.add_argument('--short-seq-prob', type=float, default=0.1,
                        help='Probability of producing a short sequence.')
+    group.add_argument('--no-add-mask-tokens', action='store_false',
+                       help='Whether not to add sentinel tokens for masked '
+                       'spans in span corruption tasks.',
+                       dest='add_mask_tokens')
+    group.add_argument('--pack-samples', action='store_true',
+                       help='Whether to pack samples in span corruption '
+                       'datasets (T5 or UL2). GPT dataset is always packed.')
     group.add_argument('--mmap-warmup', action='store_true',
                        help='Warm up mmap files.')
     group.add_argument('--num-workers', type=int, default=2,
@@ -1302,3 +1323,66 @@ def _add_vision_args(parser):
                        help='warmup teacher temperaure epochs')
 
     return parser
+
+
+def _add_ul2_args(parser):
+    group = parser.add_argument_group(title="UL2")
+
+    group.add_argument('--ul2-model-type', type=str, default='ED',
+                       choices=['ED', 'ND', 'CD'],
+                       help='What type of model to use for UL2 pretraining. '
+                       'ED = encoder-decoder; ND = non-causal decoder-only; '
+                       'CD = causal decoder-only')
+    group.add_argument('--ul2-denoiser-ratios', nargs='+', type=float,
+                       default=None,
+                       help='Probability of each denoising objective to be '
+                       'selected. Uniform distribution by default.')
+    group.add_argument('--ul2-denoisers', nargs='+', type=str,
+                       default=['R', 'R', 'S', 'X', 'X', 'X', 'X'],
+                       choices=['R', 'S', 'X', 'C'],
+                       help='What type of UL2 denoising objective the other '
+                       'UL2 configurations refer to. "C" is a fully causal '
+                       'objective with BOS as its denoiser token. Its '
+                       'settings need to be provided but will be ignored.')
+    group.add_argument('--ul2-mean-span-lengths', nargs='+', type=float,
+                       default=[3, 8, 0.25, 3, 8, 64, 64],
+                       help='Mean length for sampling span lengths. '
+                       'Numbers < 1 indicate a mean length of the sequence '
+                       'length times that number.')
+    group.add_argument('--ul2-mask-ratios', nargs='+', type=float,
+                       default=[0.15, 0.15, 0.25, 0.5, 0.5, 0.15, 0.5],
+                       help='Ratio of masked token in the full sequence.')
+    group.add_argument('--ul2-r-denoiser-token', type=str, default='[R]',
+                       help='What token to prepend for the UL2 R-denoising '
+                       'objective. If empty, do not prepend a token for this '
+                       'objective.')
+    group.add_argument('--ul2-s-denoiser-token', type=str, default='[S]',
+                       help='What token to prepend for the UL2 S-denoising '
+                       'objective. If empty, do not prepend a token for this '
+                       'objective.')
+    group.add_argument('--ul2-x-denoiser-token', type=str, default='[X]',
+                       help='What token to prepend for the UL2 X-denoising '
+                       'objective. If empty, do not prepend a token for this '
+                       'objective.')
+    group.add_argument('--ul2-scale-normal-std', action='store_true',
+                       help='Whether to scale the standard deviation when '
+                       'using a normal distribution for span length sampling.')
+    group.add_argument('--ul2-like-ul2r', action='store_true',
+                       help='Whether to use the updated implementation as '
+                       'described in the UL2R paper. This only changes the '
+                       'implementation, not the objective configurations!')
+    group.add_argument('--ul2-pack-any', action='store_true',
+                       help='When `--pack-samples` is also given, whether to '
+                       'pack different denoisers into one sample. If not '
+                       'given, the same denoiser is used for all packed '
+                       'samples.')
+    group.add_argument('--ul2-pack-no-repeat-prompt', action='store_false',
+                       help='When `--pack-samples` is also given and '
+                       '`--ul2-pack-any` is *not* given, whether to '
+                       'repeat the prompt token for each packed sample.',
+                       dest='ul2_pack_repeat_prompt')
+    # Has to be `None` by default so it can be overridden by `defaults`
+    # in `validate_args` but still evaluate to `False`.
+    group.add_argument('--_is_ul2', help=argparse.SUPPRESS)
+
+    return parser
diff --git a/megatron/core/transformer/enums.py b/megatron/core/transformer/enums.py
@@ -23,3 +23,4 @@ class AttnType(enum.Enum):
 class AttnMaskType(enum.Enum):
     padding = 1
     causal = 2
+    prefix = 3
diff --git a/megatron/data/bert_dataset.py b/megatron/data/bert_dataset.py
@@ -22,7 +22,7 @@
 class BertDataset(torch.utils.data.Dataset):
 
     def __init__(self, name, indexed_dataset, data_prefix,
-                 num_epochs, max_num_samples, masked_lm_prob,
+                 splits_string, num_epochs, max_num_samples, masked_lm_prob,
                  max_seq_length, short_seq_prob, seed, binary_head):
 
         # Params to store.
@@ -38,6 +38,7 @@ def __init__(self, name, indexed_dataset, data_prefix,
         # Build the samples mapping.
         self.samples_mapping = get_samples_mapping(self.indexed_dataset,
                                                    data_prefix,
+                                                   splits_string,
                                                    num_epochs,
                                                    max_num_samples,
                                                    self.max_seq_length - 3, # account for added tokens