More consistent defaults for PaliGemma

mattdangerw · divyashreepathihalli · mattdangerw · commit b2ec3801f82d · 2024-05-21T10:34:28.000-07:00
* More consistent defaults for PaliGemma

In general, we do not copy the hyper parameters for a specific
pre-trained model into the init args. Do the same here for
consistency.

Also, use as small test models as possible, so our unit testing
stays somewhat reasonable.

* add basica nd saved model test

---------

Co-authored-by: divyashreepathihalli &lt;divyashreepathihalli@gmail.com&gt;
diff --git a/keras_nlp/src/models/pali_gemma/pali_gemma_backbone.py b/keras_nlp/src/models/pali_gemma/pali_gemma_backbone.py
@@ -60,9 +60,6 @@ class PaliGemmaBackbone(Backbone):
         intermediate_dim: int. The output dimension of the first Dense layer in
             a two-layer feedforward network for each transformer decoder block.
         head_dim: int. The size of each attention head in the mixed decoder.
-        layer_norm_epsilon: float. The epsilon value user for every layer norm
-            in all transformer blocks.
-        dropout: float. Dropout probability for the Transformer decoder blocks.
         vit_patch_size: int. The size of each square patch in the input image.
         vit_num_heads: int. The number of attention heads for the vision(image)
             transformer encoder.
@@ -76,9 +73,10 @@ class PaliGemmaBackbone(Backbone):
             `"0"` or `"none"`. Defaults to `"none"`.
         vit_classifier_activation: activation function. The activation that
             is used for final output classification in the vision transformer.
-        vit_include_rescaling: bool. To be set to `True` if input image values
-            needs to be rescaled between 0-1.
         vit_name: string. The name used for vision transformer layers.
+        layer_norm_epsilon: float. The epsilon value user for every layer norm
+            in all transformer blocks.
+        dropout: float. Dropout probability for the Transformer decoder blocks.
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
@@ -99,44 +97,46 @@ class PaliGemmaBackbone(Backbone):
     # Randomly initialized PaliGemma decoder with custom config.
     model = keras_nlp.models.PaliGemmaBackbone(
         vocabulary_size=50257,
+        images_size=224,
         num_layers=12,
         num_query_heads=12,
         num_key_value_heads=1,
         hidden_dim=768,
         intermediate_dim=3072,
         head_dim=64,
+        vit_patch_size=14,
+        vit_num_heads=8,
+        vit_hidden_dim=768,
+        vit_intermediate_dim=3072,
+        vit_num_layers=2,
     )
     model(input_data)
     ```
     """
 
     def __init__(
         self,
-        vocabulary_size=257152,
-        image_size=224,
-        num_layers=18,
-        num_query_heads=8,
-        num_key_value_heads=1,
-        hidden_dim=2048,
-        intermediate_dim=32768,
-        head_dim=256,
-        layer_norm_epsilon=1e-6,
-        dropout=0,
-        vit_patch_size=14,
-        vit_num_heads=16,
-        vit_hidden_dim=1152,
-        vit_num_layers=27,
-        vit_intermediate_dim=4304,
+        vocabulary_size,
+        image_size,
+        num_layers,
+        num_query_heads,
+        num_key_value_heads,
+        hidden_dim,
+        intermediate_dim,
+        head_dim,
+        vit_patch_size,
+        vit_num_heads,
+        vit_hidden_dim,
+        vit_num_layers,
+        vit_intermediate_dim=None,  # TODO remove default
         vit_pooling=None,
         vit_classifier_activation=None,
         vit_name=None,
+        layer_norm_epsilon=1e-6,
+        dropout=0,
         dtype=None,
         **kwargs,
     ):
-        # TODO: remove these from our uploaded models.
-        kwargs.pop("vit_num_classes", None)
-        kwargs.pop("vit_include_rescaling", None)
-
         if not config.keras_3():
             raise ValueError(
                 "`PaliGemmaBackbone` requires Keras 3. Run "
@@ -159,6 +159,8 @@ def __init__(
             dtype=dtype,
             name="token_embedding",
         )
+        # TODO Remove this. Work around for previous serialization bug.
+        vit_intermediate_dim = vit_intermediate_dim or 4304
         self.vit_encoder = PaliGemmaVit(
             image_size=image_size,
             patch_size=vit_patch_size,
@@ -268,6 +270,7 @@ def get_config(self):
                 "vit_num_heads": self.vit_num_heads,
                 "vit_hidden_dim": self.vit_hidden_dim,
                 "vit_num_layers": self.vit_num_layers,
+                "vit_intermediate_dim": self.vit_intermediate_dim,
                 "vit_pooling": self.vit_pooling,
                 "vit_classifier_activation": self.vit_classifier_activation,
                 "vit_name": self.vit_name,
diff --git a/keras_nlp/src/models/pali_gemma/pali_gemma_backbone_test.py b/keras_nlp/src/models/pali_gemma/pali_gemma_backbone_test.py
@@ -34,17 +34,12 @@ def setUp(self):
         self.batch_size = 2
         self.vocabulary_size = 256
         self.text_sequence_length = 64
-        self.image_size = 224
+        self.image_size = 16
         self.dummy_text = [
             "the quick brown fox" for _ in range(self.batch_size)
         ]
         self.dummy_images = np.random.uniform(
-            size=(
-                self.batch_size,
-                self.image_size,
-                self.image_size,
-                3,
-            )
+            size=(self.batch_size, self.image_size, self.image_size, 3)
         )
 
         proto = "gemma_test_vocab.spm"
@@ -56,19 +51,19 @@ def setUp(self):
         )
 
         self.backbone = PaliGemmaBackbone(
-            self.vocabulary_size,
-            image_size=224,
-            num_layers=27,
-            num_query_heads=16,
-            num_key_value_heads=16,
-            hidden_dim=256,
-            intermediate_dim=256,
-            head_dim=126,
-            vit_patch_size=14,
-            vit_num_heads=8,
-            vit_hidden_dim=16,
+            vocabulary_size=self.vocabulary_size,
+            image_size=self.image_size,
+            num_layers=2,
+            num_query_heads=2,
+            num_key_value_heads=1,
+            hidden_dim=8,
+            intermediate_dim=16,
+            head_dim=4,
+            vit_patch_size=4,
             vit_num_layers=2,
-            vit_intermediate_dim=8,
+            vit_num_heads=2,
+            vit_hidden_dim=8,
+            vit_intermediate_dim=16,
         )
         self.dummy_imgs = np.random.rand(
             self.batch_size, self.image_size, self.image_size, 3
@@ -99,7 +94,7 @@ def test_pali_gemma_backbone(self):
             (
                 self.batch_size,
                 self.text_sequence_length + self.backbone.image_sequence_length,
-                256,
+                8,
             ),
             output.shape,
         )
@@ -117,7 +112,7 @@ def test_pali_gemma_backbone_with_preprocessing(self):
             (
                 self.batch_size,
                 self.text_sequence_length + self.backbone.image_sequence_length,
-                256,
+                8,
             ),
             output.shape,
         )
diff --git a/keras_nlp/src/models/pali_gemma/pali_gemma_causal_lm_test.py b/keras_nlp/src/models/pali_gemma/pali_gemma_causal_lm_test.py
@@ -35,18 +35,13 @@
 class PaliGemmaCausalLMTest(TestCase):
     def setUp(self):
         self.batch_size = 2
-        self.text_sequence_length = 64
-        self.image_size = 224
+        self.text_sequence_length = 16
+        self.image_size = 16
         self.dummy_text = [
             "the quick brown fox" for _ in range(self.batch_size)
         ]
         self.dummy_images = np.random.uniform(
-            size=(
-                self.batch_size,
-                self.image_size,
-                self.image_size,
-                3,
-            )
+            size=(self.batch_size, self.image_size, self.image_size, 3)
         )
 
         proto = "gemma_test_vocab.spm"
@@ -62,20 +57,60 @@ def setUp(self):
         )
 
         self.backbone = PaliGemmaBackbone(
-            self.vocabulary_size,
-            image_size=224,
-            num_layers=27,
-            num_query_heads=16,
-            num_key_value_heads=16,
-            hidden_dim=256,
-            intermediate_dim=256,
-            head_dim=126,
-            vit_patch_size=14,
-            vit_num_heads=8,
-            vit_hidden_dim=16,
+            vocabulary_size=self.vocabulary_size,
+            image_size=self.image_size,
+            num_layers=2,
+            num_query_heads=2,
+            num_key_value_heads=1,
+            hidden_dim=8,
+            intermediate_dim=16,
+            head_dim=4,
+            vit_patch_size=4,
             vit_num_layers=2,
-            vit_intermediate_dim=8,
-            vit_num_classes=512,
+            vit_num_heads=2,
+            vit_hidden_dim=8,
+            vit_intermediate_dim=16,
+        )
+        self.train_data = (
+            {
+                "images": self.dummy_images,
+                "prompts": self.dummy_text,
+                "responses": self.dummy_text,
+            },
+        )
+        self.init_kwargs = {
+            "preprocessor": self.preprocessor,
+            "backbone": self.backbone,
+        }
+
+    def test_causal_lm_basics(self):
+        self.run_task_test(
+            cls=PaliGemmaCausalLM,
+            init_kwargs=self.init_kwargs,
+            train_data=self.train_data,
+            expected_output_shape=(2, 16, 11),
+        )
+
+    @pytest.mark.large
+    def test_saved_model(self):
+        input_data = {
+            "token_ids": np.random.rand(
+                self.batch_size, self.text_sequence_length
+            ),
+            "images": self.dummy_images,
+            "padding_mask": np.ones(
+                (self.batch_size, self.text_sequence_length),
+                dtype="int32",
+            ),
+            "response_mask": np.zeros(
+                (self.batch_size, self.text_sequence_length),
+                dtype="int32",
+            ),
+        }
+        self.run_model_saving_test(
+            cls=PaliGemmaCausalLM,
+            init_kwargs=self.init_kwargs,
+            input_data=input_data,
         )
 
     def test_pali_gemma_causal_model(self):
diff --git a/keras_nlp/src/models/pali_gemma/pali_gemma_vit.py b/keras_nlp/src/models/pali_gemma/pali_gemma_vit.py
@@ -18,9 +18,9 @@
 class PaliGemmaVitEmbeddings(keras.layers.Layer):
     def __init__(
         self,
+        image_size,
+        patch_size,
         hidden_dim,
-        image_size=224,
-        patch_size=14,
         num_channels=3,
         dtype=None,
         **kwargs,
@@ -286,12 +286,12 @@ def get_config(self):
 class PaliGemmaVitEncoder(keras.layers.Layer):
     def __init__(
         self,
+        patch_size,
+        image_size,
         hidden_dim,
         num_layers,
         num_heads,
         intermediate_dim,
-        patch_size,
-        image_size,
         dtype=None,
         **kwargs,
     ):
@@ -421,26 +421,24 @@ class PaliGemmaVit(keras.Model):
     """Vision Transformer (ViT) model for PaliGemma.
 
     Args:
+        image_size: int. The height/width of the image. Both height and width is
+            expected to be the same.
+        patch_size: int. The size of each square patch in the input image.
         num_heads: int. The number of attention heads for the vision(image)
             transformer encoder.
         hidden_dim: int. The size of the transformer hidden state at the end
             of each vision transformer layer.
         num_layers: int. The number of transformer layers.
         intermediate_dim: int. The output dimension of the first Dense layer in
             a two-layer feedforward network for transformer.
-        pooling: string. The encoded vision embeddings are pooled using the
-            specified polling setting. The accepted values are `"map"`, `"gap"`,
-            `"zero"` or `"none"`. Defaults to `"none"`.
         num_classes: int. The number of output classes. If this model is used
             as a image classifier, this value would correspond to the number of
             output classes.
-        image_size: int. The height/width of the image. Both height and width is
-            expected to be the same.
-        patch_size: int. The size of each square patch in the input image.
+        pooling: string. The encoded vision embeddings are pooled using the
+            specified polling setting. The accepted values are `"map"`, `"gap"`,
+            `"zero"` or `None`. Defaults to `None`.
         classifier_activation: activation fucntion. The activation that is used
             for final output classification
-        include_rescaling: bool. to be set to `True` if input image values needs
-            to be rescaled between 0-1.
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
@@ -458,14 +456,14 @@ class PaliGemmaVit(keras.Model):
 
     def __init__(
         self,
-        num_heads=16,
-        hidden_dim=1152,
-        num_layers=27,
-        intermediate_dim=4304,
+        image_size,
+        patch_size,
+        num_heads,
+        hidden_dim,
+        num_layers,
+        intermediate_dim,
+        num_classes,
         pooling=None,
-        num_classes=2048,
-        image_size=None,
-        patch_size=14,
         classifier_activation=None,
         dtype=None,
         **kwargs,
@@ -475,10 +473,10 @@ def __init__(
             shape=(image_size, image_size, 3), name="images"
         )
         encoded = PaliGemmaVitEncoder(
-            hidden_dim,
-            num_layers,
-            num_heads,
-            intermediate_dim,
+            hidden_dim=hidden_dim,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            intermediate_dim=intermediate_dim,
             patch_size=patch_size,
             image_size=image_size,
             dtype=dtype,
diff --git a/keras_nlp/src/models/pali_gemma/pali_gemma_vit_test.py b/keras_nlp/src/models/pali_gemma/pali_gemma_vit_test.py