Add Gemma2 to Keras (#91)

grasskin · mattdangerw · grasskin · commit 37e52a8b30fd · 2024-06-27T05:01:03.000Z
Add Gemma2 building blocks and presets.

---------

Co-authored-by: Matt Watson &lt;1389937+mattdangerw@users.noreply.github.com&gt;
diff --git a/keras_nlp/src/models/gemma/gemma_attention.py b/keras_nlp/src/models/gemma/gemma_attention.py
@@ -28,19 +28,28 @@ def __init__(
         num_query_heads,
         num_key_value_heads,
         kernel_initializer="glorot_uniform",
+        logit_soft_cap=None,
+        use_sliding_window_attention=False,
+        sliding_window_size=4096,
+        query_head_dim_normalize=True,
         dropout=0,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.num_query_heads = num_query_heads
         self.num_key_value_heads = num_key_value_heads
         self.head_dim = head_dim
+        self.logit_soft_cap = logit_soft_cap
+        self.use_sliding_window_attention = use_sliding_window_attention
+        self.sliding_window_size = sliding_window_size
+        self.query_head_dim_normalize = query_head_dim_normalize
         self.dropout = dropout
 
         self._kernel_initializer = keras.initializers.get(
             clone_initializer(kernel_initializer)
         )
         self.num_key_value_groups = num_query_heads // num_key_value_heads
+        self.query_head_dim_normalize = query_head_dim_normalize
 
     def build(self, inputs_shape):
         self.hidden_dim = inputs_shape[-1]
@@ -114,7 +123,12 @@ def _compute_attention(
         attention_mask,
         training=False,
     ):
-        query_normalization = 1 / np.sqrt(self.head_dim)
+        if self.query_head_dim_normalize:
+            query_normalization = 1 / np.sqrt(self.head_dim)
+        else:
+            query_normalization = 1 / np.sqrt(
+                self.hidden_dim // self.num_query_heads
+            )
 
         q *= ops.cast(query_normalization, dtype=q.dtype)
         q_shape = ops.shape(q)
@@ -130,6 +144,38 @@ def _compute_attention(
         b, q_len, _, _, h = ops.shape(q)
 
         attention_logits = ops.einsum("btkgh,bskh->bkgts", q, k)
+
+        if self.logit_soft_cap is not None:
+            attention_logits = ops.divide(attention_logits, self.logit_soft_cap)
+            attention_logits = ops.multiply(
+                ops.tanh(attention_logits), self.logit_soft_cap
+            )
+
+        if self.use_sliding_window_attention:
+            all_ones = ops.ones_like(attention_mask)
+            if keras.config.backend() == "tensorflow":
+                import tensorflow as tf
+
+                sliding_window_size = ops.minimum(
+                    self.sliding_window_size - 1, q_len
+                )
+                sliding_window_size = ops.cast(
+                    sliding_window_size, dtype="int32"
+                )
+                sliding_mask = tf.linalg.band_part(
+                    all_ones, sliding_window_size - 1, sliding_window_size - 1
+                )
+                sliding_mask = ops.cast(sliding_mask, dtype="bool")
+                bool_attention_mask = ops.cast(attention_mask, dtype="bool")
+                attention_mask = tf.math.logical_and(
+                    sliding_mask, bool_attention_mask
+                )
+            else:
+                sliding_mask = ops.triu(
+                    all_ones, -1 * self.sliding_window_size + 1
+                ) * ops.tril(all_ones, self.sliding_window_size - 1)
+                attention_mask = sliding_mask * attention_mask
+
         attention_mask = attention_mask[:, None, None, :, :]
         orig_dtype = attention_logits.dtype
         attention_softmax = self.softmax(attention_logits, mask=attention_mask)
@@ -186,3 +232,6 @@ def call(
         if cache is not None:
             return attention_output, cache
         return attention_output
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
diff --git a/keras_nlp/src/models/gemma/gemma_backbone.py b/keras_nlp/src/models/gemma/gemma_backbone.py
@@ -54,6 +54,21 @@ class GemmaBackbone(Backbone):
         layer_norm_epsilon: float. The epsilon value user for every layer norm
             in the transformer model.
         dropout: float. Dropout probability for the Transformer encoder.
+        query_head_dim_normalize: boolean. Whether to normalize attention with
+            head dimension or hidden_dim/num_query_heads. Gemma2 uses the
+            second option. Defaults to True.
+        use_post_ffw_norm: boolean. Whether to normalize after the feedforward
+            block. Defaults to False.
+        use_post_attention_norm: boolean. Whether to normalize after the attention
+            block. Defaults to False.
+        attention_logit_soft_cap: None or int. Soft cap for the attention logits.
+            Defaults to None.
+        final_logit_soft_cap: None or int. Soft cap for the final logits.
+            Defaults to None.
+        use_sliding_window_attention boolean. Whether to use sliding local
+          window attention. Defaults to False.
+        sliding_window_size: int. Size of the sliding local window. Defaults to
+            4096.
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
@@ -93,6 +108,13 @@ def __init__(
         hidden_dim,
         intermediate_dim,
         head_dim,
+        query_head_dim_normalize=True,
+        use_post_ffw_norm=False,
+        use_post_attention_norm=False,
+        attention_logit_soft_cap=None,
+        final_logit_soft_cap=None,
+        use_sliding_window_attention=False,
+        sliding_window_size=4096,
         layer_norm_epsilon=1e-6,
         dropout=0,
         dtype=None,
@@ -114,12 +136,19 @@ def __init__(
         )
         self.transformer_layers = []
         for i in range(num_layers):
+            sliding_window = use_sliding_window_attention and (i % 2 == 0)
             layer = GemmaDecoderBlock(
                 intermediate_dim=intermediate_dim,
                 hidden_dim=hidden_dim,
                 num_query_heads=num_query_heads,
                 head_dim=head_dim,
                 num_key_value_heads=num_key_value_heads,
+                query_head_dim_normalize=query_head_dim_normalize,
+                use_post_ffw_norm=use_post_ffw_norm,
+                use_post_attention_norm=use_post_attention_norm,
+                logit_soft_cap=attention_logit_soft_cap,
+                use_sliding_window_attention=sliding_window,
+                sliding_window_size=sliding_window_size,
                 dropout=dropout,
                 dtype=dtype,
                 name=f"decoder_block_{i}",
@@ -163,6 +192,13 @@ def __init__(
         self.head_dim = head_dim
         self.layer_norm_epsilon = layer_norm_epsilon
         self.dropout = dropout
+        self.query_head_dim_normalize = query_head_dim_normalize
+        self.use_post_ffw_norm = use_post_ffw_norm
+        self.use_post_attention_norm = use_post_attention_norm
+        self.attention_logit_soft_cap = attention_logit_soft_cap
+        self.final_logit_soft_cap = final_logit_soft_cap
+        self.sliding_window_size = sliding_window_size
+        self.use_sliding_window_attention = use_sliding_window_attention
 
     def get_config(self):
         config = super().get_config()
@@ -177,6 +213,13 @@ def get_config(self):
                 "head_dim": self.head_dim,
                 "layer_norm_epsilon": self.layer_norm_epsilon,
                 "dropout": self.dropout,
+                "query_head_dim_normalize": self.query_head_dim_normalize,
+                "use_post_ffw_norm": self.use_post_ffw_norm,
+                "use_post_attention_norm": self.use_post_attention_norm,
+                "final_logit_soft_cap": self.final_logit_soft_cap,
+                "attention_logit_soft_cap": self.attention_logit_soft_cap,
+                "sliding_window_size": self.sliding_window_size,
+                "use_sliding_window_attention": self.use_sliding_window_attention,
             }
         )
         return config
diff --git a/keras_nlp/src/models/gemma/gemma_backbone_test.py b/keras_nlp/src/models/gemma/gemma_backbone_test.py
@@ -22,13 +22,13 @@
 class GemmaBackboneTest(TestCase):
     def setUp(self):
         self.init_kwargs = {
-            "vocabulary_size": 256128,
+            "vocabulary_size": 20,
             "num_layers": 2,
-            "num_query_heads": 8,
-            "num_key_value_heads": 8,
-            "hidden_dim": 128,
-            "intermediate_dim": 256,
-            "head_dim": 128,
+            "num_query_heads": 4,
+            "num_key_value_heads": 1,
+            "hidden_dim": 16,
+            "intermediate_dim": 32,
+            "head_dim": 4,
             "layer_norm_epsilon": 1e-6,
         }
         self.input_data = {
@@ -41,7 +41,7 @@ def test_backbone_basics(self):
             cls=GemmaBackbone,
             init_kwargs=self.init_kwargs,
             input_data=self.input_data,
-            expected_output_shape=(2, 5, 128),
+            expected_output_shape=(2, 5, 16),
         )
 
     @pytest.mark.large
@@ -82,7 +82,7 @@ def test_all_presets(self):
 
     def test_architecture_characteristics(self):
         model = GemmaBackbone(**self.init_kwargs)
-        self.assertEqual(model.count_params(), 33931904)
+        self.assertEqual(model.count_params(), 3216)
         self.assertEqual(len(model.layers), 6)
 
     def test_distribution(self):
@@ -169,3 +169,45 @@ def test_distribution_with_lora(self):
                 )
             if "attention/value/lora_kernel_b" in w.path:
                 self.assertEqual(tuple(w.value.sharding.spec), (None, None))
+
+
+@pytest.mark.keras_3_only
+class Gemma2BackboneTest(TestCase):
+    def setUp(self):
+        self.init_kwargs = {
+            "vocabulary_size": 20,  # 256128
+            "num_layers": 2,  # 46
+            "num_query_heads": 4,  # 32
+            "num_key_value_heads": 2,  # 16
+            "hidden_dim": 16,  # 4608
+            "intermediate_dim": 32,  # 73728
+            "head_dim": 4,  # 128
+            "sliding_window_size": 5,  # 4096
+            "attention_logit_soft_cap": 50,
+            "final_logit_soft_cap": 30,
+            "layer_norm_epsilon": 1e-6,
+            "query_head_dim_normalize": False,
+            "use_post_ffw_norm": True,
+            "use_post_attention_norm": True,
+            "use_sliding_window_attention": True,
+        }
+        self.input_data = {
+            "token_ids": ops.ones((2, 10), dtype="int32"),
+            "padding_mask": ops.ones((2, 10), dtype="int32"),
+        }
+
+    def test_backbone_basics(self):
+        self.run_backbone_test(
+            cls=GemmaBackbone,
+            init_kwargs=self.init_kwargs,
+            input_data=self.input_data,
+            expected_output_shape=(2, 10, 16),
+        )
+
+    @pytest.mark.large
+    def test_saved_model(self):
+        self.run_model_saving_test(
+            cls=GemmaBackbone,
+            init_kwargs=self.init_kwargs,
+            input_data=self.input_data,
+        )
diff --git a/keras_nlp/src/models/gemma/gemma_causal_lm.py b/keras_nlp/src/models/gemma/gemma_causal_lm.py
@@ -223,9 +223,17 @@ def call_with_cache(
                 cache_update_index=cache_update_index,
             )
             caches.append(next_cache)
+
         cache = ops.stack(caches, axis=1)
         hidden_states = x = self.backbone.layer_norm(x)
         logits = self.backbone.token_embedding(x, reverse=True)
+
+        if self.backbone.final_logit_soft_cap is not None:
+            logits = ops.divide(logits, self.backbone.final_logit_soft_cap)
+            logits = ops.multiply(
+                ops.tanh(logits), self.backbone.final_logit_soft_cap
+            )
+
         return logits, hidden_states, cache
 
     def _build_cache(self, token_ids):
diff --git a/keras_nlp/src/models/gemma/gemma_causal_lm_test.py b/keras_nlp/src/models/gemma/gemma_causal_lm_test.py
@@ -264,3 +264,18 @@ def layer_intercept_fn_for_testing(x, i):
         # Assert shapes for info exfiltrated into the parent context.
         self.assertEqual(ops.shape(embedded_prompts), expected_embedded_shape)
         self.assertEqual(ops.shape(scores), expected_score_shape)
+
+
+class Gemma2CausalLMTest(TestCase):
+    @pytest.mark.large
+    def test_preset(self):
+        # Setup prompts, models, and associated expected shapes.
+        keras.config.set_floatx("bfloat16")
+        gemma_lm = GemmaCausalLM.from_preset(
+            "/usr/local/google/home/grasskin/gemma2/keras-nlp-private/gemma_9b_en"
+        )
+        # gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("gemma2_9b_en")
+        gemma_lm.summary()
+        print(
+            gemma_lm.generate("what is the meaning of life?.", max_length=256)
+        )
diff --git a/keras_nlp/src/models/gemma/gemma_decoder_block.py b/keras_nlp/src/models/gemma/gemma_decoder_block.py
diff --git a/keras_nlp/src/models/gemma/gemma_presets.py b/keras_nlp/src/models/gemma/gemma_presets.py