fix: prob attention shape error while bs>1 (#50)

LongxingTan · web-flow · commit f3db2337e08d · 2023-10-16T20:49:48.000+08:00
diff --git a/tests/test_models/test_informer.py b/tests/test_models/test_informer.py
@@ -129,20 +129,21 @@ def test_train(self):
         predict_length = 10
         n_encoder_feature = 2
         n_decoder_feature = 3
+        batch_size = 1
 
         x_train = (
-            np.random.rand(1, train_length, 1),
-            np.random.rand(1, train_length, n_encoder_feature),
-            np.random.rand(1, predict_length, n_decoder_feature),
+            np.random.rand(batch_size, train_length, 1),
+            np.random.rand(batch_size, train_length, n_encoder_feature),
+            np.random.rand(batch_size, predict_length, n_decoder_feature),
         )
-        y_train = np.random.rand(1, predict_length, 1)  # target: (batch, predict_length, 1)
+        y_train = np.random.rand(batch_size, predict_length, 1)  # target: (batch, predict_length, 1)
 
         x_valid = (
-            np.random.rand(1, train_length, 1),
-            np.random.rand(1, train_length, n_encoder_feature),
-            np.random.rand(1, predict_length, n_decoder_feature),
+            np.random.rand(batch_size, train_length, 1),
+            np.random.rand(batch_size, train_length, n_encoder_feature),
+            np.random.rand(batch_size, predict_length, n_decoder_feature),
         )
-        y_valid = np.random.rand(1, predict_length, 1)
+        y_valid = np.random.rand(batch_size, predict_length, 1)
 
         model = AutoModel("Informer", predict_length=predict_length, custom_model_params=custom_params)
         trainer = KerasTrainer(model)
diff --git a/tfts/layers/attention_layer.py b/tfts/layers/attention_layer.py
@@ -155,10 +155,9 @@ def _prob_qk(self, q, k, sample_k, top_n):
         K_sample = tf.gather(K_sample, indx_q_seq, axis=2)
         K_sample = tf.gather(K_sample, indx_k_seq, axis=3)
 
-        Q_K_sample = tf.squeeze(tf.matmul(tf.expand_dims(q, -2), tf.einsum("...ij->...ji", K_sample)))
+        Q_K_sample = tf.squeeze(tf.matmul(tf.expand_dims(q, -2), tf.einsum("...ij->...ji", K_sample)), axis=3)
         M = tf.math.reduce_max(Q_K_sample, axis=-1) - tf.raw_ops.Div(x=tf.reduce_sum(Q_K_sample, axis=-1), y=L)
         m_top = tf.math.top_k(M, top_n, sorted=False)[1]
-        m_top = m_top[tf.newaxis, tf.newaxis] if B == 1 else m_top
 
         batch_indexes = tf.tile(tf.range(B)[:, tf.newaxis, tf.newaxis], (1, H, top_n))
         head_indexes = tf.tile(tf.range(H)[tf.newaxis, :, tf.newaxis], (B, 1, top_n))