basf
diff --git a/‎README.md
+8-5 b/‎README.md
+8-5
diff --git a/‎mambular/__version__.py
+3-1 b/‎mambular/__version__.py
+3-1
diff --git a/‎mambular/arch_utils/enode_utils.py
+305 b/‎mambular/arch_utils/enode_utils.py
+305
diff --git a/‎mambular/arch_utils/layer_utils/embedding_layer.py
+5-1 b/‎mambular/arch_utils/layer_utils/embedding_layer.py
+5-1
@@ -76,7 +76,10 @@ Mambular is a Python package that brings the power of advanced deep learning arc
 | `TabulaRNN`      | A Recurrent Neural Network for Tabular data, introduced [here](https://arxiv.org/pdf/2411.17207).                                                   |
 | `MambAttention`  | A combination between Mamba and Transformers, also introduced [here](https://arxiv.org/pdf/2411.17207).                                             |
 | `NDTF`           | A neural decision forest using soft decision trees. See [Kontschieder et al.](https://openaccess.thecvf.com/content_iccv_2015/html/Kontschieder_Deep_Neural_Decision_ICCV_2015_paper.html) for inspiration. |
-| `SAINT`          | Improve neural networs via Row Attention and Contrastive Pre-Training, introduced [here](https://arxiv.org/pdf/2106.01342).                                              |
+| `SAINT`          | Improve neural networs via Row Attention and Contrastive Pre-Training, introduced [here](https://arxiv.org/pdf/2106.01342).                         |
+| `AutoInt`        | Automatic Feature Interaction Learning via Self-Attentive Neural Networks introduced [here](https://arxiv.org/abs/1810.11921).                      |
+| `Trompt `        | Trompt: Towards a Better Deep Neural Network for Tabular Data introduced [here](https://arxiv.org/abs/2305.18446).                                  |
+
 
 
 
@@ -211,13 +214,13 @@ random_search.fit(X, y, **fit_params)
 print("Best Parameters:", random_search.best_params_)
 print("Best Score:", random_search.best_score_)
 ```
-Note, that using this, you can also optimize the preprocessing. Just use the prefix ``prepro__`` when specifying the preprocessor arguments you want to optimize:
+Note, that using this, you can also optimize the preprocessing. Just specify the necessary parameters when specifying the preprocessor arguments you want to optimize:
 ```python
 param_dist = {
     'd_model': randint(32, 128),  
     'n_layers': randint(2, 10),  
     'lr': uniform(1e-5, 1e-3),
-    "prepro__numerical_preprocessing": ["ple", "standardization", "box-cox"]
+    "numerical_preprocessing": ["ple", "standardization", "box-cox"]
 }
 
 ```
@@ -321,7 +324,7 @@ Here's how you can implement a custom model with Mambular:
    Define your custom model just as you would for an `nn.Module`. The main difference is that you will inherit from `BaseModel` and use the provided feature information to construct your layers. To integrate your model into the existing API, you only need to define the architecture and the forward pass.
 
    ```python
-   from mambular.base_models import BaseModel
+   from mambular.base_models.utils import BaseModel
    from mambular.utils.get_feature_dimensions import get_feature_dimensions
    import torch
    import torch.nn
@@ -365,7 +368,7 @@ Here's how you can implement a custom model with Mambular:
    You can build a regression, classification, or distributional regression model that can leverage all of Mambular's built-in methods by using the following:
 
    ```python
-   from mambular.models import SklearnBaseRegressor
+   from mambular.models.utils import SklearnBaseRegressor
 
    class MyRegressor(SklearnBaseRegressor):
        def __init__(self, **kwargs):
 
@@ -16,4 +16,6 @@
 #
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "1.2.1"
+
+__version__ = "1.3.0"
+
@@ -0,0 +1,305 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mambular.arch_utils.layer_utils.sparsemax import sparsemax, sparsemoid
+from .data_aware_initialization import ModuleWithInit
+from .numpy_utils import check_numpy
+import numpy as np
+from warnings import warn
+
+
+class ODSTE(ModuleWithInit):
+
+    def __init__(
+        self,
+        in_features,  # J (number of features)
+        num_trees,
+        embed_dim,  # D (embedding dimension per feature)
+        depth=6,
+        tree_dim=1,
+        flatten_output=True,
+        choice_function=sparsemax,
+        bin_function=sparsemoid,
+        initialize_response_=nn.init.normal_,
+        initialize_selection_logits_=nn.init.uniform_,
+        threshold_init_beta=1.0,
+        threshold_init_cutoff=1.0,
+    ):
+        """Oblivious Differentiable Sparsemax Trees (ODST) with Feature & Embedding Splitting."""
+        super().__init__()
+        self.depth, self.num_trees, self.tree_dim, self.flatten_output = (
+            depth,
+            num_trees,
+            tree_dim,
+            flatten_output,
+        )
+        self.choice_function, self.bin_function = choice_function, bin_function
+        self.in_features, self.embed_dim = in_features, embed_dim
+        self.threshold_init_beta, self.threshold_init_cutoff = (
+            threshold_init_beta,
+            threshold_init_cutoff,
+        )
+
+        # Response values for each leaf
+        self.response = nn.Parameter(
+            torch.zeros([num_trees, tree_dim, embed_dim, 2**depth]), requires_grad=True
+        )
+
+        initialize_response_(self.response)
+
+        # Feature selection logits (choose J)
+        self.feature_selection_logits = nn.Parameter(
+            torch.zeros([num_trees, depth, in_features]), requires_grad=True
+        )
+        initialize_selection_logits_(self.feature_selection_logits)
+
+        # Embedding selection logits (choose D within J)
+        self.embedding_selection_logits = nn.Parameter(
+            torch.randn([num_trees, depth, in_features, embed_dim])
+        )
+
+        # Thresholds & temperatures (random initialization)
+        self.feature_thresholds = nn.Parameter(torch.randn([num_trees, depth]))
+        self.log_temperatures = nn.Parameter(torch.randn([num_trees, depth]))
+
+        # Binary code mappings
+        with torch.no_grad():
+            indices = torch.arange(2**self.depth)
+            offsets = 2 ** torch.arange(self.depth)
+            bin_codes = (indices.view(1, -1) // offsets.view(-1, 1) % 2).to(
+                torch.float32
+            )
+            bin_codes_1hot = torch.stack([bin_codes, 1.0 - bin_codes], dim=-1)
+            self.bin_codes_1hot = nn.Parameter(bin_codes_1hot, requires_grad=False)
+
+    def initialize(self, x, eps=1e-6):
+        """Data-aware initialization of thresholds and log-temperatures based on input data.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape [batch_size, in_features, embed_dim] used for threshold initialization.
+        eps : float, optional
+            Small value added to avoid log(0) errors in temperature initialization. Default is 1e-6.
+        """
+        if len(x.shape) != 3:
+            raise ValueError("Input tensor must have shape (batch_size, J, D)")
+
+        if x.shape[0] < 1000:
+            warn(
+                "Data-aware initialization is performed on less than 1000 data points. This may cause instability."
+                "To avoid potential problems, run this model on a data batch with at least 1000 data samples."
+                "You can do so manually before training. Use with torch.no_grad() for memory efficiency."
+            )
+
+        with torch.no_grad():
+            # Select features (J)
+            feature_selectors = self.choice_function(
+                self.feature_selection_logits, dim=-1
+            )
+            # feature_selectors shape: (num_trees, depth, J)
+
+            selected_features = torch.einsum("bjd,ntj->bntd", x, feature_selectors)
+            # selected_features shape: (B, num_trees, depth, D)
+
+            # Select embeddings (D)
+            embedding_selectors = self.choice_function(
+                self.embedding_selection_logits, dim=-1
+            )
+            # embedding_selectors shape: (num_trees, depth, J, D)
+
+            selected_embeddings = torch.einsum(
+                "bntd,ntjd->bntd", selected_features, embedding_selectors
+            )
+            # selected_embeddings shape: (B, num_trees, depth, D)
+
+            # Initialize thresholds using percentiles from the data
+            percentiles_q = 100 * np.random.beta(
+                self.threshold_init_beta,
+                self.threshold_init_beta,
+                size=[self.num_trees, self.depth],
+            )
+
+            reshaped_embeddings = selected_embeddings.permute(1, 2, 0, 3).reshape(
+                self.num_trees * self.depth, -1
+            )
+            self.feature_thresholds.data[...] = torch.as_tensor(
+                list(
+                    map(
+                        np.percentile,
+                        check_numpy(reshaped_embeddings),  # Now correctly 2D
+                        percentiles_q.flatten(),
+                    )
+                ),
+                dtype=selected_embeddings.dtype,
+                device=selected_embeddings.device,
+            ).view(self.num_trees, self.depth)
+
+            # Initialize temperatures based on the threshold differences
+            temperatures = np.percentile(
+                check_numpy(
+                    abs(selected_embeddings - self.feature_thresholds.unsqueeze(-1))
+                ),
+                q=100 * min(1.0, self.threshold_init_cutoff),
+                axis=0,
+            )
+
+            # Scale temperatures based on the cutoff
+            temperatures /= max(1.0, self.threshold_init_cutoff)
+
+            self.log_temperatures.data[...] = torch.log(
+                torch.as_tensor(
+                    temperatures.mean(-1),
+                    dtype=selected_embeddings.dtype,
+                    device=selected_embeddings.device,
+                )
+                + eps
+            )
+
+    def forward(self, x):
+        if len(x.shape) != 3:
+            raise ValueError("Input tensor must have shape (batch_size, J, D)")
+
+        # Select feature (J) and embedding dimension (D) separately
+        feature_selectors = self.choice_function(
+            self.feature_selection_logits, dim=-1
+        )  # [num_trees, depth, J]
+
+        embedding_selectors = self.choice_function(
+            self.embedding_selection_logits, dim=-1
+        )  # [num_trees, depth, J, D]
+
+        # Select features (J) first
+        selected_features = torch.einsum("bjd,ntj->bntd", x, feature_selectors)
+
+        # Select embeddings (D) within selected features
+        selected_embeddings = torch.einsum(
+            "bntd,ntjd->bntd", selected_features, embedding_selectors
+        )
+
+        # Compute threshold logits
+        threshold_logits = (
+            selected_embeddings - self.feature_thresholds.unsqueeze(0).unsqueeze(-1)
+        ) * torch.exp(-self.log_temperatures.unsqueeze(0).unsqueeze(-1))
+
+        threshold_logits = torch.stack([-threshold_logits, threshold_logits], dim=-1)
+
+        # Compute binary decisions
+        bins = self.bin_function(threshold_logits)
+
+        bin_matches = torch.einsum("bntds,tcs->bntdc", bins, self.bin_codes_1hot)
+
+        response_weights = torch.prod(bin_matches, dim=2)
+
+        # Compute final response
+        response = torch.einsum("bnds,ncds->bnd", response_weights, self.response)
+        return response
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}(in_features={self.in_features}, embed_dim={self.embed_dim}, num_trees={self.num_trees}, depth={self.depth}, tree_dim={self.tree_dim}, flatten_output={self.flatten_output})"
+
+
+class DenseBlock(nn.Module):
+    """DenseBlock that sequentially stacks attention layers and `Module` layers (e.g., ODSTE)
+    with feature and embedding-aware splits.
+
+    Parameters
+    ----------
+    input_dim : int
+        Number of features (J) in the input.
+    embed_dim : int
+        Embedding dimension per feature (D).
+    layer_dim : int
+        Dimensionality of each ODSTE layer.
+    num_layers : int
+        Number of layers to stack in the block.
+    tree_dim : int, optional
+        Number of output channels from each tree. Default is 1.
+    max_features : int, optional
+        Maximum number of features for expansion. Default is None.
+    input_dropout : float, optional
+        Dropout rate applied to inputs during training. Default is 0.0.
+    flatten_output : bool, optional
+        If True, flattens the output along the tree dimension. Default is True.
+    Module : nn.Module, optional
+        Module class to use for each layer in the block. Default is `ODSTE`.
+    **kwargs : dict
+        Additional keyword arguments for `Module` instances.
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        embed_dim,
+        layer_dim,
+        num_layers,
+        tree_dim=1,
+        max_features=None,
+        input_dropout=0.0,
+        flatten_output=True,
+        Module=ODSTE,
+        **kwargs,
+    ):
+        super().__init__()
+        self.num_layers = num_layers
+        self.layer_dim = layer_dim
+        self.tree_dim = tree_dim
+        self.max_features = max_features
+        self.input_dropout = input_dropout
+        self.flatten_output = flatten_output
+
+        self.attention_layers = nn.ModuleList()
+        self.odste_layers = nn.ModuleList()
+
+        for _ in range(num_layers):
+            # self.attention_layers.append(
+            #    nn.MultiheadAttention(
+            #        embed_dim=embed_dim, num_heads=1, batch_first=True
+            #    )
+            # )
+            self.odste_layers.append(
+                Module(
+                    in_features=input_dim,
+                    embed_dim=embed_dim,
+                    num_trees=layer_dim,
+                    tree_dim=tree_dim,
+                    flatten_output=True,
+                    **kwargs,
+                )
+            )
+            input_dim = min(
+                input_dim + layer_dim * tree_dim, max_features or float("inf")
+            )
+
+    def forward(self, x):
+        """Forward pass through the DenseBlock.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape [batch_size, J, D].
+
+        Returns
+        -------
+        torch.Tensor
+            Output tensor with expanded features.
+        """
+        initial_features = x.shape[1]  # J (num features)
+
+        for odste_layer in self.odste_layers:
+            # x, _ = attn_layer(x, x, x)  # Apply attention
+
+            if self.max_features is not None:
+                tail_features = min(self.max_features, x.shape[1]) - initial_features
+                if tail_features > 0:
+                    x = torch.cat(
+                        [x[:, :initial_features, :], x[:, -tail_features:, :]], dim=1
+                    )
+
+            if self.training and self.input_dropout:
+                x = F.dropout(x, self.input_dropout)
+
+            h = odste_layer(x)  # Apply ODSTE layer
+            x = torch.cat([x, h], dim=1)  # Concatenate new features
+
+        return x
@@ -156,7 +156,11 @@ def forward(self, num_features, cat_features, emb_features):
         # Process categorical embeddings
         if self.cat_embeddings and cat_features is not None:
             cat_embeddings = [
-                emb(cat_features[i]) if emb(cat_features[i]).ndim == 3 else emb(cat_features[i]).unsqueeze(1)
+                (
+                    emb(cat_features[i])
+                    if emb(cat_features[i]).ndim == 3
+                    else emb(cat_features[i]).unsqueeze(1)
+                )
                 for i, emb in enumerate(self.cat_embeddings)
             ]
Original file line number	Diff line number	Diff line change
`@@ -16,4 +16,6 @@`
`16`	`16`	`#`
`17`	`17`
`18`	`18`	`# The following line must be the last in the module, exactly as formatted:`
`19`		`-__version__ = "1.2.1"`
	`19`	`+`
	`20`	`+__version__ = "1.3.0"`
	`21`	`+`