modern-fortran
diff --git a/‎CMakeLists.txt
Lines changed: 4 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 4 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 4 additions & 2 deletions b/‎README.md
Lines changed: 4 additions & 2 deletions
diff --git a/‎fpm.toml
Lines changed: 1 addition & 1 deletion b/‎fpm.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/nf.f90
Lines changed: 15 additions & 2 deletions b/‎src/nf.f90
Lines changed: 15 additions & 2 deletions
diff --git a/‎src/nf/nf_embedding_layer.f90
Lines changed: 98 additions & 0 deletions b/‎src/nf/nf_embedding_layer.f90
Lines changed: 98 additions & 0 deletions
diff --git a/‎src/nf/nf_embedding_layer_submodule.f90
Lines changed: 137 additions & 0 deletions b/‎src/nf/nf_embedding_layer_submodule.f90
Lines changed: 137 additions & 0 deletions
diff --git a/‎src/nf/nf_layer_constructors.f90
Lines changed: 42 additions & 12 deletions b/‎src/nf/nf_layer_constructors.f90
Lines changed: 42 additions & 12 deletions
@@ -39,12 +39,16 @@ add_library(neural-fortran
   src/nf/nf_input3d_layer_submodule.f90
   src/nf/nf_layer_constructors.f90
   src/nf/nf_layer_constructors_submodule.f90
+  src/nf/nf_layernorm.f90
+  src/nf/nf_layernorm_submodule.f90
   src/nf/nf_layer.f90
   src/nf/nf_layer_submodule.f90
   src/nf/nf_locally_connected_1d_submodule.f90
   src/nf/nf_locally_connected_1d.f90
   src/nf/nf_linear2d_layer.f90
   src/nf/nf_linear2d_layer_submodule.f90
+  src/nf/nf_embedding_layer.f90
+  src/nf/nf_embedding_layer_submodule.f90
   src/nf/nf_loss.f90
   src/nf/nf_loss_submodule.f90
   src/nf/nf_maxpool1d_layer.f90
 
@@ -30,12 +30,14 @@ Read the paper [here](https://arxiv.org/abs/1902.06714).
 | Layer type | Constructor name | Supported input layers | Rank of output array | Forward pass | Backward pass |
 |------------|------------------|------------------------|----------------------|--------------|---------------|
 | Input | `input` | n/a | 1, 2, 3 | n/a | n/a |
+| Embedding | `embedding` | n/a | 2 | ✅ | ✅ |
 | Dense (fully-connected) | `dense` | `input1d`, `dense`, `dropout`, `flatten` | 1 | ✅ | ✅ |
 | Dropout | `dropout` | `dense`, `flatten`, `input1d` | 1 | ✅ | ✅ |
 | Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅(*) |
 | Max-pooling (2-d) | `maxpool2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ |
-| Linear (2-d) | `linear2d` | `input2d`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
-| Self-attention | `self_attention` | `input2d`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
+| Linear (2-d) | `linear2d` | `input2d`, `layernorm`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
+| Self-attention | `self_attention` | `input2d`, `layernorm`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
+| Layer Normalization | `layernorm` | `linear2d`, `self_attention` | 2 | ✅ | ✅ |
 | Flatten | `flatten` | `input2d`, `input3d`, `conv2d`, `maxpool2d`, `reshape` | 1 | ✅ | ✅ |
 | Reshape (1-d to 3-d) | `reshape` | `input1d`, `dense`, `flatten` | 3 | ✅ | ✅ |
 
 
@@ -1,5 +1,5 @@
 name = "neural-fortran"
-version = "0.19.0"
+version = "0.20.0"
 license = "MIT"
 author = "Milan Curcic"
 maintainer = "[email protected]"
 
@@ -3,8 +3,21 @@ module nf
   use nf_datasets_mnist, only: label_digits, load_mnist
   use nf_layer, only: layer
   use nf_layer_constructors, only: &
-    conv1d, conv2d, dense, dropout, flatten, input, linear2d, locally_connected_1d, &
-    maxpool1d, maxpool2d, reshape, reshape2d, self_attention
+    conv1d, &
+    conv2d, &
+    dense, &
+    dropout, &
+    embedding, &
+    flatten, &
+    input, &
+    layernorm, &
+    linear2d, &
+    locally_connected_1d, &
+    maxpool1d, &
+    maxpool2d, &
+    reshape, &
+    reshape2d, &
+    self_attention
   use nf_loss, only: mse, quadratic
   use nf_metrics, only: corr, maxabs
   use nf_network, only: network
 
@@ -0,0 +1,98 @@
+module nf_embedding_layer
+
+  use nf_activation, only: activation_function
+  use nf_base_layer, only: base_layer
+
+  implicit none
+
+  private
+  public :: embedding_layer
+
+  type, extends(base_layer) :: embedding_layer
+    !! Embedding Layer
+    !! Stores inputs as a trainable lookup table. Inputs are
+    !! integer indicies in a dictionary of `vocab_size`.
+    !! This layer converts them into a table of shape
+    !! (`sequence_length`, `model_dimension`)
+    integer :: sequence_length, vocab_size, model_dimension
+    integer :: positional
+
+    real, allocatable :: weights(:, :)
+    real, allocatable :: output(:, :)
+    real, allocatable :: dw(:, :) ! weight gradients
+
+  contains
+
+    procedure :: backward
+    procedure :: forward
+    procedure :: positional_trigonometric
+    procedure :: positional_absolute
+    procedure :: init
+    procedure :: get_num_params
+    procedure :: get_params
+    procedure :: get_gradients
+    procedure :: set_params
+
+  end type embedding_layer
+
+  interface embedding_layer
+    module function embedding_layer_cons(vocab_size, model_dimension, positional) result(res)
+      integer, intent(in) :: vocab_size, model_dimension
+      integer, optional :: positional
+      type(embedding_layer) :: res
+    end function embedding_layer_cons
+  end interface embedding_layer
+
+  interface
+    pure module subroutine forward(self, input)
+      !! Get vectors by indicis in the dictionary
+      class(embedding_layer), intent(in out) :: self
+      integer, intent(in) :: input(:)
+    end subroutine forward
+
+    pure module subroutine backward(self, input, gradient)
+      !! Update gradient at `input` indices
+      !! dw_i = W_i + d_output_i
+      class(embedding_layer), intent(in out) :: self
+      integer, intent(in) :: input(:)
+      real, intent(in) :: gradient(:, :)
+    end subroutine backward
+
+    pure module subroutine positional_trigonometric(self, pos)
+      !! Sum embedding with positional info (trigonometric, not trianable)
+      class(embedding_layer), intent(in out) :: self
+      integer, intent(in) :: pos
+    end subroutine positional_trigonometric
+
+    pure module subroutine positional_absolute(self, pos)
+      !! Sum embedding with absolute position
+      class(embedding_layer), intent(in out) :: self
+      integer, intent(in) :: pos
+    end subroutine positional_absolute
+
+    module subroutine init(self, input_shape)
+      class(embedding_layer), intent(in out) :: self
+      integer, intent(in) :: input_shape(:)
+    end subroutine init
+
+    pure module function get_num_params(self) result(num_params)
+       class(embedding_layer), intent(in) :: self
+       integer :: num_params
+    end function get_num_params
+
+    module function get_params(self) result(params)
+      class(embedding_layer), intent(in), target :: self
+      real, allocatable :: params(:)
+    end function get_params
+
+    module function get_gradients(self) result(gradients)
+      class(embedding_layer), intent(in), target :: self
+      real, allocatable :: gradients(:)
+    end function get_gradients
+
+    module subroutine set_params(self, params)
+      class(embedding_layer), intent(in out) :: self
+      real, intent(in), target :: params(:)
+    end subroutine set_params
+  end interface
+end module nf_embedding_layer
@@ -0,0 +1,137 @@
+#define NONE 0
+#define TRIGONOMETRIC 1
+#define ABSOLUTE 2
+
+submodule(nf_embedding_layer) nf_embedding_layer_submodule
+  use nf_base_layer, only: base_layer
+  implicit none
+contains
+  module function embedding_layer_cons(vocab_size, model_dimension, positional) result(res)
+    integer, intent(in) :: vocab_size, model_dimension
+    integer, optional :: positional
+    type(embedding_layer) :: res
+
+    res % vocab_size = vocab_size
+    res % model_dimension = model_dimension
+    if (.not. present(positional)) then
+      res % positional = NONE
+    else
+      res % positional = positional
+    end if
+  end function embedding_layer_cons
+
+  module subroutine init(self, input_shape)
+    class(embedding_layer), intent(in out) :: self
+    integer, intent(in) :: input_shape(:)
+
+    self % sequence_length = input_shape(1)
+
+    allocate(self % output(self % sequence_length, self % model_dimension))
+
+    allocate(self % weights(self % vocab_size, self % model_dimension))
+    self % weights = 0.1
+
+    allocate(self % dw(self % vocab_size, self % model_dimension))
+    self % dw = 0.0
+  end subroutine init
+
+  pure module subroutine forward(self, input)
+    class(embedding_layer), intent(in out) :: self
+    integer, intent(in) :: input(:)
+    integer :: i, index
+
+    do concurrent(i = 1: self % sequence_length)
+      index = input(i)
+      if (index > size(self % weights, 1)) then
+        index = 1
+      elseif (index == 0) then
+        index = 1
+      end if
+
+      self % output(i, :) = self % weights(index, :)
+
+      if (self % positional == TRIGONOMETRIC) then
+        call self % positional_trigonometric(i)
+      elseif (self % positional == ABSOLUTE) then
+        call self % positional_absolute(i)
+      end if
+    end do
+  end subroutine forward
+
+  pure module subroutine backward(self, input, gradient)
+    class(embedding_layer), intent(in out) :: self
+    integer, intent(in) :: input(:)
+    real, intent(in) :: gradient(:, :)
+    integer :: i
+
+    do concurrent(i = 1: self % sequence_length)
+      self % dw(input(i), :) = self % dw(input(i), :) + gradient(i, :)
+    end do
+  end subroutine backward
+
+  pure module subroutine positional_trigonometric(self, pos)
+    class(embedding_layer), intent(in out) :: self
+    integer, intent(in) :: pos
+    integer :: i
+    real :: theta
+
+    do concurrent(i = 1: floor(real(self % model_dimension) / 2))
+      theta = (pos - 1) / 10000 ** (real(2 * (i-1)) / self % model_dimension)
+      self % output(pos, 2 * i - 1) = self % output(pos, 2 * i - 1) + sin(theta)
+      self % output(pos, 2 * i) = self % output(pos, 2 * i) + cos(theta)
+    end do
+  end subroutine positional_trigonometric
+
+  pure module subroutine positional_absolute(self, pos)
+    class(embedding_layer), intent(in out) :: self
+    integer, intent(in) :: pos
+    integer :: i
+
+    do concurrent(i = 1: self % model_dimension)
+      self % output(pos, i) = self % output(pos, i) + pos - 1
+    end do
+  end subroutine positional_absolute
+
+  pure module function get_num_params(self) result(num_params)
+    class(embedding_layer), intent(in) :: self
+    integer :: num_params
+    num_params = self % vocab_size * self % model_dimension
+  end function get_num_params
+
+  module function get_params(self) result(params)
+    class(embedding_layer), intent(in), target :: self
+    real, allocatable :: params(:)
+    real, pointer :: w_(:) => null()
+
+    w_(1: product(shape(self % weights))) => self % weights
+    params = w_
+  end function get_params
+
+  module function get_gradients(self) result(gradients)
+    class(embedding_layer), intent(in), target :: self
+    real, allocatable :: gradients(:)
+    real, pointer :: dw_(:) => null()
+
+    dw_(1: product(shape(self % dw))) => self % dw
+    gradients = dw_
+  end function get_gradients
+
+  module subroutine set_params(self, params)
+    class(embedding_layer), intent(in out) :: self
+    real, intent(in), target :: params(:)
+
+    real, pointer :: p_(:,:) => null()
+
+    ! check if the number of parameters is correct
+    if (size(params) /= self % get_num_params()) then
+      error stop 'Error: number of parameters does not match'
+    end if
+
+    associate(n => self % vocab_size * self % model_dimension)
+      ! reshape the weights
+      p_(1:self % vocab_size, 1:self % model_dimension) => params(1 : n)
+      self % weights = p_
+    end associate
+
+  end subroutine set_params
+end submodule nf_embedding_layer_submodule
@@ -14,11 +14,16 @@ module nf_layer_constructors
     dense, &
     dropout, &
     flatten, &
-    input, locally_connected_1d, maxpool1d, &
+    input, &
     linear2d, &
+    locally_connected_1d, &
+    maxpool1d, &
     maxpool2d, &
-    reshape, reshape2d, &
-    self_attention
+    reshape, &
+    reshape2d, &
+    self_attention, &
+    embedding, &
+    layernorm
 
   interface input
 
@@ -310,15 +315,40 @@ module function linear2d(out_features) result(res)
         !! Resulting layer instance
     end function linear2d
 
-  module function self_attention(num_heads) result(res)
-    !! Rank-2 (sequence_length, out_features) self attention constructor.
-    !! sequence_length and model_dimension are determined at layer initialization, based on the
-    !! output shape of the previous layer.
-    integer, intent(in) :: num_heads
-      !! Number of attention heads
-    type(layer) :: res
-      !! Resulting layer instance
-  end function self_attention
+    module function self_attention(num_heads) result(res)
+      !! Rank-2 (sequence_length, out_features) self attention constructor.
+      !! sequence_length and model_dimension are determined at layer initialization, based on the
+      !! output shape of the previous layer.
+      integer, intent(in) :: num_heads
+        !! Number of attention heads
+      type(layer) :: res
+        !! Resulting layer instance
+    end function self_attention
+
+    module function embedding(sequence_length, vocab_size, model_dimension, positional) result(res)
+      !! Embedding layer constructor.
+      !!
+      !! This layer is for inputting token indices from the dictionary to the network.
+      !! Works as a trainable lookup table that converts each index into a vector.
+      !! Embedding layer must be the first layer in a network.
+      integer, intent(in) :: sequence_length
+        !! max len of input sequence  
+      integer, intent(in) :: vocab_size
+        !! length of token vocabulary
+      integer, intent(in) :: model_dimension
+        !! size of target embeddings
+      integer, optional, intent(in) :: positional
+        !! positional encoding
+      type(layer) :: res
+    end function embedding
+
+    module function layernorm() result(res)
+      !! Layer Normalization
+      !! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta
+      !! Based upon `Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton(2016)`:
+      !! https://arxiv.org/abs/1607.06450v1
+      type(layer) :: res
+    end function layernorm
 
   end interface