diff --git a/.github/workflows/ci-build-manual-eval.yml b/.github/workflows/ci-build-manual-eval.yml
new file mode 100644
index 0000000000..292c12c960
--- /dev/null
+++ b/.github/workflows/ci-build-manual-eval.yml
@@ -0,0 +1,47 @@
+name: Build and push a full docker image
+
+on:
+  workflow_dispatch:
+    inputs:
+      custom_tag:
+        type: string
+        description: Docker image tag
+        required: true
+        default: "latest"
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up JDK 17
+      uses: actions/setup-java@v4
+      with:
+        java-version: '17.0.10+7'
+        distribution: 'temurin'
+        cache: 'gradle'
+    - name: Build with Gradle
+      run: ./gradlew build -x test
+
+  docker-build-full:
+    needs: [ build ]
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Create more disk space
+        run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+      - uses: actions/checkout@v4
+      - name: Build and push
+        id: docker_build
+        uses: mr-smithers-excellent/docker-build-push@v5
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME_LFOPPIANO }}
+          password: ${{ secrets.DOCKERHUB_TOKEN_LFOPPIANO }}
+          image: lfoppiano/grobid-evaluation
+          registry: docker.io
+          pushImage: true
+          tags: latest, ${{ github.event.inputs.custom_tag}}
+          dockerfile: Dockerfile.evaluation
+      - name: Image digest
+        run: echo ${{ steps.docker_build.outputs.digest }}
diff --git a/Dockerfile.evaluation b/Dockerfile.evaluation
new file mode 100644
index 0000000000..31fbcde974
--- /dev/null
+++ b/Dockerfile.evaluation
@@ -0,0 +1,63 @@
+## Grobid evaluation image
+## ------
+# https://grobid.readthedocs.io/en/latest/End-to-end-evaluation/
+# NOTE: To match the exact evaluation published in the Grobid documentation is necessary to have a running Biblio-glutton instance
+
+FROM lfoppiano/grobid:0.8.2-RC1-full as runtime
+
+# setting locale is likely useless but to be sure
+ENV LANG C.UTF-8
+
+USER root
+
+RUN apt-get update && \
+    apt-get -y --no-install-recommends install unzip wget
+
+WORKDIR /opt/grobid
+
+# gradle
+COPY gradle/ ./gradle/
+COPY gradlew ./
+COPY gradle.properties ./
+COPY build.gradle ./
+COPY settings.gradle ./
+
+# source
+COPY grobid-core/ ./grobid-core/
+COPY grobid-service/ ./grobid-service/
+COPY grobid-trainer/ ./grobid-trainer/
+
+# Setting DL-powered configuration
+COPY grobid-home/config/grobid-evaluation.yaml grobid-home/config/config.yaml
+
+RUN rm -rf /opt/grobid/grobid-home/models/*-with_ELMo \
+    && mkdir /opt/grobid/evaluation
+
+# Download evaluation data
+WORKDIR /opt/grobid/evaluation
+RUN wget https://zenodo.org/records/3873702/files/biorxiv-10k-test-2000.zip -O biorxiv-10k-test-2000.zip \
+    && unzip biorxiv-10k-test-2000.zip -d biorxiv-10k-test-2000 \
+    && wget https://zenodo.org/records/7708580/files/eLife_984.zip -O eLife_984.zip \
+    && unzip eLife_984.zip -d eLife_984 \
+    && wget https://zenodo.org/records/7708580/files/PLOS_1000.zip -O PLOS_1000.zip \
+    && unzip PLOS_1000.zip -d PLOS_1000 \
+    && wget https://zenodo.org/records/7708580/files/PMC_sample_1943.zip -O PMC_sample_1943.zip \
+    && unzip PMC_sample_1943.zip -d PMC_sample_1943 \
+    && rm *.zip
+
+#RUN wget -q https://zenodo.org/records/7708580/files/PMC_sample_1943.zip -O PMC_sample_1943.zip \
+#    && unzip PMC_sample_1943.zip -d PMC_sample_1943 \
+#    && rm *.zip
+
+VOLUME ["/opt/grobid/grobid-home/tmp"]
+
+WORKDIR /opt/grobid
+
+CMD ["/bin/bash", "-c", "./gradlew jatsEval -Pp2t=/opt/grobid/evaluation/PMC_sample_1943 -Prun=1 -PfileRatio=1; ./gradlew jatsEval -Pp2t=/opt/grobid/evaluation/biorxiv-10k-test-2000 -Prun=1 -PfileRatio=1; ./gradlew jatsEval -Pp2t=/opt/grobid/evaluation/eLife_984 -Prun=1 -PfileRatio=1; ./gradlew jatsEval -Pp2t=/opt/grobid/evaluation/PLOS_1000 -Prun=1 -PfileRatio=1;"]
+
+LABEL \
+    authors="The contributors" \
+    org.label-schema.name="Grobid" \
+    org.label-schema.description="Image running the Grobid End 2 end evaluation" \
+    org.label-schema.url="https://github.com/kermitt2/Grobid" \
+    org.label-schema.version=${GROBID_VERSION}
\ No newline at end of file
diff --git a/grobid-home/config/grobid-evaluation.yaml b/grobid-home/config/grobid-evaluation.yaml
new file mode 100644
index 0000000000..04d9b4a2fc
--- /dev/null
+++ b/grobid-home/config/grobid-evaluation.yaml
@@ -0,0 +1,370 @@
+# this is the configuration file for the GROBID instance that uses the Deep Learning Models.
+
+grobid:
+  # where all the Grobid resources are stored (models, lexicon, native libraries, etc.), normally no need to change
+  grobidHome: "grobid-home"
+
+  # path relative to the grobid-home path (e.g. tmp for grobid-home/tmp) or absolute path (/tmp)
+  temp: "tmp"
+
+  # normally nothing to change here, path relative to the grobid-home path (e.g. grobid-home/lib)
+  nativelibrary: "lib"
+
+  pdf:
+    pdfalto:
+      # path relative to the grobid-home path (e.g. grobid-home/pdfalto), you don't want to change this normally
+      path: "pdfalto"
+      # security for PDF parsing
+      memoryLimitMb: 6096
+      timeoutSec: 120
+
+    # security relative to the PDF parsing result
+    blocksMax: 200000
+    tokensMax: 1000000
+
+  consolidation:
+    # define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or
+    # "glutton" for https://github.com/kermitt2/biblio-glutton
+#    service: "crossref"
+    service: "glutton"
+    glutton:
+      url: "http://sciencialab.ddns.net:8080"
+#      url: "http://localhost:8080"
+    crossref:
+      mailto:
+      # to use crossref web API, you need normally to use it politely and to indicate an email address here, e.g.
+      #mailto: "toto@titi.tutu"
+      token:
+      # to use Crossref metadata plus service (available by subscription)
+      #token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere"
+
+  proxy:
+    # proxy to be used when doing external call to the consolidation service
+    host:
+    port:
+
+  # CORS configuration for the GROBID web API service
+  corsAllowedOrigins: "*"
+  corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD"
+  corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"
+
+  # the actual implementation for language recognition to be used
+  languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory"
+
+  # the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP)
+  #sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory"
+  sentenceDetectorFactory: "org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory"
+
+  # maximum concurrency allowed to GROBID server for processing parallel requests - change it according to your CPU/GPU capacities
+  # for a production server running only GROBID, set the value slightly above the available number of threads of the server
+  # to get best performance and security
+  concurrency: 10
+  # when the pool is full, for queries waiting for the availability of a Grobid engine, this is the maximum time wait to try
+  # to get an engine (in seconds) - normally never change it
+  poolMaxWait: 1
+
+  delft:
+    # DeLFT global parameters
+    # delft installation path if Deep Learning architectures are used to implement one of the sequence labeling model,
+    # embeddings are usually compiled as lmdb under delft/data (this parameter is ignored if only featured-engineered CRF are used)
+    install: "../delft"
+    pythonVirtualEnv:
+
+  wapiti:
+    # Wapiti global parameters
+    # number of threads for training the wapiti models (0 to use all available processors)
+    nbThreads: 0
+
+  models:
+    # we configure here how each sequence labeling model should be implemented
+    # for feature-engineered CRF, use "wapiti" and possible training parameters are window, epsilon and nbMaxIterations
+    # for Deep Learning, use "delft" and select the target DL architecture (see DeLFT library), the training
+    # parameters then depends on this selected DL architecture
+
+    - name: "segmentation"
+      # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
+      engine: "wapiti"
+      #engine: "delft"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.0000001
+        window: 50
+        nbMaxIterations: 2000
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF_FEATURES"
+        useELMo: false
+        runtime:
+          # parameters used at runtime/prediction
+          max_sequence_length: 3000
+          batch_size: 1
+        training:
+          # parameters used for training
+          max_sequence_length: 3000
+          batch_size: 10
+
+    - name: "segmentation-article-light"
+      engine: "wapiti"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.0000001
+        window: 50
+        nbMaxIterations: 2000
+
+    - name: "segmentation-article-light-ref"
+      engine: "wapiti"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.0000001
+        window: 50
+        nbMaxIterations: 2000
+
+    - name: "fulltext"
+      # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
+      engine: "wapiti"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.0001
+        window: 20
+        nbMaxIterations: 1500
+
+    - name: "header"
+      #engine: "wapiti"
+      engine: "delft"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.000001
+        window: 30
+        nbMaxIterations: 1500
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_ChainCRF_FEATURES"
+        #transformer: "allenai/scibert_scivocab_cased"
+        useELMo: false
+        runtime:
+          # parameters used at runtime/prediction
+          #max_sequence_length: 510
+          max_sequence_length: 3000
+          batch_size: 1
+        training:
+          # parameters used for training
+          #max_sequence_length: 510
+          #batch_size: 6
+          max_sequence_length: 3000
+          batch_size: 9
+
+    - name: "header-article-light"
+#      engine: "wapiti"
+      engine: "delft"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.000001
+        window: 30
+        nbMaxIterations: 1500
+      delft:
+        architecture: "BidLSTM_ChainCRF_FEATURES"
+        useELMo: false
+
+    - name: "header-article-light-ref"
+#      engine: "wapiti"
+      engine: "delft"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.000001
+        window: 30
+        nbMaxIterations: 1500
+      delft:
+        architecture: "BidLSTM_ChainCRF_FEATURES"
+        useELMo: false
+
+    - name: "reference-segmenter"
+      #engine: "wapiti"
+      engine: "delft"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.00001
+        window: 20
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_ChainCRF_FEATURES"
+        useELMo: false
+        runtime:
+          # parameters used at runtime/prediction (for this model, use same max_sequence_length as training)
+          max_sequence_length: 3000
+          batch_size: 2
+        training:
+          # parameters used for training
+          max_sequence_length: 3000
+          batch_size: 10
+
+    - name: "name-header"
+      engine: "wapiti"
+      #engine: "delft"
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF_FEATURES"
+
+    - name: "name-citation"
+      engine: "wapiti"
+      #engine: "delft"
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF_FEATURES"
+
+    - name: "date"
+      engine: "wapiti"
+      #engine: "delft"
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF_FEATURES"
+
+    - name: "figure"
+      engine: "wapiti"
+      #engine: "delft"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.00001
+        window: 20
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF"
+
+    - name: "table"
+      engine: "wapiti"
+      #engine: "delft"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.00001
+        window: 20
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF"
+
+    - name: "affiliation-address"
+      #engine: "wapiti"
+      engine: "delft"
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF_FEATURES"
+
+    - name: "citation"
+      #engine: "wapiti"
+      engine: "delft"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.00001
+        window: 50
+        nbMaxIterations: 3000
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF_FEATURES"
+        #architecture: "BERT_CRF"
+        #transformer: "michiyasunaga/LinkBERT-base"
+        useELMo: false
+        runtime:
+          # parameters used at runtime/prediction
+          max_sequence_length: 500
+          batch_size: 30
+        training:
+          # parameters used for training
+          max_sequence_length: 500
+          batch_size: 50
+
+    - name: "patent-citation"
+      engine: "wapiti"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.0001
+        window: 20
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF_FEATURES"
+        #architecture: "BERT_CRF"
+        runtime:
+          # parameters used at runtime/prediction
+          max_sequence_length: 800
+          batch_size: 20
+        training:
+          # parameters used for training
+          max_sequence_length: 1000
+          batch_size: 40
+
+    - name: "funding-acknowledgement"
+      #engine: "wapiti"
+      engine: "delft"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.00001
+        window: 50
+        nbMaxIterations: 2000
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF_FEATURES"
+        #architecture: "BERT_CRF"
+        #transformer: "michiyasunaga/LinkBERT-base"
+        useELMo: false
+        runtime:
+          # parameters used at runtime/prediction
+          max_sequence_length: 800
+          batch_size: 20
+        training:
+          # parameters used for training
+          max_sequence_length: 500
+          batch_size: 40
+
+    - name: "copyright"
+      # at this time, we only have a DeLFT implementation,
+      # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
+#      engine: "delft"
+      engine: "wapiti"
+      delft:
+        # deep learning parameters
+        architecture: "gru"
+        #architecture: "bert"
+        #transformer: "allenai/scibert_scivocab_cased"
+
+    - name: "license"
+      # at this time, for being active, it must be DeLFT, no other implementation is available
+      # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
+#      engine: "delft"
+      engine: "wapiti"
+      delft:
+        # deep learning parameters
+        architecture: "gru"
+        #architecture: "bert"
+        #transformer: "allenai/scibert_scivocab_cased"
+
+  # for **service only**: how to load the models,
+  # false -> models are loaded when needed, avoiding putting in memory useless models (only in case of CRF) but slow down
+  #          significantly the service at first call
+  # true -> all the models are loaded into memory at the server startup (default), slow the start of the services
+  #         and models not used will take some more memory (only in case of CRF), but server is immediatly warm and ready
+  modelPreload: true
+
+server:
+  type: custom
+  applicationConnectors:
+    - type: http
+      port: 8070
+  adminConnectors:
+    - type: http
+      port: 8071
+  registerDefaultExceptionMappers: false
+  # change the following for having all http requests logged
+  requestLog:
+    appenders: []
+
+# these logging settings apply to the Grobid service usage mode
+logging:
+  level: INFO
+  loggers:
+    org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF"
+    org.glassfish.jersey.internal: "OFF"
+    com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF"
+  appenders:
+    - type: console
+      threshold: WARN
+      timeZone: UTC
+      # uncomment to have the logs in json format
+      #layout:
+      #  type: json
\ No newline at end of file