diff --git a/.github/workflows/ci-build-manual-eval.yml b/.github/workflows/ci-build-manual-eval.yml new file mode 100644 index 0000000000..292c12c960 --- /dev/null +++ b/.github/workflows/ci-build-manual-eval.yml @@ -0,0 +1,47 @@ +name: Build and push a full docker image + +on: + workflow_dispatch: + inputs: + custom_tag: + type: string + description: Docker image tag + required: true + default: "latest" + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + java-version: '17.0.10+7' + distribution: 'temurin' + cache: 'gradle' + - name: Build with Gradle + run: ./gradlew build -x test + + docker-build-full: + needs: [ build ] + runs-on: ubuntu-latest + + steps: + - name: Create more disk space + run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY" + - uses: actions/checkout@v4 + - name: Build and push + id: docker_build + uses: mr-smithers-excellent/docker-build-push@v5 + with: + username: ${{ secrets.DOCKERHUB_USERNAME_LFOPPIANO }} + password: ${{ secrets.DOCKERHUB_TOKEN_LFOPPIANO }} + image: lfoppiano/grobid-evaluation + registry: docker.io + pushImage: true + tags: latest, ${{ github.event.inputs.custom_tag}} + dockerfile: Dockerfile.evaluation + - name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/Dockerfile.evaluation b/Dockerfile.evaluation new file mode 100644 index 0000000000..31fbcde974 --- /dev/null +++ b/Dockerfile.evaluation @@ -0,0 +1,63 @@ +## Grobid evaluation image +## ------ +# https://grobid.readthedocs.io/en/latest/End-to-end-evaluation/ +# NOTE: To match the exact evaluation published in the Grobid documentation is necessary to have a running Biblio-glutton instance + +FROM lfoppiano/grobid:0.8.2-RC1-full as runtime + +# setting locale is likely useless but to be sure +ENV LANG C.UTF-8 + +USER root + +RUN apt-get update && \ + apt-get -y --no-install-recommends install unzip wget + +WORKDIR /opt/grobid + +# gradle +COPY gradle/ ./gradle/ +COPY gradlew ./ +COPY gradle.properties ./ +COPY build.gradle ./ +COPY settings.gradle ./ + +# source +COPY grobid-core/ ./grobid-core/ +COPY grobid-service/ ./grobid-service/ +COPY grobid-trainer/ ./grobid-trainer/ + +# Setting DL-powered configuration +COPY grobid-home/config/grobid-evaluation.yaml grobid-home/config/config.yaml + +RUN rm -rf /opt/grobid/grobid-home/models/*-with_ELMo \ + && mkdir /opt/grobid/evaluation + +# Download evaluation data +WORKDIR /opt/grobid/evaluation +RUN wget https://zenodo.org/records/3873702/files/biorxiv-10k-test-2000.zip -O biorxiv-10k-test-2000.zip \ + && unzip biorxiv-10k-test-2000.zip -d biorxiv-10k-test-2000 \ + && wget https://zenodo.org/records/7708580/files/eLife_984.zip -O eLife_984.zip \ + && unzip eLife_984.zip -d eLife_984 \ + && wget https://zenodo.org/records/7708580/files/PLOS_1000.zip -O PLOS_1000.zip \ + && unzip PLOS_1000.zip -d PLOS_1000 \ + && wget https://zenodo.org/records/7708580/files/PMC_sample_1943.zip -O PMC_sample_1943.zip \ + && unzip PMC_sample_1943.zip -d PMC_sample_1943 \ + && rm *.zip + +#RUN wget -q https://zenodo.org/records/7708580/files/PMC_sample_1943.zip -O PMC_sample_1943.zip \ +# && unzip PMC_sample_1943.zip -d PMC_sample_1943 \ +# && rm *.zip + +VOLUME ["/opt/grobid/grobid-home/tmp"] + +WORKDIR /opt/grobid + +CMD ["/bin/bash", "-c", "./gradlew jatsEval -Pp2t=/opt/grobid/evaluation/PMC_sample_1943 -Prun=1 -PfileRatio=1; ./gradlew jatsEval -Pp2t=/opt/grobid/evaluation/biorxiv-10k-test-2000 -Prun=1 -PfileRatio=1; ./gradlew jatsEval -Pp2t=/opt/grobid/evaluation/eLife_984 -Prun=1 -PfileRatio=1; ./gradlew jatsEval -Pp2t=/opt/grobid/evaluation/PLOS_1000 -Prun=1 -PfileRatio=1;"] + +LABEL \ + authors="The contributors" \ + org.label-schema.name="Grobid" \ + org.label-schema.description="Image running the Grobid End 2 end evaluation" \ + org.label-schema.url="https://github.com/kermitt2/Grobid" \ + org.label-schema.version=${GROBID_VERSION} \ No newline at end of file diff --git a/grobid-home/config/grobid-evaluation.yaml b/grobid-home/config/grobid-evaluation.yaml new file mode 100644 index 0000000000..04d9b4a2fc --- /dev/null +++ b/grobid-home/config/grobid-evaluation.yaml @@ -0,0 +1,370 @@ +# this is the configuration file for the GROBID instance that uses the Deep Learning Models. + +grobid: + # where all the Grobid resources are stored (models, lexicon, native libraries, etc.), normally no need to change + grobidHome: "grobid-home" + + # path relative to the grobid-home path (e.g. tmp for grobid-home/tmp) or absolute path (/tmp) + temp: "tmp" + + # normally nothing to change here, path relative to the grobid-home path (e.g. grobid-home/lib) + nativelibrary: "lib" + + pdf: + pdfalto: + # path relative to the grobid-home path (e.g. grobid-home/pdfalto), you don't want to change this normally + path: "pdfalto" + # security for PDF parsing + memoryLimitMb: 6096 + timeoutSec: 120 + + # security relative to the PDF parsing result + blocksMax: 200000 + tokensMax: 1000000 + + consolidation: + # define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or + # "glutton" for https://github.com/kermitt2/biblio-glutton +# service: "crossref" + service: "glutton" + glutton: + url: "http://sciencialab.ddns.net:8080" +# url: "http://localhost:8080" + crossref: + mailto: + # to use crossref web API, you need normally to use it politely and to indicate an email address here, e.g. + #mailto: "toto@titi.tutu" + token: + # to use Crossref metadata plus service (available by subscription) + #token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere" + + proxy: + # proxy to be used when doing external call to the consolidation service + host: + port: + + # CORS configuration for the GROBID web API service + corsAllowedOrigins: "*" + corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD" + corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin" + + # the actual implementation for language recognition to be used + languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory" + + # the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP) + #sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory" + sentenceDetectorFactory: "org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory" + + # maximum concurrency allowed to GROBID server for processing parallel requests - change it according to your CPU/GPU capacities + # for a production server running only GROBID, set the value slightly above the available number of threads of the server + # to get best performance and security + concurrency: 10 + # when the pool is full, for queries waiting for the availability of a Grobid engine, this is the maximum time wait to try + # to get an engine (in seconds) - normally never change it + poolMaxWait: 1 + + delft: + # DeLFT global parameters + # delft installation path if Deep Learning architectures are used to implement one of the sequence labeling model, + # embeddings are usually compiled as lmdb under delft/data (this parameter is ignored if only featured-engineered CRF are used) + install: "../delft" + pythonVirtualEnv: + + wapiti: + # Wapiti global parameters + # number of threads for training the wapiti models (0 to use all available processors) + nbThreads: 0 + + models: + # we configure here how each sequence labeling model should be implemented + # for feature-engineered CRF, use "wapiti" and possible training parameters are window, epsilon and nbMaxIterations + # for Deep Learning, use "delft" and select the target DL architecture (see DeLFT library), the training + # parameters then depends on this selected DL architecture + + - name: "segmentation" + # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation + engine: "wapiti" + #engine: "delft" + wapiti: + # wapiti training parameters, they will be used at training time only + epsilon: 0.0000001 + window: 50 + nbMaxIterations: 2000 + delft: + # deep learning parameters + architecture: "BidLSTM_CRF_FEATURES" + useELMo: false + runtime: + # parameters used at runtime/prediction + max_sequence_length: 3000 + batch_size: 1 + training: + # parameters used for training + max_sequence_length: 3000 + batch_size: 10 + + - name: "segmentation-article-light" + engine: "wapiti" + wapiti: + # wapiti training parameters, they will be used at training time only + epsilon: 0.0000001 + window: 50 + nbMaxIterations: 2000 + + - name: "segmentation-article-light-ref" + engine: "wapiti" + wapiti: + # wapiti training parameters, they will be used at training time only + epsilon: 0.0000001 + window: 50 + nbMaxIterations: 2000 + + - name: "fulltext" + # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation + engine: "wapiti" + wapiti: + # wapiti training parameters, they will be used at training time only + epsilon: 0.0001 + window: 20 + nbMaxIterations: 1500 + + - name: "header" + #engine: "wapiti" + engine: "delft" + wapiti: + # wapiti training parameters, they will be used at training time only + epsilon: 0.000001 + window: 30 + nbMaxIterations: 1500 + delft: + # deep learning parameters + architecture: "BidLSTM_ChainCRF_FEATURES" + #transformer: "allenai/scibert_scivocab_cased" + useELMo: false + runtime: + # parameters used at runtime/prediction + #max_sequence_length: 510 + max_sequence_length: 3000 + batch_size: 1 + training: + # parameters used for training + #max_sequence_length: 510 + #batch_size: 6 + max_sequence_length: 3000 + batch_size: 9 + + - name: "header-article-light" +# engine: "wapiti" + engine: "delft" + wapiti: + # wapiti training parameters, they will be used at training time only + epsilon: 0.000001 + window: 30 + nbMaxIterations: 1500 + delft: + architecture: "BidLSTM_ChainCRF_FEATURES" + useELMo: false + + - name: "header-article-light-ref" +# engine: "wapiti" + engine: "delft" + wapiti: + # wapiti training parameters, they will be used at training time only + epsilon: 0.000001 + window: 30 + nbMaxIterations: 1500 + delft: + architecture: "BidLSTM_ChainCRF_FEATURES" + useELMo: false + + - name: "reference-segmenter" + #engine: "wapiti" + engine: "delft" + wapiti: + # wapiti training parameters, they will be used at training time only + epsilon: 0.00001 + window: 20 + delft: + # deep learning parameters + architecture: "BidLSTM_ChainCRF_FEATURES" + useELMo: false + runtime: + # parameters used at runtime/prediction (for this model, use same max_sequence_length as training) + max_sequence_length: 3000 + batch_size: 2 + training: + # parameters used for training + max_sequence_length: 3000 + batch_size: 10 + + - name: "name-header" + engine: "wapiti" + #engine: "delft" + delft: + # deep learning parameters + architecture: "BidLSTM_CRF_FEATURES" + + - name: "name-citation" + engine: "wapiti" + #engine: "delft" + delft: + # deep learning parameters + architecture: "BidLSTM_CRF_FEATURES" + + - name: "date" + engine: "wapiti" + #engine: "delft" + delft: + # deep learning parameters + architecture: "BidLSTM_CRF_FEATURES" + + - name: "figure" + engine: "wapiti" + #engine: "delft" + wapiti: + # wapiti training parameters, they will be used at training time only + epsilon: 0.00001 + window: 20 + delft: + # deep learning parameters + architecture: "BidLSTM_CRF" + + - name: "table" + engine: "wapiti" + #engine: "delft" + wapiti: + # wapiti training parameters, they will be used at training time only + epsilon: 0.00001 + window: 20 + delft: + # deep learning parameters + architecture: "BidLSTM_CRF" + + - name: "affiliation-address" + #engine: "wapiti" + engine: "delft" + delft: + # deep learning parameters + architecture: "BidLSTM_CRF_FEATURES" + + - name: "citation" + #engine: "wapiti" + engine: "delft" + wapiti: + # wapiti training parameters, they will be used at training time only + epsilon: 0.00001 + window: 50 + nbMaxIterations: 3000 + delft: + # deep learning parameters + architecture: "BidLSTM_CRF_FEATURES" + #architecture: "BERT_CRF" + #transformer: "michiyasunaga/LinkBERT-base" + useELMo: false + runtime: + # parameters used at runtime/prediction + max_sequence_length: 500 + batch_size: 30 + training: + # parameters used for training + max_sequence_length: 500 + batch_size: 50 + + - name: "patent-citation" + engine: "wapiti" + wapiti: + # wapiti training parameters, they will be used at training time only + epsilon: 0.0001 + window: 20 + delft: + # deep learning parameters + architecture: "BidLSTM_CRF_FEATURES" + #architecture: "BERT_CRF" + runtime: + # parameters used at runtime/prediction + max_sequence_length: 800 + batch_size: 20 + training: + # parameters used for training + max_sequence_length: 1000 + batch_size: 40 + + - name: "funding-acknowledgement" + #engine: "wapiti" + engine: "delft" + wapiti: + # wapiti training parameters, they will be used at training time only + epsilon: 0.00001 + window: 50 + nbMaxIterations: 2000 + delft: + # deep learning parameters + architecture: "BidLSTM_CRF_FEATURES" + #architecture: "BERT_CRF" + #transformer: "michiyasunaga/LinkBERT-base" + useELMo: false + runtime: + # parameters used at runtime/prediction + max_sequence_length: 800 + batch_size: 20 + training: + # parameters used for training + max_sequence_length: 500 + batch_size: 40 + + - name: "copyright" + # at this time, we only have a DeLFT implementation, + # use "wapiti" if the deep learning library JNI is not available and model will then be ignored +# engine: "delft" + engine: "wapiti" + delft: + # deep learning parameters + architecture: "gru" + #architecture: "bert" + #transformer: "allenai/scibert_scivocab_cased" + + - name: "license" + # at this time, for being active, it must be DeLFT, no other implementation is available + # use "wapiti" if the deep learning library JNI is not available and model will then be ignored +# engine: "delft" + engine: "wapiti" + delft: + # deep learning parameters + architecture: "gru" + #architecture: "bert" + #transformer: "allenai/scibert_scivocab_cased" + + # for **service only**: how to load the models, + # false -> models are loaded when needed, avoiding putting in memory useless models (only in case of CRF) but slow down + # significantly the service at first call + # true -> all the models are loaded into memory at the server startup (default), slow the start of the services + # and models not used will take some more memory (only in case of CRF), but server is immediatly warm and ready + modelPreload: true + +server: + type: custom + applicationConnectors: + - type: http + port: 8070 + adminConnectors: + - type: http + port: 8071 + registerDefaultExceptionMappers: false + # change the following for having all http requests logged + requestLog: + appenders: [] + +# these logging settings apply to the Grobid service usage mode +logging: + level: INFO + loggers: + org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF" + org.glassfish.jersey.internal: "OFF" + com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF" + appenders: + - type: console + threshold: WARN + timeZone: UTC + # uncomment to have the logs in json format + #layout: + # type: json \ No newline at end of file