Skip to content

Commit

Permalink
Merge pull request kermitt2#1202 from kermitt2/feature/segmentation-l…
Browse files Browse the repository at this point in the history
…ight

Alternative articles processing flavors
  • Loading branch information
lfoppiano authored Jan 10, 2025
2 parents f6ac80f + b6584b2 commit a6bea43
Show file tree
Hide file tree
Showing 3,090 changed files with 9,771,379 additions and 2,506 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
74 changes: 41 additions & 33 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -186,11 +186,11 @@ subprojects {

test {
useJUnitPlatform()

testLogging.showStandardStreams = true
// enable for having separate test executor for different tests
forkEvery = 1
maxHeapSize = "1024m"
maxHeapSize = "1024m"

def libraries = ""
if (Os.isFamily(Os.FAMILY_MAC)) {
Expand All @@ -199,7 +199,7 @@ subprojects {
} else {
libraries = "${file("./grobid-home/lib/mac-64").absolutePath}"
}
} else if (Os.isFamily(Os.FAMILY_UNIX)) {
} else if (Os.isFamily(Os.FAMILY_UNIX)) {
def jepDir = rootProject.rootDir.getAbsolutePath() + "/grobid-home/lib/lin-64/jep"
libraries = jepDir
jepDir = rootProject.rootDir.getAbsolutePath() + "/grobid-home/lib/lin-64"
Expand All @@ -209,7 +209,7 @@ subprojects {
}

if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) {
jvmArgs "--add-opens", "java.base/java.util.stream=ALL-UNNAMED",
jvmArgs "--add-opens", "java.base/java.util.stream=ALL-UNNAMED",
"--add-opens", "java.base/java.io=ALL-UNNAMED", "--add-opens", "java.xml/jdk.xml.internal=ALL-UNNAMED"
}
systemProperty "java.library.path","${System.getProperty('java.library.path')}:" + libraries
Expand Down Expand Up @@ -351,7 +351,7 @@ project(":grobid-service") {
} else {
throw new RuntimeException("Unsupported platform!")
}

if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) {
jvmArgs "--add-opens", "java.base/java.lang=ALL-UNNAMED"
}
Expand Down Expand Up @@ -380,7 +380,7 @@ project(":grobid-service") {
distTar { duplicatesStrategy = DuplicatesStrategy.EXCLUDE }

dependencies {
implementation project(':grobid-core')
implementation project(':grobid-core')
implementation project(':grobid-trainer')

//Dropwizard
Expand All @@ -397,7 +397,7 @@ project(":grobid-service") {
implementation 'io.dropwizard.metrics:metrics-core:4.2.22'
implementation 'io.dropwizard.metrics:metrics-servlets:4.2.22'
implementation 'io.dropwizard:dropwizard-json-logging:4.0.0'

implementation "org.apache.pdfbox:pdfbox:2.0.3"
implementation "javax.activation:activation:1.1.1"
implementation "io.prometheus:simpleclient_dropwizard:0.16.0"
Expand Down Expand Up @@ -500,34 +500,38 @@ project(":grobid-trainer") {
}

def trainerTasks = [
"train_name_header" : "org.grobid.trainer.NameHeaderTrainer",
"train_name_citation" : "org.grobid.trainer.NameCitationTrainer",
"train_affiliation_address" : "org.grobid.trainer.AffiliationAddressTrainer",
// "train_header" : "org.grobid.trainer.HeaderTrainer",
"train_fulltext" : "org.grobid.trainer.FulltextTrainer",
"train_shorttext" : "org.grobid.trainer.ShorttextTrainer",
"train_figure" : "org.grobid.trainer.FigureTrainer",
"train_table" : "org.grobid.trainer.TableTrainer",
"train_citation" : "org.grobid.trainer.CitationTrainer",
"train_date" : "org.grobid.trainer.DateTrainer",
// "train_segmentation" : "org.grobid.trainer.SegmentationTrainer",
"train_reference_segmentation": "org.grobid.trainer.ReferenceSegmenterTrainer",
"train_ebook_model" : "org.grobid.trainer.EbookTrainer",
"train_patent_citation" : "org.grobid.trainer.PatentParserTrainer",
"train_name_header" : "org.grobid.trainer.NameHeaderTrainer",
"train_name_citation" : "org.grobid.trainer.NameCitationTrainer",
"train_affiliation_address" : "org.grobid.trainer.AffiliationAddressTrainer",
"train_shorttext" : "org.grobid.trainer.ShorttextTrainer",
"train_figure" : "org.grobid.trainer.FigureTrainer",
"train_table" : "org.grobid.trainer.TableTrainer",
"train_citation" : "org.grobid.trainer.CitationTrainer",
"train_date" : "org.grobid.trainer.DateTrainer",
"train_reference_segmentation" : "org.grobid.trainer.ReferenceSegmenterTrainer",
"train_ebook_model" : "org.grobid.trainer.EbookTrainer",
"train_patent_citation" : "org.grobid.trainer.PatentParserTrainer",
"train_funding_acknowledgement" : "org.grobid.trainer.FundingAcknowledgementTrainer"
]

def complexTrainerTasks = [
"train_header" : ["org.grobid.trainer.HeaderTrainer", ""],
"train_header_ietf" : ["org.grobid.trainer.HeaderTrainer", "sdo/ietf"],
"train_segmentation" : ["org.grobid.trainer.SegmentationTrainer", ""],
"train_segmentation_ietf" : ["org.grobid.trainer.SegmentationTrainer", "sdo/ietf"]
"train_header" : ["org.grobid.trainer.HeaderTrainer", ""],
"train_header_article_light" : ["org.grobid.trainer.HeaderTrainer", "article/light"],
"train_header_article_light_ref" : ["org.grobid.trainer.HeaderTrainer", "article/light-ref"],
"train_header_ietf" : ["org.grobid.trainer.HeaderTrainer", "sdo/ietf"],
"train_segmentation" : ["org.grobid.trainer.SegmentationTrainer", ""],
"train_segmentation_article_light" : ["org.grobid.trainer.SegmentationTrainer", "article/light"],
"train_segmentation_article_light_ref" : ["org.grobid.trainer.SegmentationTrainer", "article/light-ref"],
"train_segmentation_ietf" : ["org.grobid.trainer.SegmentationTrainer", "sdo/ietf"],
"train_fulltext" : ["org.grobid.trainer.FulltextTrainer", ""],
"train_fulltext_article_light" : ["org.grobid.trainer.FulltextTrainer", "article/light"],
"train_fulltext_article_light_ref" : ["org.grobid.trainer.FulltextTrainer", "article/light-ref"],
]

def libraries = ""
if (Os.isFamily(Os.FAMILY_MAC)) {
if (Os.OS_ARCH.equals("aarch64")) {
libraries = "${file("../grobid-home/lib/mac_arm-64").absolutePath}"
libraries = "${file("../grobid-home/lib/mac_arm-64").absolutePath}"
} else {
libraries = "${file("../grobid-home/lib/mac-64").absolutePath}"
}
Expand All @@ -537,13 +541,16 @@ project(":grobid-trainer") {
} else {
throw new RuntimeException("Unsupported platform!")
}

trainerTasks.each { taskName, mainClassName ->
tasks.create(name: taskName, type: JavaExec, group: 'modeltraining') {
main = mainClassName
classpath = sourceSets.main.runtimeClasspath
if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0)
if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) {
jvmArgs '-Xmx3072m', "--add-opens", "java.base/java.lang=ALL-UNNAMED"
} else {
jvmArgs '-Xmx3072m'
}
systemProperty "java.library.path","${System.getProperty('java.library.path')}:" + libraries
}
}
Expand All @@ -552,10 +559,11 @@ project(":grobid-trainer") {
tasks.create(name: taskName, type: JavaExec, group: 'modeltraining') {
main = mainClassNameAndArgs[0]
classpath = sourceSets.main.runtimeClasspath
if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0)
if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) {
jvmArgs '-Xmx3072m', "--add-opens", "java.base/java.lang=ALL-UNNAMED"
if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0)
jvmArgs '-Xmx3072m', "--add-opens", "java.base/java.lang=ALL-UNNAMED"
} else {
jvmArgs '-Xmx3072m'
}
args mainClassNameAndArgs[1]
}
}
Expand All @@ -574,7 +582,7 @@ project(":grobid-trainer") {
task(jatsEval, dependsOn: 'classes', type: JavaExec, group: 'modelevaluation') {
main = 'org.grobid.trainer.evaluation.EndToEndEvaluation'
classpath = sourceSets.main.runtimeClasspath
args 'nlm', getArg('p2t', '.'), getArg('run', '0'), getArg('fileRatio', '1.0')
args 'nlm', getArg('p2t', '.'), getArg('run', '0'), getArg('fileRatio', '1.0'), getArg('flavor', '')
if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) {
jvmArgs '-Xmx3072m', "--add-opens", "java.base/java.lang=ALL-UNNAMED"
} else {
Expand All @@ -586,7 +594,7 @@ project(":grobid-trainer") {
task(teiEval, dependsOn: 'classes', type: JavaExec, group: 'modelevaluation') {
main = 'org.grobid.trainer.evaluation.EndToEndEvaluation'
classpath = sourceSets.main.runtimeClasspath
args 'tei', getArg('p2t', '.'), getArg('run', '0'), getArg('fileRatio', '1.0')
args 'tei', getArg('p2t', '.'), getArg('run', '0'), getArg('fileRatio', '1.0'), getArg('flavor', '')
if(JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) {
jvmArgs '-Xmx3072m', "--add-opens", "java.base/java.lang=ALL-UNNAMED"
} else {
Expand Down
6 changes: 3 additions & 3 deletions doc/Benchmarking-biorxiv.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Evaluation on 2000 PDF preprints out of 2000 (no failure).

Runtime for processing 2000 PDF: **1713** seconds (0.85 seconds per PDF file) on Ubuntu 22.04, 16 CPU (32 threads), 128GB RAM and with a GeForce GTX 1080 Ti GPU.

Note: with CRF only models runtime is 622s (0.31 second per PDF) with 4GPU, 8 threads.
Note: with CRF only models runtime is 622s (0.31 second per PDF) with 4 CPU, 8 threads.


## Header metadata
Expand All @@ -35,14 +35,14 @@ Evaluation on 2000 random PDF files out of 1998 PDF (ratio 1.0).

**Field-level results**

| label | precision | recall | f1 | support |
| label | precision | recall | f1 | support |
|--- |--- |--- |--- |--- |
| abstract | 2.36 | 2.31 | 2.34 | 1989 |
| authors | 84.3 | 83.58 | 83.94 | 1998 |
| first_author | 96.97 | 96.24 | 96.61 | 1996 |
| keywords | 58.9 | 59.95 | 59.42 | 839 |
| title | 77.77 | 76.99 | 77.38 | 1999 |
| | | | | |
| | | | | |
| **all fields (micro avg.)** | **64.95** | **64.38** | **64.66** | 8821 |
| all fields (macro avg.) | 64.06 | 63.82 | 63.94 | 8821 |

Expand Down
2 changes: 1 addition & 1 deletion doc/Benchmarking-elife.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Evaluation on 984 PDF preprints out of 984 (no failure).

Runtime for processing 984 PDF: **1131** seconds (1.15 seconds per PDF file) on Ubuntu 22.04, 16 CPU (32 threads), 128GB RAM and with a GeForce GTX 1080 Ti GPU.

Note: with CRF only models runtime is 492s (0.50 seconds per PDF) with 4GPU, 8 threads.
Note: with CRF only models runtime is 492s (0.50 seconds per PDF) with 4 CPU, 8 threads.



Expand Down
2 changes: 1 addition & 1 deletion doc/Benchmarking-plos.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Evaluation on 1000 PDF preprints out of 1000 (no failure).

Runtime for processing 1000 PDF: **999** seconds, (0.99 seconds per PDF) on Ubuntu 22.04, 16 CPU (32 threads), 128GB RAM and with a GeForce GTX 1080 Ti GPU.

Note: with CRF only models runtime is 304s (0.30 seconds per PDF) with 4GPU, 8 threads.
Note: with CRF only models runtime is 304s (0.30 seconds per PDF) with 4 CPU, 8 threads.


## Header metadata
Expand Down
2 changes: 1 addition & 1 deletion doc/Benchmarking-pmc.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Evaluation on 1943 random PDF PMC files out of 1943 PDF from 1943 different jour

Runtime for processing 1943 PDF: **1467** seconds, (0.75s per PDF) on Ubuntu 22.04, 16 CPU (32 threads), 128GB RAM and with a GeForce GTX 1080 Ti GPU.

Note: with CRF only models, runtime is 470s (0.24 seconds per PDF) with 4GPU, 8 threads.
Note: with CRF only models, runtime is 470s (0.24 seconds per PDF) with 4 CPU, 8 threads.



Expand Down
Loading

0 comments on commit a6bea43

Please sign in to comment.