From 587e4a2d49e4f6c980df67d6e471a09afebe52b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20D=C3=B6ring?= Date: Wed, 29 Jan 2025 14:37:20 +0100 Subject: [PATCH] Add Fury instance and mapdb serializer with tests --- .../catalogue/common/fury/FuryFactory.java | 190 ++++++++++++++++++ .../common/fury/MapDbSerializer.java | 55 +++++ .../common/fury/FuryFactoryTest.java | 166 +++++++++++++++ .../common/fury/MapDbSerializerTest.java | 78 +++++++ .../life/catalogue/es/SerdeBenchmarks.java | 115 +++++++++++ 5 files changed, 604 insertions(+) create mode 100644 api/src/main/java/life/catalogue/common/fury/FuryFactory.java create mode 100644 api/src/main/java/life/catalogue/common/fury/MapDbSerializer.java create mode 100644 api/src/test/java/life/catalogue/common/fury/FuryFactoryTest.java create mode 100644 api/src/test/java/life/catalogue/common/fury/MapDbSerializerTest.java create mode 100644 dao/src/test/java/life/catalogue/es/SerdeBenchmarks.java diff --git a/api/src/main/java/life/catalogue/common/fury/FuryFactory.java b/api/src/main/java/life/catalogue/common/fury/FuryFactory.java new file mode 100644 index 000000000..12aeef6b1 --- /dev/null +++ b/api/src/main/java/life/catalogue/common/fury/FuryFactory.java @@ -0,0 +1,190 @@ +package life.catalogue.common.fury; + +import life.catalogue.api.model.*; +import life.catalogue.api.search.NameUsageWrapper; +import life.catalogue.api.search.SimpleDecision; +import life.catalogue.api.vocab.*; +import life.catalogue.api.vocab.terms.*; +import life.catalogue.coldp.ColdpTerm; +import life.catalogue.common.date.FuzzyDate; + +import org.gbif.dwc.terms.BibTexTerm; +import org.gbif.dwc.terms.TermFactory; +import org.gbif.dwc.terms.UnknownTerm; +import org.gbif.nameparser.api.*; + +import java.net.URI; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.util.*; + +import org.apache.fury.Fury; +import org.apache.fury.ThreadLocalFury; +import org.apache.fury.ThreadSafeFury; + +import de.undercouch.citeproc.csl.CSLType; +import it.unimi.dsi.fastutil.ints.IntOpenHashSet; +import it.unimi.dsi.fastutil.ints.IntSet; +import it.unimi.dsi.fastutil.objects.ObjectArrayList; + + +/** + * Creates a thread safe fury instance with pre registered API classes + */ +public class FuryFactory { + + public static final ThreadSafeFury FURY = new ThreadLocalFury(classLoader -> { + Fury f = Fury.builder() + .withLanguage(org.apache.fury.config.Language.JAVA) + .withClassLoader(classLoader) + .requireClassRegistration(false) // some non public class like JumboEnumSet cannot be registered + .build(); + return configure(f); + }); + + public static Fury configure(Fury fury) { + // clb core + fury.register(Agent.class); + fury.register(Authorship.class); + fury.register(BareName.class); + fury.register(Citation.class); + fury.register(Classification.class); + fury.register(Coordinate.class); + fury.register(DOI.class); + fury.register(Dataset.class); + fury.register(DatasetImport.class); + fury.register(Distribution.class); + fury.register(EditorialDecision.Mode.class); + fury.register(EditorialDecision.class); + fury.register(Identifier.class); + fury.register(IndexName.class); + fury.register(Media.class); + fury.register(Name.class); + fury.register(NameRelation.class); + fury.register(NameUsageWrapper.class); + fury.register(Page.class); + fury.register(ParsedName.State.class); + fury.register(ParsedName.class); + fury.register(ParsedNameUsage.class); + fury.register(Reference.class); + fury.register(Sector.Mode.class); + fury.register(Sector.class); + fury.register(SimpleDecision.class); + fury.register(SimpleName.class); + fury.register(SpeciesEstimate.class); + fury.register(SpeciesInteraction.class); + fury.register(Synonym.class); + fury.register(TaxGroup.class); + fury.register(Taxon.class); + fury.register(TaxonConceptRelation.class); + fury.register(TaxonProperty.class); + fury.register(Treatment.class); + fury.register(TypeMaterial.class); + fury.register(VerbatimRecord.class); + fury.register(VernacularName.class); + + // search + fury.register(NameUsageWrapper.class); + fury.register(SimpleDecision.class); + + // CSL classes & enums + fury.register(CslData.class); + fury.register(CslDate.class); + fury.register(CslName.class); + fury.register(CslName[].class); + fury.register(CSLType.class); + fury.register(int[][].class); + fury.register(String[].class); + + // date/time + fury.register(FuzzyDate.class); + fury.register(LocalDate.class); + fury.register(LocalDateTime.class); + + // java & commons + fury.register(int[].class); + fury.register(URI.class); + fury.register(UUID.class); + registerCollectionClasses(fury); + + // areas + fury.register(Area.class); + fury.register(AreaImpl.class); + fury.register(LonghurstArea.class); + fury.register(TdwgArea.class); + + // enums + fury.register(Country.class); + fury.register(DataFormat.class); + fury.register(DatasetOrigin.class); + fury.register(Setting.class); + fury.register(DatasetType.class); + fury.register(DistributionStatus.class); + fury.register(EnumMap.class); + fury.register(EnumSet.class); + fury.register(EstimateType.class); + fury.register(Frequency.class); + fury.register(Gender.class); + fury.register(Gazetteer.class); + fury.register(GeoTime.class); + fury.register(GeoTimeType.class); + fury.register(ImportState.class); + fury.register(Issue.class); + fury.register(JobStatus.class); + fury.register(License.class); + fury.register(Environment.class); + fury.register(MatchType.class); + fury.register(MediaType.class); + fury.register(NamePart.class); + fury.register(NameType.class); + fury.register(NomCode.class); + fury.register(NomRelType.class); + fury.register(NomStatus.class); + fury.register(Origin.class); + fury.register(Rank.class); + fury.register(Sex.class); + fury.register(SpeciesInteractionType.class); + fury.register(TaxonomicStatus.class); + fury.register(TaxonConceptRelType.class); + fury.register(TreatmentFormat.class); + fury.register(TypeStatus.class); + fury.register(InfoGroup.class); + + // term enums + TermFactory.instance().registerTermEnum(BiboOntTerm.class); + TermFactory.instance().registerTermEnum(ColdpTerm.class); + TermFactory.instance().registerTermEnum(EolDocumentTerm.class); + TermFactory.instance().registerTermEnum(EolReferenceTerm.class); + TermFactory.instance().registerTermEnum(InatTerm.class); + TermFactory.instance().registerTermEnum(TxtTreeTerm.class); + TermFactory.instance().registerTermEnum(WfoTerm.class); + for (Class cl : TermFactory.instance().listRegisteredTermEnums()) { + fury.register(cl); + } + fury.register(UnknownTerm.class); + fury.register(BibTexTerm.class); + return fury; + } + + public static void registerCollectionClasses(Fury fury) { + fury.register(ArrayList.class); + fury.register(HashMap.class); + fury.register(HashSet.class); + fury.register(EnumMap.class); + fury.register(EnumSet.class); + fury.register(LinkedHashMap.class); + fury.register(LinkedList.class); + fury.register(Collections.emptyList().getClass()); + // private class, special registration + try { + Class clazz = Class.forName("java.util.Arrays$ArrayList"); + fury.register(clazz); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + // fastutils + fury.register(IntSet.class); + fury.register(IntOpenHashSet.class); + fury.register(ObjectArrayList.class); + } +} diff --git a/api/src/main/java/life/catalogue/common/fury/MapDbSerializer.java b/api/src/main/java/life/catalogue/common/fury/MapDbSerializer.java new file mode 100644 index 000000000..e5cab8044 --- /dev/null +++ b/api/src/main/java/life/catalogue/common/fury/MapDbSerializer.java @@ -0,0 +1,55 @@ +package life.catalogue.common.fury; + +import com.esotericsoftware.kryo.Kryo; +import com.esotericsoftware.kryo.io.Input; +import com.esotericsoftware.kryo.io.Output; +import com.esotericsoftware.kryo.util.Pool; + +import org.apache.commons.lang3.NotImplementedException; +import org.mapdb.DataIO; +import org.mapdb.DataInput2; +import org.mapdb.DataOutput2; +import org.mapdb.serializer.GroupSerializerObjectArray; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +/** + * A mapDB serializer that uses fury under the hood to quickly serialize objects into the mapdb data output/input. + * + * @param the class to serialize + */ +public class MapDbSerializer extends GroupSerializerObjectArray { + private final Class clazz; + + public MapDbSerializer(Class clazz) { + this.clazz = clazz; + } + + @Override + public void serialize(DataOutput2 out, T value) throws IOException { + byte[] bytes = FuryFactory.FURY.serializeJavaObject(value); + DataIO.packInt(out, bytes.length); + out.write(bytes); + } + + @Override + public T deserialize(DataInput2 in, int available) throws IOException { + if (available == 0) return null; + int size = DataIO.unpackInt(in); + byte[] ret = new byte[size]; + in.readFully(ret); + return FuryFactory.FURY.deserializeJavaObject(ret, clazz); + } + + @Override + public boolean isTrusted() { + return true; + } + + @Override + public int compare(T first, T second) { + throw new NotImplementedException("compare should not be needed for our mapdb use"); + } + +} diff --git a/api/src/test/java/life/catalogue/common/fury/FuryFactoryTest.java b/api/src/test/java/life/catalogue/common/fury/FuryFactoryTest.java new file mode 100644 index 000000000..bb1bf2b65 --- /dev/null +++ b/api/src/test/java/life/catalogue/common/fury/FuryFactoryTest.java @@ -0,0 +1,166 @@ +package life.catalogue.common.fury; + +import life.catalogue.api.TestEntityGenerator; +import life.catalogue.api.model.*; +import life.catalogue.api.search.NameUsageWrapper; +import life.catalogue.api.vocab.*; +import life.catalogue.api.vocab.terms.*; +import life.catalogue.common.date.FuzzyDate; + +import org.gbif.dwc.terms.*; + +import java.net.URI; +import java.util.List; +import java.util.UUID; + +import org.junit.Test; + +import com.google.common.collect.Lists; + +import it.unimi.dsi.fastutil.objects.ObjectArrayList; + +import static org.junit.Assert.assertEquals; + +public class FuryFactoryTest { + + @Test + public void testName() throws Exception { + Name n = TestEntityGenerator.newName("1234567"); + assertSerde(n); + } + + @Test + public void testReference() throws Exception { + Reference r = new Reference(); + r.setId("1234"); + r.setYear(1984); + r.setDatasetKey(77); + r.setCsl(TestEntityGenerator.createCsl()); + assertSerde(r); + } + + @Test + public void testTypeMaterial() throws Exception { + var d = new TypeMaterial(); + d.setId("1234"); + d.setReferenceId("1984"); + d.setStatus(TypeStatus.ALLOLECTOTYPE); + d.setSectorKey(13); + d.setVerbatimKey(6789); + d.setSex(Sex.FEMALE); + d.setAssociatedSequences("my sequence"); + d.setCoordinate(new Coordinate(-1.7891,2.12)); + d.setAltitude("1000m"); + d.setCatalogNumber("45612"); + d.setInstitutionCode("B"); + d.setCitation("cite me like this"); + d.setCountry(Country.AFGHANISTAN); + d.setDate("my date"); + d.setHost("my host"); + d.setCollector("my collector"); + d.setNameId("nameID"); + d.setLink(URI.create("http://gbif.org/234567")); + d.setLatitude("12°31``2`"); + d.setLongitude("21°32``12`"); + assertSerde(d); + } + + @Test + public void testAreas() throws Exception { + var d = new Distribution(); + d.setId(1234); + d.setReferenceId("1984"); + d.setStatus(DistributionStatus.NATIVE); + d.setSectorKey(13); + d.setVerbatimKey(6789); + d.setArea(new AreaImpl("Berlin")); + assertSerde(d); + + d.setArea(Country.ALBANIA); + assertSerde(d); + + d.setArea(TdwgArea.of("BZN")); + assertSerde(d); + } + + @Test + public void testDataset() throws Exception { + Dataset d = TestEntityGenerator.newDataset("Unmut"); + d.setKey(1234); + d.setIssued(FuzzyDate.now()); + d.setGbifKey(UUID.randomUUID()); + d.setCreator(Agent.parse(List.of("Karl", "Frank"))); + d.setEditor(Agent.parse(List.of("Karlo", "Franko"))); + assertSerde(d); + } + + @Test + public void testUsages() throws Exception { + Taxon t = TestEntityGenerator.newTaxon("bla bla"); + assertSerde(t); + + Synonym s = TestEntityGenerator.newSynonym(t); + assertSerde(s); + } + + @Test + public void testUsageWrappers() throws Exception { + NameUsageWrapper nuw = TestEntityGenerator.newNameUsageSynonymWrapper(); + assertSerde(nuw); + + nuw = TestEntityGenerator.newNameUsageTaxonWrapper(); + assertSerde(nuw); + + nuw = TestEntityGenerator.newNameUsageBareNameWrapper(); + assertSerde(nuw); + } + + @Test + public void testVerbatim() throws Exception { + List terms = Lists.newArrayList( + DwcTerm.scientificName, DwcTerm.associatedOrganisms, DwcTerm.taxonID, + DcTerm.title, + GbifTerm.canonicalName, + IucnTerm.threatStatus, + AcefTerm.Family, + WfoTerm.ipniID, + TxtTreeTerm.content, + BiboOntTerm.journal, + EolDocumentTerm.Document, + InatTerm.lexicon, + UnknownTerm.build("http://gbif.org/abcdefg") + ); + assertSerde(terms); + + VerbatimRecord rec = TestEntityGenerator.createVerbatim(); + for (Issue issue : Issue.values()) { + rec.addIssue(issue); + } + assertSerde(rec); + } + + @Test + public void testEmptyModels() throws Exception { + assertSerde(new Taxon()); + assertSerde(new Name()); + assertSerde(new Reference()); + assertSerde(new Dataset()); + assertSerde(new DatasetImport()); + } + + + @Test + public void testFastutilList() throws Exception { + Name n = TestEntityGenerator.newName("1234567"); + var authors = n.getCombinationAuthorship().getAuthors(); + n.getCombinationAuthorship().setAuthors(new ObjectArrayList<>(authors)); + assertSerde(n); + } + + private void assertSerde(Object obj) { + byte[] bytes = FuryFactory.FURY.serialize(obj); + Object obj2 = FuryFactory.FURY.deserialize(bytes); + assertEquals(obj, obj2); + } + +} \ No newline at end of file diff --git a/api/src/test/java/life/catalogue/common/fury/MapDbSerializerTest.java b/api/src/test/java/life/catalogue/common/fury/MapDbSerializerTest.java new file mode 100644 index 000000000..fcbe3f81e --- /dev/null +++ b/api/src/test/java/life/catalogue/common/fury/MapDbSerializerTest.java @@ -0,0 +1,78 @@ +package life.catalogue.common.fury; + +import life.catalogue.api.model.VerbatimRecord; + +import org.gbif.dwc.terms.DwcTerm; + +import java.io.File; +import java.io.IOException; +import java.util.Map; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.time.StopWatch; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.mapdb.DB; +import org.mapdb.DBMaker; +import org.mapdb.Serializer; + +public class MapDbSerializerTest { + File dbf; + + @Before + public void init() throws IOException { + dbf = new File("/tmp/mapdb-test"); + // make sure mapdb parent dirs exist + if (!dbf.getParentFile().exists()) { + dbf.getParentFile().mkdirs(); + } + } + + @After + public void shutdown() { + FileUtils.deleteQuietly(dbf); + } + + @Test + public void serde() { + + DB mapDb = DBMaker + .fileDB(dbf) + .fileMmapEnableIfSupported() + .make(); + + Map verbatim = mapDb.hashMap("verbatim") + .keySerializer(Serializer.LONG) + .valueSerializer(new MapDbSerializer(VerbatimRecord.class)) + .create(); + + final int repeat = 100; + + StopWatch watch = new StopWatch(); + verbatim.put(0l, gen(0)); + + System.out.println("Writing..."); + watch.start(); + for (long x=1; x