Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ public CodecBuilder(CodegenContext ctx, TypeRef<?> beanType) {
.forEach(ctx::reserveName);
}

public abstract String codecClassName(Class<?> cls);

/** Generate codec class code. */
public abstract String genCode();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
* <li>additional binary compare, swap, and copy methods.
* <li>little-endian access.
* <li>independent read/write index.
* <li>variant int/long encoding.
* <li>varint int/long encoding.
* <li>aligned int/long encoding.
* </ul>
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,13 @@ public static RuntimeException handleReadFailed(Fory fory, Throwable t) {
}

public static void ignore(Object... args) {}

public static RuntimeException throwAnyway(Throwable t) {
throw ExceptionUtils.<RuntimeException>throwEvadingChecks(t);
}

@SuppressWarnings({"unchecked", "TypeParameterUnusedInFormals"})
private static <E extends Throwable> E throwEvadingChecks(Throwable throwable) throws E {
throw (E) throwable;
}
}
55 changes: 52 additions & 3 deletions java/fory-format/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,61 @@ Fory row format is heavily inspired by spark tungsten row format, but with chang
- Decimal use arrow decimal format.
- Variable-size field can be inline in fixed-size region if small enough.
- Allow skip padding by generate Row using aot to put offsets in generated code.
- Support adding fields without breaking compatibility.

The initial fory java row data structure implementation is modified from spark unsafe row/writer.
The initial Fory java row data structure implementation is modified from spark unsafe row/writer.

See `Encoders.bean` Javadoc for a list built-in supported types.

## Row Format Java

To begin using the row format from Java, start with the `Encoders` class:

```
// Many built-in types and collections are supported
public record MyRecord(int key, String value) {}

// The encoder supplier is relatively expensive to create
// It is thread-safe and should be re-used
Supplier<RowEncoder<MyRecord>> encoderFactory =
Encoders.buildBeanCodec(MyRecord.class)
.build();

// Each individual encoder is relatively cheap to create
// It is not thread-safe, but may be reused by the same thread
var encoder = encoderFactory.get();
byte[] encoded = encoder.encode(new MyRecord(42, "Test"));

MyRecord deserialized = encoder.decode(encoded);
```

## Compact Format

The default row format is cross-language compatible and alignment-padded for maximum performance.
When data size is a greater concern, the compact format provides an alternate encoding that uses
significantly less space.

Enable the compact codec on the encoder builder:

```
Supplier<RowEncoder<MyRecord>> encoderFactory =
Encoders.buildBeanCodec(MyRecord.class)
.compactEncoding()
.build();
```

Optimizations include:

- struct stores fixed-size fields (e.g. Int128. FixedSizeBinary) inline in fixed-data area without offset + size
- struct of all fixed-sized fields is itself considered fixed-size to store in other struct or array
- struct skips null bitmap if all fields are non-nullable
- struct sorts fields by fixed-size for best-effort (but not guaranteed) alignment
- struct can use less than 8 bytes for small data (int, short, etc)
- struct null bitmap stored at end of struct to borrow alignment padding if possible
- array stores fixed-size fields inline in fixed-data area without offset+size
- array header uses 4 bytes for size (since Collection and array are only int-sized) and leaves remaining 4 bytes for start of null bitmap

## Custom Type Registration

It is possible to register custom type handling and collection factories for the row format -
see Encoders.registerCustomCodec and Encoders.registerCustomCollectionFactory. For an interface,
Fory can synthesize a simple value implementation, such as the UuidType below.
Expand Down Expand Up @@ -45,7 +94,7 @@ static class UuidEncoder implements CustomCodec.MemoryBufferCodec<UUID> {
static class SortedSetOfUuidDecoder implements CustomCollectionFactory<UUID, SortedSet<UUID>> {
@Override
public SortedSet<UUID> newCollection(final int size) {
return new TreeSet<>(UnsignedUuidComparator.INSTANCE);
return new TreeSet<>();
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.fory.format.encoder;

import static org.apache.fory.type.TypeUtils.getRawType;

import java.lang.invoke.MethodHandle;
import java.lang.invoke.MethodHandles;
import java.lang.invoke.MethodType;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.function.Function;
import java.util.function.Supplier;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.fory.format.row.binary.writer.BinaryArrayWriter;
import org.apache.fory.format.type.DataTypes;
import org.apache.fory.format.type.TypeInference;
import org.apache.fory.reflect.TypeRef;
import org.apache.fory.type.TypeUtils;
import org.apache.fory.util.ExceptionUtils;

public class ArrayCodecBuilder<C extends Collection<?>>
extends BaseCodecBuilder<ArrayCodecBuilder<C>> {

private final TypeRef<C> collectionType;
private final Field elementField;

ArrayCodecBuilder(final TypeRef<C> collectionType) {
super(TypeInference.inferSchema(collectionType, false));
this.collectionType = collectionType;
elementField = DataTypes.fieldOfSchema(schema, 0);
}

public Supplier<ArrayEncoder<C>> build() {
final Function<BinaryArrayWriter, ArrayEncoder<C>> arrayEncoderFactory = buildWithWriter();
return new Supplier<ArrayEncoder<C>>() {
@Override
public ArrayEncoder<C> get() {
final BinaryArrayWriter writer = codecFormat.newArrayWriter(elementField);
return new BufferResettingArrayEncoder<>(
initialBufferSize, writer, arrayEncoderFactory.apply(writer));
}
};
}

Function<BinaryArrayWriter, ArrayEncoder<C>> buildWithWriter() {
loadArrayInnerCodecs();
final Function<BinaryArrayWriter, GeneratedArrayEncoder> generatedEncoderFactory =
generatedEncoderFactory();
return new Function<BinaryArrayWriter, ArrayEncoder<C>>() {
@Override
public ArrayEncoder<C> apply(final BinaryArrayWriter writer) {
return new BinaryArrayEncoder<>(
writer, generatedEncoderFactory.apply(writer), sizeEmbedded);
}
};
}

private void loadArrayInnerCodecs() {
final Set<TypeRef<?>> set = new HashSet<>();
Encoders.findBeanToken(collectionType, set);
if (set.isEmpty()) {
throw new IllegalArgumentException("can not find bean class.");
}

for (final TypeRef<?> tt : set) {
Encoders.loadOrGenRowCodecClass(getRawType(tt), codecFormat);
}
}

Function<BinaryArrayWriter, GeneratedArrayEncoder> generatedEncoderFactory() {
final TypeRef<?> elementType = TypeUtils.getElementType(collectionType);
final Class<?> arrayCodecClass =
Encoders.loadOrGenArrayCodecClass(collectionType, elementType, codecFormat);

final MethodHandle constructorHandle;
try {
final var constructor =
arrayCodecClass.asSubclass(GeneratedArrayEncoder.class).getConstructor(Object[].class);
constructorHandle =
MethodHandles.lookup()
.unreflectConstructor(constructor)
.asType(MethodType.methodType(GeneratedArrayEncoder.class, Object[].class));
} catch (final NoSuchMethodException | IllegalAccessException e) {
throw new EncoderException(
"Failed to construct array codec for "
+ collectionType
+ " with element class "
+ elementType,
e);
}
return new Function<BinaryArrayWriter, GeneratedArrayEncoder>() {
@Override
public GeneratedArrayEncoder apply(final BinaryArrayWriter writer) {
final Object[] references = {writer.getField(), writer, fory};
try {
return (GeneratedArrayEncoder) constructorHandle.invokeExact(references);
} catch (final Throwable t) {
throw ExceptionUtils.throwAnyway(t);
}
}
};
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
import org.apache.fory.codegen.Expression;
import org.apache.fory.codegen.ExpressionUtils;
import org.apache.fory.format.row.binary.BinaryArray;
import org.apache.fory.format.row.binary.writer.BinaryArrayWriter;
import org.apache.fory.format.type.TypeInference;
import org.apache.fory.logging.Logger;
import org.apache.fory.logging.LoggerFactory;
Expand Down Expand Up @@ -104,13 +103,13 @@ public String genCode() {
"arrayWriter",
ROOT_ARRAY_WRITER_NAME,
"arrayWriterType",
ctx.type(BinaryArrayWriter.class),
arrayWriterType(),
"fory",
FORY_NAME,
"foryType",
ctx.type(Fory.class));
ctx.addField(ctx.type(Field.class), FIELD_NAME);
ctx.addField(ctx.type(BinaryArrayWriter.class), ROOT_ARRAY_WRITER_NAME);
ctx.addField(ctx.type(arrayWriterType()), ROOT_ARRAY_WRITER_NAME);
ctx.addField(ctx.type(Fory.class), FORY_NAME);

Expression encodeExpr = buildEncodeExpression();
Expand All @@ -136,7 +135,7 @@ public String genCode() {
@Override
public Expression buildEncodeExpression() {
Expression.Reference arrayWriter =
new Expression.Reference(ROOT_ARRAY_WRITER_NAME, arrayWriterTypeToken, false);
new Expression.Reference(ROOT_ARRAY_WRITER_NAME, arrayWriterType(), false);
Expression.ListExpression expressions = new Expression.ListExpression();

Expression.Reference inputObject =
Expand All @@ -148,7 +147,7 @@ public Expression buildEncodeExpression() {

Expression.Reference fieldExpr = new Expression.Reference(FIELD_NAME, ARROW_FIELD_TYPE, false);
Expression listExpression =
serializeForArrayByWriter(array, arrayWriter, arrayToken, fieldExpr);
serializeForArrayByWriter(array, arrayWriter, arrayToken, null, fieldExpr);

expressions.add(listExpression);

Expand Down
Loading
Loading