Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion docs/development/extensions-contrib/iceberg.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ The `druid-iceberg-extensions` extension relies on the existing input source con
For Druid to seamlessly talk to the Hive metastore, ensure that the Hive configuration files such as `hive-site.xml` and `core-site.xml` are available in the Druid classpath for peon processes.
You can also specify Hive properties under the `catalogProperties` object in the ingestion spec.

The `druid-iceberg-extensions` extension presently only supports HDFS, S3 and local warehouse directories.
The `druid-iceberg-extensions` extension presently supports HDFS, S3, GCS, and local warehouse directories.

### Read from HDFS warehouse

Expand Down Expand Up @@ -106,6 +106,33 @@ The following properties are required in the `catalogProperties`:
```
Since the Hadoop AWS connector uses the `s3a` filesystem client, specify the warehouse path with the `s3a://` protocol instead of `s3://`.

### Read from GCS warehouse

To read from a GCS warehouse, load the `druid-google-extensions` extension. Druid extracts the
data file paths from the catalog and uses `GoogleCloudStorageInputSource` to ingest these files.
Set the `type` property of the `warehouseSource` object to `google` in the ingestion spec:

```json
"warehouseSource": {
"type": "google"
}
```

For the Iceberg catalog to read its own metadata files from GCS, set `io-impl` to
`org.apache.iceberg.gcp.gcs.GCSFileIO` in `catalogProperties`:

```json
"catalogProperties": {
"io-impl": "org.apache.iceberg.gcp.gcs.GCSFileIO",
"gcs.project-id": "my-gcp-project",
"warehouse": "gs://my-bucket/warehouse"
}
```

Authentication uses Application Default Credentials (ADC). Ensure that the Druid processes
have access to valid GCP credentials (e.g., via a service account key file, workload identity,
or metadata server).

## Local catalog

The local catalog type can be used for catalogs configured on the local filesystem. Set the `icebergCatalog` type to `local`. You can use this catalog for demos or localized tests. It is not recommended for production use cases.
Expand Down
18 changes: 18 additions & 0 deletions extensions-contrib/druid-iceberg-extensions/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,10 @@
<groupId>org.xerial.snappy</groupId>
<artifactId>snappy-java</artifactId>
</exclusion>
<exclusion>
<groupId>com.google.re2j</groupId>
<artifactId>re2j</artifactId>
</exclusion>
</exclusions>
</dependency>

Expand Down Expand Up @@ -293,6 +297,18 @@
<version>${awssdk.version}</version>
</dependency>

<!-- Iceberg GCSFileIO for reading metadata from GCS -->
<dependency>
<groupId>org.apache.iceberg</groupId>
<artifactId>iceberg-gcp</artifactId>
<version>${iceberg.core.version}</version>
</dependency>
<!-- Required: iceberg-gcp declares google-cloud-storage as compileOnly (not transitive) -->
<dependency>
<groupId>com.google.cloud</groupId>
<artifactId>google-cloud-storage</artifactId>
<version>${com.google.cloud.storage.version}</version>
</dependency>

<dependency>
<groupId>org.apache.hive</groupId>
Expand Down Expand Up @@ -776,6 +792,8 @@
<ignoredUnusedDeclaredDependency>software.amazon.awssdk:glue</ignoredUnusedDeclaredDependency>
<ignoredUnusedDeclaredDependency>software.amazon.awssdk:s3</ignoredUnusedDeclaredDependency>
<ignoredUnusedDeclaredDependency>software.amazon.awssdk:sts</ignoredUnusedDeclaredDependency>
<ignoredUnusedDeclaredDependency>org.apache.iceberg:iceberg-gcp</ignoredUnusedDeclaredDependency>
<ignoredUnusedDeclaredDependency>com.google.cloud:google-cloud-storage</ignoredUnusedDeclaredDependency>
</ignoredUnusedDeclaredDependencies>
</configuration>
</plugin>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.druid.data.input.google;

import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonCreator;
import org.apache.druid.data.input.InputSourceFactory;
import org.apache.druid.data.input.impl.SplittableInputSource;
import org.apache.druid.data.input.impl.systemfield.SystemFields;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.storage.google.GoogleInputDataConfig;
import org.apache.druid.storage.google.GoogleStorage;

import java.net.URI;
import java.net.URISyntaxException;
import java.util.List;
import java.util.stream.Collectors;

public class GoogleCloudStorageInputSourceFactory implements InputSourceFactory
{
private final GoogleStorage storage;
private final GoogleInputDataConfig inputDataConfig;

@JsonCreator
public GoogleCloudStorageInputSourceFactory(
@JacksonInject GoogleStorage storage,
@JacksonInject GoogleInputDataConfig inputDataConfig
)
{
this.storage = storage;
this.inputDataConfig = inputDataConfig;
}

@Override
public SplittableInputSource create(List<String> inputFilePaths)
{
final List<URI> uris = inputFilePaths.stream().map(chosenPath -> {
try {
return new URI(chosenPath);
}
catch (URISyntaxException e) {
throw new IAE(e, "Invalid input file path [%s]", chosenPath);
}
}).collect(Collectors.toList());

return new GoogleCloudStorageInputSource(
storage,
inputDataConfig,
uris,
null,
null,
null,
SystemFields.none()
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import com.google.inject.multibindings.MapBinder;
import org.apache.druid.data.SearchableVersionedDataFinder;
import org.apache.druid.data.input.google.GoogleCloudStorageInputSource;
import org.apache.druid.data.input.google.GoogleCloudStorageInputSourceFactory;
import org.apache.druid.guice.Binders;
import org.apache.druid.guice.JsonConfigProvider;
import org.apache.druid.guice.LazySingleton;
Expand Down Expand Up @@ -74,7 +75,8 @@ public void setupModule(SetupContext context)
}
},
new SimpleModule().registerSubtypes(
new NamedType(GoogleCloudStorageInputSource.class, SCHEME)
new NamedType(GoogleCloudStorageInputSource.class, SCHEME),
new NamedType(GoogleCloudStorageInputSourceFactory.class, SCHEME)
)
);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.druid.data.input.google;

import com.fasterxml.jackson.databind.InjectableValues;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.druid.data.input.InputSourceFactory;
import org.apache.druid.data.input.impl.SplittableInputSource;
import org.apache.druid.jackson.DefaultObjectMapper;
import org.apache.druid.storage.google.GoogleInputDataConfig;
import org.apache.druid.storage.google.GoogleStorage;
import org.apache.druid.storage.google.GoogleStorageDruidModule;
import org.easymock.EasyMock;
import org.junit.Assert;
import org.junit.Test;

import java.net.URI;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

public class GoogleCloudStorageInputSourceFactoryTest
{
private static final GoogleStorage STORAGE = EasyMock.createMock(GoogleStorage.class);
private static final GoogleInputDataConfig INPUT_DATA_CONFIG = EasyMock.createMock(GoogleInputDataConfig.class);

private static ObjectMapper createObjectMapper()
{
final ObjectMapper mapper = new DefaultObjectMapper();
new GoogleStorageDruidModule().getJacksonModules().forEach(mapper::registerModule);
mapper.setInjectableValues(
new InjectableValues.Std()
.addValue(GoogleStorage.class, STORAGE)
.addValue(GoogleInputDataConfig.class, INPUT_DATA_CONFIG)
);
return mapper;
}

@Test
public void testCreateReturnsGoogleCloudStorageInputSource()
{
final GoogleCloudStorageInputSourceFactory factory = new GoogleCloudStorageInputSourceFactory(
STORAGE,
INPUT_DATA_CONFIG
);
final List<String> paths = Arrays.asList(
"gs://foo/bar/file.csv",
"gs://bar/foo/file2.csv"
);

final SplittableInputSource inputSource = factory.create(paths);
Assert.assertTrue(inputSource instanceof GoogleCloudStorageInputSource);
}

@Test
public void testCreateWithMultiplePaths()
{
final GoogleCloudStorageInputSourceFactory factory = new GoogleCloudStorageInputSourceFactory(
STORAGE,
INPUT_DATA_CONFIG
);
final List<String> paths = Arrays.asList(
"gs://foo/bar/file.csv",
"gs://bar/foo/file2.csv",
"gs://baz/qux/file3.txt"
);

final GoogleCloudStorageInputSource inputSource = (GoogleCloudStorageInputSource) factory.create(paths);
Assert.assertNotNull(inputSource.getUris());
Assert.assertEquals(3, inputSource.getUris().size());
Assert.assertEquals(URI.create("gs://foo/bar/file.csv"), inputSource.getUris().get(0));
Assert.assertEquals(URI.create("gs://bar/foo/file2.csv"), inputSource.getUris().get(1));
Assert.assertEquals(URI.create("gs://baz/qux/file3.txt"), inputSource.getUris().get(2));
}

@Test
public void testCreatePreservesGsScheme()
{
final GoogleCloudStorageInputSourceFactory factory = new GoogleCloudStorageInputSourceFactory(
STORAGE,
INPUT_DATA_CONFIG
);
final List<String> paths = Collections.singletonList("gs://bucket/path/to/file.csv");

final GoogleCloudStorageInputSource inputSource = (GoogleCloudStorageInputSource) factory.create(paths);
final URI uri = inputSource.getUris().get(0);
Assert.assertEquals("gs", uri.getScheme());
Assert.assertEquals("bucket", uri.getHost());
Assert.assertEquals("/path/to/file.csv", uri.getPath());
}

@Test(expected = IllegalArgumentException.class)
public void testCreateWithEmptyListThrows()
{
final GoogleCloudStorageInputSourceFactory factory = new GoogleCloudStorageInputSourceFactory(
STORAGE,
INPUT_DATA_CONFIG
);
factory.create(Collections.emptyList());
}

@Test
public void testCreateWithEncodedPaths()
{
final GoogleCloudStorageInputSourceFactory factory = new GoogleCloudStorageInputSourceFactory(
STORAGE,
INPUT_DATA_CONFIG
);
final List<String> paths = Collections.singletonList("gs://bucket/path%20with%20spaces/file.csv");

final GoogleCloudStorageInputSource inputSource = (GoogleCloudStorageInputSource) factory.create(paths);
final URI uri = inputSource.getUris().get(0);
Assert.assertEquals("gs://bucket/path%20with%20spaces/file.csv", uri.toString());
}

@Test
public void testJacksonDeserialization() throws Exception
{
final ObjectMapper mapper = createObjectMapper();

final String json = "{\"type\": \"google\"}";
final InputSourceFactory factory = mapper.readValue(json, InputSourceFactory.class);
Assert.assertTrue(factory instanceof GoogleCloudStorageInputSourceFactory);
}

@Test
public void testSerializationRoundTrip() throws Exception
{
final ObjectMapper mapper = createObjectMapper();

final GoogleCloudStorageInputSourceFactory original = new GoogleCloudStorageInputSourceFactory(
STORAGE,
INPUT_DATA_CONFIG
);

final String json = mapper.writeValueAsString(original);
Assert.assertTrue(json.contains("\"type\":\"google\""));

final GoogleCloudStorageInputSourceFactory deserialized = mapper.readValue(
json,
GoogleCloudStorageInputSourceFactory.class
);
Assert.assertNotNull(deserialized);
}
}
4 changes: 4 additions & 0 deletions website/.spelling
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ docusaurus
64-bit
ACL
ACLs
ADC
APIs
apache.org
AvroStorage
Expand Down Expand Up @@ -113,6 +114,7 @@ Float.POSITIVE_INFINITY.
ForwardedRequestCustomizer
GC
GPG
GoogleCloudStorageInputSource
GSSAPI
GUIs
GroupBy
Expand Down Expand Up @@ -1135,7 +1137,9 @@ POD_NAME
POD_NAMESPACE
ConfigMap
PT17S
GCP
GCS
GCSFileIO
gcs-connector
hdfs
Aotearoa
Expand Down
Loading