Skip to content

Commit 0b52ac4

Browse files
authored
Disable support for reading "Content-Encoding: gzip" files by default GoogleCloudDataproc#228 (GoogleCloudDataproc#232)
1 parent 6d8fa11 commit 0b52ac4

18 files changed

+309
-37
lines changed

cloudbuild/cloudbuild.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,8 @@ steps:
5858
- 'VCS_TAG=$TAG_NAME'
5959
- 'CI_BUILD_ID=$BUILD_ID'
6060

61-
# Tests take on average 17 minutes to run
62-
timeout: 1800s
61+
# Tests take on average 25 minutes to run
62+
timeout: 2400s
6363

6464
options:
6565
machineType: 'N1_HIGHCPU_32'

gcs/CHANGES.md

+9
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,15 @@
6464

6565
1. Implement Hadoop File System `append` method using GCS compose API.
6666

67+
1. Disable support for reading GZIP encoded files (HTTP header
68+
`Content-Encoding: gzip`) because processing of
69+
[GZIP encoded](https://cloud.google.com/storage/docs/transcoding#decompressive_transcoding)
70+
files is inefficient and error-prone in Hadoop and Spark.
71+
72+
This feature is configurable with the property:
73+
74+
fs.gs.inputstream.support.gzip.encoding.enable (default: false)
75+
6776
### 1.9.14 - 2019-02-13
6877

6978
1. Implement Hadoop File System `concat` method using GCS compose API.

gcs/CONFIGURATION.md

+9
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,15 @@ exist at the same path on all nodes
253253
has independently already ensured that a file exists before calling open(),
254254
then set this property to false for more efficient reads.
255255

256+
* `fs.gs.inputstream.support.gzip.encoding.enable` (default: `false`)
257+
258+
If set to `false` then reading files with GZIP content encoding (HTTP header
259+
`Content-Encoding: gzip`) will result in failure (`IOException` is thrown).
260+
261+
This feature is disabled by default because processing of
262+
[GZIP encoded](https://cloud.google.com/storage/docs/transcoding#decompressive_transcoding)
263+
files is inefficient and error-prone in Hadoop and Spark.
264+
256265
* `fs.gs.generation.read.consistency` (default: `LATEST`)
257266

258267
Determines read consistency across different generations of a Cloud Storage

gcs/conf/gcs-core-default.xml

+9
Original file line numberDiff line numberDiff line change
@@ -572,6 +572,15 @@
572572
</description>
573573
</property>
574574

575+
<property>
576+
<name>fs.gs.inputstream.support.gzip.encoding.enable</name>
577+
<value>false</value>
578+
<description>
579+
If set to false then reading files with GZIP content encoding (HTTP header
580+
"Content-Encoding: gzip") will result in failure (`IOException` is thrown).
581+
</description>
582+
</property>
583+
575584
<property>
576585
<name>fs.gs.inputstream.inplace.seek.limit</name>
577586
<value>8388608</value>

gcs/src/main/java/com/google/cloud/hadoop/fs/gcs/GoogleHadoopFileSystemConfiguration.java

+11
Original file line numberDiff line numberDiff line change
@@ -478,6 +478,15 @@ public class GoogleHadoopFileSystemConfiguration {
478478
new GoogleHadoopFileSystemConfigurationProperty<>(
479479
"fs.gs.inputstream.fast.fail.on.not.found.enable", true);
480480

481+
/**
482+
* If true, reading a file with GZIP content encoding (HTTP header "Content-Encoding: gzip") will
483+
* result in failure (IOException is thrown).
484+
*/
485+
public static final GoogleHadoopFileSystemConfigurationProperty<Boolean>
486+
GCS_INPUT_STREAM_SUPPORT_GZIP_ENCODING_ENABLE =
487+
new GoogleHadoopFileSystemConfigurationProperty<>(
488+
"fs.gs.inputstream.support.gzip.encoding.enable", false);
489+
481490
/**
482491
* If forward seeks are within this many bytes of the current position, seeks are performed by
483492
* reading and discarding bytes in-place rather than opening a new underlying stream.
@@ -600,6 +609,8 @@ private static GoogleCloudStorageReadOptions getReadChannelOptions(Configuration
600609
return GoogleCloudStorageReadOptions.builder()
601610
.setFastFailOnNotFound(
602611
GCS_INPUT_STREAM_FAST_FAIL_ON_NOT_FOUND_ENABLE.get(config, config::getBoolean))
612+
.setSupportGzipEncoding(
613+
GCS_INPUT_STREAM_SUPPORT_GZIP_ENCODING_ENABLE.get(config, config::getBoolean))
603614
.setInplaceSeekLimit(GCS_INPUT_STREAM_INPLACE_SEEK_LIMIT.get(config, config::getLong))
604615
.setBufferSize(GCS_INPUT_STREAM_BUFFER_SIZE.get(config, config::getInt))
605616
.setFadvise(GCS_INPUT_STREAM_FADVISE.get(config, config::getEnum))

gcs/src/test/java/com/google/cloud/hadoop/fs/gcs/GoogleHadoopFileSystemConfigurationPropertyTest.java

+1
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ public class GoogleHadoopFileSystemConfigurationPropertyTest {
8888
put("fs.gs.auth.client.file", null);
8989
put("fs.gs.inputstream.buffer.size", 0);
9090
put("fs.gs.inputstream.fast.fail.on.not.found.enable", true);
91+
put("fs.gs.inputstream.support.gzip.encoding.enable", false);
9192
put("fs.gs.generation.read.consistency", GenerationReadConsistency.LATEST);
9293
put("fs.gs.outputstream.buffer.size", 8388608);
9394
put("fs.gs.outputstream.pipe.buffer.size", 1048576);

gcsio/src/main/java/com/google/cloud/hadoop/gcsio/CreateObjectOptions.java

+31-16
Original file line numberDiff line numberDiff line change
@@ -17,21 +17,23 @@
1717

1818
package com.google.cloud.hadoop.gcsio;
1919

20-
import com.google.common.base.Preconditions;
20+
import static com.google.common.base.Preconditions.checkArgument;
21+
2122
import com.google.common.collect.ImmutableMap;
2223
import java.util.Map;
2324

2425
/**
2526
* Options for creating objects in GCS.
2627
*/
2728
public class CreateObjectOptions {
28-
public static final ImmutableMap<String, byte[]> EMPTY_METADATA =
29-
ImmutableMap.<String, byte[]>of();
29+
public static final ImmutableMap<String, byte[]> EMPTY_METADATA = ImmutableMap.of();
3030
public static final String DEFAULT_CONTENT_TYPE = "application/octet-stream";
31+
public static final String DEFAULT_CONTENT_ENCODING = null;
3132
public static final CreateObjectOptions DEFAULT = new CreateObjectOptions(true);
3233

3334
private final boolean overwriteExisting;
3435
private final String contentType;
36+
private final String contentEncoding;
3537
private final Map<String, byte[]> metadata;
3638
private final boolean requireMetadataMatchForEmptyObjects;
3739

@@ -43,7 +45,7 @@ public class CreateObjectOptions {
4345
* @param overwriteExisting True to overwrite any existing objects with the same name.
4446
*/
4547
public CreateObjectOptions(boolean overwriteExisting) {
46-
this(overwriteExisting, DEFAULT_CONTENT_TYPE, EMPTY_METADATA, false);
48+
this(overwriteExisting, DEFAULT_CONTENT_TYPE, DEFAULT_CONTENT_ENCODING, EMPTY_METADATA, false);
4749
}
4850

4951
/**
@@ -56,7 +58,7 @@ public CreateObjectOptions(boolean overwriteExisting) {
5658
* @param metadata A dictionary of metadata to apply to created objects.
5759
*/
5860
public CreateObjectOptions(boolean overwriteExisting, Map<String, byte[]> metadata) {
59-
this(overwriteExisting, DEFAULT_CONTENT_TYPE, metadata, true);
61+
this(overwriteExisting, DEFAULT_CONTENT_TYPE, metadata);
6062
}
6163

6264
/**
@@ -70,30 +72,38 @@ public CreateObjectOptions(boolean overwriteExisting, Map<String, byte[]> metada
7072
*/
7173
public CreateObjectOptions(
7274
boolean overwriteExisting, String contentType, Map<String, byte[]> metadata) {
73-
this(overwriteExisting, contentType, metadata, true);
75+
this(overwriteExisting, contentType, DEFAULT_CONTENT_ENCODING, metadata, true);
7476
}
7577

7678
/**
7779
* Construct a new CreateObjectOptions with the spec metadata and content-type.
7880
*
7981
* @param overwriteExisting True to overwrite any existing objects with the same name
8082
* @param contentType content-type for the created file
83+
* @param contentEncoding content-encoding for the created file
8184
* @param metadata A dictionary of metadata to apply to created objects
82-
* @param requireMetadataMatchForEmptyObjects if true, when creating an empty object and
83-
* certain types of errors occur, any existing object is checked for an exact metadata
84-
* match to the metadata in this CreateObjectOptions before accepting the creation as
85-
* successful. If false, then on error for creating empty objects, as long as an
86-
* appropriate empty object already exists, even if it holds different metadata than
87-
* provided in this CreateObjectOptions instance, it may be considered created
88-
* successfully.
85+
* @param requireMetadataMatchForEmptyObjects if true, when creating an empty object and certain
86+
* types of errors occur, any existing object is checked for an exact metadata match to the
87+
* metadata in this CreateObjectOptions before accepting the creation as successful. If false,
88+
* then on error for creating empty objects, as long as an appropriate empty object already
89+
* exists, even if it holds different metadata than provided in this CreateObjectOptions
90+
* instance, it may be considered created successfully.
8991
*/
9092
public CreateObjectOptions(
91-
boolean overwriteExisting, String contentType, Map<String, byte[]> metadata,
93+
boolean overwriteExisting,
94+
String contentType,
95+
String contentEncoding,
96+
Map<String, byte[]> metadata,
9297
boolean requireMetadataMatchForEmptyObjects) {
93-
Preconditions.checkArgument(!metadata.containsKey("Content-Type"),
94-
"The Content-Type metadata must be provided explicitly via the 'contentType' parameter");
98+
checkArgument(
99+
!metadata.containsKey("Content-Type"),
100+
"The Content-Type must be provided explicitly via the 'contentType' parameter");
101+
checkArgument(
102+
!metadata.containsKey("Content-Encoding"),
103+
"The Content-Encoding must be provided explicitly via the 'contentEncoding' parameter");
95104
this.overwriteExisting = overwriteExisting;
96105
this.contentType = contentType;
106+
this.contentEncoding = contentEncoding;
97107
this.metadata = metadata;
98108
this.requireMetadataMatchForEmptyObjects = requireMetadataMatchForEmptyObjects;
99109
}
@@ -112,6 +122,11 @@ public String getContentType() {
112122
return contentType;
113123
}
114124

125+
/** Content type to set when creating a file. */
126+
public String getContentEncoding() {
127+
return contentEncoding;
128+
}
129+
115130
/**
116131
* Custom metadata to apply to this object.
117132
*/

gcsio/src/main/java/com/google/cloud/hadoop/gcsio/GoogleCloudStorageImpl.java

+10-5
Original file line numberDiff line numberDiff line change
@@ -376,10 +376,12 @@ public WritableByteChannel create(final StorageResourceId resourceId, CreateObje
376376
clientRequestHelper,
377377
resourceId.getBucketName(),
378378
resourceId.getObjectName(),
379+
options.getContentType(),
380+
options.getContentEncoding(),
381+
/* kmsKeyName= */ null,
379382
storageOptions.getWriteChannelOptions(),
380383
writeConditions,
381-
rewrittenMetadata,
382-
options.getContentType()) {
384+
rewrittenMetadata) {
383385

384386
@Override
385387
public Storage.Objects.Insert createRequest(InputStreamContent inputStream)
@@ -1120,10 +1122,12 @@ public List<GoogleCloudStorageItemInfo> listBucketInfo()
11201122
*/
11211123
private Storage.Objects.Insert prepareEmptyInsert(
11221124
StorageResourceId resourceId, CreateObjectOptions createObjectOptions) throws IOException {
1123-
StorageObject object = new StorageObject();
1124-
object.setName(resourceId.getObjectName());
11251125
Map<String, String> rewrittenMetadata = encodeMetadata(createObjectOptions.getMetadata());
1126-
object.setMetadata(rewrittenMetadata);
1126+
StorageObject object =
1127+
new StorageObject()
1128+
.setName(resourceId.getObjectName())
1129+
.setMetadata(rewrittenMetadata)
1130+
.setContentEncoding(createObjectOptions.getContentEncoding());
11271131

11281132
// Ideally we'd use EmptyContent, but Storage requires an AbstractInputStreamContent and not
11291133
// just an HttpContent, so we'll just use the next easiest thing.
@@ -2004,6 +2008,7 @@ public GoogleCloudStorageItemInfo composeObjects(
20042008
.setDestination(
20052009
new StorageObject()
20062010
.setContentType(options.getContentType())
2011+
.setContentEncoding(options.getContentEncoding())
20072012
.setMetadata(encodeMetadata(options.getMetadata())))),
20082013
destination.getBucketName());
20092014

gcsio/src/main/java/com/google/cloud/hadoop/gcsio/GoogleCloudStorageReadChannel.java

+4-4
Original file line numberDiff line numberDiff line change
@@ -809,11 +809,11 @@ protected void initMetadata(
809809
!metadataInitialized,
810810
"can not initialize metadata, it already initialized for '%s'", resourceIdString);
811811
gzipEncoded = nullToEmpty(encoding).contains(GZIP_ENCODING);
812-
if (gzipEncoded) {
813-
size = Long.MAX_VALUE;
814-
} else {
815-
size = sizeFromMetadata;
812+
if (gzipEncoded && !readOptions.getSupportGzipEncoding()) {
813+
throw new IOException(
814+
"Can't read GZIP encoded files - content encoding support is disabled.");
816815
}
816+
size = gzipEncoded ? Long.MAX_VALUE : sizeFromMetadata;
817817
randomAccess = !gzipEncoded && readOptions.getFadvise() == Fadvise.RANDOM;
818818
checkEncodingAndAccess();
819819

gcsio/src/main/java/com/google/cloud/hadoop/gcsio/GoogleCloudStorageReadOptions.java

+13-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
public abstract class GoogleCloudStorageReadOptions {
2929

3030
/** Operational modes of fadvise feature. */
31-
public static enum Fadvise {
31+
public enum Fadvise {
3232
AUTO,
3333
RANDOM,
3434
SEQUENTIAL
@@ -47,6 +47,7 @@ public enum GenerationReadConsistency {
4747
public static final int DEFAULT_BACKOFF_MAX_INTERVAL_MILLIS = 10 * 1000;
4848
public static final int DEFAULT_BACKOFF_MAX_ELAPSED_TIME_MILLIS = 2 * 60 * 1000;
4949
public static final boolean DEFAULT_FAST_FAIL_ON_NOT_FOUND = true;
50+
public static final boolean DEFAULT_SUPPORT_GZIP_ENCODING = true;
5051
public static final int DEFAULT_BUFFER_SIZE = 0;
5152
public static final long DEFAULT_INPLACE_SEEK_LIMIT = 0L;
5253
public static final Fadvise DEFAULT_FADVISE = Fadvise.SEQUENTIAL;
@@ -66,6 +67,7 @@ public static Builder builder() {
6667
.setBackoffMaxIntervalMillis(DEFAULT_BACKOFF_MAX_INTERVAL_MILLIS)
6768
.setBackoffMaxElapsedTimeMillis(DEFAULT_BACKOFF_MAX_ELAPSED_TIME_MILLIS)
6869
.setFastFailOnNotFound(DEFAULT_FAST_FAIL_ON_NOT_FOUND)
70+
.setSupportGzipEncoding(DEFAULT_SUPPORT_GZIP_ENCODING)
6971
.setBufferSize(DEFAULT_BUFFER_SIZE)
7072
.setInplaceSeekLimit(DEFAULT_INPLACE_SEEK_LIMIT)
7173
.setFadvise(DEFAULT_FADVISE)
@@ -91,6 +93,9 @@ public static Builder builder() {
9193
/** See {@link Builder#setFastFailOnNotFound}. */
9294
public abstract boolean getFastFailOnNotFound();
9395

96+
/** See {@link Builder#setSupportGzipEncoding}. */
97+
public abstract boolean getSupportGzipEncoding();
98+
9499
/** See {@link Builder#setBufferSize}. */
95100
public abstract int getBufferSize();
96101

@@ -153,6 +158,13 @@ public abstract static class Builder {
153158
*/
154159
public abstract Builder setFastFailOnNotFound(boolean fastFailOnNotFound);
155160

161+
/**
162+
* If false then reading a file with GZIP content encoding (HTTP header "Content-Encoding:
163+
* gzip") will result in failure (IOException is thrown). If true then GZIP-encoded files will
164+
* be read successfully.
165+
*/
166+
public abstract Builder setSupportGzipEncoding(boolean supportGzipEncoding);
167+
156168
/**
157169
* If set to a positive value, low-level streams will be wrapped inside a BufferedInputStream of
158170
* this size. Otherwise no buffer will be created to wrap the low-level streams. Note that the

gcsio/src/main/java/com/google/cloud/hadoop/gcsio/GoogleCloudStorageWriteChannel.java

+43
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ public class GoogleCloudStorageWriteChannel
3838
private final Storage gcs;
3939
private final String bucketName;
4040
private final String objectName;
41+
private final String contentEncoding;
4142
private final String kmsKeyName;
4243
private final ObjectWriteConditions writeConditions;
4344
private final Map<String, String> metadata;
@@ -127,13 +128,53 @@ public GoogleCloudStorageWriteChannel(
127128
* @param writeConditions conditions on which write should be allowed to continue
128129
* @param objectMetadata metadata to apply to the newly created object
129130
*/
131+
@Deprecated
132+
public GoogleCloudStorageWriteChannel(
133+
ExecutorService uploadThreadPool,
134+
Storage gcs,
135+
ClientRequestHelper<StorageObject> requestHelper,
136+
String bucketName,
137+
String objectName,
138+
String contentType,
139+
String kmsKeyName,
140+
AsyncWriteChannelOptions options,
141+
ObjectWriteConditions writeConditions,
142+
Map<String, String> objectMetadata) {
143+
this(
144+
uploadThreadPool,
145+
gcs,
146+
requestHelper,
147+
bucketName,
148+
objectName,
149+
contentType,
150+
/* contentEncoding= */ null,
151+
kmsKeyName,
152+
options,
153+
writeConditions,
154+
objectMetadata);
155+
}
156+
/**
157+
* Constructs an instance of GoogleCloudStorageWriteChannel.
158+
*
159+
* @param uploadThreadPool thread pool to use for running the upload operation
160+
* @param gcs storage object instance
161+
* @param requestHelper a ClientRequestHelper to set extra headers
162+
* @param bucketName name of the bucket to create object in
163+
* @param objectName name of the object to create
164+
* @param contentType content type
165+
* @param contentEncoding content encoding
166+
* @param kmsKeyName Name of Cloud KMS key to use to encrypt the newly created object
167+
* @param writeConditions conditions on which write should be allowed to continue
168+
* @param objectMetadata metadata to apply to the newly created object
169+
*/
130170
public GoogleCloudStorageWriteChannel(
131171
ExecutorService uploadThreadPool,
132172
Storage gcs,
133173
ClientRequestHelper<StorageObject> requestHelper,
134174
String bucketName,
135175
String objectName,
136176
String contentType,
177+
String contentEncoding,
137178
String kmsKeyName,
138179
AsyncWriteChannelOptions options,
139180
ObjectWriteConditions writeConditions,
@@ -146,6 +187,7 @@ public GoogleCloudStorageWriteChannel(
146187
if (contentType != null) {
147188
setContentType(contentType);
148189
}
190+
this.contentEncoding = contentEncoding;
149191
this.kmsKeyName = kmsKeyName;
150192
this.writeConditions = writeConditions;
151193
this.metadata = objectMetadata;
@@ -156,6 +198,7 @@ public Insert createRequest(InputStreamContent inputStream) throws IOException {
156198
// Create object with the given name and metadata.
157199
StorageObject object =
158200
new StorageObject()
201+
.setContentEncoding(contentEncoding)
159202
.setMetadata(metadata)
160203
.setName(objectName);
161204

gcsio/src/main/java/com/google/cloud/hadoop/gcsio/testing/InMemoryGoogleCloudStorage.java

+1
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ public synchronized WritableByteChannel create(
143143
resourceId.getObjectName(),
144144
clock.currentTimeMillis(),
145145
options.getContentType(),
146+
options.getContentEncoding(),
146147
options.getMetadata());
147148
bucketLookup.get(resourceId.getBucketName()).add(entry);
148149
return entry.getWriteChannel();

0 commit comments

Comments
 (0)