diff --git a/CHANGES.md b/CHANGES.md index ea3620903..b6c144548 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -3,7 +3,8 @@ ## Next * Issue #1290: Stopped using metadata for optimized count path * Issue #1317: Improving OpenLineage 1.24.0+ compatibility -* PR #1311 : Improve read session expired error message +* PR #1311: Improve read session expired error message +* PR #1320: Set the `temporaryGcsBucket` to default to `fs.gs.system.bucket` if exists, negating the need to set it in Dataproc clusters. ## 0.41.0 - 2024-09-05 diff --git a/README-template.md b/README-template.md index a4ec1bd30..e84eaefbc 100644 --- a/README-template.md +++ b/README-template.md @@ -573,7 +573,8 @@ word-break:break-word The GCS bucket that temporarily holds the data before it is loaded to BigQuery. Required unless set in the Spark configuration (spark.conf.set(...)). -
Not supported by the `DIRECT` write method. +
Defaults to the `fs.gs.system.bucket` if exists, for example on Google Cloud Dataproc clusters, starting version 0.42.0. +
Supported only by the `INDIRECT` write method. Write @@ -583,7 +584,7 @@ word-break:break-word The GCS bucket that holds the data before it is loaded to BigQuery. If informed, the data won't be deleted after write data into BigQuery. -
Not supported by the `DIRECT` write method. +
Supported only by the `INDIRECT` write method. Write diff --git a/spark-bigquery-connector-common/src/main/java/com/google/cloud/spark/bigquery/SparkBigQueryConfig.java b/spark-bigquery-connector-common/src/main/java/com/google/cloud/spark/bigquery/SparkBigQueryConfig.java index fd1543d0f..e61fc298a 100644 --- a/spark-bigquery-connector-common/src/main/java/com/google/cloud/spark/bigquery/SparkBigQueryConfig.java +++ b/spark-bigquery-connector-common/src/main/java/com/google/cloud/spark/bigquery/SparkBigQueryConfig.java @@ -171,6 +171,8 @@ public static WriteMethod from(@Nullable String writeMethod) { public static final String BIG_NUMERIC_DEFAULT_PRECISION = "bigNumericDefaultPrecision"; public static final String BIG_NUMERIC_DEFAULT_SCALE = "bigNumericDefaultScale"; + private static final String DATAPROC_SYSTEM_BUCKET_CONFIGURATION = "fs.gs.system.bucket"; + TableId tableId; // as the config needs to be Serializable, internally it uses // com.google.common.base.Optional but externally it uses the regular java.util.Optional @@ -398,7 +400,10 @@ public static SparkBigQueryConfig from( .orNull(); config.defaultParallelism = defaultParallelism; config.temporaryGcsBucket = - stripPrefix(getAnyOption(globalOptions, options, "temporaryGcsBucket")); + stripPrefix(getAnyOption(globalOptions, options, "temporaryGcsBucket")) + .or( + com.google.common.base.Optional.fromNullable( + hadoopConfiguration.get(DATAPROC_SYSTEM_BUCKET_CONFIGURATION))); config.persistentGcsBucket = stripPrefix(getAnyOption(globalOptions, options, "persistentGcsBucket")); config.persistentGcsPath = getOption(options, "persistentGcsPath"); diff --git a/spark-bigquery-connector-common/src/test/java/com/google/cloud/spark/bigquery/SparkBigQueryConfigTest.java b/spark-bigquery-connector-common/src/test/java/com/google/cloud/spark/bigquery/SparkBigQueryConfigTest.java index 9dfbce529..c50f26fd2 100644 --- a/spark-bigquery-connector-common/src/test/java/com/google/cloud/spark/bigquery/SparkBigQueryConfigTest.java +++ b/spark-bigquery-connector-common/src/test/java/com/google/cloud/spark/bigquery/SparkBigQueryConfigTest.java @@ -1182,4 +1182,22 @@ public void testEnableListInferenceWithDefaultIntermediateFormat() { assertThat(config.getIntermediateFormat()) .isEqualTo(SparkBigQueryConfig.IntermediateFormat.PARQUET_LIST_INFERENCE_ENABLED); } + + @Test + public void testSystemBucketAsDefaultTemporaryGcsBucket() { + Configuration hadoopConfiguration = new Configuration(); + hadoopConfiguration.set("fs.gs.system.bucket", "foo"); + SparkBigQueryConfig config = + SparkBigQueryConfig.from( + asDataSourceOptionsMap(defaultOptions), + emptyMap, // allConf + hadoopConfiguration, + emptyMap, // customDefaults + 1, + new SQLConf(), + sparkVersion, + /* schema */ Optional.empty(), + /* tableIsMandatory */ true); + assertThat(config.getTemporaryGcsBucket()).hasValue("foo"); + } }