Add note on using larger row group size (#8745)

twitu · alamb · web-flow · commit be8a9536e6f8 · 2024-01-09T15:35:02.000-05:00
* Add note on using larger row group size

* Nit

* prettier

* prettier

* update test

---------

Co-authored-by: Andrew Lamb &lt;andrew@nerdnetworks.org&gt;
diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -350,7 +350,9 @@ config_namespace! {
         /// default parquet writer setting
         pub max_statistics_size: Option<usize>, default = None
 
-        /// Sets maximum number of rows in a row group
+        /// Target maximum number of rows in each row group (defaults to 1M
+        /// rows). Writing larger row groups requires more memory to write, but
+        /// can get better compression and be faster to read.
         pub max_row_group_size: usize, default = 1024 * 1024
 
         /// Sets "created by" property
diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt
@@ -242,7 +242,7 @@ datafusion.execution.parquet.dictionary_enabled NULL Sets if dictionary encoding
 datafusion.execution.parquet.dictionary_page_size_limit 1048576 Sets best effort maximum dictionary page size, in bytes
 datafusion.execution.parquet.enable_page_index true If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded.
 datafusion.execution.parquet.encoding NULL Sets default encoding for any column Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting
-datafusion.execution.parquet.max_row_group_size 1048576 Sets maximum number of rows in a row group
+datafusion.execution.parquet.max_row_group_size 1048576 Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read.
 datafusion.execution.parquet.max_statistics_size NULL Sets max statistics size for any column. If NULL, uses default parquet writer setting
 datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.
 datafusion.execution.parquet.maximum_parallel_row_group_writers 1 By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
@@ -63,7 +63,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus
 | datafusion.execution.parquet.dictionary_page_size_limit                 | 1048576                   | Sets best effort maximum dictionary page size, in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
 | datafusion.execution.parquet.statistics_enabled                         | NULL                      | Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                           |
 | datafusion.execution.parquet.max_statistics_size                        | NULL                      | Sets max statistics size for any column. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.execution.parquet.max_row_group_size                         | 1048576                   | Sets maximum number of rows in a row group                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| datafusion.execution.parquet.max_row_group_size                         | 1048576                   | Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read.                                                                                                                                                                                                                                                                                                                                                                                                                   |
 | datafusion.execution.parquet.created_by                                 | datafusion version 34.0.0 | Sets "created by" property                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
 | datafusion.execution.parquet.column_index_truncate_length               | NULL                      | Sets column index truncate length                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
 | datafusion.execution.parquet.data_page_row_count_limit                  | 18446744073709551615      | Sets best effort maximum number of rows in data page                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
diff --git a/docs/source/user-guide/sql/write_options.md b/docs/source/user-guide/sql/write_options.md
@@ -100,21 +100,21 @@ The following options are available when writing CSV files. Note: if any unsuppo
 
 The following options are available when writing parquet files. If any unsupported option is specified an error will be raised and the query will fail. If a column specific option is specified for a column which does not exist, the option will be ignored without error. For default values, see: [Configuration Settings](https://arrow.apache.org/datafusion/user-guide/configs.html).
 
-| Option                       | Can be Column Specific? | Description                                                                                                   |
-| ---------------------------- | ----------------------- | ------------------------------------------------------------------------------------------------------------- |
-| COMPRESSION                  | Yes                     | Sets the compression codec and if applicable compression level to use                                         |
-| MAX_ROW_GROUP_SIZE           | No                      | Sets the maximum number of rows that can be encoded in a single row group                                     |
-| DATA_PAGESIZE_LIMIT          | No                      | Sets the best effort maximum page size in bytes                                                               |
-| WRITE_BATCH_SIZE             | No                      | Maximum number of rows written for each column in a single batch                                              |
-| WRITER_VERSION               | No                      | Parquet writer version (1.0 or 2.0)                                                                           |
-| DICTIONARY_PAGE_SIZE_LIMIT   | No                      | Sets best effort maximum dictionary page size in bytes                                                        |
-| CREATED_BY                   | No                      | Sets the "created by" property in the parquet file                                                            |
-| COLUMN_INDEX_TRUNCATE_LENGTH | No                      | Sets the max length of min/max value fields in the column index.                                              |
-| DATA_PAGE_ROW_COUNT_LIMIT    | No                      | Sets best effort maximum number of rows in a data page.                                                       |
-| BLOOM_FILTER_ENABLED         | Yes                     | Sets whether a bloom filter should be written into the file.                                                  |
-| ENCODING                     | Yes                     | Sets the encoding that should be used (e.g. PLAIN or RLE)                                                     |
-| DICTIONARY_ENABLED           | Yes                     | Sets if dictionary encoding is enabled. Use this instead of ENCODING to set dictionary encoding.              |
-| STATISTICS_ENABLED           | Yes                     | Sets if statistics are enabled at PAGE or ROW_GROUP level.                                                    |
-| MAX_STATISTICS_SIZE          | Yes                     | Sets the maximum size in bytes that statistics can take up.                                                   |
-| BLOOM_FILTER_FPP             | Yes                     | Sets the false positive probability (fpp) for the bloom filter. Implicitly sets BLOOM_FILTER_ENABLED to true. |
-| BLOOM_FILTER_NDV             | Yes                     | Sets the number of distinct values (ndv) for the bloom filter. Implicitly sets bloom_filter_enabled to true.  |
+| Option                       | Can be Column Specific? | Description                                                                                                                         |
+| ---------------------------- | ----------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
+| COMPRESSION                  | Yes                     | Sets the compression codec and if applicable compression level to use                                                               |
+| MAX_ROW_GROUP_SIZE           | No                      | Sets the maximum number of rows that can be encoded in a single row group. Larger row groups require more memory to write and read. |
+| DATA_PAGESIZE_LIMIT          | No                      | Sets the best effort maximum page size in bytes                                                                                     |
+| WRITE_BATCH_SIZE             | No                      | Maximum number of rows written for each column in a single batch                                                                    |
+| WRITER_VERSION               | No                      | Parquet writer version (1.0 or 2.0)                                                                                                 |
+| DICTIONARY_PAGE_SIZE_LIMIT   | No                      | Sets best effort maximum dictionary page size in bytes                                                                              |
+| CREATED_BY                   | No                      | Sets the "created by" property in the parquet file                                                                                  |
+| COLUMN_INDEX_TRUNCATE_LENGTH | No                      | Sets the max length of min/max value fields in the column index.                                                                    |
+| DATA_PAGE_ROW_COUNT_LIMIT    | No                      | Sets best effort maximum number of rows in a data page.                                                                             |
+| BLOOM_FILTER_ENABLED         | Yes                     | Sets whether a bloom filter should be written into the file.                                                                        |
+| ENCODING                     | Yes                     | Sets the encoding that should be used (e.g. PLAIN or RLE)                                                                           |
+| DICTIONARY_ENABLED           | Yes                     | Sets if dictionary encoding is enabled. Use this instead of ENCODING to set dictionary encoding.                                    |
+| STATISTICS_ENABLED           | Yes                     | Sets if statistics are enabled at PAGE or ROW_GROUP level.                                                                          |
+| MAX_STATISTICS_SIZE          | Yes                     | Sets the maximum size in bytes that statistics can take up.                                                                         |
+| BLOOM_FILTER_FPP             | Yes                     | Sets the false positive probability (fpp) for the bloom filter. Implicitly sets BLOOM_FILTER_ENABLED to true.                       |
+| BLOOM_FILTER_NDV             | Yes                     | Sets the number of distinct values (ndv) for the bloom filter. Implicitly sets bloom_filter_enabled to true.                        |