apache · thomasrebele · Mar 17, 2026 · thomasrebele · Mar 20, 2026
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsAutoGatherContext.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsAutoGatherContext.java
@@ -24,7 +24,7 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.stream.Collectors;
+import java.util.function.Supplier;
 
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.metastore.api.FieldSchema;
@@ -238,7 +238,7 @@
   * @param input : the operator right before FS in the insert overwrite statement
   * @throws HiveException 
   */
  private void replaceSelectOperatorProcess(SelectOperator operator, Operator<? extends OperatorDesc> input)
      throws HiveException {
    RowSchema selRS = operator.getSchema();
    List<ColumnInfo> signature = new ArrayList<>();
@@ -256,14 +256,32 @@
     //                                |
 
     // 1. deal with non-partition columns
+    Map<String, Integer> columnNameToIndex = new HashMap<>();
+    List<ColumnInfo> selRSSig = selRS.getSignature();
+    for (int i = 0; i < selRSSig.size(); i++) {
+      columnNameToIndex.putIfAbsent(selRSSig.get(i).getAlias(), i);
+    }
     for (int i = 0; i < this.columns.size(); i++) {
       ColumnInfo col = columns.get(i);
+      ObjectInspector objectInspector = col.getObjectInspector();
+      if (objectInspector == null) {
+        continue;
+      }
+      boolean columnSupported = isColumnSupported(objectInspector.getCategory(), col::getType);
+      if (!columnSupported) {
+        continue;
+      }
+
+      Integer selRSIdx = columnNameToIndex.get(this.columns.get(i).getName());
+      if (selRSIdx == null) {
+        continue;
+      }
       ExprNodeDesc exprNodeDesc = new ExprNodeColumnDesc(col);
       colList.add(exprNodeDesc);
-      String internalName = selRS.getColumnNames().get(i);
+      String internalName = selRS.getColumnNames().get(selRSIdx);
       columnNames.add(internalName);
       columnExprMap.put(internalName, exprNodeDesc);
-      signature.add(selRS.getSignature().get(i));
+      signature.add(selRSSig.get(selRSIdx));
     }
     // if there is any partition column (in static partition or dynamic
     // partition or mixed case)
@@ -280,7 +298,7 @@
         }
         exprNodeDesc = new ExprNodeConstantDesc(partSpec.get(partColName));
         TypeInfo srcType = exprNodeDesc.getTypeInfo();
-        TypeInfo destType = selRS.getSignature().get(this.columns.size() + i).getType();
+        TypeInfo destType = selRSSig.get(this.columns.size() + i).getType();
         if (!srcType.equals(destType)) {
           // This may be possible when srcType is string but destType is integer
           exprNodeDesc = ExprNodeTypeCheck.getExprNodeDefaultExprProcessor()
@@ -292,7 +310,7 @@
         dynamicPartBegin++;
         ColumnInfo col = columns.get(this.columns.size() + dynamicPartBegin);
         TypeInfo srcType = col.getType();
-        TypeInfo destType = selRS.getSignature().get(this.columns.size() + i).getType();
+        TypeInfo destType = selRSSig.get(this.columns.size() + i).getType();
         exprNodeDesc = new ExprNodeColumnDesc(col);
         if (!srcType.equals(destType)) {
           exprNodeDesc = ExprNodeTypeCheck.getExprNodeDefaultExprProcessor()
@@ -303,7 +321,7 @@
       String internalName = selRS.getColumnNames().get(this.columns.size() + i);
       columnNames.add(internalName);
       columnExprMap.put(internalName, exprNodeDesc);
-      signature.add(selRS.getSignature().get(this.columns.size() + i));
+      signature.add(selRSSig.get(this.columns.size() + i));
     }
     operator.setConf(new SelectDesc(colList, columnNames));
     operator.setColumnExprMap(columnExprMap);
@@ -319,36 +337,61 @@
     return isInsertInto;
   }
 
-  public static boolean canRunAutogatherStats(Operator curr) {
+  public static boolean isColumnSupported(ObjectInspector.Category category, Supplier<TypeInfo> typeInfoSupplier) {
+    if (category != ObjectInspector.Category.PRIMITIVE) {
+      return false;
+    }
+    TypeInfo t = typeInfoSupplier.get();
+    switch (((PrimitiveTypeInfo) t).getPrimitiveCategory()) {
+    case BOOLEAN:
+    case BYTE:
+    case SHORT:
+    case INT:
+    case LONG:
+    case TIMESTAMP:
+    case FLOAT:
+    case DOUBLE:
+    case STRING:
+    case CHAR:
+    case VARCHAR:
+    case BINARY:
+    case DECIMAL:
+    case DATE:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  public static boolean canRunAutogatherStats(Table destinationTable, Operator curr) {
+    if (destinationTable.isNonNative() && destinationTable.getStorageHandler().supportsPartitioning()) {
+      // On partitioned tables, the partition key is needed to store the stats.
+      // However, external tables (e.g. stored by iceberg) may not define partition keys,
+      // i.e., org.apache.hadoop.hive.ql.metadata.Table.getPartitionKeys() returns null.
+      // So keep the same behavior as before HIVE-29432, and only run stats autogather if all columns are supported.
+      return areAllColumnsSupported(curr);
+    }
+    return isAnyColumnSupported(curr);
+  }
+
+  private static boolean areAllColumnsSupported(Operator curr) {
     // check the ObjectInspector
     for (ColumnInfo cinfo : curr.getSchema().getSignature()) {
-      if (cinfo.getIsVirtualCol()) {
-        return false;
-      } else if (cinfo.getObjectInspector().getCategory() != ObjectInspector.Category.PRIMITIVE) {
+      if (cinfo.getIsVirtualCol() || !isColumnSupported(cinfo.getObjectInspector().getCategory(), cinfo::getType)) {
         return false;
-      } else {
-        switch (((PrimitiveTypeInfo) cinfo.getType()).getPrimitiveCategory()) {
-        case BOOLEAN:
-        case BYTE:
-        case SHORT:
-        case INT:
-        case LONG:
-        case TIMESTAMP:
-        case FLOAT:
-        case DOUBLE:
-        case STRING:
-        case CHAR:
-        case VARCHAR:
-        case BINARY:
-        case DECIMAL:
-        case DATE:
-          break;
-        default:
-          return false;
-        }
       }
     }
     return true;
   }
 
+  private static boolean isAnyColumnSupported(Operator curr) {
+    // check the ObjectInspector
+    for (ColumnInfo cinfo : curr.getSchema().getSignature()) {
+      if (!cinfo.getIsVirtualCol() && isColumnSupported(cinfo.getObjectInspector().getCategory(), cinfo::getType)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
 }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java
@@ -49,7 +49,6 @@
 import org.apache.hadoop.hive.ql.stats.ColStatsProcessor.ColumnStatsField;
 import org.apache.hadoop.hive.ql.stats.ColStatsProcessor.ColumnStatsType;
 import org.apache.hadoop.hive.ql.stats.StatsUtils;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
 import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
@@ -103,11 +102,27 @@ private boolean shouldRewrite(ASTNode tree) {
     return rwt;
   }
 
+  /**
+   * Get the names of the columns that support column statistics.
+   */
+  private static List<String> getColumnNames(Table tbl) {
+    List<String> colNames = new ArrayList<>();
+    for (FieldSchema col : tbl.getCols()) {
+      String type = col.getType();
+      TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(type);
+      boolean isSupported = ColumnStatsAutoGatherContext.isColumnSupported(typeInfo.getCategory(), () -> typeInfo);
+      if (isSupported) {
+        colNames.add(col.getName());
+      }
+    }
+    return colNames;
+  }
+
   private List<String> getColumnName(ASTNode tree) throws SemanticException {
 
     switch (tree.getChildCount()) {
     case 2:
-      return Utilities.getColumnNamesFromFieldSchema(tbl.getCols());
+      return getColumnNames(tbl);
     case 3:
       int numCols = tree.getChild(2).getChildCount();
       List<String> colName = new ArrayList<>(numCols);
@@ -212,7 +227,8 @@ protected static List<String> getColumnTypes(Table tbl, List<String> colNames) {
         if (colName.equalsIgnoreCase(col.getName())) {
           String type = col.getType();
           TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(type);
-          if (typeInfo.getCategory() != ObjectInspector.Category.PRIMITIVE) {
+          boolean isSupported = ColumnStatsAutoGatherContext.isColumnSupported(typeInfo.getCategory(), () -> typeInfo);
+          if (!isSupported) {
             logTypeWarning(colName, type);
             colNames.remove(colName);
           } else {
@@ -241,7 +257,7 @@ private String genRewrittenQuery(List<String> colNames, List<String> colTypes, H
   protected static String genRewrittenQuery(Table tbl,
       HiveConf conf, List<TransformSpec> partTransformSpec, Map<String, String> partSpec, 
       boolean isPartitionStats) {
-    List<String> colNames = Utilities.getColumnNamesFromFieldSchema(tbl.getCols());
+    List<String> colNames = getColumnNames(tbl);
     List<String> colTypes = ColumnStatsSemanticAnalyzer.getColumnTypes(tbl, colNames);
     return ColumnStatsSemanticAnalyzer.genRewrittenQuery(
         tbl, colNames, colTypes, conf, partTransformSpec, -1, partSpec, isPartitionStats, true);
@@ -733,7 +749,7 @@ static AnalyzeRewriteContext genAnalyzeRewriteContext(HiveConf conf, Table tbl)
     AnalyzeRewriteContext analyzeRewrite = new AnalyzeRewriteContext();
     analyzeRewrite.setTableName(tbl.getFullyQualifiedName());
     analyzeRewrite.setTblLvl(!(conf.getBoolVar(ConfVars.HIVE_STATS_COLLECT_PART_LEVEL_STATS) && tbl.isPartitioned()));
-    List<String> colNames = Utilities.getColumnNamesFromFieldSchema(tbl.getCols());
+    List<String> colNames = getColumnNames(tbl);
     List<String> colTypes = getColumnTypes(tbl, colNames);
     analyzeRewrite.setColName(colNames);
     analyzeRewrite.setColType(colTypes);

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
@@ -8260,8 +8260,9 @@ protected Operator genFileSinkPlan(String dest, QB qb, Operator input)
         && enableColumnStatsCollecting()
         && destinationTable != null
         && (!destinationTable.isNonNative() || destinationTable.getStorageHandler().commitInMoveTask())
-        && !destTableIsTemporary && !destTableIsMaterialization
-        && ColumnStatsAutoGatherContext.canRunAutogatherStats(fso)) {
+        && !destTableIsTemporary
+        && !destTableIsMaterialization
+        && ColumnStatsAutoGatherContext.canRunAutogatherStats(destinationTable, fso)) {
       if (destType == QBMetaData.DEST_TABLE) {
         genAutoColumnStatsGatheringPipeline(destinationTable, partSpec, input,
             qb.getParseInfo().isInsertIntoTable(destinationTable.getDbName(), destinationTable.getTableName(),

diff --git a/ql/src/test/queries/clientpositive/stats_unsupported_type.q b/ql/src/test/queries/clientpositive/stats_unsupported_type.q
@@ -0,0 +1,11 @@
+set hive.stats.autogather=true;
+set hive.stats.column.autogather=true;
+
+-- create a table with a type that does not support column stats autogather
+-- the columns before and after that column should still obtain statistics
+CREATE TABLE test_stats1 (a int, b uniontype<int, string>, c int) STORED AS TEXTFILE;
+INSERT INTO test_stats1 (a, b, c) VALUES (1, create_union(0, 2, ""), 3);
+DESCRIBE FORMATTED test_stats1 a;
+DESCRIBE FORMATTED test_stats1 b;
+DESCRIBE FORMATTED test_stats1 c;
+
diff --git a/ql/src/test/results/clientnegative/avro_non_nullable_union.q.out b/ql/src/test/results/clientnegative/avro_non_nullable_union.q.out
@@ -56,7 +56,8 @@ Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Avro could not vali
 Caused by: Avro could not validate record against schema (record = {"id": 3, "value": null}) (schema = {"type":"record","name":"nullable","fields":[{"name":"id","type":"int"},{"name":"value","type":["int","double"]}]})
 #### A masked pattern was here ####
 ]], Vertex did not succeed due to OWN_TASK_FAILURE, failedTasks:1 killedTasks:0, Vertex vertex_#ID# [Map 1] killed/failed due to:OWN_TASK_FAILURE]
-DAG did not succeed due to VERTEX_FAILURE. failedVertices:1 killedVertices:0
+[Masked Vertex killed due to OTHER_VERTEX_FAILURE]
+DAG did not succeed due to VERTEX_FAILURE. failedVertices:1 killedVertices:1
 FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.tez.TezTask. Vertex failed, vertexName=Map 1, vertexId=vertex_#ID#, diagnostics=[Task failed, taskId=task_#ID#, diagnostics=[TaskAttempt 0 failed, info=[Error: Error while running task ( failure ) : attempt_#ID#:java.lang.RuntimeException: java.lang.RuntimeException: Hive Runtime Error while closing operators
 #### A masked pattern was here ####
 Caused by: java.lang.RuntimeException: Hive Runtime Error while closing operators
@@ -73,4 +74,4 @@ Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Avro could not vali
 #### A masked pattern was here ####
 Caused by: Avro could not validate record against schema (record = {"id": 3, "value": null}) (schema = {"type":"record","name":"nullable","fields":[{"name":"id","type":"int"},{"name":"value","type":["int","double"]}]})
 #### A masked pattern was here ####
-]], Vertex did not succeed due to OWN_TASK_FAILURE, failedTasks:1 killedTasks:0, Vertex vertex_#ID# [Map 1] killed/failed due to:OWN_TASK_FAILURE]DAG did not succeed due to VERTEX_FAILURE. failedVertices:1 killedVertices:0
+]], Vertex did not succeed due to OWN_TASK_FAILURE, failedTasks:1 killedTasks:0, Vertex vertex_#ID# [Map 1] killed/failed due to:OWN_TASK_FAILURE][Masked Vertex killed due to OTHER_VERTEX_FAILURE]DAG did not succeed due to VERTEX_FAILURE. failedVertices:1 killedVertices:1
diff --git a/ql/src/test/results/clientpositive/llap/cast_null_to_complex.q.out b/ql/src/test/results/clientpositive/llap/cast_null_to_complex.q.out
@@ -87,7 +87,7 @@ Retention:          	0
 #### A masked pattern was here ####
 Table Type:         	MANAGED_TABLE       	 
 Table Parameters:	 	 
-	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\"}
+	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"_c2\":\"true\"}}
 	bucketing_version   	2                   
 	numFiles            	1                   
 	numRows             	1                   

diff --git a/ql/src/test/results/clientpositive/llap/columnarserde_create_shortcut.q.out b/ql/src/test/results/clientpositive/llap/columnarserde_create_shortcut.q.out
@@ -30,6 +30,7 @@ STAGE PLANS:
 #### A masked pattern was here ####
       Edges:
         Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
 #### A masked pattern was here ####
       Vertices:
         Map 1 
@@ -64,6 +65,40 @@ STAGE PLANS:
                       output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
                       serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
                       name: default.columnarserde_create_shortcut
+                Select Operator
+                  expressions: _col3 (type: int), _col4 (type: string)
+                  outputColumnNames: d, e
+                  Statistics: Num rows: 11 Data size: 34628 Basic stats: COMPLETE Column stats: NONE
+                  Group By Operator
+                    aggregations: min(d), max(d), count(1), count(d), compute_bit_vector_hll(d), max(length(e)), avg(COALESCE(length(e),0)), count(e), compute_bit_vector_hll(e)
+                    minReductionHashAggr: 0.99
+                    mode: hash
+                    outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
+                    Statistics: Num rows: 1 Data size: 3548 Basic stats: COMPLETE Column stats: NONE
+                    Reduce Output Operator
+                      null sort order: 
+                      sort order: 
+                      Statistics: Num rows: 1 Data size: 3548 Basic stats: COMPLETE Column stats: NONE
+                      value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: binary), _col5 (type: int), _col6 (type: struct<count:bigint,sum:double,input:int>), _col7 (type: bigint), _col8 (type: binary)
+        Reducer 3 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), count(VALUE._col2), count(VALUE._col3), compute_bit_vector_hll(VALUE._col4), max(VALUE._col5), avg(VALUE._col6), count(VALUE._col7), compute_bit_vector_hll(VALUE._col8)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
+                Statistics: Num rows: 1 Data size: 3548 Basic stats: COMPLETE Column stats: NONE
+                Select Operator
+                  expressions: 'LONG' (type: string), UDFToLong(_col0) (type: bigint), UDFToLong(_col1) (type: bigint), (_col2 - _col3) (type: bigint), COALESCE(ndv_compute_bit_vector(_col4),0) (type: bigint), _col4 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col5,0)) (type: bigint), COALESCE(_col6,0) (type: double), (_col2 - _col7) (type: bigint), COALESCE(ndv_compute_bit_vector(_col8),0) (type: bigint), _col8 (type: binary)
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11
+                  Statistics: Num rows: 1 Data size: 3548 Basic stats: COMPLETE Column stats: NONE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 1 Data size: 3548 Basic stats: COMPLETE Column stats: NONE
+                    table:
+                        input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
 
   Stage: Stage-2
     Dependency Collection
@@ -81,6 +116,10 @@ STAGE PLANS:
   Stage: Stage-3
     Stats Work
       Basic Stats Work:
+      Column Stats Desc:
+          Columns: d, e
+          Column Types: int, string
+          Table: default.columnarserde_create_shortcut
 
 PREHOOK: query: FROM src_thrift
 INSERT OVERWRITE TABLE columnarserde_create_shortcut SELECT src_thrift.lint, src_thrift.lstring, src_thrift.mstringstring, src_thrift.aint, src_thrift.astring DISTRIBUTE BY 1

diff --git a/ql/src/test/results/clientpositive/llap/empty_result_ctas.q.out b/ql/src/test/results/clientpositive/llap/empty_result_ctas.q.out
@@ -41,7 +41,7 @@ Retention:          	0
 #### A masked pattern was here ####
 Table Type:         	MANAGED_TABLE       	 
 Table Parameters:	 	 
-	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\"}
+	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"c_primitive\":\"true\"}}
 	bucketing_version   	2                   
 	numFiles            	0                   
 	numRows             	0                   
@@ -92,7 +92,7 @@ Retention:          	0
 #### A masked pattern was here ####
 Table Type:         	MANAGED_TABLE       	 
 Table Parameters:	 	 
-	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\"}
+	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"c_primitive\":\"true\"}}
 	bucketing_version   	2                   
 	numFiles            	0                   
 	numRows             	0                   
@@ -161,7 +161,7 @@ Retention:          	0
 #### A masked pattern was here ####
 Table Type:         	MANAGED_TABLE       	 
 Table Parameters:	 	 
-	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\"}
+	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\"}}
 	bucketing_version   	2                   
 	numFiles            	0                   
 	numRows             	0