Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.function.Supplier;

import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
Expand Down Expand Up @@ -238,7 +238,7 @@
* @param input : the operator right before FS in the insert overwrite statement
* @throws HiveException
*/
private void replaceSelectOperatorProcess(SelectOperator operator, Operator<? extends OperatorDesc> input)

Check failure on line 241 in ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsAutoGatherContext.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Refactor this method to reduce its Cognitive Complexity from 22 to the 15 allowed.

See more on https://sonarcloud.io/project/issues?id=apache_hive&issues=AZ0HS0VkBqQHq5MYh9vW&open=AZ0HS0VkBqQHq5MYh9vW&pullRequest=6374
throws HiveException {
RowSchema selRS = operator.getSchema();
List<ColumnInfo> signature = new ArrayList<>();
Expand All @@ -256,14 +256,32 @@
// |

// 1. deal with non-partition columns
Map<String, Integer> columnNameToIndex = new HashMap<>();
List<ColumnInfo> selRSSig = selRS.getSignature();
for (int i = 0; i < selRSSig.size(); i++) {
columnNameToIndex.putIfAbsent(selRSSig.get(i).getAlias(), i);
}
for (int i = 0; i < this.columns.size(); i++) {

Check warning on line 264 in ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsAutoGatherContext.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Reduce the total number of break and continue statements in this loop to use at most one.

See more on https://sonarcloud.io/project/issues?id=apache_hive&issues=AZ0HS0VkBqQHq5MYh9vV&open=AZ0HS0VkBqQHq5MYh9vV&pullRequest=6374
ColumnInfo col = columns.get(i);
ObjectInspector objectInspector = col.getObjectInspector();
if (objectInspector == null) {
continue;
}
boolean columnSupported = isColumnSupported(objectInspector.getCategory(), col::getType);
if (!columnSupported) {
continue;
}

Integer selRSIdx = columnNameToIndex.get(this.columns.get(i).getName());
if (selRSIdx == null) {
continue;
}
ExprNodeDesc exprNodeDesc = new ExprNodeColumnDesc(col);
colList.add(exprNodeDesc);
String internalName = selRS.getColumnNames().get(i);
String internalName = selRS.getColumnNames().get(selRSIdx);
columnNames.add(internalName);
columnExprMap.put(internalName, exprNodeDesc);
signature.add(selRS.getSignature().get(i));
signature.add(selRSSig.get(selRSIdx));
}
// if there is any partition column (in static partition or dynamic
// partition or mixed case)
Expand All @@ -280,7 +298,7 @@
}
exprNodeDesc = new ExprNodeConstantDesc(partSpec.get(partColName));
TypeInfo srcType = exprNodeDesc.getTypeInfo();
TypeInfo destType = selRS.getSignature().get(this.columns.size() + i).getType();
TypeInfo destType = selRSSig.get(this.columns.size() + i).getType();
if (!srcType.equals(destType)) {
// This may be possible when srcType is string but destType is integer
exprNodeDesc = ExprNodeTypeCheck.getExprNodeDefaultExprProcessor()
Expand All @@ -292,7 +310,7 @@
dynamicPartBegin++;
ColumnInfo col = columns.get(this.columns.size() + dynamicPartBegin);
TypeInfo srcType = col.getType();
TypeInfo destType = selRS.getSignature().get(this.columns.size() + i).getType();
TypeInfo destType = selRSSig.get(this.columns.size() + i).getType();
exprNodeDesc = new ExprNodeColumnDesc(col);
if (!srcType.equals(destType)) {
exprNodeDesc = ExprNodeTypeCheck.getExprNodeDefaultExprProcessor()
Expand All @@ -303,7 +321,7 @@
String internalName = selRS.getColumnNames().get(this.columns.size() + i);
columnNames.add(internalName);
columnExprMap.put(internalName, exprNodeDesc);
signature.add(selRS.getSignature().get(this.columns.size() + i));
signature.add(selRSSig.get(this.columns.size() + i));
}
operator.setConf(new SelectDesc(colList, columnNames));
operator.setColumnExprMap(columnExprMap);
Expand All @@ -319,36 +337,61 @@
return isInsertInto;
}

public static boolean canRunAutogatherStats(Operator curr) {
public static boolean isColumnSupported(ObjectInspector.Category category, Supplier<TypeInfo> typeInfoSupplier) {
if (category != ObjectInspector.Category.PRIMITIVE) {
return false;
}
TypeInfo t = typeInfoSupplier.get();
switch (((PrimitiveTypeInfo) t).getPrimitiveCategory()) {
case BOOLEAN:
case BYTE:
case SHORT:
case INT:
case LONG:
case TIMESTAMP:
case FLOAT:
case DOUBLE:
case STRING:
case CHAR:
case VARCHAR:
case BINARY:
case DECIMAL:
case DATE:
return true;
default:
return false;
}
}

public static boolean canRunAutogatherStats(Table destinationTable, Operator curr) {

Check warning on line 366 in ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsAutoGatherContext.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Provide the parametrized type for this generic.

See more on https://sonarcloud.io/project/issues?id=apache_hive&issues=AZ0Lpa1D1t5mclAij6gI&open=AZ0Lpa1D1t5mclAij6gI&pullRequest=6374
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll add the parametrized type for the Opterator parameters during the next update of the PR.

if (destinationTable.isNonNative() && destinationTable.getStorageHandler().supportsPartitioning()) {
// On partitioned tables, the partition key is needed to store the stats.
// However, external tables (e.g. stored by iceberg) may not define partition keys,
// i.e., org.apache.hadoop.hive.ql.metadata.Table.getPartitionKeys() returns null.
// So keep the same behavior as before HIVE-29432, and only run stats autogather if all columns are supported.
return areAllColumnsSupported(curr);
}
return isAnyColumnSupported(curr);
}

private static boolean areAllColumnsSupported(Operator curr) {

Check warning on line 377 in ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsAutoGatherContext.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Provide the parametrized type for this generic.

See more on https://sonarcloud.io/project/issues?id=apache_hive&issues=AZ0Lpa1D1t5mclAij6gJ&open=AZ0Lpa1D1t5mclAij6gJ&pullRequest=6374
// check the ObjectInspector
for (ColumnInfo cinfo : curr.getSchema().getSignature()) {
if (cinfo.getIsVirtualCol()) {
return false;
} else if (cinfo.getObjectInspector().getCategory() != ObjectInspector.Category.PRIMITIVE) {
if (cinfo.getIsVirtualCol() || !isColumnSupported(cinfo.getObjectInspector().getCategory(), cinfo::getType)) {
return false;
} else {
switch (((PrimitiveTypeInfo) cinfo.getType()).getPrimitiveCategory()) {
case BOOLEAN:
case BYTE:
case SHORT:
case INT:
case LONG:
case TIMESTAMP:
case FLOAT:
case DOUBLE:
case STRING:
case CHAR:
case VARCHAR:
case BINARY:
case DECIMAL:
case DATE:
break;
default:
return false;
}
}
}
return true;
}

private static boolean isAnyColumnSupported(Operator curr) {

Check warning on line 387 in ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsAutoGatherContext.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Provide the parametrized type for this generic.

See more on https://sonarcloud.io/project/issues?id=apache_hive&issues=AZ0Lpa1D1t5mclAij6gK&open=AZ0Lpa1D1t5mclAij6gK&pullRequest=6374
// check the ObjectInspector
for (ColumnInfo cinfo : curr.getSchema().getSignature()) {
if (!cinfo.getIsVirtualCol() && isColumnSupported(cinfo.getObjectInspector().getCategory(), cinfo::getType)) {
return true;
}
}
return false;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
import org.apache.hadoop.hive.ql.stats.ColStatsProcessor.ColumnStatsField;
import org.apache.hadoop.hive.ql.stats.ColStatsProcessor.ColumnStatsType;
import org.apache.hadoop.hive.ql.stats.StatsUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
Expand Down Expand Up @@ -103,11 +102,27 @@ private boolean shouldRewrite(ASTNode tree) {
return rwt;
}

/**
* Get the names of the columns that support column statistics.
*/
private static List<String> getColumnNames(Table tbl) {
List<String> colNames = new ArrayList<>();
for (FieldSchema col : tbl.getCols()) {
String type = col.getType();
TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(type);
boolean isSupported = ColumnStatsAutoGatherContext.isColumnSupported(typeInfo.getCategory(), () -> typeInfo);
if (isSupported) {
colNames.add(col.getName());
}
}
return colNames;
}

private List<String> getColumnName(ASTNode tree) throws SemanticException {

switch (tree.getChildCount()) {
case 2:
return Utilities.getColumnNamesFromFieldSchema(tbl.getCols());
return getColumnNames(tbl);
case 3:
int numCols = tree.getChild(2).getChildCount();
List<String> colName = new ArrayList<>(numCols);
Expand Down Expand Up @@ -212,7 +227,8 @@ protected static List<String> getColumnTypes(Table tbl, List<String> colNames) {
if (colName.equalsIgnoreCase(col.getName())) {
String type = col.getType();
TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(type);
if (typeInfo.getCategory() != ObjectInspector.Category.PRIMITIVE) {
boolean isSupported = ColumnStatsAutoGatherContext.isColumnSupported(typeInfo.getCategory(), () -> typeInfo);
if (!isSupported) {
logTypeWarning(colName, type);
colNames.remove(colName);
} else {
Expand Down Expand Up @@ -241,7 +257,7 @@ private String genRewrittenQuery(List<String> colNames, List<String> colTypes, H
protected static String genRewrittenQuery(Table tbl,
HiveConf conf, List<TransformSpec> partTransformSpec, Map<String, String> partSpec,
boolean isPartitionStats) {
List<String> colNames = Utilities.getColumnNamesFromFieldSchema(tbl.getCols());
List<String> colNames = getColumnNames(tbl);
List<String> colTypes = ColumnStatsSemanticAnalyzer.getColumnTypes(tbl, colNames);
return ColumnStatsSemanticAnalyzer.genRewrittenQuery(
tbl, colNames, colTypes, conf, partTransformSpec, -1, partSpec, isPartitionStats, true);
Expand Down Expand Up @@ -733,7 +749,7 @@ static AnalyzeRewriteContext genAnalyzeRewriteContext(HiveConf conf, Table tbl)
AnalyzeRewriteContext analyzeRewrite = new AnalyzeRewriteContext();
analyzeRewrite.setTableName(tbl.getFullyQualifiedName());
analyzeRewrite.setTblLvl(!(conf.getBoolVar(ConfVars.HIVE_STATS_COLLECT_PART_LEVEL_STATS) && tbl.isPartitioned()));
List<String> colNames = Utilities.getColumnNamesFromFieldSchema(tbl.getCols());
List<String> colNames = getColumnNames(tbl);
List<String> colTypes = getColumnTypes(tbl, colNames);
analyzeRewrite.setColName(colNames);
analyzeRewrite.setColType(colTypes);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8260,8 +8260,9 @@ protected Operator genFileSinkPlan(String dest, QB qb, Operator input)
&& enableColumnStatsCollecting()
&& destinationTable != null
&& (!destinationTable.isNonNative() || destinationTable.getStorageHandler().commitInMoveTask())
&& !destTableIsTemporary && !destTableIsMaterialization
&& ColumnStatsAutoGatherContext.canRunAutogatherStats(fso)) {
&& !destTableIsTemporary
&& !destTableIsMaterialization
&& ColumnStatsAutoGatherContext.canRunAutogatherStats(destinationTable, fso)) {
if (destType == QBMetaData.DEST_TABLE) {
genAutoColumnStatsGatheringPipeline(destinationTable, partSpec, input,
qb.getParseInfo().isInsertIntoTable(destinationTable.getDbName(), destinationTable.getTableName(),
Expand Down
11 changes: 11 additions & 0 deletions ql/src/test/queries/clientpositive/stats_unsupported_type.q
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
set hive.stats.autogather=true;
set hive.stats.column.autogather=true;

-- create a table with a type that does not support column stats autogather
-- the columns before and after that column should still obtain statistics
CREATE TABLE test_stats1 (a int, b uniontype<int, string>, c int) STORED AS TEXTFILE;
INSERT INTO test_stats1 (a, b, c) VALUES (1, create_union(0, 2, ""), 3);
DESCRIBE FORMATTED test_stats1 a;
DESCRIBE FORMATTED test_stats1 b;
DESCRIBE FORMATTED test_stats1 c;

Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Avro could not vali
Caused by: Avro could not validate record against schema (record = {"id": 3, "value": null}) (schema = {"type":"record","name":"nullable","fields":[{"name":"id","type":"int"},{"name":"value","type":["int","double"]}]})
#### A masked pattern was here ####
]], Vertex did not succeed due to OWN_TASK_FAILURE, failedTasks:1 killedTasks:0, Vertex vertex_#ID# [Map 1] killed/failed due to:OWN_TASK_FAILURE]
DAG did not succeed due to VERTEX_FAILURE. failedVertices:1 killedVertices:0
[Masked Vertex killed due to OTHER_VERTEX_FAILURE]
DAG did not succeed due to VERTEX_FAILURE. failedVertices:1 killedVertices:1
FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.tez.TezTask. Vertex failed, vertexName=Map 1, vertexId=vertex_#ID#, diagnostics=[Task failed, taskId=task_#ID#, diagnostics=[TaskAttempt 0 failed, info=[Error: Error while running task ( failure ) : attempt_#ID#:java.lang.RuntimeException: java.lang.RuntimeException: Hive Runtime Error while closing operators
#### A masked pattern was here ####
Caused by: java.lang.RuntimeException: Hive Runtime Error while closing operators
Expand All @@ -73,4 +74,4 @@ Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Avro could not vali
#### A masked pattern was here ####
Caused by: Avro could not validate record against schema (record = {"id": 3, "value": null}) (schema = {"type":"record","name":"nullable","fields":[{"name":"id","type":"int"},{"name":"value","type":["int","double"]}]})
#### A masked pattern was here ####
]], Vertex did not succeed due to OWN_TASK_FAILURE, failedTasks:1 killedTasks:0, Vertex vertex_#ID# [Map 1] killed/failed due to:OWN_TASK_FAILURE]DAG did not succeed due to VERTEX_FAILURE. failedVertices:1 killedVertices:0
]], Vertex did not succeed due to OWN_TASK_FAILURE, failedTasks:1 killedTasks:0, Vertex vertex_#ID# [Map 1] killed/failed due to:OWN_TASK_FAILURE][Masked Vertex killed due to OTHER_VERTEX_FAILURE]DAG did not succeed due to VERTEX_FAILURE. failedVertices:1 killedVertices:1
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ Retention: 0
#### A masked pattern was here ####
Table Type: MANAGED_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"_c2\":\"true\"}}
bucketing_version 2
numFiles 1
numRows 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ STAGE PLANS:
#### A masked pattern was here ####
Edges:
Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
#### A masked pattern was here ####
Vertices:
Map 1
Expand Down Expand Up @@ -64,6 +65,40 @@ STAGE PLANS:
output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
name: default.columnarserde_create_shortcut
Select Operator
expressions: _col3 (type: int), _col4 (type: string)
outputColumnNames: d, e
Statistics: Num rows: 11 Data size: 34628 Basic stats: COMPLETE Column stats: NONE
Group By Operator
aggregations: min(d), max(d), count(1), count(d), compute_bit_vector_hll(d), max(length(e)), avg(COALESCE(length(e),0)), count(e), compute_bit_vector_hll(e)
minReductionHashAggr: 0.99
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
Statistics: Num rows: 1 Data size: 3548 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
null sort order:
sort order:
Statistics: Num rows: 1 Data size: 3548 Basic stats: COMPLETE Column stats: NONE
value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: binary), _col5 (type: int), _col6 (type: struct<count:bigint,sum:double,input:int>), _col7 (type: bigint), _col8 (type: binary)
Reducer 3
Execution mode: vectorized, llap
Reduce Operator Tree:
Group By Operator
aggregations: min(VALUE._col0), max(VALUE._col1), count(VALUE._col2), count(VALUE._col3), compute_bit_vector_hll(VALUE._col4), max(VALUE._col5), avg(VALUE._col6), count(VALUE._col7), compute_bit_vector_hll(VALUE._col8)
mode: mergepartial
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
Statistics: Num rows: 1 Data size: 3548 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: 'LONG' (type: string), UDFToLong(_col0) (type: bigint), UDFToLong(_col1) (type: bigint), (_col2 - _col3) (type: bigint), COALESCE(ndv_compute_bit_vector(_col4),0) (type: bigint), _col4 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col5,0)) (type: bigint), COALESCE(_col6,0) (type: double), (_col2 - _col7) (type: bigint), COALESCE(ndv_compute_bit_vector(_col8),0) (type: bigint), _col8 (type: binary)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11
Statistics: Num rows: 1 Data size: 3548 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
Statistics: Num rows: 1 Data size: 3548 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

Stage: Stage-2
Dependency Collection
Expand All @@ -81,6 +116,10 @@ STAGE PLANS:
Stage: Stage-3
Stats Work
Basic Stats Work:
Column Stats Desc:
Columns: d, e
Column Types: int, string
Table: default.columnarserde_create_shortcut

PREHOOK: query: FROM src_thrift
INSERT OVERWRITE TABLE columnarserde_create_shortcut SELECT src_thrift.lint, src_thrift.lstring, src_thrift.mstringstring, src_thrift.aint, src_thrift.astring DISTRIBUTE BY 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ Retention: 0
#### A masked pattern was here ####
Table Type: MANAGED_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"c_primitive\":\"true\"}}
bucketing_version 2
numFiles 0
numRows 0
Expand Down Expand Up @@ -92,7 +92,7 @@ Retention: 0
#### A masked pattern was here ####
Table Type: MANAGED_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"c_primitive\":\"true\"}}
bucketing_version 2
numFiles 0
numRows 0
Expand Down Expand Up @@ -161,7 +161,7 @@ Retention: 0
#### A masked pattern was here ####
Table Type: MANAGED_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\"}}
bucketing_version 2
numFiles 0
numRows 0
Expand Down
Loading
Loading