Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ class IDataLakeMetadata : boost::noncopyable
virtual std::optional<size_t> totalRows(ContextPtr) const { return {}; }
virtual std::optional<size_t> totalBytes(ContextPtr) const { return {}; }

virtual std::optional<String> partitionKey(ContextPtr) const { return {}; }
virtual std::optional<String> sortingKey(ContextPtr) const { return {}; }

protected:
ObjectIterator createKeysIterator(
Strings && data_files_,
Expand Down
207 changes: 206 additions & 1 deletion src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,197 @@ bool IcebergMetadata::update(const ContextPtr & local_context)
return previous_snapshot_schema_id != relevant_snapshot_schema_id;
}

namespace
{

using IdToName = std::unordered_map<Int32, String>;

IdToName buildIdToNameMap(const Poco::JSON::Object::Ptr & metadata_obj)
{
IdToName map;
if (!metadata_obj || !metadata_obj->has("current-schema-id") || !metadata_obj->has("schemas"))
Copy link
Member

@Enmk Enmk Sep 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which docs/sample metadata object this implementation is based on?
Is it just https://iceberg.apache.org/spec/?h=schema+id#table-metadata-fields or is there anything else?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, based on docs and test

return map;

const auto current_schema_id = metadata_obj->getValue<Int32>("current-schema-id");
auto schemas = metadata_obj->getArray("schemas");
if (!schemas)
return map;

for (size_t i = 0; i < schemas->size(); ++i)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for (const auto & schema : *schemas)
{
  if (!schema->has("schema-id"))
....

and the same way in other cycles over JSON::Array

Copy link
Collaborator Author

@zvonand zvonand Aug 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does not look any better: in range-based loop, type will be wrong and will need to be converted explicitly later. This will not be better than it is already:

for (const auto & v : *schemas)
{
    const auto & schema = v.extract<Poco::JSON::Object::Ptr>();

{
auto schema = schemas->getObject(i);

if (!schema || !schema->has("schema-id") || (schema->getValue<Int32>("schema-id") != current_schema_id))
continue;

if (auto fields = schema->getArray("fields"))
{
for (size_t j = 0; j < fields->size(); ++j)
{
auto f = fields->getObject(j);
if (!f || !f->has("id") || !f->has("name"))
continue;
map.emplace(f->getValue<Int32>("id"), f->getValue<String>("name"));
}
}
break;
}
return map;
}

String formatTransform(
const String & transform,
const Poco::JSON::Object::Ptr & field_obj,
const IdToName & id_to_name)
{
Int32 source_id = (field_obj && field_obj->has("source-id"))
? field_obj->getValue<Int32>("source-id")
: -1;

const auto it = id_to_name.find(source_id);
const String col = (it != id_to_name.end()) ? it->second : ("col_" + toString(source_id));

String base = transform;
String param;
if (const auto lpos = transform.find('['); lpos != String::npos && transform.back() == ']')
{
base = transform.substr(0, lpos);
param = transform.substr(lpos + 1, transform.size() - lpos - 2); // strip [ and ]
}

String result;
if (base == "identity")
result = col;
else if (base == "year" || base == "month" || base == "day" || base == "hour")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Am I right that these types can't have param?
If it true, than next block base != "void" make the same.
If it false, than we lost param, is it correct?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Am I right that these types can't have param?

Yes, correct

result = base + "(" + col + ")";
else if (base != "void")
{
if (!param.empty())
result = base + "(" + param + ", " + col + ")";
else
result = base + "(" + col + ")";
}
return result;
}

Poco::JSON::Array::Ptr findActivePartitionFields(const Poco::JSON::Object::Ptr & metadata_obj)
{
if (!metadata_obj)
return nullptr;

if (metadata_obj->has("partition-spec"))
return metadata_obj->getArray("partition-spec");

// If for some reason there is no partition-spec, try partition-specs + default-
if (metadata_obj->has("partition-specs") && metadata_obj->has("default-spec-id"))
{
const auto default_spec_id = metadata_obj->getValue<Int32>("default-spec-id");
if (auto specs = metadata_obj->getArray("partition-specs"))
{
for (size_t i = 0; i < specs->size(); ++i)
{
auto spec = specs->getObject(i);
if (!spec || !spec->has("spec-id"))
continue;
if (spec->getValue<Int32>("spec-id") == default_spec_id)
return spec->has("fields") ? spec->getArray("fields") : nullptr;
}
}
}

return nullptr;
}

Poco::JSON::Array::Ptr findActiveSortFields(const Poco::JSON::Object::Ptr & metadata_obj)
{
if (!metadata_obj || !metadata_obj->has("default-sort-order-id") || !metadata_obj->has("sort-orders"))
return nullptr;

const auto default_sort_order_id = metadata_obj->getValue<Int32>("default-sort-order-id");
auto orders = metadata_obj->getArray("sort-orders");
if (!orders)
return nullptr;

for (size_t i = 0; i < orders->size(); ++i)
{
auto order = orders->getObject(i);
if (!order || !order->has("order-id"))
continue;
if (order->getValue<Int32>("order-id") == default_sort_order_id)
return order->has("fields") ? order->getArray("fields") : nullptr;
}
return nullptr;
}

String composeList(
const Poco::JSON::Array::Ptr & fields,
const IdToName & id_to_name,
bool lookup_sort_modifiers)
{
if (!fields || fields->size() == 0)
return {};

Strings parts;
parts.reserve(fields->size());

for (size_t i = 0; i < fields->size(); ++i)
{
auto field = fields->getObject(i);
if (!field)
continue;

const String transform = field->has("transform") ? field->getValue<String>("transform") : "identity";
String expr = formatTransform(transform, field, id_to_name);
if (expr.empty())
continue;

if (lookup_sort_modifiers)
{
if (field->has("direction"))
{
auto d = field->getValue<String>("direction");
expr += (Poco::icompare(d, "desc") == 0) ? "DESC" : "ASC";
}
if (field->has("null-order"))
{
auto n = field->getValue<String>("null-order");
expr += (Poco::icompare(n, "nulls-last") == 0) ? "NULLS LAST" : "NULLS FIRST";
}
}

parts.push_back(std::move(expr));
}

if (parts.empty())
return {};

String res;
for (size_t i = 0; i < parts.size(); ++i)
{
if (i) res += ", ";
res += parts[i];
}
return res;
}

std::pair<std::optional<String>, std::optional<String>> extractIcebergKeys(const Poco::JSON::Object::Ptr & metadata_obj)
{
std::optional<String> partition_key;
std::optional<String> sort_key;

if (metadata_obj)
{
auto id_to_name = buildIdToNameMap(metadata_obj);

partition_key = composeList(findActivePartitionFields(metadata_obj), id_to_name, /*lookup_sort_modifiers=*/ false);
sort_key = composeList(findActiveSortFields(metadata_obj), id_to_name, /*lookup_sort_modifiers=*/ true);
}

return {partition_key, sort_key};
}

}

void IcebergMetadata::updateSnapshot(ContextPtr local_context, Poco::JSON::Object::Ptr metadata_object)
{
auto configuration_ptr = configuration.lock();
Expand Down Expand Up @@ -563,10 +754,11 @@ void IcebergMetadata::updateSnapshot(ContextPtr local_context, Poco::JSON::Objec
total_bytes = summary_object->getValue<Int64>(f_total_files_size);
}

auto [partition_key, sorting_key] = extractIcebergKeys(metadata_object);
relevant_snapshot = IcebergSnapshot{
getManifestList(local_context, getProperFilePathFromMetadataInfo(
snapshot->getValue<String>(f_manifest_list), configuration_ptr->getPathForRead().path, table_location, configuration_ptr->getNamespace())),
relevant_snapshot_id, total_rows, total_bytes};
relevant_snapshot_id, total_rows, total_bytes, partition_key, sorting_key};

if (!snapshot->has(f_schema_id))
throw Exception(
Expand Down Expand Up @@ -1011,6 +1203,19 @@ std::optional<size_t> IcebergMetadata::totalBytes(ContextPtr local_context) cons
return result;
}

std::optional<String> IcebergMetadata::partitionKey(ContextPtr) const
{
SharedLockGuard lock(mutex);
return relevant_snapshot->partition_key;
}

std::optional<String> IcebergMetadata::sortingKey(ContextPtr) const
{
SharedLockGuard lock(mutex);
return relevant_snapshot->sorting_key;
}


ObjectIterator IcebergMetadata::iterate(
const ActionsDAG * filter_dag,
FileProgressCallback callback,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
std::optional<size_t> totalRows(ContextPtr Local_context) const override;
std::optional<size_t> totalBytes(ContextPtr Local_context) const override;

std::optional<String> partitionKey(ContextPtr) const override;
std::optional<String> sortingKey(ContextPtr) const override;

protected:
ObjectIterator iterate(
const ActionsDAG * filter_dag,
Expand Down
2 changes: 2 additions & 0 deletions src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ struct IcebergSnapshot
Int64 snapshot_id;
std::optional<size_t> total_rows;
std::optional<size_t> total_bytes;
std::optional<String> partition_key;
std::optional<String> sorting_key;
};

struct IcebergHistoryRecord
Expand Down
54 changes: 46 additions & 8 deletions src/Storages/System/StorageSystemTables.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include <QueryPipeline/Pipe.h>
#include <QueryPipeline/QueryPipelineBuilder.h>
#include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/ObjectStorage/StorageObjectStorageCluster.h>
#include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
#include <Storages/SelectQueryInfo.h>
#include <Storages/StorageView.h>
#include <Storages/System/getQueriedColumnsMaskAndHeader.h>
Expand Down Expand Up @@ -595,18 +597,54 @@ class TablesBlockSource : public ISource
ASTPtr expression_ptr;
if (columns_mask[src_index++])
{
if (metadata_snapshot && (expression_ptr = metadata_snapshot->getPartitionKeyAST()))
res_columns[res_index++]->insert(format({context, *expression_ptr}));
else
res_columns[res_index++]->insertDefault();
bool inserted = false;
// Extract from specific DataLake metadata if suitable
if (auto * obj = dynamic_cast<StorageObjectStorageCluster *>(table.get()))
{
if (auto * dl_meta = obj->getExternalMetadata(context))
{
if (auto p = dl_meta->partitionKey(context); p.has_value())
{
res_columns[res_index++]->insert(*p);
inserted = true;
}
}

}

if (!inserted)
{
if (metadata_snapshot && (expression_ptr = metadata_snapshot->getPartitionKeyAST()))
res_columns[res_index++]->insert(format({context, *expression_ptr}));
else
res_columns[res_index++]->insertDefault();
}
}

if (columns_mask[src_index++])
{
if (metadata_snapshot && (expression_ptr = metadata_snapshot->getSortingKey().expression_list_ast))
res_columns[res_index++]->insert(format({context, *expression_ptr}));
else
res_columns[res_index++]->insertDefault();
bool inserted = false;

// Extract from specific DataLake metadata if suitable
if (auto * obj = dynamic_cast<StorageObjectStorageCluster *>(table.get()))
{
if (auto * dl_meta = obj->getExternalMetadata(context))
{
if (auto p = dl_meta->sortingKey(context); p.has_value())
{
res_columns[res_index++]->insert(*p);
inserted = true;
}
}
}

if (!inserted)
{
if (metadata_snapshot && (expression_ptr = metadata_snapshot->getSortingKey().expression_list_ast))
res_columns[res_index++]->insert(format({context, *expression_ptr}));
else
res_columns[res_index++]->insertDefault();
}
}

if (columns_mask[src_index++])
Expand Down
46 changes: 45 additions & 1 deletion tests/integration/test_storage_iceberg/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3410,6 +3410,50 @@ def execute_spark_query(query: str):

instance.query(f"SELECT * FROM {table_select_expression} ORDER BY ALL")

@pytest.mark.parametrize("storage_type", ["s3"])
def test_system_tables_partition_sorting_keys(started_cluster, storage_type):
instance = started_cluster.instances["node1"]
spark = started_cluster.spark_session

table_name = f"test_sys_tables_keys_{storage_type}_{uuid.uuid4().hex[:8]}"
fq_table = f"spark_catalog.default.{table_name}"

spark.sql(f"DROP TABLE IF EXISTS {fq_table}")
spark.sql(f"""
CREATE TABLE {fq_table} (
id INT,
ts TIMESTAMP,
payload STRING
)
USING iceberg
PARTITIONED BY (bucket(16, id), day(ts))
TBLPROPERTIES ('format-version' = '2')
""")
spark.sql(f"ALTER TABLE {fq_table} WRITE ORDERED BY (id DESC NULLS LAST, hour(ts))")
spark.sql(f"""
INSERT INTO {fq_table} VALUES
(1, timestamp'2024-01-01 10:00:00', 'a'),
(2, timestamp'2024-01-02 11:00:00', 'b'),
(NULL, timestamp'2024-01-03 12:00:00', 'c')
""")

time.sleep(2)
default_upload_directory(
started_cluster,
storage_type,
f"/iceberg_data/default/{table_name}/",
f"/iceberg_data/default/{table_name}/",
)

create_iceberg_table(storage_type, instance, table_name, started_cluster)

res = instance.query(f"""
SELECT partition_key, sorting_key
FROM system.tables
WHERE name = '{table_name}' FORMAT csv
""").strip().lower()

assert res == '"bucket(16, id), day(ts)","iddescnulls last, hour(ts)ascnulls first"'

@pytest.mark.parametrize("storage_type", ["local", "s3"])
def test_compressed_metadata(started_cluster, storage_type):
Expand Down Expand Up @@ -3447,4 +3491,4 @@ def test_compressed_metadata(started_cluster, storage_type):

create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster, explicit_metadata_path="")

assert instance.query(f"SELECT * FROM {TABLE_NAME} WHERE not ignore(*)") == "1\tAlice\n2\tBob\n"
assert instance.query(f"SELECT * FROM {TABLE_NAME} WHERE not ignore(*)") == "1\tAlice\n2\tBob\n"
Loading