Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/iceberg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ set(ICEBERG_SOURCES
partition_field.cc
partition_spec.cc
partition_summary.cc
puffin/blob.cc
puffin/blob_metadata.cc
puffin/file_metadata.cc
puffin/puffin_compression_codec.cc
row/arrow_array_wrapper.cc
row/manifest_wrapper.cc
row/partition_values.cc
Expand Down Expand Up @@ -166,6 +170,7 @@ add_subdirectory(catalog)
add_subdirectory(data)
add_subdirectory(expression)
add_subdirectory(manifest)
add_subdirectory(puffin)
add_subdirectory(row)
add_subdirectory(update)
add_subdirectory(util)
Expand Down
5 changes: 5 additions & 0 deletions src/iceberg/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ iceberg_sources = files(
'partition_field.cc',
'partition_spec.cc',
'partition_summary.cc',
'puffin/blob.cc',
'puffin/blob_metadata.cc',
'puffin/file_metadata.cc',
'puffin/puffin_compression_codec.cc',
'row/arrow_array_wrapper.cc',
'row/manifest_wrapper.cc',
'row/partition_values.cc',
Expand Down Expand Up @@ -221,6 +225,7 @@ install_headers(
subdir('catalog')
subdir('expression')
subdir('manifest')
subdir('puffin')
subdir('row')
subdir('update')
subdir('util')
Expand Down
18 changes: 18 additions & 0 deletions src/iceberg/puffin/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

iceberg_install_all_headers(iceberg/puffin)
46 changes: 46 additions & 0 deletions src/iceberg/puffin/blob.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include "iceberg/puffin/blob.h"

#include <format>

#include "iceberg/util/formatter_internal.h"

namespace iceberg::puffin {

std::string ToString(const Blob& blob) {
std::string repr = "Blob[";
std::format_to(std::back_inserter(repr), "type='{}',inputFields={},", blob.type,
blob.input_fields);
std::format_to(std::back_inserter(repr), "snapshotId={},sequenceNumber={},",
blob.snapshot_id, blob.sequence_number);
std::format_to(std::back_inserter(repr), "dataSize={}", blob.data.size());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: may be add a checksum if available for fast comparison(I do understand checksum is not part of the puffin spec).

Copy link
Author

@zhaoxuan1994 zhaoxuan1994 Mar 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's an interesting idea! Currently Blob doesn't carry a checksum field (the Puffin spec doesn't include one either), so there's nothing available to display here. For equality comparison, Blob already has operator==. If we want to add checksum support down the road, it would probably be a separate discussion about the struct design itself — happy to explore that if there's a use case!

if (blob.requested_compression.has_value()) {
std::format_to(std::back_inserter(repr), ",requestedCompression={}",
iceberg::puffin::ToString(*blob.requested_compression));
}
if (!blob.properties.empty()) {
std::format_to(std::back_inserter(repr), ",properties={}", blob.properties);
}
std::format_to(std::back_inserter(repr), "]");
return repr;
}

} // namespace iceberg::puffin
64 changes: 64 additions & 0 deletions src/iceberg/puffin/blob.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#pragma once

/// \file iceberg/puffin/blob.h
/// Blob data structure for Puffin files.

#include <cstdint>
#include <optional>
#include <string>
#include <unordered_map>
#include <vector>

#include "iceberg/iceberg_export.h"
#include "iceberg/puffin/puffin_compression_codec.h"

namespace iceberg::puffin {

/// \brief A blob to be written to a Puffin file.
///
/// This represents the uncompressed blob data along with its metadata.
/// The actual compression is handled during writing.
struct ICEBERG_EXPORT Blob {
/// Type of the blob. See StandardBlobTypes for known types.
std::string type;
/// List of field IDs the blob was computed for.
/// The order of items is used to compute sketches stored in the blob.
std::vector<int32_t> input_fields;
/// ID of the Iceberg table's snapshot the blob was computed from.
int64_t snapshot_id;
/// Sequence number of the Iceberg table's snapshot the blob was computed from.
int64_t sequence_number;
/// The uncompressed blob data.
std::vector<uint8_t> data;
/// Requested compression codec. If not set, the writer's default will be used.
std::optional<PuffinCompressionCodec> requested_compression;
/// Additional properties of the blob.
std::unordered_map<std::string, std::string> properties;

/// \brief Compare two Blobs for equality.
friend bool operator==(const Blob& lhs, const Blob& rhs) = default;
};

/// \brief Returns a string representation of a Blob.
ICEBERG_EXPORT std::string ToString(const Blob& blob);

} // namespace iceberg::puffin
47 changes: 47 additions & 0 deletions src/iceberg/puffin/blob_metadata.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include "iceberg/puffin/blob_metadata.h"

#include <format>

#include "iceberg/util/formatter_internal.h"

namespace iceberg::puffin {

std::string ToString(const BlobMetadata& blob_metadata) {
std::string repr = "BlobMetadata[";
std::format_to(std::back_inserter(repr), "type='{}',inputFields={},",
blob_metadata.type, blob_metadata.input_fields);
std::format_to(std::back_inserter(repr), "snapshotId={},sequenceNumber={},",
blob_metadata.snapshot_id, blob_metadata.sequence_number);
std::format_to(std::back_inserter(repr), "offset={},length={}", blob_metadata.offset,
blob_metadata.length);
if (blob_metadata.compression_codec.has_value()) {
std::format_to(std::back_inserter(repr), ",compressionCodec='{}'",
*blob_metadata.compression_codec);
}
if (!blob_metadata.properties.empty()) {
std::format_to(std::back_inserter(repr), ",properties={}", blob_metadata.properties);
}
std::format_to(std::back_inserter(repr), "]");
return repr;
}

} // namespace iceberg::puffin
64 changes: 64 additions & 0 deletions src/iceberg/puffin/blob_metadata.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#pragma once

/// \file iceberg/puffin/blob_metadata.h
/// Blob metadata structure for Puffin files.

#include <cstdint>
#include <optional>
#include <string>
#include <unordered_map>
#include <vector>

#include "iceberg/iceberg_export.h"

namespace iceberg::puffin {

/// \brief Metadata about a blob stored in a Puffin file.
///
/// This represents the metadata stored in the Puffin file footer,
/// including the blob's location within the file.
struct ICEBERG_EXPORT BlobMetadata {
/// Type of the blob. See StandardBlobTypes for known types.
std::string type;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is weird that the type is declared as string. it makes it harder to understand what type of blobs are accepted but harder to catch typos.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same reasoning as StandardBlobTypes — the spec requires this to be an open string.

/// List of field IDs the blob was computed for.
std::vector<int32_t> input_fields;
/// ID of the Iceberg table's snapshot the blob was computed from.
int64_t snapshot_id;
/// Sequence number of the Iceberg table's snapshot the blob was computed from.
int64_t sequence_number;
/// Offset in the file where the blob data starts.
int64_t offset;
/// Length of the blob data in the file (after compression, if compressed).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How to check if the blob is compressed, Is it if the compression_codec is set?

Copy link
Author

@zhaoxuan1994 zhaoxuan1994 Mar 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, exactly. If compression_codec has a value, the blob is compressed; if it's std::nullopt, it's uncompressed. I'll improve the comment to make this clearer.

int64_t length;
/// Compression codec name, or std::nullopt if uncompressed.
std::optional<std::string> compression_codec;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto same pattern string or enum

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BlobMetadata is a direct mapping of the Puffin footer JSON, where compression-codec is a JSON string (or absent). The Java implementation also uses @nullable String here. The conversion from string to PuffinCompressionCodec enum happens at a higher level via PuffinCompressionCodecFromName(), which returns Result<> to properly handle unknown codecs. This layering keeps the metadata faithful to the on-disk format and provides forward compatibility — if a future spec version adds a new codec, we can still deserialize the footer successfully and fail gracefully at decompression time rather than at parsing time.

/// Additional properties of the blob.
std::unordered_map<std::string, std::string> properties;

/// \brief Compare two BlobMetadatas for equality.
friend bool operator==(const BlobMetadata& lhs, const BlobMetadata& rhs) = default;
};

/// \brief Returns a string representation of a BlobMetadata.
ICEBERG_EXPORT std::string ToString(const BlobMetadata& blob_metadata);

} // namespace iceberg::puffin
45 changes: 45 additions & 0 deletions src/iceberg/puffin/file_metadata.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include "iceberg/puffin/file_metadata.h"

#include <format>

#include "iceberg/util/formatter_internal.h"

namespace iceberg::puffin {

std::string ToString(const FileMetadata& file_metadata) {
std::string repr = "FileMetadata[";
std::format_to(std::back_inserter(repr), "blobs=[");
for (size_t i = 0; i < file_metadata.blobs.size(); ++i) {
if (i > 0) {
std::format_to(std::back_inserter(repr), ",");
}
std::format_to(std::back_inserter(repr), "{}", ToString(file_metadata.blobs[i]));
}
std::format_to(std::back_inserter(repr), "]");
if (!file_metadata.properties.empty()) {
std::format_to(std::back_inserter(repr), ",properties={}", file_metadata.properties);
}
std::format_to(std::back_inserter(repr), "]");
return repr;
}

} // namespace iceberg::puffin
51 changes: 51 additions & 0 deletions src/iceberg/puffin/file_metadata.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#pragma once

/// \file iceberg/puffin/file_metadata.h
/// File metadata structure for Puffin files.

#include <string>
#include <unordered_map>
#include <vector>

#include "iceberg/iceberg_export.h"
#include "iceberg/puffin/blob_metadata.h"

namespace iceberg::puffin {

/// \brief Metadata about a Puffin file.
///
/// This represents the metadata stored in the Puffin file footer,
/// including information about all blobs in the file.
struct ICEBERG_EXPORT FileMetadata {
/// List of blob metadata for all blobs in the file.
std::vector<BlobMetadata> blobs;
/// File-level properties.
std::unordered_map<std::string, std::string> properties;

/// \brief Compare two FileMetadatas for equality.
friend bool operator==(const FileMetadata& lhs, const FileMetadata& rhs) = default;
};

/// \brief Returns a string representation of a FileMetadata.
ICEBERG_EXPORT std::string ToString(const FileMetadata& file_metadata);

} // namespace iceberg::puffin
Loading
Loading