-
Notifications
You must be signed in to change notification settings - Fork 98
feat(puffin): add basic data structures and constants #588
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| # Licensed to the Apache Software Foundation (ASF) under one | ||
| # or more contributor license agreements. See the NOTICE file | ||
| # distributed with this work for additional information | ||
| # regarding copyright ownership. The ASF licenses this file | ||
| # to you under the Apache License, Version 2.0 (the | ||
| # "License"); you may not use this file except in compliance | ||
| # with the License. You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, | ||
| # software distributed under the License is distributed on an | ||
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| # KIND, either express or implied. See the License for the | ||
| # specific language governing permissions and limitations | ||
| # under the License. | ||
|
|
||
| iceberg_install_all_headers(iceberg/puffin) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,46 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| #include "iceberg/puffin/blob.h" | ||
|
|
||
| #include <format> | ||
|
|
||
| #include "iceberg/util/formatter_internal.h" | ||
|
|
||
| namespace iceberg::puffin { | ||
|
|
||
| std::string ToString(const Blob& blob) { | ||
| std::string repr = "Blob["; | ||
| std::format_to(std::back_inserter(repr), "type='{}',inputFields={},", blob.type, | ||
| blob.input_fields); | ||
| std::format_to(std::back_inserter(repr), "snapshotId={},sequenceNumber={},", | ||
| blob.snapshot_id, blob.sequence_number); | ||
| std::format_to(std::back_inserter(repr), "dataSize={}", blob.data.size()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: may be add a checksum if available for fast comparison(I do understand checksum is not part of the puffin spec).
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's an interesting idea! Currently Blob doesn't carry a checksum field (the Puffin spec doesn't include one either), so there's nothing available to display here. For equality comparison, Blob already has operator==. If we want to add checksum support down the road, it would probably be a separate discussion about the struct design itself — happy to explore that if there's a use case! |
||
| if (blob.requested_compression.has_value()) { | ||
| std::format_to(std::back_inserter(repr), ",requestedCompression={}", | ||
| iceberg::puffin::ToString(*blob.requested_compression)); | ||
| } | ||
| if (!blob.properties.empty()) { | ||
| std::format_to(std::back_inserter(repr), ",properties={}", blob.properties); | ||
| } | ||
| std::format_to(std::back_inserter(repr), "]"); | ||
| return repr; | ||
| } | ||
|
|
||
| } // namespace iceberg::puffin | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| #pragma once | ||
|
|
||
| /// \file iceberg/puffin/blob.h | ||
| /// Blob data structure for Puffin files. | ||
|
|
||
| #include <cstdint> | ||
| #include <optional> | ||
| #include <string> | ||
| #include <unordered_map> | ||
| #include <vector> | ||
|
|
||
| #include "iceberg/iceberg_export.h" | ||
| #include "iceberg/puffin/puffin_compression_codec.h" | ||
|
|
||
| namespace iceberg::puffin { | ||
|
|
||
| /// \brief A blob to be written to a Puffin file. | ||
| /// | ||
| /// This represents the uncompressed blob data along with its metadata. | ||
| /// The actual compression is handled during writing. | ||
| struct ICEBERG_EXPORT Blob { | ||
| /// Type of the blob. See StandardBlobTypes for known types. | ||
| std::string type; | ||
| /// List of field IDs the blob was computed for. | ||
| /// The order of items is used to compute sketches stored in the blob. | ||
| std::vector<int32_t> input_fields; | ||
| /// ID of the Iceberg table's snapshot the blob was computed from. | ||
| int64_t snapshot_id; | ||
| /// Sequence number of the Iceberg table's snapshot the blob was computed from. | ||
| int64_t sequence_number; | ||
| /// The uncompressed blob data. | ||
| std::vector<uint8_t> data; | ||
| /// Requested compression codec. If not set, the writer's default will be used. | ||
| std::optional<PuffinCompressionCodec> requested_compression; | ||
| /// Additional properties of the blob. | ||
| std::unordered_map<std::string, std::string> properties; | ||
|
|
||
| /// \brief Compare two Blobs for equality. | ||
| friend bool operator==(const Blob& lhs, const Blob& rhs) = default; | ||
| }; | ||
|
|
||
| /// \brief Returns a string representation of a Blob. | ||
| ICEBERG_EXPORT std::string ToString(const Blob& blob); | ||
|
|
||
| } // namespace iceberg::puffin |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,47 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| #include "iceberg/puffin/blob_metadata.h" | ||
|
|
||
| #include <format> | ||
|
|
||
| #include "iceberg/util/formatter_internal.h" | ||
|
|
||
| namespace iceberg::puffin { | ||
|
|
||
| std::string ToString(const BlobMetadata& blob_metadata) { | ||
| std::string repr = "BlobMetadata["; | ||
| std::format_to(std::back_inserter(repr), "type='{}',inputFields={},", | ||
| blob_metadata.type, blob_metadata.input_fields); | ||
| std::format_to(std::back_inserter(repr), "snapshotId={},sequenceNumber={},", | ||
| blob_metadata.snapshot_id, blob_metadata.sequence_number); | ||
| std::format_to(std::back_inserter(repr), "offset={},length={}", blob_metadata.offset, | ||
| blob_metadata.length); | ||
| if (blob_metadata.compression_codec.has_value()) { | ||
| std::format_to(std::back_inserter(repr), ",compressionCodec='{}'", | ||
| *blob_metadata.compression_codec); | ||
| } | ||
| if (!blob_metadata.properties.empty()) { | ||
| std::format_to(std::back_inserter(repr), ",properties={}", blob_metadata.properties); | ||
| } | ||
| std::format_to(std::back_inserter(repr), "]"); | ||
| return repr; | ||
| } | ||
|
|
||
| } // namespace iceberg::puffin |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| #pragma once | ||
|
|
||
| /// \file iceberg/puffin/blob_metadata.h | ||
| /// Blob metadata structure for Puffin files. | ||
|
|
||
| #include <cstdint> | ||
| #include <optional> | ||
| #include <string> | ||
| #include <unordered_map> | ||
| #include <vector> | ||
|
|
||
| #include "iceberg/iceberg_export.h" | ||
|
|
||
| namespace iceberg::puffin { | ||
|
|
||
| /// \brief Metadata about a blob stored in a Puffin file. | ||
| /// | ||
| /// This represents the metadata stored in the Puffin file footer, | ||
| /// including the blob's location within the file. | ||
| struct ICEBERG_EXPORT BlobMetadata { | ||
| /// Type of the blob. See StandardBlobTypes for known types. | ||
| std::string type; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is weird that the type is declared as string. it makes it harder to understand what type of blobs are accepted but harder to catch typos.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same reasoning as StandardBlobTypes — the spec requires this to be an open string. |
||
| /// List of field IDs the blob was computed for. | ||
| std::vector<int32_t> input_fields; | ||
| /// ID of the Iceberg table's snapshot the blob was computed from. | ||
| int64_t snapshot_id; | ||
| /// Sequence number of the Iceberg table's snapshot the blob was computed from. | ||
| int64_t sequence_number; | ||
| /// Offset in the file where the blob data starts. | ||
| int64_t offset; | ||
| /// Length of the blob data in the file (after compression, if compressed). | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How to check if the blob is compressed, Is it if the compression_codec is set?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, exactly. If compression_codec has a value, the blob is compressed; if it's |
||
| int64_t length; | ||
| /// Compression codec name, or std::nullopt if uncompressed. | ||
| std::optional<std::string> compression_codec; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto same pattern string or enum
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BlobMetadata is a direct mapping of the Puffin footer JSON, where compression-codec is a JSON string (or absent). The Java implementation also uses @nullable String here. The conversion from string to PuffinCompressionCodec enum happens at a higher level via PuffinCompressionCodecFromName(), which returns Result<> to properly handle unknown codecs. This layering keeps the metadata faithful to the on-disk format and provides forward compatibility — if a future spec version adds a new codec, we can still deserialize the footer successfully and fail gracefully at decompression time rather than at parsing time. |
||
| /// Additional properties of the blob. | ||
| std::unordered_map<std::string, std::string> properties; | ||
|
|
||
| /// \brief Compare two BlobMetadatas for equality. | ||
| friend bool operator==(const BlobMetadata& lhs, const BlobMetadata& rhs) = default; | ||
| }; | ||
|
|
||
| /// \brief Returns a string representation of a BlobMetadata. | ||
| ICEBERG_EXPORT std::string ToString(const BlobMetadata& blob_metadata); | ||
|
|
||
| } // namespace iceberg::puffin | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,45 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| #include "iceberg/puffin/file_metadata.h" | ||
|
|
||
| #include <format> | ||
|
|
||
| #include "iceberg/util/formatter_internal.h" | ||
|
|
||
| namespace iceberg::puffin { | ||
|
|
||
| std::string ToString(const FileMetadata& file_metadata) { | ||
| std::string repr = "FileMetadata["; | ||
| std::format_to(std::back_inserter(repr), "blobs=["); | ||
| for (size_t i = 0; i < file_metadata.blobs.size(); ++i) { | ||
| if (i > 0) { | ||
| std::format_to(std::back_inserter(repr), ","); | ||
| } | ||
| std::format_to(std::back_inserter(repr), "{}", ToString(file_metadata.blobs[i])); | ||
| } | ||
| std::format_to(std::back_inserter(repr), "]"); | ||
| if (!file_metadata.properties.empty()) { | ||
| std::format_to(std::back_inserter(repr), ",properties={}", file_metadata.properties); | ||
| } | ||
| std::format_to(std::back_inserter(repr), "]"); | ||
| return repr; | ||
| } | ||
|
|
||
| } // namespace iceberg::puffin |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,51 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| #pragma once | ||
|
|
||
| /// \file iceberg/puffin/file_metadata.h | ||
| /// File metadata structure for Puffin files. | ||
|
|
||
| #include <string> | ||
| #include <unordered_map> | ||
| #include <vector> | ||
|
|
||
| #include "iceberg/iceberg_export.h" | ||
| #include "iceberg/puffin/blob_metadata.h" | ||
|
|
||
| namespace iceberg::puffin { | ||
|
|
||
| /// \brief Metadata about a Puffin file. | ||
| /// | ||
| /// This represents the metadata stored in the Puffin file footer, | ||
| /// including information about all blobs in the file. | ||
| struct ICEBERG_EXPORT FileMetadata { | ||
| /// List of blob metadata for all blobs in the file. | ||
| std::vector<BlobMetadata> blobs; | ||
| /// File-level properties. | ||
| std::unordered_map<std::string, std::string> properties; | ||
|
|
||
| /// \brief Compare two FileMetadatas for equality. | ||
| friend bool operator==(const FileMetadata& lhs, const FileMetadata& rhs) = default; | ||
| }; | ||
|
|
||
| /// \brief Returns a string representation of a FileMetadata. | ||
| ICEBERG_EXPORT std::string ToString(const FileMetadata& file_metadata); | ||
|
|
||
| } // namespace iceberg::puffin |
Uh oh!
There was an error while loading. Please reload this page.