Skip to content

Commit e5d1696

Browse files
committed
Refactor the part of TreeParamters that calculates the max number of possible pointers to a new class ManifestSizeCalculator.
1 parent ba7e0fb commit e5d1696

File tree

2 files changed

+131
-89
lines changed

2 files changed

+131
-89
lines changed
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
# Copyright 2024 Marc Mosko
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from .HashGroupBuilderPair import HashGroupBuilderPair
16+
from ..HashGroupBuilder import HashGroupBuilder
17+
from ..ManifestFactory import ManifestFactory
18+
from ..name_constructor.NameConstructorContext import NameConstructorContext
19+
from ..name_constructor.SchemaImpl import SchemaImpl
20+
from ..tlvs.StartSegmentId import StartSegmentId
21+
from ...core.HashValue import HashValue
22+
23+
24+
class ManifestSizeCalculator:
25+
"""
26+
This class decides the maximuim number of pointers that can fit in a single manifest. It does this by
27+
using the `max_packet_size` and filling in a Manifest to see how much space is left.
28+
29+
The test manifest must have all possible fields that are used in a manifest filled in, so their space
30+
is accounted for.
31+
"""
32+
33+
# Used as the largest number of chunk id and manifest id and final chunk id.
34+
__MAX_MANIFEST_ID = 0xFFFFFF
35+
36+
def __init__(self, max_packet_size: int, manifest_factory: ManifestFactory,
37+
name_ctx: NameConstructorContext, total_bytes: int):
38+
self._max_packet_size = max_packet_size
39+
self._manifest_factory = manifest_factory
40+
self._name_ctx = name_ctx
41+
self._total_bytes = total_bytes
42+
43+
def calculate_max_pointers(self) -> int:
44+
"""
45+
Create a Manifest with the specified number of tree pointers and figure out how much space we have left
46+
out of self._max_size. Then figure out how many data pointers we can fit in.
47+
48+
We only put metadata and locators and things like that in the root manifest.
49+
50+
:param max_packet_size: The maximum ccnpy.Packet size (bytes)
51+
:param manifest_factory: Factory used to create manifests
52+
:param total_bytes: The total file bytes. We need to reserve big enough ints for leaf_size and subtree_size
53+
:return: The number of data points we can fit in a max_size nameless manifest (SHA256 HashValues)
54+
"""
55+
# Assume 32-byte sha256 hashes
56+
hv = HashValue.create_sha256(32 * [0])
57+
hash_value_len = len(hv)
58+
packet = self._build_manifest_packet(1, hv)
59+
length = len(packet)
60+
if length >= self._max_packet_size:
61+
raise ValueError("An empty manifest packet is %r bytes and exceeds max_size %r" % (length, self._max_packet_size))
62+
63+
slack = self._max_packet_size - length
64+
# +1 because we already have 1 hash in the manifest
65+
num_hashes = int(slack / hash_value_len) + 1
66+
67+
# Now validate that it works
68+
packet = self._build_manifest_packet(num_hashes, hv)
69+
length = len(packet)
70+
if length > self._max_packet_size:
71+
raise ValueError(
72+
"A filled manifest packet is %r bytes with %r hashes, a hash is %r bytes, and exceeds max_size %r" %
73+
(length, num_hashes, hash_value_len, self._max_packet_size))
74+
75+
#print("calculate_max_pointers = %r in length %r, actual length %r" % (num_hashes, max_packet_size, length))
76+
77+
if num_hashes < 2:
78+
min_packet_size = len(packet) + hash_value_len
79+
raise ValueError("With max_packet_size %r there are %r hashes/manifest, must have at least 2."
80+
" Minimum packet_size is %r" % (self._max_packet_size, num_hashes, min_packet_size))
81+
return num_hashes
82+
83+
def _build_manifest_packet(self, num_hashes, hv):
84+
# Arbitrary choise, we put n-1 into direct and 1 into indirect
85+
hgb = HashGroupBuilderPair(name_ctx=self._name_ctx, max_direct = num_hashes -1, max_indirect=1)
86+
87+
for hv in (num_hashes -1) * [hv]:
88+
hgb.prepend_direct(hv)
89+
hgb.prepend_indirect(hv)
90+
if self._name_ctx.manifest_schema_impl.uses_name_id():
91+
indirect_start_segment_id = StartSegmentId(self.__MAX_MANIFEST_ID)
92+
else:
93+
indirect_start_segment_id = None
94+
95+
if self._name_ctx.data_schema_impl.uses_name_id():
96+
direct_start_segment_id = StartSegmentId(SchemaImpl._MAX_CHUNK_ID)
97+
else:
98+
direct_start_segment_id = None
99+
100+
# include_leaf_size and include_subtree_size might reserve too much space if we do not use those.
101+
hash_groups = hgb.hash_groups(include_leaf_size=True,
102+
include_subtree_size=True,
103+
indirect_start_segment_id=indirect_start_segment_id,
104+
direct_start_segment_id=direct_start_segment_id)
105+
106+
packet = self._manifest_factory.build_packet(source=hash_groups,
107+
node_subtree_size=self._total_bytes)
108+
109+
return packet
110+
111+
def _build_single_hash_group(self, indirect_ptrs, direct_ptrs, nc_id):
112+
# We use total bytes for the subtree size and leaf size. This might end up reserving one one more byte
113+
# than necessary if it overflows.
114+
hgb1 = HashGroupBuilder()
115+
for ptr in indirect_ptrs:
116+
hgb1.append_indirect(ptr, subtree_size=self._total_bytes)
117+
for ptr in direct_ptrs:
118+
hgb1.append_direct(ptr, leaf_size=self._total_bytes)
119+
120+
hg1 = hgb1.hash_group(nc_id=nc_id,
121+
include_leaf_size=self._manifest_factory.tree_options().add_group_leaf_size,
122+
include_subtree_size=self._manifest_factory.tree_options().add_group_subtree_size)
123+
return hg1

ccnpy/flic/tree/TreeParameters.py

Lines changed: 8 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
from .HashGroupBuilderPair import HashGroupBuilderPair
16+
from .ManifestSizeCalculator import ManifestSizeCalculator
1617
from .OptimizerResult import OptimizerResult
1718
from .TreeOptimizer import TreeOptimizer
1819
from ..HashGroupBuilder import HashGroupBuilder
@@ -33,7 +34,6 @@ class TreeParameters:
3334
3435
Usually, one calls `create_optimized_tree` to calculate these parameters.
3536
"""
36-
__MAX_MANIFEST_ID = 0xFFFFFF
3737

3838
@classmethod
3939
def create_optimized_tree(cls,
@@ -49,12 +49,13 @@ def create_optimized_tree(cls,
4949
max_packet_size = manifest_factory.tree_options().max_packet_size
5050
max_tree_degree = manifest_factory.tree_options().max_tree_degree
5151

52-
num_pointers_per_node = cls._calculate_max_pointers(max_packet_size=max_packet_size,
52+
num_pointers_per_node = ManifestSizeCalculator(max_packet_size=max_packet_size,
5353
manifest_factory=manifest_factory,
5454
name_ctx=name_ctx,
55-
total_bytes=file_metadata.total_bytes)
55+
total_bytes=file_metadata.total_bytes).calculate_max_pointers()
5656

5757
if num_pointers_per_node < 2:
58+
# TODO: This is not entirely true. If the app data is only 1 chunk, it could work.
5859
raise ValueError("With a max_packet_size of %r there is only %r pointers per node, must have at least 2" %
5960
(max_packet_size, num_pointers_per_node))
6061

@@ -125,94 +126,12 @@ def tree_height(self) -> int:
125126
"""
126127
return self._solution.tree_height()
127128

128-
@classmethod
129-
def _build_single_hash_group(cls, manifest_factory, indirect_ptrs, direct_ptrs, nc_id, total_bytes):
130-
# We use total bytes for the subtree size and leaf size. This might end up reserving one one more byte
131-
# than necessary if it overflows.
132-
hgb1 = HashGroupBuilder()
133-
for ptr in indirect_ptrs:
134-
hgb1.append_indirect(ptr, subtree_size=total_bytes)
135-
for ptr in direct_ptrs:
136-
hgb1.append_direct(ptr, leaf_size=total_bytes)
137-
138-
hg1 = hgb1.hash_group(nc_id=nc_id,
139-
include_leaf_size=manifest_factory.tree_options().add_group_leaf_size,
140-
include_subtree_size=manifest_factory.tree_options().add_group_subtree_size)
141-
return hg1
142-
143-
@classmethod
144-
def _build_manifest_packet(cls, manifest_factory, num_hashes, hv, name_ctx, total_bytes):
145-
# Arbitrary choise, we put n-1 into direct and 1 into indirect
146-
hgb = HashGroupBuilderPair(name_ctx=name_ctx, max_direct = num_hashes -1, max_indirect=1)
147-
148-
for hv in (num_hashes -1) * [hv]:
149-
hgb.prepend_direct(hv)
150-
hgb.prepend_indirect(hv)
151-
if name_ctx.manifest_schema_impl.uses_name_id():
152-
indirect_start_segment_id = StartSegmentId(cls.__MAX_MANIFEST_ID)
153-
else:
154-
indirect_start_segment_id = None
155-
156-
if name_ctx.data_schema_impl.uses_name_id():
157-
direct_start_segment_id = StartSegmentId(SchemaImpl._MAX_CHUNK_ID)
158-
else:
159-
direct_start_segment_id = None
160-
161-
# include_leaf_size and include_subtree_size might reserve too much space if we do not use those.
162-
hash_groups = hgb.hash_groups(include_leaf_size=True,
163-
include_subtree_size=True,
164-
indirect_start_segment_id=indirect_start_segment_id,
165-
direct_start_segment_id=direct_start_segment_id)
166-
167-
packet = manifest_factory.build_packet(source=hash_groups,
168-
node_subtree_size=total_bytes)
169-
170-
return packet
171-
172-
@classmethod
173-
def _calculate_max_pointers(cls, max_packet_size: int, manifest_factory: ManifestFactory,
174-
name_ctx: NameConstructorContext, total_bytes: int):
175-
"""
176-
Create a Manifest with the specified number of tree pointers and figure out how much space we have left
177-
out of self._max_size. Then figure out how many data pointers we can fit in.
178-
179-
We only put metadata and locators and things like that in the root manifest.
180-
181-
:param max_packet_size: The maximum ccnpy.Packet size (bytes)
182-
:param manifest_factory: Factory used to create manifests
183-
:param total_bytes: The total file bytes. We need to reserve big enough ints for leaf_size and subtree_size
184-
:return: The number of data points we can fit in a max_size nameless manifest
185-
"""
186-
# Assume 32-byte sha256 hashes
187-
hv = HashValue.create_sha256(32 * [0])
188-
hash_value_len = len(hv)
189-
packet = cls._build_manifest_packet(manifest_factory, 1, hv, name_ctx, total_bytes)
190-
length = len(packet)
191-
if length >= max_packet_size:
192-
raise ValueError("An empty manifest packet is %r bytes and exceeds max_size %r" % (length, max_packet_size))
193-
194-
slack = max_packet_size - length
195-
# +1 because we already have 1 hash in the manifest
196-
num_hashes = int(slack / hash_value_len) + 1
197-
198-
# Now validate that it works
199-
packet = cls._build_manifest_packet(manifest_factory, num_hashes, hv, name_ctx, total_bytes)
200-
length = len(packet)
201-
if length > max_packet_size:
202-
raise ValueError(
203-
"A filled manifest packet is %r bytes with %r hashes, a hash is %r bytes, and exceeds max_size %r" %
204-
(length, num_hashes, hash_value_len, max_packet_size))
205-
206-
#print("calculate_max_pointers = %r in length %r, actual length %r" % (num_hashes, max_packet_size, length))
207-
208-
if num_hashes < 2:
209-
min_packet_size = len(packet) + hash_value_len
210-
raise ValueError("With max_packet_size %r there are %r hashes/manifest, must have at least 2."
211-
" Minimum packet_size is %r" % (max_packet_size, num_hashes, min_packet_size))
212-
return num_hashes
213-
214129
@staticmethod
215130
def _optimize_tree(total_direct_nodes:int , num_pointers_per_node: int) -> OptimizerResult:
216131
to = TreeOptimizer(num_direct_nodes=total_direct_nodes,
217132
num_pointers=num_pointers_per_node)
133+
134+
# There are a few possible outputs from the tree optimizer. In general, we use
135+
# this one, as it picks the tree that fits the data well (minimizes waste), and then
136+
# from those picks one with minimum height.
218137
return to.minimize_waste_min_height()

0 commit comments

Comments
 (0)