fix[codegen]: fix overcopying of bytes in make_setter (#4419)

charles-cooper · web-flow · commit 9486a41e3291 · 2025-04-12T09:23:28.000Z
in `_complex_make_setter`, when a dynamic type (e.g. `Bytes` or
`DynArray`) is contained within a tuple, the current code generation
copies the entire buffer without checking how large it is.

note this represents a gas issue since more bytes might be copied than
is necessary, but not a correctness issue.

in comparison, the `make_setter` implementations for `Bytes`
and `DynArray` limit the copy length to the runtime sizes of the
`Bytes`/`DynArray`.

this commit disables the full buffer copy and adds a heuristic to
the `make_setter` implementations for `DynArray` and `Bytes` so that
in certain cases, they copy the full buffer instead of checking the
length.
diff --git a/tests/unit/compiler/venom/test_memmerging.py b/tests/unit/compiler/venom/test_memmerging.py
@@ -920,6 +920,21 @@ def test_memmerging_double_use():
     _check_pre_post(pre, post)
 
 
+def test_existing_mcopy_overlap_nochange():
+    """
+    Check that mcopy which already contains an overlap does not get optimized
+    """
+    if not version_check(begin="cancun"):
+        return
+
+    pre = """
+    _global:
+        mcopy 32, 33, 2
+        return %1
+    """
+    _check_no_change(pre)
+
+
 @pytest.mark.parametrize("load_opcode,copy_opcode", LOAD_COPY)
 def test_memmerging_load(load_opcode, copy_opcode):
     """
diff --git a/vyper/codegen/core.py b/vyper/codegen/core.py
@@ -188,10 +188,61 @@ def make_byte_array_copier(dst, src):
         # batch copy the bytearray (including length word) using copy_bytes
         len_ = add_ofst(get_bytearray_length(src), 32)
         max_bytes = src.typ.maxlen + 32
+
+        if _prefer_copy_maxbound_heuristic(dst, src, item_size=1):
+            len_ = max_bytes
+
+        # batch copy the entire dynarray, including length word
         ret = copy_bytes(dst, src, len_, max_bytes)
         return b1.resolve(ret)
 
 
+# heuristic to choose
+def _prefer_copy_maxbound_heuristic(dst, src, item_size):
+    if dst.location != MEMORY:
+        return False
+
+    # a heuristic - it's cheaper to just copy the extra buffer bytes
+    # than calculate the number of bytes
+    # copy(dst, src, 32 + itemsize*load(src))
+    # =>
+    # copy(dst, src, bound)
+    # (32 + itemsize*(load(src))) costs 4*3 + 8 - 3 gas over just `bound`
+    length_calc_cost = 4 * 3 - 3
+    length_calc_cost += 8 * (item_size != 1)  # PUSH MUL
+
+    # NOTE: there is an opportunity for more optimization if this
+    # is one in a sequence of copies, since doing copy(dst, src, maxbound)
+    # allows us to fuse copies together, further saving gas (each copy
+    # costs at least 15 gas).
+
+    if _opt_codesize():
+        # if we are optimizing for codesize, we are ok with a higher
+        # gas cost before switching to copy(dst, src, <precise length>).
+        # +45 is based on vibes -- it says we are willing to burn 45
+        # gas (additional 15 words in the copy operation) at runtime to
+        # save these 5-8 bytes (depending on if itemsize is 1 or not)
+        # (DUP<src> MLOAD PUSH1 ITEMSIZE MUL PUSH1 32 ADD)
+        length_calc_cost += 45
+
+    src_bound = src.typ.memory_bytes_required
+    # 3 gas per word, minus the cost of the length word
+    # (since it is always copied, we don't include it in the marginal
+    # cost difference)
+    copy_cost = ceil32(src_bound - 32) * 3 // 32
+    if src.location in (CALLDATA, MEMORY) and copy_cost <= length_calc_cost:
+        return True
+    # threshold is 6 words of data (+ 1 length word that we need to copy anyway)
+    # dload(src) costs additional 14-20 gas depending on if `src` is a literal
+    # or not.
+    # (dload(src) expands to `codecopy(0, add(CODE_END, src), 32); mload(0)`,
+    # and we have already accounted for an `mload(ptr)`).
+    # for simplicity, skip the 14 case.
+    if src.location == DATA and copy_cost <= (20 + length_calc_cost):
+        return True
+    return False
+
+
 def bytes_data_ptr(ptr):
     if ptr.location is None:  # pragma: nocover
         raise CompilerPanic("tried to modify non-pointer type")
@@ -287,6 +338,9 @@ def _dynarray_make_setter(dst, src, hi=None):
                 n_bytes = add_ofst(_mul(count, element_size), 32)
                 max_bytes = 32 + src.typ.count * element_size
 
+                if _prefer_copy_maxbound_heuristic(dst, src, element_size):
+                    n_bytes = max_bytes
+
                 # batch copy the entire dynarray, including length word
                 ret.append(copy_bytes(dst, src, n_bytes, max_bytes))
 
@@ -1049,7 +1103,18 @@ def _complex_make_setter(left, right, hi=None):
         assert is_tuple_like(left.typ)
         keys = left.typ.tuple_keys()
 
-    if left.is_pointer and right.is_pointer and right.encoding == Encoding.VYPER:
+    # performance: if there is any dynamic data, there might be
+    # unused space between the end of the dynarray and the end of the buffer.
+    # for instance DynArray[uint256, 100] with runtime length of 5.
+    # in these cases, we recurse to dynarray make_setter which has its own
+    # heuristic for when to copy all data.
+
+    # use abi_type.is_dynamic since it is identical to the query "do any children
+    # have dynamic size"
+    has_dynamic_data = right.typ.abi_type.is_dynamic()
+    simple_encoding = right.encoding == Encoding.VYPER
+
+    if left.is_pointer and right.is_pointer and simple_encoding and not has_dynamic_data:
         # both left and right are pointers, see if we want to batch copy
         # instead of unrolling the loop.
         assert left.encoding == Encoding.VYPER