perf(levm): use specialized PUSH1 and PUSH2 implementations (#3262)

edg-l · JereSalo · web-flow · commit bca73af8f979 · 2025-06-23T13:37:30.000Z
**Motivation** According to stats from @azteca1998 PUSH2 and PUSH1 are widely used: ``` Loaded 903636264 rows (3447.10MiB) Stats (of 903636264 records): 0xf1: count= 730979 t_min= 2278 t_max=1512728 t_avg=110877.43 t_acc=81049072024 CALL 0x61: count=131856777 t_min= 136 t_max= 549032 t_avg= 189.29 t_acc=24959614846 PUSH2 0x56: count= 78745029 t_min= 170 t_max=1488792 t_avg= 243.75 t_acc=19194034756 JUMP 0x60: count= 86327863 t_min= 136 t_max= 837080 t_avg= 199.78 t_acc=17246262544 PUSH1 0x5b: count=107216057 t_min= 102 t_max= 267308 t_avg= 159.43 t_acc=17093508806 JUMPDEST 0x50: count= 86546732 t_min= 102 t_max= 353260 t_avg= 174.49 t_acc=15101132640 POP 0x57: count= 53096953 t_min= 102 t_max=1382576 t_avg= 233.40 t_acc=12393069292 JUMPI 0x81: count= 55585321 t_min= 102 t_max= 267410 t_avg= 192.79 t_acc=10716509980 DUP2 0x01: count= 56493418 t_min= 102 t_max=1431060 t_avg= 189.52 t_acc=10706399944 ADD 0x91: count= 31380921 t_min= 102 t_max= 146030 t_avg= 205.38 t_acc= 6444862520 SWAP2 ``` Furthermore i keep seeing `U256::from_big_endian` taking quite some time on samply so I made specialized PUSH1 and PUSH2 implementations that avoid that, also using fixed size arrays. Benchmarks: Hoodi 11k: main 9m10.471s pr 8m25.933s **Description**   Closes #issue_number # Benchmark Results Comparison #### Benchmark Results: Factorial | Command | Mean [ms] | Min [ms] | Max [ms] | Relative | |:---|---:|---:|---:|---:| | `levm_Factorial_pr` | 634.2 ± 7.3 | 629.6 | 654.2 | 2.71 ± 0.04 | | `levm_Factorial` | 726.1 ± 5.2 | 722.5 | 740.1 | 3.11 ± 0.03 | | `levm_FactorialRecursive_pr` | 3.567 ± 0.021 | 3.541 | 3.604 | 2.22 ± 0.05 | | `levm_FactorialRecursive` | 3.828 ± 0.035 | 3.775 | 3.889 | 2.39 ± 0.03 | | `levm_Fibonacci_pr` | 629.2 ± 6.4 | 625.7 | 646.9 | 2.99 ± 0.03 | | `levm_Fibonacci` | 727.7 ± 6.5 | 722.3 | 743.9 | 3.47 ± 0.03 | | `levm_ManyHashes_pr` | 14.9 ± 0.2 | 14.7 | 15.3 | 1.70 ± 0.03 | | `levm_ManyHashes` | 16.3 ± 0.1 | 16.2 | 16.4 | 1.87 ± 0.02 | | `levm_BubbleSort_pr` | 5.065 ± 0.023 | 5.034 | 5.107 | 1.58 ± 0.01 | | `levm_BubbleSort` | 5.508 ± 0.035 | 5.489 | 5.603 | 1.71 ± 0.02 | | `levm_ERC20Transfer_pr` | 461.5 ± 1.3 | 459.7 | 463.4 | 1.87 ± 0.03 | | `levm_ERC20Transfer` | 487.9 ± 2.4 | 484.1 | 491.0 | 1.99 ± 0.01 | | `levm_ERC20Mint_pr` | 306.8 ± 8.9 | 300.1 | 328.5 | 2.22 ± 0.07 | | `levm_ERC20Mint` | 320.1 ± 1.5 | 317.9 | 322.6 | 2.31 ± 0.05 | | `levm_ERC20Approval_pr` | 1.779 ± 0.023 | 1.763 | 1.838 | 1.69 ± 0.02 | | `levm_ERC20Approval` | 1.850 ± 0.011 | 1.837 | 1.873 | 1.76 ± 0.02 | ![image](https://github.com/user-attachments/assets/8f08cb93-ac5d-4909-a15d-cf799f1ce023) According to the samply this makes op_push nearly negligible (from 30% to 0%) --------- Co-authored-by: Jeremías Salomón <48994069+JereSalo@users.noreply.github.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## Perf
 
+### 2025-06-23
+
+- Use specialized PUSH1 and PUSH2 implementations [#3262](https://github.com/lambdaclass/ethrex/pull/3262)
+
 ### 2025-05-27
 
 - Improved the performance of shift instructions. [2933](https://github.com/lambdaclass/ethrex/pull/2933)
diff --git a/crates/vm/levm/src/execution_handlers.rs b/crates/vm/levm/src/execution_handlers.rs
@@ -75,8 +75,10 @@ impl<'a> VM<'a> {
             Opcode::BLOBHASH => self.op_blobhash(),
             Opcode::BLOBBASEFEE => self.op_blobbasefee(),
             Opcode::PUSH0 => self.op_push0(),
+            Opcode::PUSH1 => self.op_push1(),
+            Opcode::PUSH2 => self.op_push2(),
             // PUSHn
-            op if (Opcode::PUSH1..=Opcode::PUSH32).contains(&op) => {
+            op if (Opcode::PUSH3..=Opcode::PUSH32).contains(&op) => {
                 let n_bytes = get_n_value(op, Opcode::PUSH1)?;
                 self.op_push(n_bytes)
             }
diff --git a/crates/vm/levm/src/opcode_handlers/bitwise_comparison.rs b/crates/vm/levm/src/opcode_handlers/bitwise_comparison.rs
@@ -266,6 +266,6 @@ pub fn checked_shift_left(value: U256, shift: U256) -> Result<U256, VMError> {
     Ok(result)
 }
 
-fn u256_from_bool(value: bool) -> U256 {
-    U256::from(u8::from(value))
+const fn u256_from_bool(value: bool) -> U256 {
+    if value { U256::one() } else { U256::zero() }
 }
diff --git a/crates/vm/levm/src/opcode_handlers/push.rs b/crates/vm/levm/src/opcode_handlers/push.rs
@@ -11,7 +11,7 @@ use ethrex_common::{U256, types::Fork};
 // Opcodes: PUSH0, PUSH1 ... PUSH32
 
 impl<'a> VM<'a> {
-    // PUSH operation
+    // Generic PUSH operation
     pub fn op_push(&mut self, n_bytes: usize) -> Result<OpcodeResult, VMError> {
         let current_call_frame = self.current_call_frame_mut()?;
         current_call_frame.increase_consumed_gas(gas_cost::PUSHN)?;
@@ -30,6 +30,42 @@ impl<'a> VM<'a> {
         })
     }
 
+    /// Specialized PUSH1 operation
+    ///
+    /// We use specialized push1 and push2 implementations because they are way more frequent than the others,
+    /// so their impact on performance is significant.
+    /// These implementations allow using U256::from, which is considerable more performant than U256::from_big_endian)
+    pub fn op_push1(&mut self) -> Result<OpcodeResult, VMError> {
+        let current_call_frame = self.current_call_frame_mut()?;
+        current_call_frame.increase_consumed_gas(gas_cost::PUSHN)?;
+
+        let value = read_bytcode_slice_const::<1>(current_call_frame)?[0];
+
+        current_call_frame.stack.push(U256::from(value))?;
+
+        Ok(OpcodeResult::Continue {
+            // The 1 byte that you push to the stack + 1 for the next instruction
+            pc_increment: 2,
+        })
+    }
+
+    // Specialized PUSH2 operation
+    pub fn op_push2(&mut self) -> Result<OpcodeResult, VMError> {
+        let current_call_frame = self.current_call_frame_mut()?;
+        current_call_frame.increase_consumed_gas(gas_cost::PUSHN)?;
+
+        let read_n_bytes = read_bytcode_slice_const::<2>(current_call_frame)?;
+
+        let value = u16::from_be_bytes(read_n_bytes);
+
+        current_call_frame.stack.push(U256::from(value))?;
+
+        Ok(OpcodeResult::Continue {
+            // The 2 bytes that you push to the stack + 1 for the next instruction
+            pc_increment: 3,
+        })
+    }
+
     // PUSH0
     pub fn op_push0(&mut self) -> Result<OpcodeResult, VMError> {
         // [EIP-3855] - PUSH0 is only available from SHANGHAI
@@ -61,3 +97,28 @@ fn read_bytcode_slice(current_call_frame: &CallFrame, n_bytes: usize) -> Result<
         .get(pc_offset..pc_offset.checked_add(n_bytes).ok_or(OutOfBounds)?)
         .unwrap_or_default())
 }
+
+// Like `read_bytcode_slice` but using a const generic and returning a fixed size array.
+fn read_bytcode_slice_const<const N: usize>(
+    current_call_frame: &CallFrame,
+) -> Result<[u8; N], VMError> {
+    let current_pc = current_call_frame.pc;
+    let pc_offset = current_pc
+        // Add 1 to the PC because we don't want to include the
+        // Bytecode of the current instruction in the data we're about
+        // to read. We only want to read the data _NEXT_ to that
+        // bytecode
+        .checked_add(1)
+        .ok_or(InternalError::Overflow)?;
+
+    if let Some(slice) = current_call_frame
+        .bytecode
+        .get(pc_offset..pc_offset.checked_add(N).ok_or(OutOfBounds)?)
+    {
+        Ok(slice
+            .try_into()
+            .map_err(|_| VMError::Internal(InternalError::TypeConversion))?)
+    } else {
+        Ok([0; N])
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -266,6 +266,6 @@ pub fn checked_shift_left(value: U256, shift: U256) -> Result<U256, VMError> {`
`266`	`266`	`Ok(result)`
`267`	`267`	`}`
`268`	`268`
`269`		`-fn u256_from_bool(value: bool) -> U256 {`
`270`		`- U256::from(u8::from(value))`
	`269`	`+const fn u256_from_bool(value: bool) -> U256 {`
	`270`	`+ if value { U256::one() } else { U256::zero() }`
`271`	`271`	`}`