Skip to content

Commit bca73af

Browse files
edg-lJereSalo
andauthored
perf(levm): use specialized PUSH1 and PUSH2 implementations (#3262)
**Motivation** According to stats from @azteca1998 PUSH2 and PUSH1 are widely used: ``` Loaded 903636264 rows (3447.10MiB) Stats (of 903636264 records): 0xf1: count= 730979 t_min= 2278 t_max=1512728 t_avg=110877.43 t_acc=81049072024 CALL 0x61: count=131856777 t_min= 136 t_max= 549032 t_avg= 189.29 t_acc=24959614846 PUSH2 0x56: count= 78745029 t_min= 170 t_max=1488792 t_avg= 243.75 t_acc=19194034756 JUMP 0x60: count= 86327863 t_min= 136 t_max= 837080 t_avg= 199.78 t_acc=17246262544 PUSH1 0x5b: count=107216057 t_min= 102 t_max= 267308 t_avg= 159.43 t_acc=17093508806 JUMPDEST 0x50: count= 86546732 t_min= 102 t_max= 353260 t_avg= 174.49 t_acc=15101132640 POP 0x57: count= 53096953 t_min= 102 t_max=1382576 t_avg= 233.40 t_acc=12393069292 JUMPI 0x81: count= 55585321 t_min= 102 t_max= 267410 t_avg= 192.79 t_acc=10716509980 DUP2 0x01: count= 56493418 t_min= 102 t_max=1431060 t_avg= 189.52 t_acc=10706399944 ADD 0x91: count= 31380921 t_min= 102 t_max= 146030 t_avg= 205.38 t_acc= 6444862520 SWAP2 ``` Furthermore i keep seeing `U256::from_big_endian` taking quite some time on samply so I made specialized PUSH1 and PUSH2 implementations that avoid that, also using fixed size arrays. Benchmarks: Hoodi 11k: main 9m10.471s pr 8m25.933s **Description** <!-- A clear and concise general description of the changes this PR introduces --> <!-- Link to issues: Resolves #111, Resolves #222 --> Closes #issue_number # Benchmark Results Comparison #### Benchmark Results: Factorial | Command | Mean [ms] | Min [ms] | Max [ms] | Relative | |:---|---:|---:|---:|---:| | `levm_Factorial_pr` | 634.2 ± 7.3 | 629.6 | 654.2 | 2.71 ± 0.04 | | `levm_Factorial` | 726.1 ± 5.2 | 722.5 | 740.1 | 3.11 ± 0.03 | | `levm_FactorialRecursive_pr` | 3.567 ± 0.021 | 3.541 | 3.604 | 2.22 ± 0.05 | | `levm_FactorialRecursive` | 3.828 ± 0.035 | 3.775 | 3.889 | 2.39 ± 0.03 | | `levm_Fibonacci_pr` | 629.2 ± 6.4 | 625.7 | 646.9 | 2.99 ± 0.03 | | `levm_Fibonacci` | 727.7 ± 6.5 | 722.3 | 743.9 | 3.47 ± 0.03 | | `levm_ManyHashes_pr` | 14.9 ± 0.2 | 14.7 | 15.3 | 1.70 ± 0.03 | | `levm_ManyHashes` | 16.3 ± 0.1 | 16.2 | 16.4 | 1.87 ± 0.02 | | `levm_BubbleSort_pr` | 5.065 ± 0.023 | 5.034 | 5.107 | 1.58 ± 0.01 | | `levm_BubbleSort` | 5.508 ± 0.035 | 5.489 | 5.603 | 1.71 ± 0.02 | | `levm_ERC20Transfer_pr` | 461.5 ± 1.3 | 459.7 | 463.4 | 1.87 ± 0.03 | | `levm_ERC20Transfer` | 487.9 ± 2.4 | 484.1 | 491.0 | 1.99 ± 0.01 | | `levm_ERC20Mint_pr` | 306.8 ± 8.9 | 300.1 | 328.5 | 2.22 ± 0.07 | | `levm_ERC20Mint` | 320.1 ± 1.5 | 317.9 | 322.6 | 2.31 ± 0.05 | | `levm_ERC20Approval_pr` | 1.779 ± 0.023 | 1.763 | 1.838 | 1.69 ± 0.02 | | `levm_ERC20Approval` | 1.850 ± 0.011 | 1.837 | 1.873 | 1.76 ± 0.02 | ![image](https://github.com/user-attachments/assets/8f08cb93-ac5d-4909-a15d-cf799f1ce023) According to the samply this makes op_push nearly negligible (from 30% to 0%) --------- Co-authored-by: Jeremías Salomón <[email protected]>
1 parent 939e95f commit bca73af

File tree

4 files changed

+71
-4
lines changed

4 files changed

+71
-4
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
## Perf
44

5+
### 2025-06-23
6+
7+
- Use specialized PUSH1 and PUSH2 implementations [#3262](https://github.com/lambdaclass/ethrex/pull/3262)
8+
59
### 2025-05-27
610

711
- Improved the performance of shift instructions. [2933](https://github.com/lambdaclass/ethrex/pull/2933)

crates/vm/levm/src/execution_handlers.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,10 @@ impl<'a> VM<'a> {
7575
Opcode::BLOBHASH => self.op_blobhash(),
7676
Opcode::BLOBBASEFEE => self.op_blobbasefee(),
7777
Opcode::PUSH0 => self.op_push0(),
78+
Opcode::PUSH1 => self.op_push1(),
79+
Opcode::PUSH2 => self.op_push2(),
7880
// PUSHn
79-
op if (Opcode::PUSH1..=Opcode::PUSH32).contains(&op) => {
81+
op if (Opcode::PUSH3..=Opcode::PUSH32).contains(&op) => {
8082
let n_bytes = get_n_value(op, Opcode::PUSH1)?;
8183
self.op_push(n_bytes)
8284
}

crates/vm/levm/src/opcode_handlers/bitwise_comparison.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,6 @@ pub fn checked_shift_left(value: U256, shift: U256) -> Result<U256, VMError> {
266266
Ok(result)
267267
}
268268

269-
fn u256_from_bool(value: bool) -> U256 {
270-
U256::from(u8::from(value))
269+
const fn u256_from_bool(value: bool) -> U256 {
270+
if value { U256::one() } else { U256::zero() }
271271
}

crates/vm/levm/src/opcode_handlers/push.rs

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ use ethrex_common::{U256, types::Fork};
1111
// Opcodes: PUSH0, PUSH1 ... PUSH32
1212

1313
impl<'a> VM<'a> {
14-
// PUSH operation
14+
// Generic PUSH operation
1515
pub fn op_push(&mut self, n_bytes: usize) -> Result<OpcodeResult, VMError> {
1616
let current_call_frame = self.current_call_frame_mut()?;
1717
current_call_frame.increase_consumed_gas(gas_cost::PUSHN)?;
@@ -30,6 +30,42 @@ impl<'a> VM<'a> {
3030
})
3131
}
3232

33+
/// Specialized PUSH1 operation
34+
///
35+
/// We use specialized push1 and push2 implementations because they are way more frequent than the others,
36+
/// so their impact on performance is significant.
37+
/// These implementations allow using U256::from, which is considerable more performant than U256::from_big_endian)
38+
pub fn op_push1(&mut self) -> Result<OpcodeResult, VMError> {
39+
let current_call_frame = self.current_call_frame_mut()?;
40+
current_call_frame.increase_consumed_gas(gas_cost::PUSHN)?;
41+
42+
let value = read_bytcode_slice_const::<1>(current_call_frame)?[0];
43+
44+
current_call_frame.stack.push(U256::from(value))?;
45+
46+
Ok(OpcodeResult::Continue {
47+
// The 1 byte that you push to the stack + 1 for the next instruction
48+
pc_increment: 2,
49+
})
50+
}
51+
52+
// Specialized PUSH2 operation
53+
pub fn op_push2(&mut self) -> Result<OpcodeResult, VMError> {
54+
let current_call_frame = self.current_call_frame_mut()?;
55+
current_call_frame.increase_consumed_gas(gas_cost::PUSHN)?;
56+
57+
let read_n_bytes = read_bytcode_slice_const::<2>(current_call_frame)?;
58+
59+
let value = u16::from_be_bytes(read_n_bytes);
60+
61+
current_call_frame.stack.push(U256::from(value))?;
62+
63+
Ok(OpcodeResult::Continue {
64+
// The 2 bytes that you push to the stack + 1 for the next instruction
65+
pc_increment: 3,
66+
})
67+
}
68+
3369
// PUSH0
3470
pub fn op_push0(&mut self) -> Result<OpcodeResult, VMError> {
3571
// [EIP-3855] - PUSH0 is only available from SHANGHAI
@@ -61,3 +97,28 @@ fn read_bytcode_slice(current_call_frame: &CallFrame, n_bytes: usize) -> Result<
6197
.get(pc_offset..pc_offset.checked_add(n_bytes).ok_or(OutOfBounds)?)
6298
.unwrap_or_default())
6399
}
100+
101+
// Like `read_bytcode_slice` but using a const generic and returning a fixed size array.
102+
fn read_bytcode_slice_const<const N: usize>(
103+
current_call_frame: &CallFrame,
104+
) -> Result<[u8; N], VMError> {
105+
let current_pc = current_call_frame.pc;
106+
let pc_offset = current_pc
107+
// Add 1 to the PC because we don't want to include the
108+
// Bytecode of the current instruction in the data we're about
109+
// to read. We only want to read the data _NEXT_ to that
110+
// bytecode
111+
.checked_add(1)
112+
.ok_or(InternalError::Overflow)?;
113+
114+
if let Some(slice) = current_call_frame
115+
.bytecode
116+
.get(pc_offset..pc_offset.checked_add(N).ok_or(OutOfBounds)?)
117+
{
118+
Ok(slice
119+
.try_into()
120+
.map_err(|_| VMError::Internal(InternalError::TypeConversion))?)
121+
} else {
122+
Ok([0; N])
123+
}
124+
}

0 commit comments

Comments
 (0)