📉 Add SequenceSet benchmarks

nevans · nevans · commit 5f2a563ff166 · 2025-06-18T10:09:54.000-04:00
This adds a simple ruby script for measuring `ObjectSpace.memsize_of`,
and several benchmark-driver scripts for:
* `SequenceSet.new` (indirectly via `::[]`)
* `SequenceSet#slice` (aka `#[]`)
* Various set ops: `&amp;`, `|`, `-`, `^`, `~`
* Various set predicates: `#intersect?`, `#disjoint?`, `#cover?`
* Several alternate implementations of:
  * AND — `#&amp;` and `#intersect!`
  * NOT — `#~` and `#complement!`
  * XOR — `#^` and `#xor!`
diff --git a/benchmarks/seqset-memsize.rb b/benchmarks/seqset-memsize.rb
@@ -0,0 +1,40 @@
+# frozen_string_literal: true
+
+$LOAD_PATH.unshift "./lib"
+require "net/imap"
+require "objspace"
+
+def seqset(n, min: 1, max: (n * 1.25).to_i)
+  inputs = Array.new(n) { rand(min..max) }
+  Net::IMAP::SequenceSet[inputs]
+end
+
+def obj_tree(obj, seen: Set.new)
+  seen << obj
+  children = ObjectSpace.reachable_objects_from(obj)
+    .reject { _1 in Module or seen.include?(_1) }
+    .flat_map { obj_tree(_1, seen:) }
+  [obj, *children]
+end
+
+def memsize(obj) = obj_tree(obj).sum { ObjectSpace.memsize_of _1 }
+
+def avg(ary) = ary.sum / ary.count.to_f
+
+def print_avg(n, count: 10, **)
+  print "Average memsize of SequenceSet with %6d inputs: " % [n]
+  sizes = Array.new(count) {
+    print "."
+    memsize seqset(n, **)
+  }
+  puts "%9.1f" % [avg(sizes)]
+end
+
+# pp obj_tree(seqset(200, min: 1_000_000, max: 1_000_999)).to_h { [_1, memsize(_1)] }
+print_avg   1
+print_avg  10
+print_avg 100
+
+print_avg   1_000
+print_avg  10_000
+print_avg 100_000
diff --git a/benchmarks/sequence_set-and.yml b/benchmarks/sequence_set-and.yml
@@ -0,0 +1,76 @@
+---
+prelude: |
+  require "yaml"
+  require "net/imap"
+
+  INPUT_COUNT = Integer ENV.fetch("PROFILE_INPUT_COUNT", 1000)
+  MAX_INPUT   = Integer ENV.fetch("PROFILE_MAX_INPUT",   1400)
+  WARMUP_RUNS = Integer ENV.fetch("PROFILE_WARMUP_RUNS",  200)
+
+  SETS = Array.new(1000) {
+    Net::IMAP::SequenceSet[Array.new(INPUT_COUNT) { rand(1..MAX_INPUT) }]
+  }
+
+  def sets
+    l, r = SETS.sample(2)
+    [l.dup, r]
+  end
+
+  class Net::IMAP
+    class SequenceSet
+      def and0(other) remain_frozen dup.and0! other end
+      def and1(other) remain_frozen dup.and1! other end
+      def and2(other) remain_frozen dup.and2! other end
+
+      # L - ~R
+      def and0!(other)
+        modifying!
+        subtract SequenceSet.new(other).complement!
+      end
+
+      # L - (L - R)
+      def and1!(other)
+        modifying!
+        subtract dup.subtract(SequenceSet.new(other))
+      end
+
+      # TODO: add this as a public method
+      def xor!(other) # :nodoc:
+        modifying!
+        copy  = dup
+        other = SequenceSet.new(other)
+        merge(other).subtract(other.subtract(copy.complement!))
+      end
+
+      # L - (L ^ R)
+      def and2!(other)
+        modifying!
+        subtract SequenceSet.new(other).xor! self
+      end
+    end
+  end
+
+  # warmup (esp. for JIT)
+  WARMUP_RUNS.times do
+    lhs, rhs = sets
+    lhs | rhs
+    lhs & rhs
+    lhs - rhs
+    lhs ^ rhs
+    ~lhs
+    lhs.and0 rhs
+    lhs.and1 rhs
+    lhs.and2 rhs
+  end
+
+benchmark:
+  "      L & R":       l, r = sets; l & r
+  "      L - ~R":      l, r = sets; l - ~r
+  "and0  L - ~R":      l, r = sets; l.and0  r
+  "and0! L - ~R":      l, r = sets; l.and0! r
+  "      L - (L - R)": l, r = sets; l - (l - r)
+  "and1  L - (L - R)": l, r = sets; l.and1  r
+  "and1! L - (L - R)": l, r = sets; l.and1! r
+  "      L - (L ^ R)": l, r = sets; l - (l ^ r)
+  "and2  L - (L ^ R)": l, r = sets; l.and2  r
+  "and2! L - (L ^ R)": l, r = sets; l.and2! r
diff --git a/benchmarks/sequence_set-new.yml b/benchmarks/sequence_set-new.yml
@@ -0,0 +1,80 @@
+---
+prelude: |
+  $LOAD_PATH.unshift "./lib"
+  require "net/imap"
+  SeqSet = Net::IMAP::SequenceSet
+
+  N_RAND = 100
+
+  def rand_nums(n, min: 1, max: (n * 1.25).to_i) = Array.new(n) { rand(1..max) }
+  def rand_entries(...) = SeqSet[rand_nums(...)].elements.shuffle
+  def rand_string(...)  = SeqSet[rand_nums(...)].string.split(?,).shuffle.join(?,)
+
+  def build_string_inputs(n, n_rand, **)
+    Array.new(n_rand) { rand_string(n, **) }
+  end
+
+  def build_int_inputs(n, n_rand, **)
+    Array.new(n_rand) { rand_entries(n, **) }
+  end
+
+  inputs = nil
+  i = 0
+
+  # warm up, especially for YJIT
+  300.times do
+    ints   = rand_nums(1000)
+    seqset = SeqSet[ints]
+    string = seqset.string.split(?,).shuffle.join(?,)
+    SeqSet[string]
+  end
+
+benchmark:
+
+  - name:    n=10 ints
+    prelude: inputs = build_int_inputs 10, N_RAND
+    script:  10_000.times do SeqSet[inputs[i = (i+1) % N_RAND]] end
+
+  - name:    n=10 string
+    prelude: inputs = build_string_inputs 10, N_RAND
+    script:  10_000.times do SeqSet[inputs[i = (i+1) % N_RAND]] end
+
+  - name:    n=100 ints
+    prelude: inputs = build_int_inputs 100, N_RAND
+    script:  1_000.times do SeqSet[inputs[i = (i+1) % N_RAND]] end
+
+  - name:    n=100 string
+    prelude: inputs = build_string_inputs 100, N_RAND
+    script:  1_000.times do SeqSet[inputs[i = (i+1) % N_RAND]] end
+
+  - name:    n=1000 ints
+    prelude: inputs = build_int_inputs 1000, N_RAND
+    script:  100.times do SeqSet[inputs[i = (i+1) % N_RAND]] end
+
+  - name:    n=1000 string
+    prelude: inputs = build_string_inputs 1000, N_RAND
+    script:  100.times do SeqSet[inputs[i = (i+1) % N_RAND]] end
+
+  - name:    n=10,000 ints
+    prelude: inputs = build_int_inputs 10_000, N_RAND
+    script:  10.times do SeqSet[inputs[i = (i+1) % N_RAND]] end
+
+  - name:    n=10,000 string
+    prelude: inputs = build_string_inputs 10_000, N_RAND
+    script:  10.times do SeqSet[inputs[i = (i+1) % N_RAND]] end
+
+  - name:    n=100,000 ints
+    prelude: inputs = build_int_inputs 100_000, N_RAND / 2
+    script:  SeqSet[inputs[i = (i+1) % N_RAND]]
+
+  - name:    n=100,000 string
+    prelude: inputs = build_string_inputs 100_000, N_RAND / 2
+    script:  SeqSet[inputs[i = (i+1) % (N_RAND / 2)]]
+
+#   - name:    n=1,000,000 ints
+#     prelude: inputs = build_int_inputs 1_000_000
+#     script:  SeqSet[inputs[i = (i+1) % N_RAND]]
+
+#   - name:    n=10,000,000 ints
+#     prelude: inputs = build_int_inputs 10_000_000
+#     script:  SeqSet[inputs[i = (i+1) % N_RAND]]
diff --git a/benchmarks/sequence_set-not.yml b/benchmarks/sequence_set-not.yml
@@ -0,0 +1,85 @@
+---
+prelude: |
+  require "yaml"
+  require "net/imap"
+
+  INPUT_COUNT = Integer ENV.fetch("PROFILE_INPUT_COUNT", 1000)
+  MAX_INPUT   = Integer ENV.fetch("PROFILE_MAX_INPUT",   1400)
+  WARMUP_RUNS = Integer ENV.fetch("PROFILE_WARMUP_RUNS",  200)
+
+  SETS = Array.new(1000) {
+    Net::IMAP::SequenceSet[Array.new(INPUT_COUNT) { rand(1..MAX_INPUT) }]
+  }
+
+  class Net::IMAP
+    class SequenceSet
+      def orig_not
+        remain_frozen dup.orig_not!
+      end
+
+      # 0.5.8 implementation
+      def orig_not!
+        return replace(self.class.full) if empty?
+        return clear                    if full?
+        flat = @tuples.flat_map { [_1 - 1, _2 + 1] }
+        if flat.first < 1         then flat.shift else flat.unshift 1        end
+        if STAR_INT   < flat.last then flat.pop   else flat.push    STAR_INT end
+        @tuples = flat.each_slice(2).to_a
+        normalize!
+      end
+
+      # enumerator based
+      def enum_not
+        result = SequenceSet.new
+        each_tuple_complement do |min, max| result.tuples << [min, max] end
+        remain_frozen result
+      end
+
+      # enumerator based
+      def enum_not_2
+        remain_frozen dup.enum_not!
+      end
+
+      def enum_not!
+        last = -1
+        each_tuple_complement.with_index do |minmax, idx|
+          last = idx
+          @tuples[idx] = minmax
+        end
+        @tuples.delete_at(last + 1)
+        self
+      end
+
+      private
+
+      def each_tuple_complement
+        return to_enum(__method__) unless block_given?
+        if    full?  then # no yield
+        elsif empty? then yield 1, STAR_INT
+        else
+          yield 1, min - 1 unless min <= 1
+          tuples.each_cons(2) do |(_, a), (b,_)| yield a+1, b-1 end
+          yield max + 1, STAR_INT unless max == STAR_INT
+        end
+        nil
+      end
+
+    end
+  end
+
+  # warmup (esp. for JIT)
+  WARMUP_RUNS.times do
+    ~SETS.sample
+    SETS.sample.orig_not
+    SETS.sample.enum_not
+    SETS.sample.dup.orig_not!
+    SETS.sample.dup.enum_not!
+  end
+
+benchmark:
+  "      ~set":        ~SETS.sample
+  "0.5.8 ~set":         SETS.sample.orig_not
+  "enum  ~set":         SETS.sample.enum_not
+  "enum2 ~set":         SETS.sample.enum_not_2
+  "0.5.8 ~dup":         SETS.sample.dup.orig_not!
+  "enum  ~dup":         SETS.sample.dup.enum_not!
diff --git a/benchmarks/sequence_set-ops.yml b/benchmarks/sequence_set-ops.yml
@@ -0,0 +1,34 @@
+---
+prelude: |
+  require "yaml"
+  require "net/imap"
+
+  INPUT_COUNT = Integer ENV.fetch("PROFILE_INPUT_COUNT", 1000)
+  MAX_INPUT   = Integer ENV.fetch("PROFILE_MAX_INPUT",   1400)
+  WARMUP_RUNS = Integer ENV.fetch("PROFILE_WARMUP_RUNS",  200)
+
+  SETS = Array.new(1000) {
+    Net::IMAP::SequenceSet[Array.new(INPUT_COUNT) { rand(1..MAX_INPUT) }]
+  }
+
+  def sets
+    l, r = SETS.sample(2)
+    [l.dup, r]
+  end
+
+  # warmup (esp. for JIT)
+  200.times do
+    lhs, rhs = sets
+    lhs | rhs
+    lhs & rhs
+    lhs - rhs
+    lhs ^ rhs
+    ~lhs
+  end
+
+benchmark:
+  union:        l, r = sets; l | r
+  intersection: l, r = sets; l & r
+  difference:   l, r = sets; l - r
+  xor:          l, r = sets; l ^ r
+  complement:   l, _ = sets; ~l
diff --git a/benchmarks/sequence_set-predicates.yml b/benchmarks/sequence_set-predicates.yml
@@ -0,0 +1,32 @@
+---
+prelude: |
+  require "yaml"
+  require "net/imap"
+
+  INPUT_COUNT = Integer ENV.fetch("PROFILE_INPUT_COUNT", 1000)
+  MAX_INPUT   = Integer ENV.fetch("PROFILE_MAX_INPUT",   1400)
+  WARMUP_RUNS = Integer ENV.fetch("PROFILE_WARMUP_RUNS",  200)
+
+  SETS = Array.new(1000) {
+    Net::IMAP::SequenceSet[Array.new(INPUT_COUNT) { rand(1..MAX_INPUT) }]
+  }
+
+  def sets
+    l, r = SETS.sample(2)
+    [l.dup, r]
+  end
+
+  # warmup (esp. for JIT)
+  WARMUP_RUNS.times do
+    lhs, rhs = sets
+    lhs | rhs
+    lhs & rhs
+    lhs - rhs
+    lhs ^ rhs
+    ~lhs
+  end
+
+benchmark:
+  intersect?:   l, r = sets; l.intersect? r
+  disjoint?:    l, r = sets; l.disjoint? r
+  cover?:       l, r = sets; l.cover? r
diff --git a/benchmarks/sequence_set-slice.yml b/benchmarks/sequence_set-slice.yml
diff --git a/benchmarks/sequence_set-xor.yml b/benchmarks/sequence_set-xor.yml