apache · ding-young · Jul 18, 2025 · Jul 22, 2025 · Jul 23, 2025 · Jul 25, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml
@@ -33,18 +33,21 @@ workspace = true
 ci = []
 default = ["mimalloc"]
 snmalloc = ["snmalloc-rs"]
+mimalloc_extended = ["libmimalloc-sys/extended"]
 
 [dependencies]
 arrow = { workspace = true }
 datafusion = { workspace = true, default-features = true }
 datafusion-common = { workspace = true, default-features = true }
 env_logger = { workspace = true }
 futures = { workspace = true }
+libmimalloc-sys = { version = "0.1", optional = true }
 log = { workspace = true }
 mimalloc = { version = "0.1", optional = true, default-features = false }
 object_store = { workspace = true }
 parquet = { workspace = true, default-features = true }
 rand = { workspace = true }
+regex.workspace = true
 serde = { version = "1.0.219", features = ["derive"] }
 serde_json = { workspace = true }
 snmalloc-rs = { version = "0.3", optional = true }

diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -283,6 +283,7 @@ This will produce output like:
 └──────────────┴──────────────┴──────────────┴───────────────┘
 ```
 
+
 # Benchmark Runner
 
 The `dfbench` program contains subcommands to run the various
@@ -321,6 +322,66 @@ FLAGS:
 ...
 ```
 
+# Profiling Memory Stats for each benchmark query
+The `mem_profile` program wraps benchmark execution to measure memory usage statistics, such as peak RSS. It runs each benchmark query in a separate subprocess, capturing the child process’s stdout to print structured output.
+
+Subcommands supported by mem_profile are the subset of those in `dfbench`.
+Currently supported benchmarks include: Clickbench, H2o, Imdb, SortTpch, Tpch
+
+Before running benchmarks, `mem_profile` automatically compiles the benchmark binary (`dfbench`) using `cargo build`. Note that the build profile used for `dfbench` is not tied to the profile used for running `mem_profile` itself. We can explicitly specify the desired build profile using the `--bench-profile` option (e.g. release-nonlto). By prebuilding the binary and running each query in a separate process, we can ensure accurate memory statistics.
+
+Currently, `mem_profile` only supports `mimalloc` as the memory allocator, since it relies on `mimalloc`'s API to collect memory statistics.
+
+Because it runs the compiled binary directly from the target directory, make sure your working directory is the top-level datafusion/ directory, where the target/ is also located. 
+
+The benchmark subcommand (e.g., `tpch`) and all following arguments are passed directly to `dfbench`. Be sure to specify `--bench-profile` before the benchmark subcommand. 
+
+Example: 
+```shell
+datafusion$ cargo run --profile release-nonlto --bin mem_profile -- --bench-profile release-nonlto tpch --path benchmarks/data/tpch_sf1 --partitions 4 --format parquet
+```
+Example Output:
+```
+Query     Time (ms)     Peak RSS  Peak Commit  Major Page Faults
+----------------------------------------------------------------
+1            503.42     283.4 MB       3.0 GB                  0
+2            431.09     240.7 MB       3.0 GB                  0
+3            594.28     350.1 MB       3.0 GB                  0
+4            468.90     462.4 MB       3.0 GB                  0
+5            653.58     385.4 MB       3.0 GB                  0
+6            296.79     247.3 MB       2.0 GB                  0
+7            662.32     652.4 MB       3.0 GB                  0
+8            702.48     396.0 MB       3.0 GB                  0
+9            774.21     611.5 MB       3.0 GB                  0
+10           733.62     397.9 MB       3.0 GB                  0
+11           271.71     209.6 MB       3.0 GB                  0
+12           512.60     212.5 MB       2.0 GB                  0
+13           507.83     381.5 MB       2.0 GB                  0
+14           420.89     313.5 MB       3.0 GB                  0
+15           539.97     288.0 MB       2.0 GB                  0
+16           370.91     229.8 MB       3.0 GB                  0
+17           758.33     467.0 MB       2.0 GB                  0
+18          1112.32     638.9 MB       3.0 GB                  0
+19           712.72     280.9 MB       2.0 GB                  0
+20           620.64     402.9 MB       2.9 GB                  0
+21           971.63     388.9 MB       2.9 GB                  0
+22           404.50     164.8 MB       2.0 GB                  0
+```
+
+## Reported Metrics
+When running benchmarks, `mem_profile` collects several memory-related statistics using the mimalloc API:
+
+- Peak RSS (Resident Set Size): 
+The maximum amount of physical memory used by the process.
+This is a process-level metric collected via OS-specific mechanisms and is not mimalloc-specific.
+
+- Peak Commit:
+The peak amount of memory committed by the allocator (i.e., total virtual memory reserved).
+This is mimalloc-specific. It gives a more allocator-aware view of memory usage than RSS.
+
+- Major Page Faults:
+The number of major page faults triggered during execution.
+This metric is obtained from the operating system and is not mimalloc-specific.
 # Writing a new benchmark
 
 ## Creating or downloading data outside of the benchmark