Merge pull request #244 from trishullab/george

GeorgeTsoukalas · web-flow · commit a43321fef415 · 2024-11-07T11:13:32.000-06:00
Update miscellaneous data
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# PutnamBench
+# [PutnamBench](https://arxiv.org/abs/2407.11214)
 
 <p align="center">
     <a href="https://trishullab.github.io/PutnamBench/leaderboard.html"><img src="https://img.shields.io/badge/%F0%9F%8F%86-leaderboard-%23ff8811"></a>
@@ -32,9 +32,9 @@ We also report the number of problems in a certain category. Note that some prob
 | Number Theory    | 108            | 
 | Geometry         | 69             |
 | Linear Algebra   | 51             | 
-| Abstract Algebra | 28             |  
 | Combinatorics    | 29             | 
-| Probability      | 10              | 
+| Abstract Algebra | 28             |  
+| Probability      | 10             | 
 | Set Theory       | 8              | 
 
 ## Citation
diff --git a/docs/index.html b/docs/index.html
@@ -176,8 +176,8 @@ <h2 class="title is-3">Abstract</h2>
             We present PutnamBench, a new multilingual benchmark for evaluating the ability of neural theorem-provers to solve competition mathematics problems.    
            </p> 
           <p>
-            PutnamBench consists of 1697 hand-constructed formalizations of problems sourced from the William Lowell Putnam Mathematical Competition, the premier undergraduate-level mathematics competition in North America. 
-            All the theorems, of which there are 640, have formalizations in Lean 4 and Isabelle; a substantial subset also has Coq formalizations.
+            PutnamBench consists of 1696 hand-constructed formalizations of problems sourced from the William Lowell Putnam Mathematical Competition, the premier undergraduate-level mathematics competition in North America. 
+            There are 644 problems formalized in Lean 4, 640 formalized in Isabelle, and 412 formalized in Coq.
           </p>
           <p>
             Proving the theorems requires significant problem-solving ability and proficiency in a broad range of topics taught in undergraduate mathematics courses. We use PutnamBench to evaluate several established neural and symbolic theorem-provers. 
diff --git a/docs/leaderboard.html b/docs/leaderboard.html
@@ -134,7 +134,7 @@ <h3 class="fw-light text-nowrap">
       <!-- <div id="chart" style="width: 100%; height: 600px"></div> -->
       <div class="container-fluid d-flex flex-row flex-nowrap">
         <div class="container-fluid d-flex flex-column align-items-center">
-          <label for="lean" style="font-size: 14px;" class="text-success mb-3">Lean (out of 640)</label>
+          <label for="lean" style="font-size: 14px;" class="text-success mb-3">Lean (out of 644)</label>
           <table
             id="lean"
             class="table table-responsive table-striped table-bordered flex-shrink-1 border border-success border-3"
@@ -148,7 +148,7 @@ <h3 class="fw-light text-nowrap">
           ></table>
         </div>
                 <div class="container-fluid d-flex flex-column align-items-center">
-          <label for="coq" style="font-size: 14px;" class="text-success mb-3">Coq (out of 417)</label>
+          <label for="coq" style="font-size: 14px;" class="text-success mb-3">Coq (out of 412)</label>
           <table
             id="coq"
             class="table table-responsive table-striped table-bordered flex-shrink-1 border border-danger border-3"
diff --git a/docs/results.json b/docs/results.json
@@ -7,7 +7,7 @@
       },
       "size": 0,
       "condensed-compute-budget": "pass@10",
-      "note": "pass@10 w/ T=0.1"
+      "note": "pass@10 w/ T=0.1; 1971 B1, 1986 B1, 1995 B1, 2001 A1."
     },
     "GPT-4o": {
       "link": "https://openai.com/index/hello-gpt-4o/",
@@ -19,7 +19,7 @@
       },
       "size": 0,
       "condensed-compute-budget": "pass@10",
-      "note": "pass@10 w/ T=0.7"
+      "note": "pass@10 w/ T=0.7; 1988 B1."
     },
     "COPRA (GPT-4o)": {
       "link": "https://arxiv.org/abs/2310.04353",
@@ -30,7 +30,7 @@
       },
       "size": 0,
       "condensed-compute-budget": "pass@1",
-      "note": "pass@1 w/ T=0.0"
+      "note": "pass@1 w/ T=0.0; 1988 B1."
     },
     "CoqHammer": {
       "link": "https://coqhammer.github.io/",
@@ -50,7 +50,7 @@
       },
       "size": 0,
       "condensed-compute-budget": "pass@1",
-      "note": "pass@1 w/ t=120s"
+      "note": "pass@1 w/ t=120s; 1971 B1, 2001 A1, 2012 A2."
     },
     "ReProver w/ retrieval": {
       "link": "https://arxiv.org/abs/2306.15626",
@@ -100,6 +100,16 @@
       },
       "size": 7,
       "condensed-compute-budget": "pass@596",
-      "note": "7 hour cumulative online proof search on 256 GPUs"
+      "note": "7 hour cumulative online proof search on 256 GPUs; 1977 A3, 1986 A1, 1986 A2, 1986 B1, 1988 B1, 1988 B2, 2001 A1. Commit: 8aaccf8"
+    },
+    "InternLM2.5-StepProver": {
+      "link": "https://arxiv.org/pdf/2410.15700",
+      "open-data": "FULL",
+      "num-solved": {
+        "lean-wsolution": 6
+      },
+      "size": 8.8,
+      "condensed-compute-budget": "pass@2x32x600",
+      "note": "pass @ 2 x 32 states x 600 state expansions proof search; 1977 A3, 1986 A1, 1986 B1, 1988 B1, 1988 B2, 2001 A1. Commit: b377431"
     }
 }