Skip to content

Commit cb651f2

Browse files
committed
update codemmlu leaderboard
1 parent 957ef04 commit cb651f2

File tree

1 file changed

+227
-58
lines changed

1 file changed

+227
-58
lines changed

leaderboards/codemmlu/results.json

Lines changed: 227 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -4,129 +4,298 @@
44
"open-data": "None",
55
"pass@1": {
66
"instruct": null,
7-
"complete": 41.7
7+
"complete": 38.73
88
},
9-
"prompted": false,
9+
"prompted": true,
1010
"size": 34,
11-
"direct_complete": true,
11+
"direct_complete": false,
1212
"lazy": false,
1313
"elo_mle": 942
1414
},
15-
"CodeLlama-13B-Python": {
16-
"link": "https://huggingface.co/codellama/CodeLlama-13b-hf",
15+
"Meta-Llama-3-70B": {
16+
"link": "https://huggingface.co/meta-llama/Meta-Llama-3-70B",
1717
"open-data": "None",
1818
"pass@1": {
1919
"instruct": null,
20-
"complete": 40.0
20+
"complete": 48.98
2121
},
2222
"prompted": false,
23-
"size": 13,
24-
"direct_complete": true,
23+
"size": 70,
24+
"direct_complete": false,
25+
"lazy": false,
26+
"elo_mle": 874
27+
},
28+
"Meta-Llama-3-70B-Instruct": {
29+
"link": "https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct",
30+
"open-data": "None",
31+
"pass@1": {
32+
"instruct": null,
33+
"complete": 62.45
34+
},
35+
"prompted": true,
36+
"size": 70,
37+
"direct_complete": false,
38+
"lazy": false,
39+
"elo_mle": 874
40+
},
41+
"Meta-Llama-3.1-70B-Instruct": {
42+
"link": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
43+
"open-data": "None",
44+
"pass@1": {
45+
"instruct": null,
46+
"complete": 60
47+
},
48+
"prompted": true,
49+
"size": 70,
50+
"direct_complete": false,
2551
"lazy": false,
2652
"elo_mle": 874
2753
},
28-
"CodeQwen1.5-7B": {
29-
"link": "https://huggingface.co/Qwen/CodeQwen1.5-7B",
54+
"Meta-Llama-3.1-70B": {
55+
"link": "https://huggingface.co/meta-llama/Llama-3.1-70B",
3056
"open-data": "None",
3157
"pass@1": {
3258
"instruct": null,
33-
"complete": 31.8
59+
"complete": 37.56
3460
},
3561
"prompted": false,
62+
"size": 70,
63+
"direct_complete": false,
64+
"lazy": false,
65+
"elo_mle": 874
66+
},
67+
"Mistral-7B-Instruct-v0.3": {
68+
"link": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3",
69+
"open-data": "None",
70+
"pass@1": {
71+
"instruct": null,
72+
"complete": 43.33
73+
},
74+
"prompted": true,
3675
"size": 7,
37-
"direct_complete": true,
76+
"direct_complete": false,
3877
"lazy": false,
39-
"elo_mle": 1056
78+
"elo_mle": 874
4079
},
41-
"DeepSeek-Coder-33B-Base": {
42-
"link": "https://huggingface.co/deepseek-ai/deepseek-coder-33b-base",
80+
"Mixtral-8x7B-Instruct-v0.1": {
81+
"link": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1",
4382
"open-data": "None",
4483
"pass@1": {
4584
"instruct": null,
46-
"complete": 33.5
85+
"complete": 42.96
4786
},
48-
"prompted": false,
49-
"size": 33,
50-
"direct_complete": true,
87+
"prompted": true,
88+
"size": 7,
89+
"direct_complete": false,
90+
"lazy": false,
91+
"elo_mle": 874
92+
},
93+
"Codestral-22B-v0.1": {
94+
"link": "https://huggingface.co/mistralai/Codestral-22B-v0.1",
95+
"open-data": "None",
96+
"pass@1": {
97+
"instruct": null,
98+
"complete": 47.6
99+
},
100+
"prompted": true,
101+
"size": 22,
102+
"direct_complete": false,
51103
"lazy": false,
52-
"elo_mle": 1064
104+
"elo_mle": 874
53105
},
54-
"StarCoder2-15B": {
55-
"link": "https://huggingface.co/bigcode/starcoder2-15b",
56-
"open-data": "Full",
106+
"Phi-3-medium-128k-instruct": {
107+
"link": "https://huggingface.co/microsoft/Phi-3-medium-128k-instruct",
108+
"open-data": "None",
57109
"pass@1": {
58110
"instruct": null,
59-
"complete": 28.2
111+
"complete": 48.03
60112
},
61-
"prompted": false,
62-
"size": 15,
63-
"direct_complete": true,
113+
"prompted": true,
114+
"size": 14,
115+
"direct_complete": false,
64116
"lazy": false,
65-
"elo_mle": 960
117+
"elo_mle": 874
66118
},
67-
"DeepSeek-Coder-6.7B-Base": {
68-
"link": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base",
119+
"Phi-3-mini-128k-instruct": {
120+
"link": "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct",
69121
"open-data": "None",
70122
"pass@1": {
71123
"instruct": null,
72-
"complete": 28.4
124+
"complete": 37.93
73125
},
74-
"prompted": false,
75-
"size": 6.7,
76-
"direct_complete": true,
126+
"prompted": true,
127+
"size": 3.8,
128+
"direct_complete": false,
77129
"lazy": false,
78-
"elo_mle": 1002
130+
"elo_mle": 874
79131
},
80-
"DeepSeek-Coder-33B-Instruct": {
81-
"link": "https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct",
132+
"Qwen2-57B-A14B-Instruct": {
133+
"link": "https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct",
82134
"open-data": "None",
83135
"pass@1": {
84136
"instruct": null,
85-
"complete": 33.5
137+
"complete": 46.34
86138
},
87139
"prompted": true,
88-
"size": 33,
140+
"size": 57,
89141
"direct_complete": false,
90142
"lazy": false,
91-
"elo_mle": 1129
143+
"elo_mle": 874
92144
},
93-
"Yi-1.5-34B": {
94-
"link": "https://huggingface.co/01-ai/Yi-1.5-34B",
145+
"CodeQwen1.5-7B-Chat": {
146+
"link": "https://huggingface.co/Qwen/CodeQwen1.5-7B-Chat",
95147
"open-data": "None",
96148
"pass@1": {
97149
"instruct": null,
98-
"complete": 34.9
150+
"complete": 49.82
99151
},
100-
"prompted": false,
152+
"prompted": true,
153+
"size": 7,
154+
"direct_complete": false,
155+
"lazy": false,
156+
"elo_mle": 874
157+
},
158+
"Yi-1.5-34B-Chat": {
159+
"link": "https://huggingface.co/01-ai/Yi-1.5-34B-Chat",
160+
"open-data": "None",
161+
"pass@1": {
162+
"instruct": null,
163+
"complete": 49.39
164+
},
165+
"prompted": true,
101166
"size": 34,
102-
"direct_complete": true,
167+
"direct_complete": false,
168+
"lazy": false,
169+
"elo_mle": 874
170+
},
171+
"Yi-1.5-9B-Chat": {
172+
"link": "https://huggingface.co/01-ai/Yi-1.5-9B-Chat",
173+
"open-data": "None",
174+
"pass@1": {
175+
"instruct": null,
176+
"complete": 47.23
177+
},
178+
"prompted": true,
179+
"size": 9,
180+
"direct_complete": false,
181+
"lazy": false,
182+
"elo_mle": 874
183+
},
184+
"DeepSeek-coder-7b-instruct-v1.5": {
185+
"link": "https://huggingface.co/deepseek-ai/deepseek-coder-7b-instruct-v1.5",
186+
"open-data": "None",
187+
"pass@1": {
188+
"instruct": null,
189+
"complete": 41.21
190+
},
191+
"prompted": true,
192+
"size": 7,
193+
"direct_complete": false,
103194
"lazy": false,
104-
"elo_mle": 978
195+
"elo_mle": 874
105196
},
106-
"OpenCodeInterpreter-DS-33B": {
107-
"link": "https://huggingface.co/m-a-p/OpenCodeInterpreter-DS-33B",
108-
"open-data": "Partial",
197+
"DeepSeek-coder-33b-instruct": {
198+
"link": "https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct",
199+
"open-data": "None",
109200
"pass@1": {
110201
"instruct": null,
111-
"complete": 31.0
202+
"complete": 36.6
112203
},
113204
"prompted": true,
114205
"size": 33,
115-
"direct_complete": true,
206+
"direct_complete": false,
116207
"lazy": false,
117-
"elo_mle": 1131
208+
"elo_mle": 874
118209
},
119-
"To be updated": {
120-
"link": "",
210+
"DeepSeek-moe-16b-chat": {
211+
"link": "https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat",
121212
"open-data": "None",
122213
"pass@1": {
123214
"instruct": null,
124-
"complete": 0
215+
"complete": 31.01
125216
},
126-
"prompted": false,
217+
"prompted": true,
218+
"size": 16.4,
219+
"direct_complete": false,
220+
"lazy": false,
221+
"elo_mle": 874
222+
},
223+
"DeepSeek-Coder-V2-Lite-Instruct": {
224+
"link": "https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
225+
"open-data": "None",
226+
"pass@1": {
227+
"instruct": null,
228+
"complete": 46.51
229+
},
230+
"prompted": true,
231+
"size": 16,
232+
"direct_complete": false,
233+
"lazy": false,
234+
"elo_mle": 874
235+
},
236+
"InternLM2-5-20b-chat": {
237+
"link": "https://huggingface.co/internlm/internlm2_5-20b-chat",
238+
"open-data": "None",
239+
"pass@1": {
240+
"instruct": null,
241+
"complete": 44.89
242+
},
243+
"prompted": true,
244+
"size": 20,
245+
"direct_complete": false,
246+
"lazy": false,
247+
"elo_mle": 874
248+
},
249+
"StarCoder2-15b-instruct-v0.1": {
250+
"link": "https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1",
251+
"open-data": "None",
252+
"pass@1": {
253+
"instruct": null,
254+
"complete": 47.94
255+
},
256+
"prompted": true,
127257
"size": 15,
128-
"direct_complete": true,
258+
"direct_complete": false,
259+
"lazy": false,
260+
"elo_mle": 874
261+
},
262+
"Claude-3-sonnet@20240229": {
263+
"link": "",
264+
"open-data": "None",
265+
"pass@1": {
266+
"instruct": null,
267+
"complete": 53.97
268+
},
269+
"prompted": true,
270+
"size": "None",
271+
"direct_complete": false,
272+
"lazy": false,
273+
"elo_mle": 874
274+
},
275+
"GPT-4o-2024-05-13": {
276+
"link": "",
277+
"open-data": "None",
278+
"pass@1": {
279+
"instruct": null,
280+
"complete": 67
281+
},
282+
"prompted": true,
283+
"size": "None",
284+
"direct_complete": false,
285+
"lazy": false,
286+
"elo_mle": 874
287+
},
288+
"GPT-3.5-turbo-0613": {
289+
"link": "",
290+
"open-data": "None",
291+
"pass@1": {
292+
"instruct": null,
293+
"complete": 51.7
294+
},
295+
"prompted": true,
296+
"size": "None",
297+
"direct_complete": false,
129298
"lazy": false,
130-
"elo_mle": 960
299+
"elo_mle": 874
131300
}
132301
}

0 commit comments

Comments
 (0)