Spaces:
Running
Running
liaojiajia
commited on
Commit
Β·
6add05a
1
Parent(s):
cd01d35
updated scores
Browse files- src/detail_math_score.json +236 -236
- src/detail_results.csv +117 -117
- src/overall_math_score.json +48 -48
- src/overall_results.csv +33 -33
- src/record.csv +33 -33
src/detail_math_score.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"time": "2025-
|
3 |
"results": {
|
4 |
"IO": {
|
5 |
"gpt-3.5-turbo": {
|
@@ -2041,43 +2041,43 @@
|
|
2041 |
"Eval Date": "2025/1/7"
|
2042 |
},
|
2043 |
"gsm8k": {
|
2044 |
-
"Score":
|
2045 |
-
"Pass rate": 0.
|
2046 |
-
"Cost($)":
|
2047 |
"Framework": "",
|
2048 |
"X-shot": "8",
|
2049 |
"Samples": 1319,
|
2050 |
-
"All tokens":
|
2051 |
-
"Total input tokens":
|
2052 |
-
"Average input tokens":
|
2053 |
-
"Total output tokens":
|
2054 |
-
"Average output tokens":
|
2055 |
},
|
2056 |
"AQuA": {
|
2057 |
-
"Score": 66
|
2058 |
-
"Pass rate": 0.
|
2059 |
-
"Cost($)": 0.
|
2060 |
"Framework": "",
|
2061 |
"X-shot": "0",
|
2062 |
"Samples": 254,
|
2063 |
-
"All tokens":
|
2064 |
-
"Total input tokens":
|
2065 |
-
"Average input tokens":
|
2066 |
-
"Total output tokens":
|
2067 |
-
"Average output tokens":
|
2068 |
},
|
2069 |
"MATH-500": {
|
2070 |
-
"Score":
|
2071 |
"Pass rate": 1.0,
|
2072 |
-
"Cost($)": 1.
|
2073 |
"Framework": "",
|
2074 |
"X-shot": "4",
|
2075 |
"Samples": 500,
|
2076 |
-
"All tokens":
|
2077 |
-
"Total input tokens":
|
2078 |
-
"Average input tokens":
|
2079 |
-
"Total output tokens":
|
2080 |
-
"Average output tokens":
|
2081 |
}
|
2082 |
},
|
2083 |
"Doubao-lite-32k": {
|
@@ -2087,43 +2087,43 @@
|
|
2087 |
"Eval Date": "2025/1/7"
|
2088 |
},
|
2089 |
"gsm8k": {
|
2090 |
-
"Score":
|
2091 |
"Pass rate": 0.9992,
|
2092 |
-
"Cost($)": 0.
|
2093 |
"Framework": "",
|
2094 |
"X-shot": "8",
|
2095 |
"Samples": 1319,
|
2096 |
-
"All tokens":
|
2097 |
-
"Total input tokens":
|
2098 |
-
"Average input tokens":
|
2099 |
-
"Total output tokens":
|
2100 |
-
"Average output tokens":
|
2101 |
},
|
2102 |
"AQuA": {
|
2103 |
-
"Score":
|
2104 |
-
"Pass rate": 0.
|
2105 |
-
"Cost($)": 0.
|
2106 |
"Framework": "",
|
2107 |
"X-shot": "0",
|
2108 |
"Samples": 254,
|
2109 |
-
"All tokens":
|
2110 |
-
"Total input tokens":
|
2111 |
-
"Average input tokens":
|
2112 |
-
"Total output tokens":
|
2113 |
-
"Average output tokens":
|
2114 |
},
|
2115 |
"MATH-500": {
|
2116 |
-
"Score":
|
2117 |
-
"Pass rate":
|
2118 |
-
"Cost($)": 0.
|
2119 |
"Framework": "",
|
2120 |
"X-shot": "4",
|
2121 |
"Samples": 500,
|
2122 |
-
"All tokens":
|
2123 |
-
"Total input tokens":
|
2124 |
-
"Average input tokens":
|
2125 |
-
"Total output tokens":
|
2126 |
-
"Average output tokens":
|
2127 |
}
|
2128 |
},
|
2129 |
"gpt-4o": {
|
@@ -2133,43 +2133,43 @@
|
|
2133 |
"Eval Date": "2025/1/22"
|
2134 |
},
|
2135 |
"gsm8k": {
|
2136 |
-
"Score":
|
2137 |
-
"Pass rate": 0
|
2138 |
-
"Cost($)":
|
2139 |
"Framework": "",
|
2140 |
"X-shot": "8",
|
2141 |
"Samples": 1319,
|
2142 |
-
"All tokens":
|
2143 |
-
"Total input tokens":
|
2144 |
-
"Average input tokens":
|
2145 |
-
"Total output tokens":
|
2146 |
-
"Average output tokens":
|
2147 |
},
|
2148 |
"AQuA": {
|
2149 |
-
"Score":
|
2150 |
-
"Pass rate": 0.
|
2151 |
-
"Cost($)":
|
2152 |
"Framework": "",
|
2153 |
"X-shot": "0",
|
2154 |
"Samples": 254,
|
2155 |
-
"All tokens":
|
2156 |
-
"Total input tokens":
|
2157 |
-
"Average input tokens":
|
2158 |
-
"Total output tokens":
|
2159 |
-
"Average output tokens":
|
2160 |
},
|
2161 |
"MATH-500": {
|
2162 |
-
"Score":
|
2163 |
"Pass rate": 1.0,
|
2164 |
-
"Cost($)":
|
2165 |
"Framework": "",
|
2166 |
"X-shot": "4",
|
2167 |
"Samples": 500,
|
2168 |
-
"All tokens":
|
2169 |
-
"Total input tokens":
|
2170 |
-
"Average input tokens":
|
2171 |
-
"Total output tokens":
|
2172 |
-
"Average output tokens":
|
2173 |
}
|
2174 |
},
|
2175 |
"Qwen2.5-72B-Instruct": {
|
@@ -2179,43 +2179,43 @@
|
|
2179 |
"Eval Date": "2025/1/22"
|
2180 |
},
|
2181 |
"gsm8k": {
|
2182 |
-
"Score":
|
2183 |
"Pass rate": 1.0,
|
2184 |
-
"Cost($)":
|
2185 |
"Framework": "",
|
2186 |
"X-shot": "8",
|
2187 |
"Samples": 1319,
|
2188 |
-
"All tokens":
|
2189 |
-
"Total input tokens":
|
2190 |
-
"Average input tokens":
|
2191 |
-
"Total output tokens":
|
2192 |
-
"Average output tokens":
|
2193 |
},
|
2194 |
"AQuA": {
|
2195 |
-
"Score": 85.
|
2196 |
-
"Pass rate": 0.
|
2197 |
-
"Cost($)":
|
2198 |
"Framework": "",
|
2199 |
"X-shot": "0",
|
2200 |
"Samples": 254,
|
2201 |
-
"All tokens":
|
2202 |
-
"Total input tokens":
|
2203 |
-
"Average input tokens":
|
2204 |
-
"Total output tokens":
|
2205 |
-
"Average output tokens":
|
2206 |
},
|
2207 |
"MATH-500": {
|
2208 |
-
"Score":
|
2209 |
"Pass rate": 1.0,
|
2210 |
-
"Cost($)":
|
2211 |
"Framework": "",
|
2212 |
"X-shot": "4",
|
2213 |
"Samples": 500,
|
2214 |
-
"All tokens":
|
2215 |
-
"Total input tokens":
|
2216 |
-
"Average input tokens":
|
2217 |
-
"Total output tokens":
|
2218 |
-
"Average output tokens":
|
2219 |
}
|
2220 |
},
|
2221 |
"Llama-3.3-70B-Instruct": {
|
@@ -2225,43 +2225,43 @@
|
|
2225 |
"Eval Date": "2025/1/22"
|
2226 |
},
|
2227 |
"gsm8k": {
|
2228 |
-
"Score": 95.
|
2229 |
"Pass rate": 1.0,
|
2230 |
-
"Cost($)":
|
2231 |
"Framework": "",
|
2232 |
"X-shot": "8",
|
2233 |
"Samples": 1319,
|
2234 |
-
"All tokens":
|
2235 |
-
"Total input tokens":
|
2236 |
-
"Average input tokens":
|
2237 |
-
"Total output tokens":
|
2238 |
-
"Average output tokens":
|
2239 |
},
|
2240 |
"AQuA": {
|
2241 |
-
"Score":
|
2242 |
-
"Pass rate": 0.
|
2243 |
-
"Cost($)":
|
2244 |
"Framework": "",
|
2245 |
"X-shot": "0",
|
2246 |
"Samples": 254,
|
2247 |
-
"All tokens":
|
2248 |
-
"Total input tokens":
|
2249 |
-
"Average input tokens":
|
2250 |
-
"Total output tokens":
|
2251 |
-
"Average output tokens":
|
2252 |
},
|
2253 |
"MATH-500": {
|
2254 |
-
"Score":
|
2255 |
"Pass rate": 1.0,
|
2256 |
-
"Cost($)":
|
2257 |
"Framework": "",
|
2258 |
"X-shot": "4",
|
2259 |
"Samples": 500,
|
2260 |
-
"All tokens":
|
2261 |
-
"Total input tokens":
|
2262 |
-
"Average input tokens":
|
2263 |
-
"Total output tokens":
|
2264 |
-
"Average output tokens":
|
2265 |
}
|
2266 |
},
|
2267 |
"Qwen2.5-7B-Instruct": {
|
@@ -2271,43 +2271,43 @@
|
|
2271 |
"Eval Date": "2025/1/22"
|
2272 |
},
|
2273 |
"gsm8k": {
|
2274 |
-
"Score":
|
2275 |
"Pass rate": 1.0,
|
2276 |
"Cost($)": 0.0,
|
2277 |
"Framework": "",
|
2278 |
"X-shot": "8",
|
2279 |
"Samples": 1319,
|
2280 |
-
"All tokens":
|
2281 |
-
"Total input tokens":
|
2282 |
-
"Average input tokens":
|
2283 |
-
"Total output tokens":
|
2284 |
-
"Average output tokens":
|
2285 |
},
|
2286 |
"AQuA": {
|
2287 |
-
"Score": 79.
|
2288 |
"Pass rate": 1.0,
|
2289 |
"Cost($)": 0.0,
|
2290 |
"Framework": "",
|
2291 |
"X-shot": "0",
|
2292 |
"Samples": 254,
|
2293 |
-
"All tokens":
|
2294 |
-
"Total input tokens":
|
2295 |
-
"Average input tokens":
|
2296 |
-
"Total output tokens":
|
2297 |
-
"Average output tokens":
|
2298 |
},
|
2299 |
"MATH-500": {
|
2300 |
-
"Score":
|
2301 |
"Pass rate": 1.0,
|
2302 |
"Cost($)": 0.0,
|
2303 |
"Framework": "",
|
2304 |
"X-shot": "4",
|
2305 |
"Samples": 500,
|
2306 |
-
"All tokens":
|
2307 |
-
"Total input tokens":
|
2308 |
-
"Average input tokens":
|
2309 |
-
"Total output tokens":
|
2310 |
-
"Average output tokens":
|
2311 |
}
|
2312 |
},
|
2313 |
"Llama-3.1-8B-Instruct": {
|
@@ -2317,43 +2317,43 @@
|
|
2317 |
"Eval Date": "2025/1/22"
|
2318 |
},
|
2319 |
"gsm8k": {
|
2320 |
-
"Score":
|
2321 |
-
"Pass rate": 0.
|
2322 |
"Cost($)": 0.0,
|
2323 |
"Framework": "",
|
2324 |
"X-shot": "8",
|
2325 |
"Samples": 1319,
|
2326 |
-
"All tokens":
|
2327 |
-
"Total input tokens":
|
2328 |
-
"Average input tokens":
|
2329 |
-
"Total output tokens":
|
2330 |
-
"Average output tokens":
|
2331 |
},
|
2332 |
"AQuA": {
|
2333 |
"Score": 59.45,
|
2334 |
-
"Pass rate": 0.
|
2335 |
"Cost($)": 0.0,
|
2336 |
"Framework": "",
|
2337 |
"X-shot": "0",
|
2338 |
"Samples": 254,
|
2339 |
-
"All tokens":
|
2340 |
-
"Total input tokens":
|
2341 |
-
"Average input tokens":
|
2342 |
-
"Total output tokens":
|
2343 |
-
"Average output tokens":
|
2344 |
},
|
2345 |
"MATH-500": {
|
2346 |
-
"Score":
|
2347 |
-
"Pass rate":
|
2348 |
"Cost($)": 0.0,
|
2349 |
"Framework": "",
|
2350 |
"X-shot": "4",
|
2351 |
"Samples": 500,
|
2352 |
-
"All tokens":
|
2353 |
-
"Total input tokens":
|
2354 |
-
"Average input tokens":
|
2355 |
-
"Total output tokens":
|
2356 |
-
"Average output tokens":
|
2357 |
}
|
2358 |
},
|
2359 |
"Internllm2_5-7B": {
|
@@ -2363,43 +2363,43 @@
|
|
2363 |
"Eval Date": "2025/1/22"
|
2364 |
},
|
2365 |
"gsm8k": {
|
2366 |
-
"Score":
|
2367 |
-
"Pass rate": 0.
|
2368 |
"Cost($)": 0.0,
|
2369 |
"Framework": "",
|
2370 |
"X-shot": "8",
|
2371 |
"Samples": 1319,
|
2372 |
-
"All tokens":
|
2373 |
-
"Total input tokens":
|
2374 |
-
"Average input tokens":
|
2375 |
-
"Total output tokens":
|
2376 |
-
"Average output tokens":
|
2377 |
},
|
2378 |
"AQuA": {
|
2379 |
-
"Score":
|
2380 |
-
"Pass rate": 0.
|
2381 |
"Cost($)": 0.0,
|
2382 |
"Framework": "",
|
2383 |
"X-shot": "0",
|
2384 |
"Samples": 254,
|
2385 |
-
"All tokens":
|
2386 |
-
"Total input tokens":
|
2387 |
-
"Average input tokens":
|
2388 |
-
"Total output tokens":
|
2389 |
-
"Average output tokens":
|
2390 |
},
|
2391 |
"MATH-500": {
|
2392 |
-
"Score": 9.
|
2393 |
-
"Pass rate":
|
2394 |
"Cost($)": 0.0,
|
2395 |
"Framework": "",
|
2396 |
"X-shot": "4",
|
2397 |
"Samples": 500,
|
2398 |
-
"All tokens":
|
2399 |
-
"Total input tokens":
|
2400 |
-
"Average input tokens":
|
2401 |
-
"Total output tokens":
|
2402 |
-
"Average output tokens":
|
2403 |
}
|
2404 |
},
|
2405 |
"Qwen2-1.5B-Instruct": {
|
@@ -2409,43 +2409,43 @@
|
|
2409 |
"Eval Date": "2025/1/22"
|
2410 |
},
|
2411 |
"gsm8k": {
|
2412 |
-
"Score":
|
2413 |
-
"Pass rate": 0.
|
2414 |
"Cost($)": 0.0,
|
2415 |
"Framework": "",
|
2416 |
"X-shot": "8",
|
2417 |
"Samples": 1319,
|
2418 |
-
"All tokens":
|
2419 |
-
"Total input tokens":
|
2420 |
-
"Average input tokens":
|
2421 |
-
"Total output tokens":
|
2422 |
-
"Average output tokens":
|
2423 |
},
|
2424 |
"AQuA": {
|
2425 |
-
"Score":
|
2426 |
-
"Pass rate": 0.
|
2427 |
"Cost($)": 0.0,
|
2428 |
"Framework": "",
|
2429 |
"X-shot": "0",
|
2430 |
"Samples": 254,
|
2431 |
-
"All tokens":
|
2432 |
-
"Total input tokens":
|
2433 |
-
"Average input tokens":
|
2434 |
-
"Total output tokens":
|
2435 |
-
"Average output tokens":
|
2436 |
},
|
2437 |
"MATH-500": {
|
2438 |
-
"Score":
|
2439 |
-
"Pass rate": 0.
|
2440 |
"Cost($)": 0.0,
|
2441 |
"Framework": "",
|
2442 |
"X-shot": "4",
|
2443 |
"Samples": 500,
|
2444 |
-
"All tokens":
|
2445 |
-
"Total input tokens":
|
2446 |
-
"Average input tokens":
|
2447 |
-
"Total output tokens":
|
2448 |
-
"Average output tokens":
|
2449 |
}
|
2450 |
},
|
2451 |
"Qwen2-0.5B-Instruct": {
|
@@ -2455,43 +2455,43 @@
|
|
2455 |
"Eval Date": "2025/1/22"
|
2456 |
},
|
2457 |
"gsm8k": {
|
2458 |
-
"Score":
|
2459 |
-
"Pass rate": 0.
|
2460 |
"Cost($)": 0.0,
|
2461 |
"Framework": "",
|
2462 |
"X-shot": "8",
|
2463 |
"Samples": 1319,
|
2464 |
-
"All tokens":
|
2465 |
-
"Total input tokens":
|
2466 |
-
"Average input tokens":
|
2467 |
-
"Total output tokens":
|
2468 |
-
"Average output tokens":
|
2469 |
},
|
2470 |
"AQuA": {
|
2471 |
-
"Score":
|
2472 |
-
"Pass rate": 0.
|
2473 |
"Cost($)": 0.0,
|
2474 |
"Framework": "",
|
2475 |
"X-shot": "0",
|
2476 |
"Samples": 254,
|
2477 |
-
"All tokens":
|
2478 |
-
"Total input tokens":
|
2479 |
-
"Average input tokens":
|
2480 |
-
"Total output tokens":
|
2481 |
-
"Average output tokens":
|
2482 |
},
|
2483 |
"MATH-500": {
|
2484 |
-
"Score":
|
2485 |
-
"Pass rate":
|
2486 |
"Cost($)": 0.0,
|
2487 |
"Framework": "",
|
2488 |
"X-shot": "4",
|
2489 |
"Samples": 500,
|
2490 |
-
"All tokens":
|
2491 |
-
"Total input tokens":
|
2492 |
-
"Average input tokens":
|
2493 |
-
"Total output tokens":
|
2494 |
-
"Average output tokens":
|
2495 |
}
|
2496 |
},
|
2497 |
"deepseek-r1:1.5b": {
|
@@ -2501,43 +2501,43 @@
|
|
2501 |
"Eval Date": "2025/2/10"
|
2502 |
},
|
2503 |
"gsm8k": {
|
2504 |
-
"Score":
|
2505 |
-
"Pass rate": 0.
|
2506 |
"Cost($)": 0.0,
|
2507 |
"Framework": "",
|
2508 |
"X-shot": "8",
|
2509 |
"Samples": 1319,
|
2510 |
-
"All tokens":
|
2511 |
-
"Total input tokens":
|
2512 |
-
"Average input tokens":
|
2513 |
-
"Total output tokens":
|
2514 |
-
"Average output tokens":
|
2515 |
},
|
2516 |
"AQuA": {
|
2517 |
-
"Score":
|
2518 |
-
"Pass rate": 0.
|
2519 |
"Cost($)": 0.0,
|
2520 |
"Framework": "",
|
2521 |
"X-shot": "0",
|
2522 |
"Samples": 254,
|
2523 |
-
"All tokens":
|
2524 |
-
"Total input tokens":
|
2525 |
-
"Average input tokens":
|
2526 |
-
"Total output tokens":
|
2527 |
-
"Average output tokens":
|
2528 |
},
|
2529 |
"MATH-500": {
|
2530 |
-
"Score":
|
2531 |
-
"Pass rate":
|
2532 |
"Cost($)": 0.0,
|
2533 |
"Framework": "",
|
2534 |
"X-shot": "4",
|
2535 |
"Samples": 500,
|
2536 |
-
"All tokens":
|
2537 |
-
"Total input tokens":
|
2538 |
-
"Average input tokens":
|
2539 |
-
"Total output tokens":
|
2540 |
-
"Average output tokens":
|
2541 |
}
|
2542 |
}
|
2543 |
},
|
|
|
1 |
{
|
2 |
+
"time": "2025-03-05 13:15:02",
|
3 |
"results": {
|
4 |
"IO": {
|
5 |
"gpt-3.5-turbo": {
|
|
|
2041 |
"Eval Date": "2025/1/7"
|
2042 |
},
|
2043 |
"gsm8k": {
|
2044 |
+
"Score": 69.29,
|
2045 |
+
"Pass rate": 0.9879,
|
2046 |
+
"Cost($)": 2.5203,
|
2047 |
"Framework": "",
|
2048 |
"X-shot": "8",
|
2049 |
"Samples": 1319,
|
2050 |
+
"All tokens": 2277249,
|
2051 |
+
"Total input tokens": 895571,
|
2052 |
+
"Average input tokens": 679,
|
2053 |
+
"Total output tokens": 1381678,
|
2054 |
+
"Average output tokens": 1048
|
2055 |
},
|
2056 |
"AQuA": {
|
2057 |
+
"Score": 58.66,
|
2058 |
+
"Pass rate": 0.9252,
|
2059 |
+
"Cost($)": 0.3277,
|
2060 |
"Framework": "",
|
2061 |
"X-shot": "0",
|
2062 |
"Samples": 254,
|
2063 |
+
"All tokens": 237066,
|
2064 |
+
"Total input tokens": 27906,
|
2065 |
+
"Average input tokens": 110,
|
2066 |
+
"Total output tokens": 209160,
|
2067 |
+
"Average output tokens": 823
|
2068 |
},
|
2069 |
"MATH-500": {
|
2070 |
+
"Score": 40.8,
|
2071 |
"Pass rate": 1.0,
|
2072 |
+
"Cost($)": 1.2308,
|
2073 |
"Framework": "",
|
2074 |
"X-shot": "4",
|
2075 |
"Samples": 500,
|
2076 |
+
"All tokens": 1050819,
|
2077 |
+
"Total input tokens": 345411,
|
2078 |
+
"Average input tokens": 691,
|
2079 |
+
"Total output tokens": 705408,
|
2080 |
+
"Average output tokens": 1411
|
2081 |
}
|
2082 |
},
|
2083 |
"Doubao-lite-32k": {
|
|
|
2087 |
"Eval Date": "2025/1/7"
|
2088 |
},
|
2089 |
"gsm8k": {
|
2090 |
+
"Score": 91.58,
|
2091 |
"Pass rate": 0.9992,
|
2092 |
+
"Cost($)": 0.1118,
|
2093 |
"Framework": "",
|
2094 |
"X-shot": "8",
|
2095 |
"Samples": 1319,
|
2096 |
+
"All tokens": 1835891,
|
2097 |
+
"Total input tokens": 942182,
|
2098 |
+
"Average input tokens": 714,
|
2099 |
+
"Total output tokens": 893709,
|
2100 |
+
"Average output tokens": 678
|
2101 |
},
|
2102 |
"AQuA": {
|
2103 |
+
"Score": 76.37,
|
2104 |
+
"Pass rate": 0.9173,
|
2105 |
+
"Cost($)": 0.0279,
|
2106 |
"Framework": "",
|
2107 |
"X-shot": "0",
|
2108 |
"Samples": 254,
|
2109 |
+
"All tokens": 356839,
|
2110 |
+
"Total input tokens": 31703,
|
2111 |
+
"Average input tokens": 125,
|
2112 |
+
"Total output tokens": 325136,
|
2113 |
+
"Average output tokens": 1280
|
2114 |
},
|
2115 |
"MATH-500": {
|
2116 |
+
"Score": 65.8,
|
2117 |
+
"Pass rate": 0.998,
|
2118 |
+
"Cost($)": 0.0734,
|
2119 |
"Framework": "",
|
2120 |
"X-shot": "4",
|
2121 |
"Samples": 500,
|
2122 |
+
"All tokens": 1078003,
|
2123 |
+
"Total input tokens": 362390,
|
2124 |
+
"Average input tokens": 725,
|
2125 |
+
"Total output tokens": 715613,
|
2126 |
+
"Average output tokens": 1431
|
2127 |
}
|
2128 |
},
|
2129 |
"gpt-4o": {
|
|
|
2133 |
"Eval Date": "2025/1/22"
|
2134 |
},
|
2135 |
"gsm8k": {
|
2136 |
+
"Score": 94.77,
|
2137 |
+
"Pass rate": 1.0,
|
2138 |
+
"Cost($)": 18.2044,
|
2139 |
"Framework": "",
|
2140 |
"X-shot": "8",
|
2141 |
"Samples": 1319,
|
2142 |
+
"All tokens": 2491605,
|
2143 |
+
"Total input tokens": 894889,
|
2144 |
+
"Average input tokens": 678,
|
2145 |
+
"Total output tokens": 1596716,
|
2146 |
+
"Average output tokens": 1211
|
2147 |
},
|
2148 |
"AQuA": {
|
2149 |
+
"Score": 85.83,
|
2150 |
+
"Pass rate": 0.9921,
|
2151 |
+
"Cost($)": 5.2456,
|
2152 |
"Framework": "",
|
2153 |
"X-shot": "0",
|
2154 |
"Samples": 254,
|
2155 |
+
"All tokens": 545431,
|
2156 |
+
"Total input tokens": 27829,
|
2157 |
+
"Average input tokens": 110,
|
2158 |
+
"Total output tokens": 517602,
|
2159 |
+
"Average output tokens": 2038
|
2160 |
},
|
2161 |
"MATH-500": {
|
2162 |
+
"Score": 74.6,
|
2163 |
"Pass rate": 1.0,
|
2164 |
+
"Cost($)": 12.3611,
|
2165 |
"Framework": "",
|
2166 |
"X-shot": "4",
|
2167 |
"Samples": 500,
|
2168 |
+
"All tokens": 1495125,
|
2169 |
+
"Total input tokens": 345347,
|
2170 |
+
"Average input tokens": 691,
|
2171 |
+
"Total output tokens": 1149778,
|
2172 |
+
"Average output tokens": 2300
|
2173 |
}
|
2174 |
},
|
2175 |
"Qwen2.5-72B-Instruct": {
|
|
|
2179 |
"Eval Date": "2025/1/22"
|
2180 |
},
|
2181 |
"gsm8k": {
|
2182 |
+
"Score": 94.77,
|
2183 |
"Pass rate": 1.0,
|
2184 |
+
"Cost($)": 4.045,
|
2185 |
"Framework": "",
|
2186 |
"X-shot": "8",
|
2187 |
"Samples": 1319,
|
2188 |
+
"All tokens": 7175258,
|
2189 |
+
"Total input tokens": 5370360,
|
2190 |
+
"Average input tokens": 4072,
|
2191 |
+
"Total output tokens": 1804898,
|
2192 |
+
"Average output tokens": 1368
|
2193 |
},
|
2194 |
"AQuA": {
|
2195 |
+
"Score": 85.43,
|
2196 |
+
"Pass rate": 0.9685,
|
2197 |
+
"Cost($)": 0.4186,
|
2198 |
"Framework": "",
|
2199 |
"X-shot": "0",
|
2200 |
"Samples": 254,
|
2201 |
+
"All tokens": 742552,
|
2202 |
+
"Total input tokens": 137990,
|
2203 |
+
"Average input tokens": 543,
|
2204 |
+
"Total output tokens": 604562,
|
2205 |
+
"Average output tokens": 2380
|
2206 |
},
|
2207 |
"MATH-500": {
|
2208 |
+
"Score": 79.8,
|
2209 |
"Pass rate": 1.0,
|
2210 |
+
"Cost($)": 1.8504,
|
2211 |
"Framework": "",
|
2212 |
"X-shot": "4",
|
2213 |
"Samples": 500,
|
2214 |
+
"All tokens": 3282349,
|
2215 |
+
"Total input tokens": 1775395,
|
2216 |
+
"Average input tokens": 3551,
|
2217 |
+
"Total output tokens": 1506954,
|
2218 |
+
"Average output tokens": 3014
|
2219 |
}
|
2220 |
},
|
2221 |
"Llama-3.3-70B-Instruct": {
|
|
|
2225 |
"Eval Date": "2025/1/22"
|
2226 |
},
|
2227 |
"gsm8k": {
|
2228 |
+
"Score": 95.22,
|
2229 |
"Pass rate": 1.0,
|
2230 |
+
"Cost($)": 3.7895,
|
2231 |
"Framework": "",
|
2232 |
"X-shot": "8",
|
2233 |
"Samples": 1319,
|
2234 |
+
"All tokens": 6722014,
|
2235 |
+
"Total input tokens": 5295585,
|
2236 |
+
"Average input tokens": 4015,
|
2237 |
+
"Total output tokens": 1426429,
|
2238 |
+
"Average output tokens": 1081
|
2239 |
},
|
2240 |
"AQuA": {
|
2241 |
+
"Score": 84.65,
|
2242 |
+
"Pass rate": 0.9961,
|
2243 |
+
"Cost($)": 0.4438,
|
2244 |
"Framework": "",
|
2245 |
"X-shot": "0",
|
2246 |
"Samples": 254,
|
2247 |
+
"All tokens": 787312,
|
2248 |
+
"Total input tokens": 175050,
|
2249 |
+
"Average input tokens": 689,
|
2250 |
+
"Total output tokens": 612262,
|
2251 |
+
"Average output tokens": 2410
|
2252 |
},
|
2253 |
"MATH-500": {
|
2254 |
+
"Score": 72.4,
|
2255 |
"Pass rate": 1.0,
|
2256 |
+
"Cost($)": 1.7845,
|
2257 |
"Framework": "",
|
2258 |
"X-shot": "4",
|
2259 |
"Samples": 500,
|
2260 |
+
"All tokens": 3165511,
|
2261 |
+
"Total input tokens": 1797045,
|
2262 |
+
"Average input tokens": 3594,
|
2263 |
+
"Total output tokens": 1368466,
|
2264 |
+
"Average output tokens": 2737
|
2265 |
}
|
2266 |
},
|
2267 |
"Qwen2.5-7B-Instruct": {
|
|
|
2271 |
"Eval Date": "2025/1/22"
|
2272 |
},
|
2273 |
"gsm8k": {
|
2274 |
+
"Score": 90.98,
|
2275 |
"Pass rate": 1.0,
|
2276 |
"Cost($)": 0.0,
|
2277 |
"Framework": "",
|
2278 |
"X-shot": "8",
|
2279 |
"Samples": 1319,
|
2280 |
+
"All tokens": 7259943,
|
2281 |
+
"Total input tokens": 5580524,
|
2282 |
+
"Average input tokens": 4231,
|
2283 |
+
"Total output tokens": 1679419,
|
2284 |
+
"Average output tokens": 1273
|
2285 |
},
|
2286 |
"AQuA": {
|
2287 |
+
"Score": 79.53,
|
2288 |
"Pass rate": 1.0,
|
2289 |
"Cost($)": 0.0,
|
2290 |
"Framework": "",
|
2291 |
"X-shot": "0",
|
2292 |
"Samples": 254,
|
2293 |
+
"All tokens": 745410,
|
2294 |
+
"Total input tokens": 177972,
|
2295 |
+
"Average input tokens": 701,
|
2296 |
+
"Total output tokens": 567438,
|
2297 |
+
"Average output tokens": 2234
|
2298 |
},
|
2299 |
"MATH-500": {
|
2300 |
+
"Score": 71.2,
|
2301 |
"Pass rate": 1.0,
|
2302 |
"Cost($)": 0.0,
|
2303 |
"Framework": "",
|
2304 |
"X-shot": "4",
|
2305 |
"Samples": 500,
|
2306 |
+
"All tokens": 3155475,
|
2307 |
+
"Total input tokens": 1855922,
|
2308 |
+
"Average input tokens": 3712,
|
2309 |
+
"Total output tokens": 1299553,
|
2310 |
+
"Average output tokens": 2599
|
2311 |
}
|
2312 |
},
|
2313 |
"Llama-3.1-8B-Instruct": {
|
|
|
2317 |
"Eval Date": "2025/1/22"
|
2318 |
},
|
2319 |
"gsm8k": {
|
2320 |
+
"Score": 54.36,
|
2321 |
+
"Pass rate": 0.9985,
|
2322 |
"Cost($)": 0.0,
|
2323 |
"Framework": "",
|
2324 |
"X-shot": "8",
|
2325 |
"Samples": 1319,
|
2326 |
+
"All tokens": 10956434,
|
2327 |
+
"Total input tokens": 5136762,
|
2328 |
+
"Average input tokens": 3894,
|
2329 |
+
"Total output tokens": 5819672,
|
2330 |
+
"Average output tokens": 4412
|
2331 |
},
|
2332 |
"AQuA": {
|
2333 |
"Score": 59.45,
|
2334 |
+
"Pass rate": 0.9567,
|
2335 |
"Cost($)": 0.0,
|
2336 |
"Framework": "",
|
2337 |
"X-shot": "0",
|
2338 |
"Samples": 254,
|
2339 |
+
"All tokens": 690077,
|
2340 |
+
"Total input tokens": 145108,
|
2341 |
+
"Average input tokens": 571,
|
2342 |
+
"Total output tokens": 544969,
|
2343 |
+
"Average output tokens": 2146
|
2344 |
},
|
2345 |
"MATH-500": {
|
2346 |
+
"Score": 19.8,
|
2347 |
+
"Pass rate": 0.998,
|
2348 |
"Cost($)": 0.0,
|
2349 |
"Framework": "",
|
2350 |
"X-shot": "4",
|
2351 |
"Samples": 500,
|
2352 |
+
"All tokens": 3490834,
|
2353 |
+
"Total input tokens": 1734545,
|
2354 |
+
"Average input tokens": 3469,
|
2355 |
+
"Total output tokens": 1756289,
|
2356 |
+
"Average output tokens": 3513
|
2357 |
}
|
2358 |
},
|
2359 |
"Internllm2_5-7B": {
|
|
|
2363 |
"Eval Date": "2025/1/22"
|
2364 |
},
|
2365 |
"gsm8k": {
|
2366 |
+
"Score": 44.66,
|
2367 |
+
"Pass rate": 0.9181,
|
2368 |
"Cost($)": 0.0,
|
2369 |
"Framework": "",
|
2370 |
"X-shot": "8",
|
2371 |
"Samples": 1319,
|
2372 |
+
"All tokens": 8162499,
|
2373 |
+
"Total input tokens": 5847761,
|
2374 |
+
"Average input tokens": 4433,
|
2375 |
+
"Total output tokens": 2314738,
|
2376 |
+
"Average output tokens": 1755
|
2377 |
},
|
2378 |
"AQuA": {
|
2379 |
+
"Score": 38.58,
|
2380 |
+
"Pass rate": 0.9724,
|
2381 |
"Cost($)": 0.0,
|
2382 |
"Framework": "",
|
2383 |
"X-shot": "0",
|
2384 |
"Samples": 254,
|
2385 |
+
"All tokens": 879671,
|
2386 |
+
"Total input tokens": 264557,
|
2387 |
+
"Average input tokens": 1042,
|
2388 |
+
"Total output tokens": 615114,
|
2389 |
+
"Average output tokens": 2422
|
2390 |
},
|
2391 |
"MATH-500": {
|
2392 |
+
"Score": 9.2,
|
2393 |
+
"Pass rate": 0.974,
|
2394 |
"Cost($)": 0.0,
|
2395 |
"Framework": "",
|
2396 |
"X-shot": "4",
|
2397 |
"Samples": 500,
|
2398 |
+
"All tokens": 3249876,
|
2399 |
+
"Total input tokens": 1994983,
|
2400 |
+
"Average input tokens": 3990,
|
2401 |
+
"Total output tokens": 1254893,
|
2402 |
+
"Average output tokens": 2510
|
2403 |
}
|
2404 |
},
|
2405 |
"Qwen2-1.5B-Instruct": {
|
|
|
2409 |
"Eval Date": "2025/1/22"
|
2410 |
},
|
2411 |
"gsm8k": {
|
2412 |
+
"Score": 8.19,
|
2413 |
+
"Pass rate": 0.6876,
|
2414 |
"Cost($)": 0.0,
|
2415 |
"Framework": "",
|
2416 |
"X-shot": "8",
|
2417 |
"Samples": 1319,
|
2418 |
+
"All tokens": 7386453,
|
2419 |
+
"Total input tokens": 5439568,
|
2420 |
+
"Average input tokens": 4124,
|
2421 |
+
"Total output tokens": 1946885,
|
2422 |
+
"Average output tokens": 1476
|
2423 |
},
|
2424 |
"AQuA": {
|
2425 |
+
"Score": 10.63,
|
2426 |
+
"Pass rate": 0.5157,
|
2427 |
"Cost($)": 0.0,
|
2428 |
"Framework": "",
|
2429 |
"X-shot": "0",
|
2430 |
"Samples": 254,
|
2431 |
+
"All tokens": 701980,
|
2432 |
+
"Total input tokens": 151410,
|
2433 |
+
"Average input tokens": 596,
|
2434 |
+
"Total output tokens": 550570,
|
2435 |
+
"Average output tokens": 2168
|
2436 |
},
|
2437 |
"MATH-500": {
|
2438 |
+
"Score": 2.0,
|
2439 |
+
"Pass rate": 0.894,
|
2440 |
"Cost($)": 0.0,
|
2441 |
"Framework": "",
|
2442 |
"X-shot": "4",
|
2443 |
"Samples": 500,
|
2444 |
+
"All tokens": 3139024,
|
2445 |
+
"Total input tokens": 1805170,
|
2446 |
+
"Average input tokens": 3610,
|
2447 |
+
"Total output tokens": 1333854,
|
2448 |
+
"Average output tokens": 2668
|
2449 |
}
|
2450 |
},
|
2451 |
"Qwen2-0.5B-Instruct": {
|
|
|
2455 |
"Eval Date": "2025/1/22"
|
2456 |
},
|
2457 |
"gsm8k": {
|
2458 |
+
"Score": 4.17,
|
2459 |
+
"Pass rate": 0.9447,
|
2460 |
"Cost($)": 0.0,
|
2461 |
"Framework": "",
|
2462 |
"X-shot": "8",
|
2463 |
"Samples": 1319,
|
2464 |
+
"All tokens": 7478767,
|
2465 |
+
"Total input tokens": 5441962,
|
2466 |
+
"Average input tokens": 4126,
|
2467 |
+
"Total output tokens": 2036805,
|
2468 |
+
"Average output tokens": 1544
|
2469 |
},
|
2470 |
"AQuA": {
|
2471 |
+
"Score": 17.32,
|
2472 |
+
"Pass rate": 0.8228,
|
2473 |
"Cost($)": 0.0,
|
2474 |
"Framework": "",
|
2475 |
"X-shot": "0",
|
2476 |
"Samples": 254,
|
2477 |
+
"All tokens": 753913,
|
2478 |
+
"Total input tokens": 150787,
|
2479 |
+
"Average input tokens": 594,
|
2480 |
+
"Total output tokens": 603126,
|
2481 |
+
"Average output tokens": 2375
|
2482 |
},
|
2483 |
"MATH-500": {
|
2484 |
+
"Score": 2.2,
|
2485 |
+
"Pass rate": 0.988,
|
2486 |
"Cost($)": 0.0,
|
2487 |
"Framework": "",
|
2488 |
"X-shot": "4",
|
2489 |
"Samples": 500,
|
2490 |
+
"All tokens": 2797682,
|
2491 |
+
"Total input tokens": 1808691,
|
2492 |
+
"Average input tokens": 3617,
|
2493 |
+
"Total output tokens": 988991,
|
2494 |
+
"Average output tokens": 1978
|
2495 |
}
|
2496 |
},
|
2497 |
"deepseek-r1:1.5b": {
|
|
|
2501 |
"Eval Date": "2025/2/10"
|
2502 |
},
|
2503 |
"gsm8k": {
|
2504 |
+
"Score": 69.07,
|
2505 |
+
"Pass rate": 0.9879,
|
2506 |
"Cost($)": 0.0,
|
2507 |
"Framework": "",
|
2508 |
"X-shot": "8",
|
2509 |
"Samples": 1319,
|
2510 |
+
"All tokens": 10029684,
|
2511 |
+
"Total input tokens": 5407357,
|
2512 |
+
"Average input tokens": 4100,
|
2513 |
+
"Total output tokens": 4622327,
|
2514 |
+
"Average output tokens": 3504
|
2515 |
},
|
2516 |
"AQuA": {
|
2517 |
+
"Score": 57.87,
|
2518 |
+
"Pass rate": 0.7402,
|
2519 |
"Cost($)": 0.0,
|
2520 |
"Framework": "",
|
2521 |
"X-shot": "0",
|
2522 |
"Samples": 254,
|
2523 |
+
"All tokens": 2132111,
|
2524 |
+
"Total input tokens": 144710,
|
2525 |
+
"Average input tokens": 570,
|
2526 |
+
"Total output tokens": 1987401,
|
2527 |
+
"Average output tokens": 7824
|
2528 |
},
|
2529 |
"MATH-500": {
|
2530 |
+
"Score": 46.8,
|
2531 |
+
"Pass rate": 0.992,
|
2532 |
"Cost($)": 0.0,
|
2533 |
"Framework": "",
|
2534 |
"X-shot": "4",
|
2535 |
"Samples": 500,
|
2536 |
+
"All tokens": 13968168,
|
2537 |
+
"Total input tokens": 1858874,
|
2538 |
+
"Average input tokens": 3718,
|
2539 |
+
"Total output tokens": 12109294,
|
2540 |
+
"Average output tokens": 24219
|
2541 |
}
|
2542 |
}
|
2543 |
},
|
src/detail_results.csv
CHANGED
@@ -1,24 +1,24 @@
|
|
1 |
Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Samples,All tokens,Total input tokens,Average input tokens,Total output tokens,Average output tokens
|
2 |
-
1,
|
3 |
-
2,CoT,AQuA,
|
4 |
-
3,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.
|
5 |
-
4,
|
6 |
-
5,
|
7 |
-
6,
|
8 |
-
7,
|
9 |
-
8,
|
10 |
-
9,CoT,AQuA,
|
11 |
-
10,
|
12 |
11,ToT,AQuA,gpt-4o,2025/1/22,81.5,0.9921,0,8.5295,,254,2613607,2347538,9242,266069,1048
|
13 |
-
12,
|
14 |
-
13,
|
15 |
-
14,
|
16 |
-
15,SC-CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,79.
|
17 |
-
16,
|
18 |
-
17,
|
19 |
-
18,
|
20 |
-
19,
|
21 |
-
20,
|
22 |
21,IO,AQuA,gpt-4o,2025/1/22,75.59,0.9724,0,1.1453,,254,133752,25631,101,108121,426
|
23 |
22,PoT,AQuA,gpt-4o,2025/1/22,75.2,1.0,0,1.6087,,254,327908,222717,877,105191,414
|
24 |
23,PoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,75.2,1.0,0,0.1645,,254,291764,249215,981,42549,168
|
@@ -28,14 +28,14 @@ Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Sa
|
|
28 |
27,CoT,AQuA,deepseek-r1:1.5b,2025/1/23,71.65,0.9685,0,0.0,,254,333072,26413,104,306659,1207
|
29 |
28,IO,AQuA,deepseek-r1:1.5b,2025/1/22,68.9,0.9488,0,0.0,,254,351767,26667,105,325100,1280
|
30 |
29,PoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,68.11,1.0,0,0.0,,254,313728,264517,1041,49211,194
|
31 |
-
30,
|
32 |
-
31,
|
33 |
-
32,CoT,AQuA,
|
34 |
-
33,
|
35 |
-
34,
|
36 |
-
35,
|
37 |
-
36,SC-CoT,AQuA,
|
38 |
-
37,
|
39 |
38,ReAct-Pro*,AQuA,gpt-4o,2025/1/22,57.48,0.9724,0,2.304,,254,692096,615589,2424,76507,301
|
40 |
39,ToT,AQuA,gpt-3.5-turbo,2025/1/7,57.09,0.9961,0,1.1513,,254,2001396,1850767,7286,150629,593
|
41 |
40,ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,2025/1/22,55.51,0.9685,0,0.0,,254,4340821,3764723,14822,576098,2268
|
@@ -48,8 +48,8 @@ Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Sa
|
|
48 |
47,ToT,AQuA,Doubao-lite-32k,2025/1/7,45.28,0.7402,0,0.0881,,254,2000550,1850249,7284,150301,592
|
49 |
48,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0,0.0,,254,4428801,3592039,14142,836762,3294
|
50 |
49,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.9882,0,0.0,,254,110040,30477,120,79563,313
|
51 |
-
50,
|
52 |
-
51,
|
53 |
52,PoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,36.61,0.9685,0,0.0,,254,290914,240613,947,50301,198
|
54 |
53,PoT,AQuA,Internllm2_5-7B,2025/1/22,36.61,0.9882,0,0.0,,254,301962,233505,919,68457,270
|
55 |
54,ToT,AQuA,Internllm2_5-7B,2025/1/22,35.83,0.9961,0,0.0,,254,4734560,4263136,16784,471424,1856
|
@@ -62,109 +62,109 @@ Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Sa
|
|
62 |
61,ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,2025/1/22,25.59,0.9606,0,0.0,,254,5072004,4555858,17936,516146,2032
|
63 |
62,ToT,AQuA,deepseek-r1:1.5b,2025/2/10,24.8,0.5551,0,0.0,,254,794512,605028,2382,189484,746
|
64 |
63,ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,2025/1/22,24.02,0.9685,0,0.0,,254,7170087,6344167,24977,825920,3252
|
65 |
-
64,
|
66 |
-
65,SC-CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,
|
67 |
-
66,
|
68 |
1,CoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,80.2,1.0,4,0.349,,500,619015,338549,677,280466,561
|
69 |
-
2,SC-CoT,MATH-500,
|
70 |
-
3,SC-CoT,MATH-500,
|
71 |
-
4,CoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,
|
72 |
-
5,
|
73 |
-
6,CoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,
|
74 |
-
7,IO,MATH-500,
|
75 |
-
8,CoT,MATH-500,
|
76 |
-
9,
|
77 |
-
10,
|
78 |
-
11,
|
79 |
-
12,
|
80 |
-
13,
|
81 |
-
14,
|
82 |
-
15,CoT,MATH-500,
|
83 |
-
16,
|
84 |
-
17,
|
85 |
-
18,ReAct-Pro*,MATH-500,
|
86 |
-
19,
|
87 |
-
20,
|
88 |
-
21,
|
89 |
-
22,
|
90 |
-
23,PoT,MATH-500,
|
91 |
-
24,IO,MATH-500,
|
92 |
-
25,
|
93 |
-
26,
|
94 |
-
27,
|
95 |
-
28,
|
96 |
-
29,
|
97 |
-
30,
|
98 |
-
31,
|
99 |
-
32,
|
100 |
33,ReAct-Pro*,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,28.8,1.0,4,0.0,,500,8763629,7486706,14973,1276923,2554
|
101 |
34,PoT,MATH-500,gpt-3.5-turbo,2025/1/7,28.8,0.838,4,0.168,,500,271916,239902,480,32014,64
|
102 |
-
35,
|
103 |
-
36,
|
104 |
-
37,
|
105 |
-
38,ReAct-Pro*,MATH-500,
|
106 |
-
39,
|
107 |
-
40,
|
108 |
41,IO,MATH-500,gpt-3.5-turbo,2025/1/7,17.2,1.0,4,0.2436,,500,265625,154881,310,110744,221
|
109 |
42,CoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,15.2,1.0,4,0.0,,500,536377,349049,698,187328,375
|
110 |
43,PoT,MATH-500,Internllm2_5-7B,2025/1/22,15.0,0.324,4,0.0,,500,368709,247883,496,120826,242
|
111 |
44,ReAct-Pro*,MATH-500,Internllm2_5-7B,2025/1/22,14.8,1.0,4,0.0,,500,14186105,11831496,23663,2354609,4709
|
112 |
45,ToT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,10.8,1.0,4,9.0421,,500,16039361,15657730,31315,381631,763
|
113 |
-
46,
|
114 |
-
47,
|
115 |
48,ReAct-Pro*,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,8.2,1.0,4,0.0,,500,8987061,8430774,16862,556287,1113
|
116 |
49,IO,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,7.0,1.0,4,0.0,,500,413878,158777,318,255101,510
|
117 |
50,CoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,6.2,1.0,4,0.0,,500,549188,349049,698,200139,400
|
118 |
-
51,
|
119 |
-
52,
|
120 |
-
53,
|
121 |
-
54,
|
122 |
-
55,ToT,MATH-500,Llama-3.
|
123 |
-
56,ToT,MATH-500,
|
124 |
-
57,ToT,MATH-500,
|
125 |
-
58,
|
126 |
-
59,PoT,MATH-500,
|
127 |
-
60,
|
128 |
61,ToT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,0.8,0.972,4,0.0,,500,4535000,4408000,8816,127000,254
|
129 |
62,ReAct-Pro*,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.6,1.0,4,0.0,,500,19442440,18137392,36275,1305048,2610
|
130 |
63,ToT,MATH-500,deepseek-r1:1.5b,2025/2/10,0.4,0.716,4,0.0,,500,1941500,1831000,3662,110500,221
|
131 |
64,ToT,MATH-500,Internllm2_5-7B,2025/1/22,0.2,0.99,4,0.0,,500,8350500,7515000,15030,835500,1671
|
132 |
65,PoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.0,4,0.0,,500,437202,253549,507,183653,367
|
133 |
66,ToT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.962,4,0.0,,500,5996500,5590500,11181,406000,812
|
134 |
-
1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.
|
135 |
-
2,CoT,gsm8k,gpt-4o,2025/1/22,94.
|
136 |
-
3,CoT,gsm8k,
|
137 |
-
4,
|
138 |
-
5,
|
139 |
-
6,
|
140 |
-
7,
|
141 |
-
8,
|
142 |
-
9,
|
143 |
-
10,
|
144 |
-
11,
|
145 |
-
12,
|
146 |
-
13,CoT,gsm8k,
|
147 |
-
14,
|
148 |
-
15,
|
149 |
-
16,
|
150 |
-
17,ReAct-Pro*,gsm8k,
|
151 |
-
18,
|
152 |
19,IO,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,86.58,1.0,8,0.4899,,1319,869060,555340,421,313720,238
|
153 |
20,CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,85.67,1.0,8,0.0,,1319,1290805,1046008,793,244797,186
|
154 |
21,ReAct-Pro*,gsm8k,Doubao-lite-32k,2025/1/7,85.6,0.9962,8,0.2512,,1319,5998639,5862016,4444,136623,104
|
155 |
22,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.87,1.0,8,0.0,,1319,14850914,14355752,10884,495162,375
|
156 |
-
23,
|
157 |
-
24,
|
158 |
-
25,CoT,gsm8k,
|
159 |
-
26,
|
160 |
-
27,
|
161 |
-
28,
|
162 |
-
29,
|
163 |
-
30,
|
164 |
-
31,
|
165 |
-
32,
|
166 |
-
33,
|
167 |
-
34,CoT,gsm8k,deepseek-r1:1.5b,2025/
|
168 |
35,ToT,gsm8k,gpt-3.5-turbo,2025/1/7,67.93,0.997,8,9.1707,,1319,16727175,15920037,12070,807138,612
|
169 |
36,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.78,0.9856,8,0.0,,1319,22835767,21044978,15955,1790789,1358
|
170 |
37,ToT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,65.05,0.9196,8,0.0,,1319,16432102,15554967,11793,877135,665
|
@@ -174,8 +174,8 @@ Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Sa
|
|
174 |
41,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8,0.0,,1319,887913,596229,452,291684,221
|
175 |
42,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.9955,8,0.0,,1319,1745429,550941,418,1194488,906
|
176 |
43,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.5,1.0,8,0.0,,1319,1218525,1032818,783,185707,141
|
177 |
-
44,SC-CoT,gsm8k,
|
178 |
-
45,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,
|
179 |
46,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.67,0.5542,8,0.0,,1319,1391111,1147538,870,243573,185
|
180 |
47,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8,0.0,,1319,1324949,1136843,862,188106,143
|
181 |
48,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8,0.3328,,1319,586553,546990,415,39563,30
|
@@ -191,9 +191,9 @@ Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Sa
|
|
191 |
58,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.68,1.0,8,0.0,,1319,736996,568530,431,168466,128
|
192 |
59,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.71,1.0,8,0.0,,1319,834897,568116,431,266781,202
|
193 |
60,PoT,gsm8k,deepseek-r1:1.5b,2025/2/10,11.9,0.1744,8,0.0,,1319,1954509,1138872,863,815637,618
|
194 |
-
61,
|
195 |
-
62,
|
196 |
-
63,
|
197 |
64,ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,7.66,0.9522,8,0.0,,1319,55392611,52431343,39751,2961268,2245
|
198 |
-
65,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,
|
199 |
66,ToT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.0,8,0.0,,1319,0,0,0,0,0
|
|
|
1 |
Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Samples,All tokens,Total input tokens,Average input tokens,Total output tokens,Average output tokens
|
2 |
+
1,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,86.22,0.9921,0,0.0808,,254,143289,25143,99,118146,465
|
3 |
+
2,SC-CoT,AQuA,gpt-4o,2025/1/22,85.83,0.9921,0,5.2456,,254,545431,27829,110,517602,2038
|
4 |
+
3,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.43,0.9685,0,0.4186,,254,742552,137990,543,604562,2380
|
5 |
+
4,SC-CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,84.65,0.9961,0,0.4438,,254,787312,175050,689,612262,2410
|
6 |
+
5,IO,AQuA,Qwen2.5-72B-Instruct,2025/1/22,84.25,0.9961,0,0.0742,,254,131604,25397,100,106207,418
|
7 |
+
6,CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.46,0.9843,0,0.0927,,254,164389,32555,128,131834,519
|
8 |
+
7,ToT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.07,1.0,0,2.9404,,254,5215848,4735188,18642,480660,1892
|
9 |
+
8,IO,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.68,0.9921,0,0.0798,,254,141567,32809,129,108758,428
|
10 |
+
9,CoT,AQuA,Doubao-lite-32k,2025/1/7,82.68,0.9724,0,0.0066,,254,94577,27978,110,66599,262
|
11 |
+
10,CoT,AQuA,gpt-4o,2025/1/22,82.68,0.9803,0,1.0417,,254,123017,25123,99,97894,385
|
12 |
11,ToT,AQuA,gpt-4o,2025/1/22,81.5,0.9921,0,8.5295,,254,2613607,2347538,9242,266069,1048
|
13 |
+
12,ToT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,81.1,0.9921,0,3.7389,,254,6632255,6371642,25085,260613,1026
|
14 |
+
13,CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,80.71,0.9961,0,0.0,,254,149736,33017,130,116719,460
|
15 |
+
14,PoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.53,0.9921,0,0.1746,,254,309799,240735,948,69064,272
|
16 |
+
15,SC-CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,79.53,1.0,0,0.0,,254,745410,177972,701,567438,2234
|
17 |
+
16,IO,AQuA,Doubao-lite-32k,2025/1/7,79.13,1.0,0,0.0058,,254,87742,33058,130,54684,215
|
18 |
+
17,ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.13,0.9961,0,0.768,,254,1362379,1119143,4406,243236,958
|
19 |
+
18,IO,AQuA,Qwen2.5-7B-Instruct,2025/1/22,78.74,0.9843,0,0.0,,254,137771,33271,131,104500,411
|
20 |
+
19,ReAct-Pro*,AQuA,Doubao-lite-32k,2025/1/7,77.56,0.9606,0,0.0445,,254,1032841,977890,3850,54951,216
|
21 |
+
20,SC-CoT,AQuA,Doubao-lite-32k,2025/1/7,76.37,0.9173,0,0.0279,,254,356839,31703,125,325136,1280
|
22 |
21,IO,AQuA,gpt-4o,2025/1/22,75.59,0.9724,0,1.1453,,254,133752,25631,101,108121,426
|
23 |
22,PoT,AQuA,gpt-4o,2025/1/22,75.2,1.0,0,1.6087,,254,327908,222717,877,105191,414
|
24 |
23,PoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,75.2,1.0,0,0.1645,,254,291764,249215,981,42549,168
|
|
|
28 |
27,CoT,AQuA,deepseek-r1:1.5b,2025/1/23,71.65,0.9685,0,0.0,,254,333072,26413,104,306659,1207
|
29 |
28,IO,AQuA,deepseek-r1:1.5b,2025/1/22,68.9,0.9488,0,0.0,,254,351767,26667,105,325100,1280
|
30 |
29,PoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,68.11,1.0,0,0.0,,254,313728,264517,1041,49211,194
|
31 |
+
30,ReAct-Pro*,AQuA,gpt-3.5-turbo,2025/1/7,64.57,0.9803,0,0.4928,,254,903587,862614,3396,40973,161
|
32 |
+
31,CoT,AQuA,gpt-3.5-turbo,2025/1/7,61.02,0.937,0,0.0957,,254,80793,25447,100,55346,218
|
33 |
+
32,CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,60.63,1.0,0,0.0,,254,144435,32555,128,111880,440
|
34 |
+
33,PoT,AQuA,gpt-3.5-turbo,2025/1/7,59.45,1.0,0,0.1748,,254,266654,225162,886,41492,163
|
35 |
+
34,SC-CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,59.45,0.9567,0,0.0,,254,690077,145108,571,544969,2146
|
36 |
+
35,ToT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,59.06,1.0,0,0.0,,254,5739684,4896222,19276,843462,3321
|
37 |
+
36,SC-CoT,AQuA,gpt-3.5-turbo,2025/1/7,58.66,0.9252,0,0.3277,,254,237066,27906,110,209160,823
|
38 |
+
37,SC-CoT,AQuA,deepseek-r1:1.5b,2025/2/10,57.87,0.7402,0,0.0,,254,2132111,144710,570,1987401,7824
|
39 |
38,ReAct-Pro*,AQuA,gpt-4o,2025/1/22,57.48,0.9724,0,2.304,,254,692096,615589,2424,76507,301
|
40 |
39,ToT,AQuA,gpt-3.5-turbo,2025/1/7,57.09,0.9961,0,1.1513,,254,2001396,1850767,7286,150629,593
|
41 |
40,ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,2025/1/22,55.51,0.9685,0,0.0,,254,4340821,3764723,14822,576098,2268
|
|
|
48 |
47,ToT,AQuA,Doubao-lite-32k,2025/1/7,45.28,0.7402,0,0.0881,,254,2000550,1850249,7284,150301,592
|
49 |
48,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0,0.0,,254,4428801,3592039,14142,836762,3294
|
50 |
49,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.9882,0,0.0,,254,110040,30477,120,79563,313
|
51 |
+
50,IO,AQuA,gpt-3.5-turbo,2025/1/7,38.98,1.0,0,0.038,,254,42471,25701,101,16770,66
|
52 |
+
51,SC-CoT,AQuA,Internllm2_5-7B,2025/1/22,38.58,0.9724,0,0.0,,254,879671,264557,1042,615114,2422
|
53 |
52,PoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,36.61,0.9685,0,0.0,,254,290914,240613,947,50301,198
|
54 |
53,PoT,AQuA,Internllm2_5-7B,2025/1/22,36.61,0.9882,0,0.0,,254,301962,233505,919,68457,270
|
55 |
54,ToT,AQuA,Internllm2_5-7B,2025/1/22,35.83,0.9961,0,0.0,,254,4734560,4263136,16784,471424,1856
|
|
|
62 |
61,ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,2025/1/22,25.59,0.9606,0,0.0,,254,5072004,4555858,17936,516146,2032
|
63 |
62,ToT,AQuA,deepseek-r1:1.5b,2025/2/10,24.8,0.5551,0,0.0,,254,794512,605028,2382,189484,746
|
64 |
63,ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,2025/1/22,24.02,0.9685,0,0.0,,254,7170087,6344167,24977,825920,3252
|
65 |
+
64,PoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.9213,0,0.0,,254,322281,258867,1019,63414,250
|
66 |
+
65,SC-CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.8228,0,0.0,,254,753913,150787,594,603126,2375
|
67 |
+
66,SC-CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,10.63,0.5157,0,0.0,,254,701980,151410,596,550570,2168
|
68 |
1,CoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,80.2,1.0,4,0.349,,500,619015,338549,677,280466,561
|
69 |
+
2,SC-CoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,79.8,1.0,4,1.8504,,500,3282349,1775395,3551,1506954,3014
|
70 |
+
3,SC-CoT,MATH-500,gpt-4o,2025/1/22,74.6,1.0,4,12.3611,,500,1495125,345347,691,1149778,2300
|
71 |
+
4,SC-CoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,72.4,1.0,4,1.7845,,500,3165511,1797045,3594,1368466,2737
|
72 |
+
5,CoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,71.2,1.0,4,0.3463,,500,614221,342879,686,271342,543
|
73 |
+
6,SC-CoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,71.2,1.0,4,0.0,,500,3155475,1855922,3712,1299553,2599
|
74 |
+
7,IO,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,70.2,1.0,4,0.2506,,500,444591,169549,339,275042,550
|
75 |
+
8,CoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,69.8,1.0,4,0.0,,500,617204,354049,708,263155,526
|
76 |
+
9,IO,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,69.4,1.0,4,0.2386,,500,423216,155879,312,267337,535
|
77 |
+
10,CoT,MATH-500,gpt-4o,2025/1/22,68.0,1.0,4,3.0569,,500,552688,329332,659,223356,447
|
78 |
+
11,SC-CoT,MATH-500,Doubao-lite-32k,2025/1/7,65.8,0.998,4,0.0734,,500,1078003,362390,725,715613,1431
|
79 |
+
12,ReAct-Pro*,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,64.6,1.0,4,3.1806,,500,5641879,5223611,10447,418268,837
|
80 |
+
13,ReAct-Pro*,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,62.8,1.0,4,3.4541,,500,6127117,5747268,11495,379849,760
|
81 |
+
14,IO,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,59.4,1.0,4,0.0,,500,411362,169549,339,241813,484
|
82 |
+
15,CoT,MATH-500,Doubao-lite-32k,2025/1/7,59.0,1.0,4,0.0255,,500,479941,336370,673,143571,287
|
83 |
+
16,ReAct-Pro*,MATH-500,gpt-4o,2025/1/22,54.0,1.0,4,17.7735,,500,6153255,5834537,11669,318718,637
|
84 |
+
17,CoT,MATH-500,deepseek-r1:1.5b,2025/1/23,49.4,1.0,4,0.0,,500,1199129,341549,683,857580,1715
|
85 |
+
18,ReAct-Pro*,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,48.8,1.0,4,0.0,,500,4990240,4646708,9293,343532,687
|
86 |
+
19,ReAct-Pro*,MATH-500,Doubao-lite-32k,2025/1/7,47.2,1.0,4,0.186,,500,4388666,4234620,8469,154046,308
|
87 |
+
20,PoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,47.2,0.822,4,0.233,,500,413372,242549,485,170823,342
|
88 |
+
21,SC-CoT,MATH-500,deepseek-r1:1.5b,2025/2/10,46.8,0.992,4,0.0,,500,13968168,1858874,3718,12109294,24219
|
89 |
+
22,CoT,MATH-500,Internllm2_5-7B,2025/1/22,46.6,1.0,4,0.0,,500,546774,332883,666,213891,428
|
90 |
+
23,PoT,MATH-500,gpt-4o,2025/1/22,46.2,0.864,4,1.5994,,500,340960,241357,483,99603,199
|
91 |
+
24,IO,MATH-500,deepseek-r1:1.5b,2025/1/22,43.8,1.0,4,0.0,,500,1022548,157049,314,865499,1731
|
92 |
+
25,PoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,42.6,0.802,4,0.2839,,500,503596,253879,508,249717,499
|
93 |
+
26,IO,MATH-500,gpt-4o,2025/1/22,41.8,1.0,4,2.7907,,500,394447,153832,308,240615,481
|
94 |
+
27,SC-CoT,MATH-500,gpt-3.5-turbo,2025/1/7,40.8,1.0,4,1.2308,,500,1050819,345411,691,705408,1411
|
95 |
+
28,CoT,MATH-500,gpt-3.5-turbo,2025/1/7,39.8,1.0,4,0.3189,,500,432196,329381,659,102815,206
|
96 |
+
29,PoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,39.6,0.744,4,0.0,,500,408812,258549,517,150263,301
|
97 |
+
30,IO,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,38.6,1.0,4,0.0,,500,503934,155563,311,348371,697
|
98 |
+
31,IO,MATH-500,Doubao-lite-32k,2025/1/7,37.4,1.0,4,0.0187,,500,311730,166870,334,144860,290
|
99 |
+
32,PoT,MATH-500,Doubao-lite-32k,2025/1/7,32.6,0.68,4,0.0144,,500,303148,254377,509,48771,98
|
100 |
33,ReAct-Pro*,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,28.8,1.0,4,0.0,,500,8763629,7486706,14973,1276923,2554
|
101 |
34,PoT,MATH-500,gpt-3.5-turbo,2025/1/7,28.8,0.838,4,0.168,,500,271916,239902,480,32014,64
|
102 |
+
35,CoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,25.8,1.0,4,0.0,,500,625568,342879,686,282689,565
|
103 |
+
36,PoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,25.4,0.684,4,0.0,,500,462271,253879,508,208392,417
|
104 |
+
37,ReAct-Pro*,MATH-500,deepseek-r1:1.5b,2025/2/10,24.4,1.0,4,0.0,,500,30177348,20729970,41460,9447378,18895
|
105 |
+
38,ReAct-Pro*,MATH-500,gpt-3.5-turbo,2025/1/7,23.8,1.0,4,2.0406,,500,3832714,3708461,7417,124253,249
|
106 |
+
39,IO,MATH-500,Internllm2_5-7B,2025/1/22,22.8,1.0,4,0.0,,500,467888,201883,404,266005,532
|
107 |
+
40,SC-CoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,19.8,0.998,4,0.0,,500,3490834,1734545,3469,1756289,3513
|
108 |
41,IO,MATH-500,gpt-3.5-turbo,2025/1/7,17.2,1.0,4,0.2436,,500,265625,154881,310,110744,221
|
109 |
42,CoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,15.2,1.0,4,0.0,,500,536377,349049,698,187328,375
|
110 |
43,PoT,MATH-500,Internllm2_5-7B,2025/1/22,15.0,0.324,4,0.0,,500,368709,247883,496,120826,242
|
111 |
44,ReAct-Pro*,MATH-500,Internllm2_5-7B,2025/1/22,14.8,1.0,4,0.0,,500,14186105,11831496,23663,2354609,4709
|
112 |
45,ToT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,10.8,1.0,4,9.0421,,500,16039361,15657730,31315,381631,763
|
113 |
+
46,ToT,MATH-500,gpt-3.5-turbo,2025/1/7,9.8,1.0,4,5.2914,,500,10001767,9711244,19422,290523,581
|
114 |
+
47,SC-CoT,MATH-500,Internllm2_5-7B,2025/1/22,9.2,0.974,4,0.0,,500,3249876,1994983,3990,1254893,2510
|
115 |
48,ReAct-Pro*,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,8.2,1.0,4,0.0,,500,8987061,8430774,16862,556287,1113
|
116 |
49,IO,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,7.0,1.0,4,0.0,,500,413878,158777,318,255101,510
|
117 |
50,CoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,6.2,1.0,4,0.0,,500,549188,349049,698,200139,400
|
118 |
+
51,ToT,MATH-500,gpt-4o,2025/1/22,3.2,1.0,4,40.8094,,500,15242432,14881985,29764,360447,721
|
119 |
+
52,IO,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,2.6,1.0,4,0.0,,500,429330,159049,318,270281,541
|
120 |
+
53,SC-CoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,2.2,0.988,4,0.0,,500,2797682,1808691,3617,988991,1978
|
121 |
+
54,SC-CoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,2.0,0.894,4,0.0,,500,3139024,1805170,3610,1333854,2668
|
122 |
+
55,ToT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,1.8,0.908,4,0.0,,500,9035000,7729000,15458,1306000,2612
|
123 |
+
56,ToT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,1.4,0.698,4,8.2699,,500,14669500,14099500,28199,570000,1140
|
124 |
+
57,ToT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,1.4,0.916,4,0.0,,500,10167500,9749000,19498,418500,837
|
125 |
+
58,ToT,MATH-500,Doubao-lite-32k,2025/1/7,1.2,0.942,4,0.2371,,500,5564500,5338500,10677,226000,452
|
126 |
+
59,PoT,MATH-500,deepseek-r1:1.5b,2025/2/10,1.0,0.016,4,0.0,,500,1031067,245549,491,785518,1571
|
127 |
+
60,PoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,0.8,0.022,4,0.0,,500,786870,248509,497,538361,1077
|
128 |
61,ToT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,0.8,0.972,4,0.0,,500,4535000,4408000,8816,127000,254
|
129 |
62,ReAct-Pro*,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.6,1.0,4,0.0,,500,19442440,18137392,36275,1305048,2610
|
130 |
63,ToT,MATH-500,deepseek-r1:1.5b,2025/2/10,0.4,0.716,4,0.0,,500,1941500,1831000,3662,110500,221
|
131 |
64,ToT,MATH-500,Internllm2_5-7B,2025/1/22,0.2,0.99,4,0.0,,500,8350500,7515000,15030,835500,1671
|
132 |
65,PoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.0,4,0.0,,500,437202,253549,507,183653,367
|
133 |
66,ToT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.962,4,0.0,,500,5996500,5590500,11181,406000,812
|
134 |
+
1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.22,1.0,8,3.7895,,1319,6722014,5295585,4015,1426429,1081
|
135 |
+
2,SC-CoT,gsm8k,gpt-4o,2025/1/22,94.77,1.0,8,18.2044,,1319,2491605,894889,678,1596716,1211
|
136 |
+
3,SC-CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,94.77,1.0,8,4.045,,1319,7175258,5370360,4072,1804898,1368
|
137 |
+
4,CoT,gsm8k,gpt-4o,2025/1/22,94.09,1.0,8,4.5367,,1319,1165166,948668,719,216498,164
|
138 |
+
5,CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,93.93,1.0,8,0.687,,1319,1218665,990168,751,228497,173
|
139 |
+
6,PoT,gsm8k,gpt-4o,2025/1/22,93.1,0.9977,8,4.2166,,1319,1247912,1101672,835,146240,111
|
140 |
+
7,CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.87,1.0,8,0.7195,,1319,1276252,1005119,762,271133,206
|
141 |
+
8,PoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.34,0.9939,8,0.7054,,1319,1251210,1106682,839,144528,110
|
142 |
+
9,IO,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,92.27,1.0,8,0.4709,,1319,835275,583916,443,251359,191
|
143 |
+
10,ToT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,91.89,1.0,8,20.8753,,1319,37029687,35096810,26609,1932877,1465
|
144 |
+
11,SC-CoT,gsm8k,Doubao-lite-32k,2025/1/7,91.58,0.9992,8,0.1118,,1319,1835891,942182,714,893709,678
|
145 |
+
12,ToT,gsm8k,gpt-4o,2025/1/22,91.13,1.0,8,86.8581,,1319,30769735,29445237,22324,1324498,1004
|
146 |
+
13,SC-CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,90.98,1.0,8,0.0,,1319,7259943,5580524,4231,1679419,1273
|
147 |
+
14,CoT,gsm8k,Doubao-lite-32k,2025/1/7,89.31,1.0,8,0.0558,,1319,1201820,1042095,790,159725,121
|
148 |
+
15,ToT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,88.88,1.0,8,23.5911,,1319,41847148,40435361,30656,1411787,1070
|
149 |
+
16,IO,gsm8k,gpt-4o,2025/1/22,88.4,1.0,8,3.3463,,1319,741446,542416,411,199030,151
|
150 |
+
17,ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,87.64,0.9992,8,10.1124,,1319,17937864,17038928,12918,898936,682
|
151 |
+
18,ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,87.26,1.0,8,10.5479,,1319,18710437,18160983,13769,549454,417
|
152 |
19,IO,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,86.58,1.0,8,0.4899,,1319,869060,555340,421,313720,238
|
153 |
20,CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,85.67,1.0,8,0.0,,1319,1290805,1046008,793,244797,186
|
154 |
21,ReAct-Pro*,gsm8k,Doubao-lite-32k,2025/1/7,85.6,0.9962,8,0.2512,,1319,5998639,5862016,4444,136623,104
|
155 |
22,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.87,1.0,8,0.0,,1319,14850914,14355752,10884,495162,375
|
156 |
+
23,PoT,gsm8k,Doubao-lite-32k,2025/1/7,79.61,0.9257,8,0.0576,,1319,1288055,1170038,887,118017,89
|
157 |
+
24,CoT,gsm8k,gpt-3.5-turbo,2025/1/7,78.7,1.0,8,0.6788,,1319,1088041,953242,723,134799,102
|
158 |
+
25,CoT,gsm8k,Internllm2_5-7B,2025/1/22,77.71,0.997,8,0.0,,1319,1202163,968163,734,234000,177
|
159 |
+
26,PoT,gsm8k,gpt-3.5-turbo,2025/1/7,76.88,0.9924,8,0.6902,,1319,1187080,1090418,827,96662,73
|
160 |
+
27,CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.44,0.9992,8,0.0,,1319,1248329,990168,751,258161,196
|
161 |
+
28,ReAct-Pro*,gsm8k,gpt-3.5-turbo,2025/1/7,74.91,0.9939,8,3.4633,,1319,6646286,6506164,4933,140122,106
|
162 |
+
29,PoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,73.09,0.7961,8,0.9736,,1319,1727044,1126025,854,601019,456
|
163 |
+
30,ToT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,72.21,0.9901,8,0.0,,1319,31657319,20196528,15312,11460791,8689
|
164 |
+
31,IO,gsm8k,Doubao-lite-32k,2025/1/7,72.02,0.9992,8,0.0354,,1319,740483,617377,468,123106,93
|
165 |
+
32,CoT,gsm8k,deepseek-r1:1.5b,2025/1/23,70.66,0.9977,8,0.0,,1319,2090625,1011714,767,1078911,818
|
166 |
+
33,SC-CoT,gsm8k,gpt-3.5-turbo,2025/1/7,69.29,0.9879,8,2.5203,,1319,2277249,895571,679,1381678,1048
|
167 |
+
34,SC-CoT,gsm8k,deepseek-r1:1.5b,2025/2/10,69.07,0.9879,8,0.0,,1319,10029684,5407357,4100,4622327,3504
|
168 |
35,ToT,gsm8k,gpt-3.5-turbo,2025/1/7,67.93,0.997,8,9.1707,,1319,16727175,15920037,12070,807138,612
|
169 |
36,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.78,0.9856,8,0.0,,1319,22835767,21044978,15955,1790789,1358
|
170 |
37,ToT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,65.05,0.9196,8,0.0,,1319,16432102,15554967,11793,877135,665
|
|
|
174 |
41,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8,0.0,,1319,887913,596229,452,291684,221
|
175 |
42,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.9955,8,0.0,,1319,1745429,550941,418,1194488,906
|
176 |
43,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.5,1.0,8,0.0,,1319,1218525,1032818,783,185707,141
|
177 |
+
44,SC-CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,54.36,0.9985,8,0.0,,1319,10956434,5136762,3894,5819672,4412
|
178 |
+
45,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,44.66,0.9181,8,0.0,,1319,8162499,5847761,4433,2314738,1755
|
179 |
46,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.67,0.5542,8,0.0,,1319,1391111,1147538,870,243573,185
|
180 |
47,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8,0.0,,1319,1324949,1136843,862,188106,143
|
181 |
48,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8,0.3328,,1319,586553,546990,415,39563,30
|
|
|
191 |
58,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.68,1.0,8,0.0,,1319,736996,568530,431,168466,128
|
192 |
59,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.71,1.0,8,0.0,,1319,834897,568116,431,266781,202
|
193 |
60,PoT,gsm8k,deepseek-r1:1.5b,2025/2/10,11.9,0.1744,8,0.0,,1319,1954509,1138872,863,815637,618
|
194 |
+
61,IO,gsm8k,Internllm2_5-7B,2025/1/22,11.6,0.9795,8,0.0,,1319,1113728,679302,515,434426,329
|
195 |
+
62,PoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,9.63,0.1691,8,0.0,,1319,1389135,1151528,873,237607,180
|
196 |
+
63,SC-CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,8.19,0.6876,8,0.0,,1319,7386453,5439568,4124,1946885,1476
|
197 |
64,ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,7.66,0.9522,8,0.0,,1319,55392611,52431343,39751,2961268,2245
|
198 |
+
65,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,4.17,0.9447,8,0.0,,1319,7478767,5441962,4126,2036805,1544
|
199 |
66,ToT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.0,8,0.0,,1319,0,0,0,0,0
|
src/overall_math_score.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"time": "2025-
|
3 |
"results": {
|
4 |
"IO": {
|
5 |
"META": {
|
@@ -84,16 +84,16 @@
|
|
84 |
"Eval Date": "2025/1/7"
|
85 |
},
|
86 |
"gsm8k": {
|
87 |
-
"Score":
|
88 |
-
"Cost($)":
|
89 |
},
|
90 |
"AQuA": {
|
91 |
-
"Score": 66
|
92 |
-
"Cost($)": 0.
|
93 |
},
|
94 |
"MATH-500": {
|
95 |
-
"Score":
|
96 |
-
"Cost($)": 1.
|
97 |
}
|
98 |
},
|
99 |
"ToT": {
|
@@ -198,16 +198,16 @@
|
|
198 |
"Eval Date": "2025/1/7"
|
199 |
},
|
200 |
"gsm8k": {
|
201 |
-
"Score":
|
202 |
-
"Cost($)": 0.
|
203 |
},
|
204 |
"AQuA": {
|
205 |
-
"Score":
|
206 |
-
"Cost($)": 0.
|
207 |
},
|
208 |
"MATH-500": {
|
209 |
-
"Score":
|
210 |
-
"Cost($)": 0.
|
211 |
}
|
212 |
},
|
213 |
"ToT-Doubao-lite-32k": {
|
@@ -312,16 +312,16 @@
|
|
312 |
"Eval Date": "2025/1/22"
|
313 |
},
|
314 |
"gsm8k": {
|
315 |
-
"Score":
|
316 |
-
"Cost($)":
|
317 |
},
|
318 |
"AQuA": {
|
319 |
-
"Score":
|
320 |
-
"Cost($)":
|
321 |
},
|
322 |
"MATH-500": {
|
323 |
-
"Score":
|
324 |
-
"Cost($)":
|
325 |
}
|
326 |
},
|
327 |
"ToT-gpt-4o": {
|
@@ -426,16 +426,16 @@
|
|
426 |
"Eval Date": "2025/1/22"
|
427 |
},
|
428 |
"gsm8k": {
|
429 |
-
"Score":
|
430 |
-
"Cost($)":
|
431 |
},
|
432 |
"AQuA": {
|
433 |
-
"Score": 85.
|
434 |
-
"Cost($)":
|
435 |
},
|
436 |
"MATH-500": {
|
437 |
-
"Score":
|
438 |
-
"Cost($)":
|
439 |
}
|
440 |
},
|
441 |
"ToT-Qwen2.5-72B-Instruct": {
|
@@ -540,16 +540,16 @@
|
|
540 |
"Eval Date": "2025/1/22"
|
541 |
},
|
542 |
"gsm8k": {
|
543 |
-
"Score": 95.
|
544 |
-
"Cost($)":
|
545 |
},
|
546 |
"AQuA": {
|
547 |
-
"Score":
|
548 |
-
"Cost($)":
|
549 |
},
|
550 |
"MATH-500": {
|
551 |
-
"Score":
|
552 |
-
"Cost($)":
|
553 |
}
|
554 |
},
|
555 |
"ToT-Llama-3.3-70B-Instruct": {
|
@@ -654,15 +654,15 @@
|
|
654 |
"Eval Date": "2025/1/22"
|
655 |
},
|
656 |
"gsm8k": {
|
657 |
-
"Score":
|
658 |
"Cost($)": 0.0
|
659 |
},
|
660 |
"AQuA": {
|
661 |
-
"Score": 79.
|
662 |
"Cost($)": 0.0
|
663 |
},
|
664 |
"MATH-500": {
|
665 |
-
"Score":
|
666 |
"Cost($)": 0.0
|
667 |
}
|
668 |
},
|
@@ -768,7 +768,7 @@
|
|
768 |
"Eval Date": "2025/1/22"
|
769 |
},
|
770 |
"gsm8k": {
|
771 |
-
"Score":
|
772 |
"Cost($)": 0.0
|
773 |
},
|
774 |
"AQuA": {
|
@@ -776,7 +776,7 @@
|
|
776 |
"Cost($)": 0.0
|
777 |
},
|
778 |
"MATH-500": {
|
779 |
-
"Score":
|
780 |
"Cost($)": 0.0
|
781 |
}
|
782 |
},
|
@@ -882,15 +882,15 @@
|
|
882 |
"Eval Date": "2025/1/22"
|
883 |
},
|
884 |
"gsm8k": {
|
885 |
-
"Score":
|
886 |
"Cost($)": 0.0
|
887 |
},
|
888 |
"AQuA": {
|
889 |
-
"Score":
|
890 |
"Cost($)": 0.0
|
891 |
},
|
892 |
"MATH-500": {
|
893 |
-
"Score": 9.
|
894 |
"Cost($)": 0.0
|
895 |
}
|
896 |
},
|
@@ -996,15 +996,15 @@
|
|
996 |
"Eval Date": "2025/1/22"
|
997 |
},
|
998 |
"gsm8k": {
|
999 |
-
"Score":
|
1000 |
"Cost($)": 0.0
|
1001 |
},
|
1002 |
"AQuA": {
|
1003 |
-
"Score":
|
1004 |
"Cost($)": 0.0
|
1005 |
},
|
1006 |
"MATH-500": {
|
1007 |
-
"Score":
|
1008 |
"Cost($)": 0.0
|
1009 |
}
|
1010 |
},
|
@@ -1110,15 +1110,15 @@
|
|
1110 |
"Eval Date": "2025/1/22"
|
1111 |
},
|
1112 |
"gsm8k": {
|
1113 |
-
"Score":
|
1114 |
"Cost($)": 0.0
|
1115 |
},
|
1116 |
"AQuA": {
|
1117 |
-
"Score":
|
1118 |
"Cost($)": 0.0
|
1119 |
},
|
1120 |
"MATH-500": {
|
1121 |
-
"Score":
|
1122 |
"Cost($)": 0.0
|
1123 |
}
|
1124 |
},
|
@@ -1224,15 +1224,15 @@
|
|
1224 |
"Eval Date": "2025/2/10"
|
1225 |
},
|
1226 |
"gsm8k": {
|
1227 |
-
"Score":
|
1228 |
"Cost($)": 0.0
|
1229 |
},
|
1230 |
"AQuA": {
|
1231 |
-
"Score":
|
1232 |
"Cost($)": 0.0
|
1233 |
},
|
1234 |
"MATH-500": {
|
1235 |
-
"Score":
|
1236 |
"Cost($)": 0.0
|
1237 |
}
|
1238 |
},
|
|
|
1 |
{
|
2 |
+
"time": "2025-03-05 13:15:02",
|
3 |
"results": {
|
4 |
"IO": {
|
5 |
"META": {
|
|
|
84 |
"Eval Date": "2025/1/7"
|
85 |
},
|
86 |
"gsm8k": {
|
87 |
+
"Score": 69.29,
|
88 |
+
"Cost($)": 2.5203
|
89 |
},
|
90 |
"AQuA": {
|
91 |
+
"Score": 58.66,
|
92 |
+
"Cost($)": 0.3277
|
93 |
},
|
94 |
"MATH-500": {
|
95 |
+
"Score": 40.8,
|
96 |
+
"Cost($)": 1.2308
|
97 |
}
|
98 |
},
|
99 |
"ToT": {
|
|
|
198 |
"Eval Date": "2025/1/7"
|
199 |
},
|
200 |
"gsm8k": {
|
201 |
+
"Score": 91.58,
|
202 |
+
"Cost($)": 0.1118
|
203 |
},
|
204 |
"AQuA": {
|
205 |
+
"Score": 76.37,
|
206 |
+
"Cost($)": 0.0279
|
207 |
},
|
208 |
"MATH-500": {
|
209 |
+
"Score": 65.8,
|
210 |
+
"Cost($)": 0.0734
|
211 |
}
|
212 |
},
|
213 |
"ToT-Doubao-lite-32k": {
|
|
|
312 |
"Eval Date": "2025/1/22"
|
313 |
},
|
314 |
"gsm8k": {
|
315 |
+
"Score": 94.77,
|
316 |
+
"Cost($)": 18.2044
|
317 |
},
|
318 |
"AQuA": {
|
319 |
+
"Score": 85.83,
|
320 |
+
"Cost($)": 5.2456
|
321 |
},
|
322 |
"MATH-500": {
|
323 |
+
"Score": 74.6,
|
324 |
+
"Cost($)": 12.3611
|
325 |
}
|
326 |
},
|
327 |
"ToT-gpt-4o": {
|
|
|
426 |
"Eval Date": "2025/1/22"
|
427 |
},
|
428 |
"gsm8k": {
|
429 |
+
"Score": 94.77,
|
430 |
+
"Cost($)": 4.045
|
431 |
},
|
432 |
"AQuA": {
|
433 |
+
"Score": 85.43,
|
434 |
+
"Cost($)": 0.4186
|
435 |
},
|
436 |
"MATH-500": {
|
437 |
+
"Score": 79.8,
|
438 |
+
"Cost($)": 1.8504
|
439 |
}
|
440 |
},
|
441 |
"ToT-Qwen2.5-72B-Instruct": {
|
|
|
540 |
"Eval Date": "2025/1/22"
|
541 |
},
|
542 |
"gsm8k": {
|
543 |
+
"Score": 95.22,
|
544 |
+
"Cost($)": 3.7895
|
545 |
},
|
546 |
"AQuA": {
|
547 |
+
"Score": 84.65,
|
548 |
+
"Cost($)": 0.4438
|
549 |
},
|
550 |
"MATH-500": {
|
551 |
+
"Score": 72.4,
|
552 |
+
"Cost($)": 1.7845
|
553 |
}
|
554 |
},
|
555 |
"ToT-Llama-3.3-70B-Instruct": {
|
|
|
654 |
"Eval Date": "2025/1/22"
|
655 |
},
|
656 |
"gsm8k": {
|
657 |
+
"Score": 90.98,
|
658 |
"Cost($)": 0.0
|
659 |
},
|
660 |
"AQuA": {
|
661 |
+
"Score": 79.53,
|
662 |
"Cost($)": 0.0
|
663 |
},
|
664 |
"MATH-500": {
|
665 |
+
"Score": 71.2,
|
666 |
"Cost($)": 0.0
|
667 |
}
|
668 |
},
|
|
|
768 |
"Eval Date": "2025/1/22"
|
769 |
},
|
770 |
"gsm8k": {
|
771 |
+
"Score": 54.36,
|
772 |
"Cost($)": 0.0
|
773 |
},
|
774 |
"AQuA": {
|
|
|
776 |
"Cost($)": 0.0
|
777 |
},
|
778 |
"MATH-500": {
|
779 |
+
"Score": 19.8,
|
780 |
"Cost($)": 0.0
|
781 |
}
|
782 |
},
|
|
|
882 |
"Eval Date": "2025/1/22"
|
883 |
},
|
884 |
"gsm8k": {
|
885 |
+
"Score": 44.66,
|
886 |
"Cost($)": 0.0
|
887 |
},
|
888 |
"AQuA": {
|
889 |
+
"Score": 38.58,
|
890 |
"Cost($)": 0.0
|
891 |
},
|
892 |
"MATH-500": {
|
893 |
+
"Score": 9.2,
|
894 |
"Cost($)": 0.0
|
895 |
}
|
896 |
},
|
|
|
996 |
"Eval Date": "2025/1/22"
|
997 |
},
|
998 |
"gsm8k": {
|
999 |
+
"Score": 8.19,
|
1000 |
"Cost($)": 0.0
|
1001 |
},
|
1002 |
"AQuA": {
|
1003 |
+
"Score": 10.63,
|
1004 |
"Cost($)": 0.0
|
1005 |
},
|
1006 |
"MATH-500": {
|
1007 |
+
"Score": 2.0,
|
1008 |
"Cost($)": 0.0
|
1009 |
}
|
1010 |
},
|
|
|
1110 |
"Eval Date": "2025/1/22"
|
1111 |
},
|
1112 |
"gsm8k": {
|
1113 |
+
"Score": 4.17,
|
1114 |
"Cost($)": 0.0
|
1115 |
},
|
1116 |
"AQuA": {
|
1117 |
+
"Score": 17.32,
|
1118 |
"Cost($)": 0.0
|
1119 |
},
|
1120 |
"MATH-500": {
|
1121 |
+
"Score": 2.2,
|
1122 |
"Cost($)": 0.0
|
1123 |
}
|
1124 |
},
|
|
|
1224 |
"Eval Date": "2025/2/10"
|
1225 |
},
|
1226 |
"gsm8k": {
|
1227 |
+
"Score": 69.07,
|
1228 |
"Cost($)": 0.0
|
1229 |
},
|
1230 |
"AQuA": {
|
1231 |
+
"Score": 57.87,
|
1232 |
"Cost($)": 0.0
|
1233 |
},
|
1234 |
"MATH-500": {
|
1235 |
+
"Score": 46.8,
|
1236 |
"Cost($)": 0.0
|
1237 |
}
|
1238 |
},
|
src/overall_results.csv
CHANGED
@@ -1,20 +1,20 @@
|
|
1 |
Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA-Cost($),MATH-500-Score,MATH-500-Cost($)
|
2 |
-
1.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,86.
|
3 |
-
2.0,
|
4 |
-
3.0,SC-CoT,
|
5 |
-
4.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,
|
6 |
-
5.0,CoT,
|
7 |
-
6.0,
|
8 |
-
7.0,IO,
|
9 |
-
8.0,SC-CoT,Qwen2.5-7B-Instruct,2025/1/22,
|
10 |
-
9.0,
|
11 |
-
10.0,
|
12 |
-
11.0,CoT,Doubao-lite-32k,2025/1/7,77.
|
13 |
-
12.0,ReAct-Pro*,
|
14 |
-
13.0,
|
15 |
-
14.0,
|
16 |
-
15.0,PoT,
|
17 |
-
16.0,
|
18 |
17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,70.12,85.6,0.2512,77.56,0.0445,47.2,0.186
|
19 |
18.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,68.69,82.87,0.0,74.41,0.0,48.8,0.0
|
20 |
19.0,IO,gpt-4o,2025/1/22,68.60,88.4,3.3463,75.59,1.1453,41.8,2.7907
|
@@ -29,24 +29,24 @@ Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA
|
|
29 |
28.0,IO,deepseek-r1:1.5b,2025/1/22,58.95,64.14,0.0,68.9,0.0,43.8,0.0
|
30 |
29.0,ToT,Llama-3.3-70B-Instruct,2025/1/22,58.79,91.89,20.8753,83.07,2.9404,1.4,8.2699
|
31 |
30.0,ToT,gpt-4o,2025/1/22,58.61,91.13,86.8581,81.5,8.5295,3.2,40.8094
|
32 |
-
31.0,
|
33 |
-
32.0,
|
34 |
-
33.0,
|
35 |
-
34.0,PoT,
|
36 |
-
35.0,
|
37 |
-
36.0,
|
38 |
37.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,53.96,75.44,0.0,60.63,0.0,25.8,0.0
|
39 |
-
38.0,
|
40 |
-
39.0,
|
41 |
-
40.0,
|
42 |
-
41.0,
|
43 |
42.0,ToT,Qwen2.5-7B-Instruct,2025/1/22,42.52,72.21,0.0,53.94,0.0,1.4,0.0
|
44 |
43.0,ToT,Llama-3.1-8B-Instruct,2025/1/22,41.97,65.05,0.0,59.06,0.0,1.8,0.0
|
45 |
44.0,ReAct-Pro*,deepseek-r1:1.5b,2025/2/10,38.22,35.94,0.0,54.33,0.0,24.4,0.0
|
46 |
45.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,37.08,55.5,0.0,40.55,0.0,15.2,0.0
|
47 |
46.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,33.56,38.67,0.0,36.61,0.0,25.4,0.0
|
48 |
-
47.0,
|
49 |
-
48.0,
|
50 |
49.0,PoT,Internllm2_5-7B,2025/1/22,29.94,38.21,0.0,36.61,0.0,15.0,0.0
|
51 |
50.0,ReAct-Pro*,Internllm2_5-7B,2025/1/22,29.75,33.51,0.0,40.94,0.0,14.8,0.0
|
52 |
51.0,ToT,Doubao-lite-32k,2025/1/7,28.10,37.83,0.8739,45.28,0.0881,1.2,0.2371
|
@@ -60,8 +60,8 @@ Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA
|
|
60 |
59.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,16.67,18.5,0.0,30.71,0.0,0.8,0.0
|
61 |
60.0,ToT,deepseek-r1:1.5b,2025/2/10,16.11,23.12,0.0,24.8,0.0,0.4,0.0
|
62 |
61.0,IO,Qwen2-0.5B-Instruct,2025/1/22,14.83,14.71,0.0,27.17,0.0,2.6,0.0
|
63 |
-
62.0,
|
64 |
-
63.0,
|
65 |
-
64.0,
|
66 |
-
65.0,
|
67 |
-
66.0,SC-CoT,Qwen2-
|
|
|
1 |
Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA-Cost($),MATH-500-Score,MATH-500-Cost($)
|
2 |
+
1.0,SC-CoT,Qwen2.5-72B-Instruct,2025/1/22,86.67,94.77,4.045,85.43,0.4186,79.8,1.8504
|
3 |
+
2.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,86.43,92.87,0.7195,86.22,0.0808,80.2,0.349
|
4 |
+
3.0,SC-CoT,gpt-4o,2025/1/22,85.07,94.77,18.2044,85.83,5.2456,74.6,12.3611
|
5 |
+
4.0,SC-CoT,Llama-3.3-70B-Instruct,2025/1/22,84.09,95.22,3.7895,84.65,0.4438,72.4,1.7845
|
6 |
+
5.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,82.86,93.93,0.687,83.46,0.0927,71.2,0.3463
|
7 |
+
6.0,CoT,gpt-4o,2025/1/22,81.59,94.09,4.5367,82.68,1.0417,68.0,3.0569
|
8 |
+
7.0,IO,Llama-3.3-70B-Instruct,2025/1/22,81.45,92.27,0.4709,82.68,0.0798,69.4,0.2386
|
9 |
+
8.0,SC-CoT,Qwen2.5-7B-Instruct,2025/1/22,80.57,90.98,0.0,79.53,0.0,71.2,0.0
|
10 |
+
9.0,IO,Qwen2.5-72B-Instruct,2025/1/22,80.34,86.58,0.4899,84.25,0.0742,70.2,0.2506
|
11 |
+
10.0,CoT,Qwen2.5-7B-Instruct,2025/1/22,78.73,85.67,0.0,80.71,0.0,69.8,0.0
|
12 |
+
11.0,SC-CoT,Doubao-lite-32k,2025/1/7,77.92,91.58,0.1118,76.37,0.0279,65.8,0.0734
|
13 |
+
12.0,ReAct-Pro*,Llama-3.3-70B-Instruct,2025/1/22,77.12,87.64,10.1124,79.13,0.768,64.6,3.1806
|
14 |
+
13.0,CoT,Doubao-lite-32k,2025/1/7,77.00,89.31,0.0558,82.68,0.0066,59.0,0.0255
|
15 |
+
14.0,ReAct-Pro*,Qwen2.5-72B-Instruct,2025/1/22,74.43,87.26,10.5479,73.23,0.3177,62.8,3.4541
|
16 |
+
15.0,PoT,Qwen2.5-72B-Instruct,2025/1/22,71.58,92.34,0.7054,75.2,0.1645,47.2,0.233
|
17 |
+
16.0,PoT,gpt-4o,2025/1/22,71.50,93.1,4.2166,75.2,1.6087,46.2,1.5994
|
18 |
17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,70.12,85.6,0.2512,77.56,0.0445,47.2,0.186
|
19 |
18.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,68.69,82.87,0.0,74.41,0.0,48.8,0.0
|
20 |
19.0,IO,gpt-4o,2025/1/22,68.60,88.4,3.3463,75.59,1.1453,41.8,2.7907
|
|
|
29 |
28.0,IO,deepseek-r1:1.5b,2025/1/22,58.95,64.14,0.0,68.9,0.0,43.8,0.0
|
30 |
29.0,ToT,Llama-3.3-70B-Instruct,2025/1/22,58.79,91.89,20.8753,83.07,2.9404,1.4,8.2699
|
31 |
30.0,ToT,gpt-4o,2025/1/22,58.61,91.13,86.8581,81.5,8.5295,3.2,40.8094
|
32 |
+
31.0,ReAct-Pro*,gpt-4o,2025/1/22,58.26,63.31,39.0751,57.48,2.304,54.0,17.7735
|
33 |
+
32.0,SC-CoT,deepseek-r1:1.5b,2025/2/10,57.91,69.07,0.0,57.87,0.0,46.8,0.0
|
34 |
+
33.0,SC-CoT,gpt-3.5-turbo,2025/1/7,56.25,69.29,2.5203,58.66,0.3277,40.8,1.2308
|
35 |
+
34.0,PoT,Qwen2.5-7B-Instruct,2025/1/22,55.51,58.83,0.0,68.11,0.0,39.6,0.0
|
36 |
+
35.0,PoT,gpt-3.5-turbo,2025/1/7,55.04,76.88,0.6902,59.45,0.1748,28.8,0.168
|
37 |
+
36.0,ReAct-Pro*,gpt-3.5-turbo,2025/1/7,54.43,74.91,3.4633,64.57,0.4928,23.8,2.0406
|
38 |
37.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,53.96,75.44,0.0,60.63,0.0,25.8,0.0
|
39 |
+
38.0,ReAct-Pro*,Llama-3.1-8B-Instruct,2025/1/22,50.70,67.78,0.0,55.51,0.0,28.8,0.0
|
40 |
+
39.0,IO,Llama-3.1-8B-Instruct,2025/1/22,48.98,57.16,0.0,51.18,0.0,38.6,0.0
|
41 |
+
40.0,ToT,gpt-3.5-turbo,2025/1/7,44.94,67.93,9.1707,57.09,1.1513,9.8,5.2914
|
42 |
+
41.0,SC-CoT,Llama-3.1-8B-Instruct,2025/1/22,44.54,54.36,0.0,59.45,0.0,19.8,0.0
|
43 |
42.0,ToT,Qwen2.5-7B-Instruct,2025/1/22,42.52,72.21,0.0,53.94,0.0,1.4,0.0
|
44 |
43.0,ToT,Llama-3.1-8B-Instruct,2025/1/22,41.97,65.05,0.0,59.06,0.0,1.8,0.0
|
45 |
44.0,ReAct-Pro*,deepseek-r1:1.5b,2025/2/10,38.22,35.94,0.0,54.33,0.0,24.4,0.0
|
46 |
45.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,37.08,55.5,0.0,40.55,0.0,15.2,0.0
|
47 |
46.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,33.56,38.67,0.0,36.61,0.0,25.4,0.0
|
48 |
+
47.0,IO,gpt-3.5-turbo,2025/1/7,31.34,37.83,0.3328,38.98,0.038,17.2,0.2436
|
49 |
+
48.0,SC-CoT,Internllm2_5-7B,2025/1/22,30.81,44.66,0.0,38.58,0.0,9.2,0.0
|
50 |
49.0,PoT,Internllm2_5-7B,2025/1/22,29.94,38.21,0.0,36.61,0.0,15.0,0.0
|
51 |
50.0,ReAct-Pro*,Internllm2_5-7B,2025/1/22,29.75,33.51,0.0,40.94,0.0,14.8,0.0
|
52 |
51.0,ToT,Doubao-lite-32k,2025/1/7,28.10,37.83,0.8739,45.28,0.0881,1.2,0.2371
|
|
|
60 |
59.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,16.67,18.5,0.0,30.71,0.0,0.8,0.0
|
61 |
60.0,ToT,deepseek-r1:1.5b,2025/2/10,16.11,23.12,0.0,24.8,0.0,0.4,0.0
|
62 |
61.0,IO,Qwen2-0.5B-Instruct,2025/1/22,14.83,14.71,0.0,27.17,0.0,2.6,0.0
|
63 |
+
62.0,ReAct-Pro*,Qwen2-0.5B-Instruct,2025/1/22,10.76,7.66,0.0,24.02,0.0,0.6,0.0
|
64 |
+
63.0,ToT,Qwen2-0.5B-Instruct,2025/1/22,9.97,0.0,0.0,29.92,0.0,0.0,0.0
|
65 |
+
64.0,PoT,Qwen2-0.5B-Instruct,2025/1/22,8.98,9.63,0.0,17.32,0.0,0.0,0.0
|
66 |
+
65.0,SC-CoT,Qwen2-0.5B-Instruct,2025/1/22,7.90,4.17,0.0,17.32,0.0,2.2,0.0
|
67 |
+
66.0,SC-CoT,Qwen2-1.5B-Instruct,2025/1/22,6.94,8.19,0.0,10.63,0.0,2.0,0.0
|
src/record.csv
CHANGED
@@ -43,17 +43,17 @@ CoT,gsm8k,2025/1/22,Internllm2_5-7B,77.71,99.70,8,,"1,319","968,163",734,"234,00
|
|
43 |
CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,55.50,100.00,8,,"1,319","1,032,818",783,"185,707",141,"1,218,525",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
44 |
CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,35.94,99.92,8,,"1,319","1,032,818",783,"190,641",145,"1,223,459",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
45 |
CoT,gsm8k,2025/1/23,deepseek-r1:1.5b,70.66,99.77,8,,"1,319","1,011,714",767,"1,078,911",818,"2,090,625",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
46 |
-
SC-CoT,gsm8k,2025/1/7,gpt-3.5-turbo,
|
47 |
-
SC-CoT,gsm8k,2025/1/7,Doubao-lite-32k,
|
48 |
-
SC-CoT,gsm8k,2025/1/22,gpt-4o,
|
49 |
-
SC-CoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,
|
50 |
-
SC-CoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,95.
|
51 |
-
SC-CoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,
|
52 |
-
SC-CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,
|
53 |
-
SC-CoT,gsm8k,2025/1/22,Internllm2_5-7B,
|
54 |
-
SC-CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,
|
55 |
-
SC-CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,
|
56 |
-
SC-CoT,gsm8k,2025/2/10,deepseek-r1:1.5b,
|
57 |
ToT,gsm8k,2025/1/7,gpt-3.5-turbo,67.93,99.70,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","15,920,037","12,070","807,138",612,"16,727,175",9.1707,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
58 |
ToT,gsm8k,2025/1/7,Doubao-lite-32k,37.83,87.34,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","19,208,597","14,563","1,065,752",808,"20,274,349",0.8739,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
59 |
ToT,gsm8k,2025/1/22,gpt-4o,91.13,100.00,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","29,445,237","22,324","1,324,498","1,004","30,769,735",86.8581,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
@@ -98,17 +98,17 @@ PoT,AQuA,2025/1/22,Internllm2_5-7B,36.61,98.82,0,,254,"233,505",919,"68,457",270
|
|
98 |
PoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,30.71,96.46,0,,254,"246,560",971,"51,915",204,"298,475",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
99 |
PoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,17.32,92.13,0,,254,"258,867","1,019","63,414",250,"322,281",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
100 |
PoT,AQuA,2025/2/10,deepseek-r1:1.5b,54.72,97.24,0,,254,"250,690",987,"765,957","3,016","1,016,647",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
101 |
-
SC-CoT,AQuA,2025/1/22,gpt-3.5-turbo,66
|
102 |
-
SC-CoT,AQuA,2025/1/22,Doubao-lite-32k,
|
103 |
-
SC-CoT,AQuA,2025/1/22,gpt-4o,
|
104 |
-
SC-CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,85.
|
105 |
-
SC-CoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,
|
106 |
-
SC-CoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,79.
|
107 |
-
SC-CoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,59.45,
|
108 |
-
SC-CoT,AQuA,2025/1/22,Internllm2_5-7B,
|
109 |
-
SC-CoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,
|
110 |
-
SC-CoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,
|
111 |
-
SC-CoT,AQuA,2025/2/10,deepseek-r1:1.5b,
|
112 |
ReAct-Pro*,AQuA,2025/1/7,gpt-3.5-turbo,64.57,98.03,0,max_steps=10,254,"862,614","3,396","40,973",161,"903,587",0.4928,"think-action εη¬θΏε,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
113 |
ReAct-Pro*,AQuA,2025/1/7,Doubao-lite-32k,77.56,96.06,0,max_steps=10,254,"977,890","3,850","54,951",216,"1,032,841",0.0445,"think-action εη¬θΏε,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
114 |
ReAct-Pro*,AQuA,2025/1/22,gpt-4o,57.48,97.24,0,max_steps=10,254,"615,589","2,424","76,507",301,"692,096",2.3040,"think-action εη¬θΏε,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
@@ -164,17 +164,17 @@ PoT,MATH-500,2025/2/10,Internllm2_5-7B,15.00,32.40,4,,500,"247,883",496,"120,826
|
|
164 |
PoT,MATH-500,2025/2/10,Qwen2-1.5B-Instruct,0.80,2.20,4,,500,"248,509",497,"538,361","1,077","786,870",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
165 |
PoT,MATH-500,2025/2/10,Qwen2-0.5B-Instruct,0.00,0.00,4,,500,"253,549",507,"183,653",367,"437,202",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
166 |
PoT,MATH-500,2025/2/10,deepseek-r1:1.5b,1.00,1.60,4,,500,"245,549",491,"785,518","1,571","1,031,067",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
167 |
-
SC-CoT,MATH-500,2025/2/10,gpt-3.5-turbo,
|
168 |
-
SC-CoT,MATH-500,2025/2/10,Doubao-lite-32k,
|
169 |
-
SC-CoT,MATH-500,2025/2/10,gpt-4o,
|
170 |
-
SC-CoT,MATH-500,2025/2/10,Qwen2.5-72B-Instruct,
|
171 |
-
SC-CoT,MATH-500,2025/2/10,Llama-3.3-70B-Instruct,
|
172 |
-
SC-CoT,MATH-500,2025/2/10,Qwen2.5-7B-Instruct,
|
173 |
-
SC-CoT,MATH-500,2025/2/10,Llama-3.1-8B-Instruct,
|
174 |
-
SC-CoT,MATH-500,2025/2/10,Internllm2_5-7B,9.
|
175 |
-
SC-CoT,MATH-500,2025/2/10,Qwen2-1.5B-Instruct,
|
176 |
-
SC-CoT,MATH-500,2025/2/10,Qwen2-0.5B-Instruct,
|
177 |
-
SC-CoT,MATH-500,2025/2/10,deepseek-r1:1.5b,
|
178 |
ReAct-Pro*,MATH-500,2025/2/10,gpt-3.5-turbo,23.80,100.00,4,max_steps=10,500,"3,708,461","7,417","124,253",249,"3,832,714",2.0406,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
179 |
ReAct-Pro*,MATH-500,2025/2/10,Doubao-lite-32k,47.20,100.00,4,max_steps=10,500,"4,234,620","8,469","154,046",308,"4,388,666",0.1860,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
180 |
ReAct-Pro*,MATH-500,2025/2/10,gpt-4o,54.00,100.00,4,max_steps=10,500,"5,834,537","11,669","318,718",637,"6,153,255",17.7735,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
|
|
43 |
CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,55.50,100.00,8,,"1,319","1,032,818",783,"185,707",141,"1,218,525",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
44 |
CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,35.94,99.92,8,,"1,319","1,032,818",783,"190,641",145,"1,223,459",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
45 |
CoT,gsm8k,2025/1/23,deepseek-r1:1.5b,70.66,99.77,8,,"1,319","1,011,714",767,"1,078,911",818,"2,090,625",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
46 |
+
SC-CoT,gsm8k,2025/1/7,gpt-3.5-turbo,69.29,98.79,8,"temperature=1, path_num=5","1,319","895,571",679,"1,381,678","1,048","2,277,249",2.5203,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
47 |
+
SC-CoT,gsm8k,2025/1/7,Doubao-lite-32k,91.58,99.92,8,"temperature=1, path_num=5","1,319","942,182",714,"893,709",678,"1,835,891",0.1118,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
48 |
+
SC-CoT,gsm8k,2025/1/22,gpt-4o,94.77,100.00,8,"temperature=1, path_num=5","1,319","894,889",678,"1,596,716","1,211","2,491,605",18.2044,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
49 |
+
SC-CoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,94.77,100.00,8,"temperature=1, path_num=5","1,319","5,370,360","4,072","1,804,898","1,368","7,175,258",4.0450,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
50 |
+
SC-CoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,95.22,100.00,8,"temperature=1, path_num=5","1,319","5,295,585","4,015","1,426,429","1,081","6,722,014",3.7895,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
51 |
+
SC-CoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,90.98,100.00,8,"temperature=1, path_num=5","1,319","5,580,524","4,231","1,679,419","1,273","7,259,943",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
52 |
+
SC-CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,54.36,99.85,8,"temperature=1, path_num=5","1,319","5,136,762","3,894","5,819,672","4,412","10,956,434",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
53 |
+
SC-CoT,gsm8k,2025/1/22,Internllm2_5-7B,44.66,91.81,8,"temperature=1, path_num=5","1,319","5,847,761","4,433","2,314,738","1,755","8,162,499",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
54 |
+
SC-CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,8.19,68.76,8,"temperature=1, path_num=5","1,319","5,439,568","4,124","1,946,885","1,476","7,386,453",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
55 |
+
SC-CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,4.17,94.47,8,"temperature=1, path_num=5","1,319","5,441,962","4,126","2,036,805","1,544","7,478,767",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
56 |
+
SC-CoT,gsm8k,2025/2/10,deepseek-r1:1.5b,69.07,98.79,8,"temperature=1, path_num=5","1,319","5,407,357","4,100","4,622,327","3,504","10,029,684",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
57 |
ToT,gsm8k,2025/1/7,gpt-3.5-turbo,67.93,99.70,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","15,920,037","12,070","807,138",612,"16,727,175",9.1707,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
58 |
ToT,gsm8k,2025/1/7,Doubao-lite-32k,37.83,87.34,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","19,208,597","14,563","1,065,752",808,"20,274,349",0.8739,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
59 |
ToT,gsm8k,2025/1/22,gpt-4o,91.13,100.00,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","29,445,237","22,324","1,324,498","1,004","30,769,735",86.8581,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
|
|
98 |
PoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,30.71,96.46,0,,254,"246,560",971,"51,915",204,"298,475",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
99 |
PoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,17.32,92.13,0,,254,"258,867","1,019","63,414",250,"322,281",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
100 |
PoT,AQuA,2025/2/10,deepseek-r1:1.5b,54.72,97.24,0,,254,"250,690",987,"765,957","3,016","1,016,647",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
101 |
+
SC-CoT,AQuA,2025/1/22,gpt-3.5-turbo,58.66,92.52,0,"temperature=1, path_num=5",254,"27,906",110,"209,160",823,"237,066",0.3277,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
102 |
+
SC-CoT,AQuA,2025/1/22,Doubao-lite-32k,76.37,91.73,0,"temperature=1, path_num=5",254,"31,703",125,"325,136","1,280","356,839",0.0279,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
103 |
+
SC-CoT,AQuA,2025/1/22,gpt-4o,85.83,99.21,0,"temperature=1, path_num=5",254,"27,829",110,"517,602","2,038","545,431",5.2456,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
104 |
+
SC-CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,85.43,96.85,0,"temperature=1, path_num=5",254,"137,990",543,"604,562","2,380","742,552",0.4186,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
105 |
+
SC-CoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,84.65,99.61,0,"temperature=1, path_num=5",254,"175,050",689,"612,262","2,410","787,312",0.4438,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
106 |
+
SC-CoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,79.53,100.00,0,"temperature=1, path_num=5",254,"177,972",701,"567,438","2,234","745,410",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
107 |
+
SC-CoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,59.45,95.67,0,"temperature=1, path_num=5",254,"145,108",571,"544,969","2,146","690,077",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
108 |
+
SC-CoT,AQuA,2025/1/22,Internllm2_5-7B,38.58,97.24,0,"temperature=1, path_num=5",254,"264,557","1,042","615,114","2,422","879,671",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
109 |
+
SC-CoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,10.63,51.57,0,"temperature=1, path_num=5",254,"151,410",596,"550,570","2,168","701,980",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
110 |
+
SC-CoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,17.32,82.28,0,"temperature=1, path_num=5",254,"150,787",594,"603,126","2,375","753,913",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
111 |
+
SC-CoT,AQuA,2025/2/10,deepseek-r1:1.5b,57.87,74.02,0,"temperature=1, path_num=5",254,"144,710",570,"1,987,401","7,824","2,132,111",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
112 |
ReAct-Pro*,AQuA,2025/1/7,gpt-3.5-turbo,64.57,98.03,0,max_steps=10,254,"862,614","3,396","40,973",161,"903,587",0.4928,"think-action εη¬θΏε,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
113 |
ReAct-Pro*,AQuA,2025/1/7,Doubao-lite-32k,77.56,96.06,0,max_steps=10,254,"977,890","3,850","54,951",216,"1,032,841",0.0445,"think-action εη¬θΏε,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
114 |
ReAct-Pro*,AQuA,2025/1/22,gpt-4o,57.48,97.24,0,max_steps=10,254,"615,589","2,424","76,507",301,"692,096",2.3040,"think-action εη¬θΏε,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
|
|
164 |
PoT,MATH-500,2025/2/10,Qwen2-1.5B-Instruct,0.80,2.20,4,,500,"248,509",497,"538,361","1,077","786,870",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
165 |
PoT,MATH-500,2025/2/10,Qwen2-0.5B-Instruct,0.00,0.00,4,,500,"253,549",507,"183,653",367,"437,202",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
166 |
PoT,MATH-500,2025/2/10,deepseek-r1:1.5b,1.00,1.60,4,,500,"245,549",491,"785,518","1,571","1,031,067",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
167 |
+
SC-CoT,MATH-500,2025/2/10,gpt-3.5-turbo,40.80,100.00,4,"temperature=1, path_num=5",500,"345,411",691,"705,408","1,411","1,050,819",1.2308,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
168 |
+
SC-CoT,MATH-500,2025/2/10,Doubao-lite-32k,65.80,99.80,4,"temperature=1, path_num=5",500,"362,390",725,"715,613","1,431","1,078,003",0.0734,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
169 |
+
SC-CoT,MATH-500,2025/2/10,gpt-4o,74.60,100.00,4,"temperature=1, path_num=5",500,"345,347",691,"1,149,778","2,300","1,495,125",12.3611,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
170 |
+
SC-CoT,MATH-500,2025/2/10,Qwen2.5-72B-Instruct,79.80,100.00,4,"temperature=1, path_num=5",500,"1,775,395","3,551","1,506,954","3,014","3,282,349",1.8504,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
171 |
+
SC-CoT,MATH-500,2025/2/10,Llama-3.3-70B-Instruct,72.40,100.00,4,"temperature=1, path_num=5",500,"1,797,045","3,594","1,368,466","2,737","3,165,511",1.7845,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
172 |
+
SC-CoT,MATH-500,2025/2/10,Qwen2.5-7B-Instruct,71.20,100.00,4,"temperature=1, path_num=5",500,"1,855,922","3,712","1,299,553","2,599","3,155,475",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
173 |
+
SC-CoT,MATH-500,2025/2/10,Llama-3.1-8B-Instruct,19.80,99.80,4,"temperature=1, path_num=5",500,"1,734,545","3,469","1,756,289","3,513","3,490,834",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
174 |
+
SC-CoT,MATH-500,2025/2/10,Internllm2_5-7B,9.20,97.40,4,"temperature=1, path_num=5",500,"1,994,983","3,990","1,254,893","2,510","3,249,876",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
175 |
+
SC-CoT,MATH-500,2025/2/10,Qwen2-1.5B-Instruct,2.00,89.40,4,"temperature=1, path_num=5",500,"1,805,170","3,610","1,333,854","2,668","3,139,024",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
176 |
+
SC-CoT,MATH-500,2025/2/10,Qwen2-0.5B-Instruct,2.20,98.80,4,"temperature=1, path_num=5",500,"1,808,691","3,617","988,991","1,978","2,797,682",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
177 |
+
SC-CoT,MATH-500,2025/2/10,deepseek-r1:1.5b,46.80,99.20,4,"temperature=1, path_num=5",500,"1,858,874","3,718","12,109,294","24,219","13,968,168",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
178 |
ReAct-Pro*,MATH-500,2025/2/10,gpt-3.5-turbo,23.80,100.00,4,max_steps=10,500,"3,708,461","7,417","124,253",249,"3,832,714",2.0406,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
179 |
ReAct-Pro*,MATH-500,2025/2/10,Doubao-lite-32k,47.20,100.00,4,max_steps=10,500,"4,234,620","8,469","154,046",308,"4,388,666",0.1860,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
180 |
ReAct-Pro*,MATH-500,2025/2/10,gpt-4o,54.00,100.00,4,max_steps=10,500,"5,834,537","11,669","318,718",637,"6,153,255",17.7735,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|