liaojiajia commited on
Commit
6add05a
Β·
1 Parent(s): cd01d35

updated scores

Browse files
src/detail_math_score.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "time": "2025-02-11 13:22:59",
3
  "results": {
4
  "IO": {
5
  "gpt-3.5-turbo": {
@@ -2041,43 +2041,43 @@
2041
  "Eval Date": "2025/1/7"
2042
  },
2043
  "gsm8k": {
2044
- "Score": 79.91,
2045
- "Pass rate": 0.9992,
2046
- "Cost($)": 3.3938,
2047
  "Framework": "",
2048
  "X-shot": "8",
2049
  "Samples": 1319,
2050
- "All tokens": 4089612,
2051
- "Total input tokens": 2740652,
2052
- "Average input tokens": 2078,
2053
- "Total output tokens": 1348960,
2054
- "Average output tokens": 1023
2055
  },
2056
  "AQuA": {
2057
- "Score": 66.14,
2058
- "Pass rate": 0.9921,
2059
- "Cost($)": 0.7888,
2060
  "Framework": "",
2061
  "X-shot": "0",
2062
  "Samples": 254,
2063
- "All tokens": 847335,
2064
- "Total input tokens": 482192,
2065
- "Average input tokens": 1898,
2066
- "Total output tokens": 365143,
2067
- "Average output tokens": 1438
2068
  },
2069
  "MATH-500": {
2070
- "Score": 28.8,
2071
  "Pass rate": 1.0,
2072
- "Cost($)": 1.9764,
2073
  "Framework": "",
2074
  "X-shot": "4",
2075
  "Samples": 500,
2076
- "All tokens": 2238812,
2077
- "Total input tokens": 1381818,
2078
- "Average input tokens": 2764,
2079
- "Total output tokens": 856994,
2080
- "Average output tokens": 1714
2081
  }
2082
  },
2083
  "Doubao-lite-32k": {
@@ -2087,43 +2087,43 @@
2087
  "Eval Date": "2025/1/7"
2088
  },
2089
  "gsm8k": {
2090
- "Score": 87.26,
2091
  "Pass rate": 0.9992,
2092
- "Cost($)": 0.2083,
2093
  "Framework": "",
2094
  "X-shot": "8",
2095
  "Samples": 1319,
2096
- "All tokens": 3888813,
2097
- "Total input tokens": 2691714,
2098
- "Average input tokens": 2041,
2099
- "Total output tokens": 1197099,
2100
- "Average output tokens": 908
2101
  },
2102
  "AQuA": {
2103
- "Score": 81.1,
2104
- "Pass rate": 0.9724,
2105
- "Cost($)": 0.0519,
2106
  "Framework": "",
2107
  "X-shot": "0",
2108
  "Samples": 254,
2109
- "All tokens": 885986,
2110
- "Total input tokens": 503751,
2111
- "Average input tokens": 1983,
2112
- "Total output tokens": 382235,
2113
- "Average output tokens": 1505
2114
  },
2115
  "MATH-500": {
2116
- "Score": 49.2,
2117
- "Pass rate": 1.0,
2118
- "Cost($)": 0.1406,
2119
  "Framework": "",
2120
  "X-shot": "4",
2121
  "Samples": 500,
2122
- "All tokens": 2470810,
2123
- "Total input tokens": 1507651,
2124
- "Average input tokens": 3015,
2125
- "Total output tokens": 963159,
2126
- "Average output tokens": 1926
2127
  }
2128
  },
2129
  "gpt-4o": {
@@ -2133,43 +2133,43 @@
2133
  "Eval Date": "2025/1/22"
2134
  },
2135
  "gsm8k": {
2136
- "Score": 90.3,
2137
- "Pass rate": 0.9992,
2138
- "Cost($)": 31.0542,
2139
  "Framework": "",
2140
  "X-shot": "8",
2141
  "Samples": 1319,
2142
- "All tokens": 5798173,
2143
- "Total input tokens": 3590336,
2144
- "Average input tokens": 2722,
2145
- "Total output tokens": 2207837,
2146
- "Average output tokens": 1674
2147
  },
2148
  "AQuA": {
2149
- "Score": 86.61,
2150
- "Pass rate": 0.9882,
2151
- "Cost($)": 8.1485,
2152
  "Framework": "",
2153
  "X-shot": "0",
2154
  "Samples": 254,
2155
- "All tokens": 1373206,
2156
- "Total input tokens": 744478,
2157
- "Average input tokens": 2931,
2158
- "Total output tokens": 628728,
2159
- "Average output tokens": 2475
2160
  },
2161
  "MATH-500": {
2162
- "Score": 34.4,
2163
  "Pass rate": 1.0,
2164
- "Cost($)": 19.6538,
2165
  "Framework": "",
2166
  "X-shot": "4",
2167
  "Samples": 500,
2168
- "All tokens": 3455323,
2169
- "Total input tokens": 1986584,
2170
- "Average input tokens": 3973,
2171
- "Total output tokens": 1468739,
2172
- "Average output tokens": 2937
2173
  }
2174
  },
2175
  "Qwen2.5-72B-Instruct": {
@@ -2179,43 +2179,43 @@
2179
  "Eval Date": "2025/1/22"
2180
  },
2181
  "gsm8k": {
2182
- "Score": 93.86,
2183
  "Pass rate": 1.0,
2184
- "Cost($)": 5.9858,
2185
  "Framework": "",
2186
  "X-shot": "8",
2187
  "Samples": 1319,
2188
- "All tokens": 10618008,
2189
- "Total input tokens": 8136223,
2190
- "Average input tokens": 6168,
2191
- "Total output tokens": 2481785,
2192
- "Average output tokens": 1882
2193
  },
2194
  "AQuA": {
2195
- "Score": 85.04,
2196
- "Pass rate": 0.9921,
2197
- "Cost($)": 1.0348,
2198
  "Framework": "",
2199
  "X-shot": "0",
2200
  "Samples": 254,
2201
- "All tokens": 1835669,
2202
- "Total input tokens": 1051218,
2203
- "Average input tokens": 4139,
2204
- "Total output tokens": 784451,
2205
- "Average output tokens": 3088
2206
  },
2207
  "MATH-500": {
2208
- "Score": 74.0,
2209
  "Pass rate": 1.0,
2210
- "Cost($)": 3.1556,
2211
  "Framework": "",
2212
  "X-shot": "4",
2213
  "Samples": 500,
2214
- "All tokens": 5597513,
2215
- "Total input tokens": 3823997,
2216
- "Average input tokens": 7648,
2217
- "Total output tokens": 1773516,
2218
- "Average output tokens": 3547
2219
  }
2220
  },
2221
  "Llama-3.3-70B-Instruct": {
@@ -2225,43 +2225,43 @@
2225
  "Eval Date": "2025/1/22"
2226
  },
2227
  "gsm8k": {
2228
- "Score": 95.07,
2229
  "Pass rate": 1.0,
2230
- "Cost($)": 6.2005,
2231
  "Framework": "",
2232
  "X-shot": "8",
2233
  "Samples": 1319,
2234
- "All tokens": 10998794,
2235
- "Total input tokens": 8413717,
2236
- "Average input tokens": 6379,
2237
- "Total output tokens": 2585077,
2238
- "Average output tokens": 1960
2239
  },
2240
  "AQuA": {
2241
- "Score": 82.28,
2242
- "Pass rate": 0.9921,
2243
- "Cost($)": 1.0756,
2244
  "Framework": "",
2245
  "X-shot": "0",
2246
  "Samples": 254,
2247
- "All tokens": 1907924,
2248
- "Total input tokens": 1135251,
2249
- "Average input tokens": 4469,
2250
- "Total output tokens": 772673,
2251
- "Average output tokens": 3042
2252
  },
2253
  "MATH-500": {
2254
- "Score": 74.2,
2255
  "Pass rate": 1.0,
2256
- "Cost($)": 3.2239,
2257
  "Framework": "",
2258
  "X-shot": "4",
2259
  "Samples": 500,
2260
- "All tokens": 5718739,
2261
- "Total input tokens": 3959492,
2262
- "Average input tokens": 7919,
2263
- "Total output tokens": 1759247,
2264
- "Average output tokens": 3518
2265
  }
2266
  },
2267
  "Qwen2.5-7B-Instruct": {
@@ -2271,43 +2271,43 @@
2271
  "Eval Date": "2025/1/22"
2272
  },
2273
  "gsm8k": {
2274
- "Score": 91.13,
2275
  "Pass rate": 1.0,
2276
  "Cost($)": 0.0,
2277
  "Framework": "",
2278
  "X-shot": "8",
2279
  "Samples": 1319,
2280
- "All tokens": 11140985,
2281
- "Total input tokens": 8586888,
2282
- "Average input tokens": 6510,
2283
- "Total output tokens": 2554097,
2284
- "Average output tokens": 1936
2285
  },
2286
  "AQuA": {
2287
- "Score": 79.92,
2288
  "Pass rate": 1.0,
2289
  "Cost($)": 0.0,
2290
  "Framework": "",
2291
  "X-shot": "0",
2292
  "Samples": 254,
2293
- "All tokens": 1845332,
2294
- "Total input tokens": 1098280,
2295
- "Average input tokens": 4324,
2296
- "Total output tokens": 747052,
2297
- "Average output tokens": 2941
2298
  },
2299
  "MATH-500": {
2300
- "Score": 67.0,
2301
  "Pass rate": 1.0,
2302
  "Cost($)": 0.0,
2303
  "Framework": "",
2304
  "X-shot": "4",
2305
  "Samples": 500,
2306
- "All tokens": 5451484,
2307
- "Total input tokens": 3833751,
2308
- "Average input tokens": 7668,
2309
- "Total output tokens": 1617733,
2310
- "Average output tokens": 3235
2311
  }
2312
  },
2313
  "Llama-3.1-8B-Instruct": {
@@ -2317,43 +2317,43 @@
2317
  "Eval Date": "2025/1/22"
2318
  },
2319
  "gsm8k": {
2320
- "Score": 73.46,
2321
- "Pass rate": 0.9955,
2322
  "Cost($)": 0.0,
2323
  "Framework": "",
2324
  "X-shot": "8",
2325
  "Samples": 1319,
2326
- "All tokens": 11778716,
2327
- "Total input tokens": 8630514,
2328
- "Average input tokens": 6543,
2329
- "Total output tokens": 3148202,
2330
- "Average output tokens": 2387
2331
  },
2332
  "AQuA": {
2333
  "Score": 59.45,
2334
- "Pass rate": 0.9724,
2335
  "Cost($)": 0.0,
2336
  "Framework": "",
2337
  "X-shot": "0",
2338
  "Samples": 254,
2339
- "All tokens": 1651333,
2340
- "Total input tokens": 971003,
2341
- "Average input tokens": 3823,
2342
- "Total output tokens": 680330,
2343
- "Average output tokens": 2678
2344
  },
2345
  "MATH-500": {
2346
- "Score": 30.2,
2347
- "Pass rate": 1.0,
2348
  "Cost($)": 0.0,
2349
  "Framework": "",
2350
  "X-shot": "4",
2351
  "Samples": 500,
2352
- "All tokens": 5034937,
2353
- "Total input tokens": 3546673,
2354
- "Average input tokens": 7093,
2355
- "Total output tokens": 1488264,
2356
- "Average output tokens": 2977
2357
  }
2358
  },
2359
  "Internllm2_5-7B": {
@@ -2363,43 +2363,43 @@
2363
  "Eval Date": "2025/1/22"
2364
  },
2365
  "gsm8k": {
2366
- "Score": 48.22,
2367
- "Pass rate": 0.9841,
2368
  "Cost($)": 0.0,
2369
  "Framework": "",
2370
  "X-shot": "8",
2371
  "Samples": 1319,
2372
- "All tokens": 14526431,
2373
- "Total input tokens": 10678792,
2374
- "Average input tokens": 8096,
2375
- "Total output tokens": 3847639,
2376
- "Average output tokens": 2917
2377
  },
2378
  "AQuA": {
2379
- "Score": 39.37,
2380
- "Pass rate": 0.9803,
2381
  "Cost($)": 0.0,
2382
  "Framework": "",
2383
  "X-shot": "0",
2384
  "Samples": 254,
2385
- "All tokens": 2296222,
2386
- "Total input tokens": 1420494,
2387
- "Average input tokens": 5592,
2388
- "Total output tokens": 875728,
2389
- "Average output tokens": 3448
2390
  },
2391
  "MATH-500": {
2392
- "Score": 9.8,
2393
- "Pass rate": 1.0,
2394
  "Cost($)": 0.0,
2395
  "Framework": "",
2396
  "X-shot": "4",
2397
  "Samples": 500,
2398
- "All tokens": 5838466,
2399
- "Total input tokens": 4193296,
2400
- "Average input tokens": 8387,
2401
- "Total output tokens": 1645170,
2402
- "Average output tokens": 3290
2403
  }
2404
  },
2405
  "Qwen2-1.5B-Instruct": {
@@ -2409,43 +2409,43 @@
2409
  "Eval Date": "2025/1/22"
2410
  },
2411
  "gsm8k": {
2412
- "Score": 11.75,
2413
- "Pass rate": 0.9189,
2414
  "Cost($)": 0.0,
2415
  "Framework": "",
2416
  "X-shot": "8",
2417
  "Samples": 1319,
2418
- "All tokens": 12411942,
2419
- "Total input tokens": 9066115,
2420
- "Average input tokens": 6873,
2421
- "Total output tokens": 3345827,
2422
- "Average output tokens": 2537
2423
  },
2424
  "AQuA": {
2425
- "Score": 23.62,
2426
- "Pass rate": 0.9646,
2427
  "Cost($)": 0.0,
2428
  "Framework": "",
2429
  "X-shot": "0",
2430
  "Samples": 254,
2431
- "All tokens": 1775335,
2432
- "Total input tokens": 1034362,
2433
- "Average input tokens": 4072,
2434
- "Total output tokens": 740973,
2435
- "Average output tokens": 2917
2436
  },
2437
  "MATH-500": {
2438
- "Score": 3.8,
2439
- "Pass rate": 0.99,
2440
  "Cost($)": 0.0,
2441
  "Framework": "",
2442
  "X-shot": "4",
2443
  "Samples": 500,
2444
- "All tokens": 5569442,
2445
- "Total input tokens": 3832429,
2446
- "Average input tokens": 7665,
2447
- "Total output tokens": 1737013,
2448
- "Average output tokens": 3474
2449
  }
2450
  },
2451
  "Qwen2-0.5B-Instruct": {
@@ -2455,43 +2455,43 @@
2455
  "Eval Date": "2025/1/22"
2456
  },
2457
  "gsm8k": {
2458
- "Score": 1.67,
2459
- "Pass rate": 0.9469,
2460
  "Cost($)": 0.0,
2461
  "Framework": "",
2462
  "X-shot": "8",
2463
  "Samples": 1319,
2464
- "All tokens": 16465720,
2465
- "Total input tokens": 11019864,
2466
- "Average input tokens": 8355,
2467
- "Total output tokens": 5445856,
2468
- "Average output tokens": 4129
2469
  },
2470
  "AQuA": {
2471
- "Score": 22.83,
2472
- "Pass rate": 0.9724,
2473
  "Cost($)": 0.0,
2474
  "Framework": "",
2475
  "X-shot": "0",
2476
  "Samples": 254,
2477
- "All tokens": 2215091,
2478
- "Total input tokens": 1246929,
2479
- "Average input tokens": 4909,
2480
- "Total output tokens": 968162,
2481
- "Average output tokens": 3812
2482
  },
2483
  "MATH-500": {
2484
- "Score": 0.8,
2485
- "Pass rate": 1.0,
2486
  "Cost($)": 0.0,
2487
  "Framework": "",
2488
  "X-shot": "4",
2489
  "Samples": 500,
2490
- "All tokens": 6862056,
2491
- "Total input tokens": 4448663,
2492
- "Average input tokens": 8897,
2493
- "Total output tokens": 2413393,
2494
- "Average output tokens": 4827
2495
  }
2496
  },
2497
  "deepseek-r1:1.5b": {
@@ -2501,43 +2501,43 @@
2501
  "Eval Date": "2025/2/10"
2502
  },
2503
  "gsm8k": {
2504
- "Score": 55.34,
2505
- "Pass rate": 0.997,
2506
  "Cost($)": 0.0,
2507
  "Framework": "",
2508
  "X-shot": "8",
2509
  "Samples": 1319,
2510
- "All tokens": 25785865,
2511
- "Total input tokens": 14540096,
2512
- "Average input tokens": 11024,
2513
- "Total output tokens": 11245769,
2514
- "Average output tokens": 8526
2515
  },
2516
  "AQuA": {
2517
- "Score": 59.06,
2518
- "Pass rate": 0.9685,
2519
  "Cost($)": 0.0,
2520
  "Framework": "",
2521
  "X-shot": "0",
2522
  "Samples": 254,
2523
- "All tokens": 5802711,
2524
- "Total input tokens": 2547772,
2525
- "Average input tokens": 10031,
2526
- "Total output tokens": 3254939,
2527
- "Average output tokens": 12815
2528
  },
2529
  "MATH-500": {
2530
- "Score": 38.0,
2531
- "Pass rate": 1.0,
2532
  "Cost($)": 0.0,
2533
  "Framework": "",
2534
  "X-shot": "4",
2535
  "Samples": 500,
2536
- "All tokens": 14742109,
2537
- "Total input tokens": 7080559,
2538
- "Average input tokens": 14161,
2539
- "Total output tokens": 7661550,
2540
- "Average output tokens": 15323
2541
  }
2542
  }
2543
  },
 
1
  {
2
+ "time": "2025-03-05 13:15:02",
3
  "results": {
4
  "IO": {
5
  "gpt-3.5-turbo": {
 
2041
  "Eval Date": "2025/1/7"
2042
  },
2043
  "gsm8k": {
2044
+ "Score": 69.29,
2045
+ "Pass rate": 0.9879,
2046
+ "Cost($)": 2.5203,
2047
  "Framework": "",
2048
  "X-shot": "8",
2049
  "Samples": 1319,
2050
+ "All tokens": 2277249,
2051
+ "Total input tokens": 895571,
2052
+ "Average input tokens": 679,
2053
+ "Total output tokens": 1381678,
2054
+ "Average output tokens": 1048
2055
  },
2056
  "AQuA": {
2057
+ "Score": 58.66,
2058
+ "Pass rate": 0.9252,
2059
+ "Cost($)": 0.3277,
2060
  "Framework": "",
2061
  "X-shot": "0",
2062
  "Samples": 254,
2063
+ "All tokens": 237066,
2064
+ "Total input tokens": 27906,
2065
+ "Average input tokens": 110,
2066
+ "Total output tokens": 209160,
2067
+ "Average output tokens": 823
2068
  },
2069
  "MATH-500": {
2070
+ "Score": 40.8,
2071
  "Pass rate": 1.0,
2072
+ "Cost($)": 1.2308,
2073
  "Framework": "",
2074
  "X-shot": "4",
2075
  "Samples": 500,
2076
+ "All tokens": 1050819,
2077
+ "Total input tokens": 345411,
2078
+ "Average input tokens": 691,
2079
+ "Total output tokens": 705408,
2080
+ "Average output tokens": 1411
2081
  }
2082
  },
2083
  "Doubao-lite-32k": {
 
2087
  "Eval Date": "2025/1/7"
2088
  },
2089
  "gsm8k": {
2090
+ "Score": 91.58,
2091
  "Pass rate": 0.9992,
2092
+ "Cost($)": 0.1118,
2093
  "Framework": "",
2094
  "X-shot": "8",
2095
  "Samples": 1319,
2096
+ "All tokens": 1835891,
2097
+ "Total input tokens": 942182,
2098
+ "Average input tokens": 714,
2099
+ "Total output tokens": 893709,
2100
+ "Average output tokens": 678
2101
  },
2102
  "AQuA": {
2103
+ "Score": 76.37,
2104
+ "Pass rate": 0.9173,
2105
+ "Cost($)": 0.0279,
2106
  "Framework": "",
2107
  "X-shot": "0",
2108
  "Samples": 254,
2109
+ "All tokens": 356839,
2110
+ "Total input tokens": 31703,
2111
+ "Average input tokens": 125,
2112
+ "Total output tokens": 325136,
2113
+ "Average output tokens": 1280
2114
  },
2115
  "MATH-500": {
2116
+ "Score": 65.8,
2117
+ "Pass rate": 0.998,
2118
+ "Cost($)": 0.0734,
2119
  "Framework": "",
2120
  "X-shot": "4",
2121
  "Samples": 500,
2122
+ "All tokens": 1078003,
2123
+ "Total input tokens": 362390,
2124
+ "Average input tokens": 725,
2125
+ "Total output tokens": 715613,
2126
+ "Average output tokens": 1431
2127
  }
2128
  },
2129
  "gpt-4o": {
 
2133
  "Eval Date": "2025/1/22"
2134
  },
2135
  "gsm8k": {
2136
+ "Score": 94.77,
2137
+ "Pass rate": 1.0,
2138
+ "Cost($)": 18.2044,
2139
  "Framework": "",
2140
  "X-shot": "8",
2141
  "Samples": 1319,
2142
+ "All tokens": 2491605,
2143
+ "Total input tokens": 894889,
2144
+ "Average input tokens": 678,
2145
+ "Total output tokens": 1596716,
2146
+ "Average output tokens": 1211
2147
  },
2148
  "AQuA": {
2149
+ "Score": 85.83,
2150
+ "Pass rate": 0.9921,
2151
+ "Cost($)": 5.2456,
2152
  "Framework": "",
2153
  "X-shot": "0",
2154
  "Samples": 254,
2155
+ "All tokens": 545431,
2156
+ "Total input tokens": 27829,
2157
+ "Average input tokens": 110,
2158
+ "Total output tokens": 517602,
2159
+ "Average output tokens": 2038
2160
  },
2161
  "MATH-500": {
2162
+ "Score": 74.6,
2163
  "Pass rate": 1.0,
2164
+ "Cost($)": 12.3611,
2165
  "Framework": "",
2166
  "X-shot": "4",
2167
  "Samples": 500,
2168
+ "All tokens": 1495125,
2169
+ "Total input tokens": 345347,
2170
+ "Average input tokens": 691,
2171
+ "Total output tokens": 1149778,
2172
+ "Average output tokens": 2300
2173
  }
2174
  },
2175
  "Qwen2.5-72B-Instruct": {
 
2179
  "Eval Date": "2025/1/22"
2180
  },
2181
  "gsm8k": {
2182
+ "Score": 94.77,
2183
  "Pass rate": 1.0,
2184
+ "Cost($)": 4.045,
2185
  "Framework": "",
2186
  "X-shot": "8",
2187
  "Samples": 1319,
2188
+ "All tokens": 7175258,
2189
+ "Total input tokens": 5370360,
2190
+ "Average input tokens": 4072,
2191
+ "Total output tokens": 1804898,
2192
+ "Average output tokens": 1368
2193
  },
2194
  "AQuA": {
2195
+ "Score": 85.43,
2196
+ "Pass rate": 0.9685,
2197
+ "Cost($)": 0.4186,
2198
  "Framework": "",
2199
  "X-shot": "0",
2200
  "Samples": 254,
2201
+ "All tokens": 742552,
2202
+ "Total input tokens": 137990,
2203
+ "Average input tokens": 543,
2204
+ "Total output tokens": 604562,
2205
+ "Average output tokens": 2380
2206
  },
2207
  "MATH-500": {
2208
+ "Score": 79.8,
2209
  "Pass rate": 1.0,
2210
+ "Cost($)": 1.8504,
2211
  "Framework": "",
2212
  "X-shot": "4",
2213
  "Samples": 500,
2214
+ "All tokens": 3282349,
2215
+ "Total input tokens": 1775395,
2216
+ "Average input tokens": 3551,
2217
+ "Total output tokens": 1506954,
2218
+ "Average output tokens": 3014
2219
  }
2220
  },
2221
  "Llama-3.3-70B-Instruct": {
 
2225
  "Eval Date": "2025/1/22"
2226
  },
2227
  "gsm8k": {
2228
+ "Score": 95.22,
2229
  "Pass rate": 1.0,
2230
+ "Cost($)": 3.7895,
2231
  "Framework": "",
2232
  "X-shot": "8",
2233
  "Samples": 1319,
2234
+ "All tokens": 6722014,
2235
+ "Total input tokens": 5295585,
2236
+ "Average input tokens": 4015,
2237
+ "Total output tokens": 1426429,
2238
+ "Average output tokens": 1081
2239
  },
2240
  "AQuA": {
2241
+ "Score": 84.65,
2242
+ "Pass rate": 0.9961,
2243
+ "Cost($)": 0.4438,
2244
  "Framework": "",
2245
  "X-shot": "0",
2246
  "Samples": 254,
2247
+ "All tokens": 787312,
2248
+ "Total input tokens": 175050,
2249
+ "Average input tokens": 689,
2250
+ "Total output tokens": 612262,
2251
+ "Average output tokens": 2410
2252
  },
2253
  "MATH-500": {
2254
+ "Score": 72.4,
2255
  "Pass rate": 1.0,
2256
+ "Cost($)": 1.7845,
2257
  "Framework": "",
2258
  "X-shot": "4",
2259
  "Samples": 500,
2260
+ "All tokens": 3165511,
2261
+ "Total input tokens": 1797045,
2262
+ "Average input tokens": 3594,
2263
+ "Total output tokens": 1368466,
2264
+ "Average output tokens": 2737
2265
  }
2266
  },
2267
  "Qwen2.5-7B-Instruct": {
 
2271
  "Eval Date": "2025/1/22"
2272
  },
2273
  "gsm8k": {
2274
+ "Score": 90.98,
2275
  "Pass rate": 1.0,
2276
  "Cost($)": 0.0,
2277
  "Framework": "",
2278
  "X-shot": "8",
2279
  "Samples": 1319,
2280
+ "All tokens": 7259943,
2281
+ "Total input tokens": 5580524,
2282
+ "Average input tokens": 4231,
2283
+ "Total output tokens": 1679419,
2284
+ "Average output tokens": 1273
2285
  },
2286
  "AQuA": {
2287
+ "Score": 79.53,
2288
  "Pass rate": 1.0,
2289
  "Cost($)": 0.0,
2290
  "Framework": "",
2291
  "X-shot": "0",
2292
  "Samples": 254,
2293
+ "All tokens": 745410,
2294
+ "Total input tokens": 177972,
2295
+ "Average input tokens": 701,
2296
+ "Total output tokens": 567438,
2297
+ "Average output tokens": 2234
2298
  },
2299
  "MATH-500": {
2300
+ "Score": 71.2,
2301
  "Pass rate": 1.0,
2302
  "Cost($)": 0.0,
2303
  "Framework": "",
2304
  "X-shot": "4",
2305
  "Samples": 500,
2306
+ "All tokens": 3155475,
2307
+ "Total input tokens": 1855922,
2308
+ "Average input tokens": 3712,
2309
+ "Total output tokens": 1299553,
2310
+ "Average output tokens": 2599
2311
  }
2312
  },
2313
  "Llama-3.1-8B-Instruct": {
 
2317
  "Eval Date": "2025/1/22"
2318
  },
2319
  "gsm8k": {
2320
+ "Score": 54.36,
2321
+ "Pass rate": 0.9985,
2322
  "Cost($)": 0.0,
2323
  "Framework": "",
2324
  "X-shot": "8",
2325
  "Samples": 1319,
2326
+ "All tokens": 10956434,
2327
+ "Total input tokens": 5136762,
2328
+ "Average input tokens": 3894,
2329
+ "Total output tokens": 5819672,
2330
+ "Average output tokens": 4412
2331
  },
2332
  "AQuA": {
2333
  "Score": 59.45,
2334
+ "Pass rate": 0.9567,
2335
  "Cost($)": 0.0,
2336
  "Framework": "",
2337
  "X-shot": "0",
2338
  "Samples": 254,
2339
+ "All tokens": 690077,
2340
+ "Total input tokens": 145108,
2341
+ "Average input tokens": 571,
2342
+ "Total output tokens": 544969,
2343
+ "Average output tokens": 2146
2344
  },
2345
  "MATH-500": {
2346
+ "Score": 19.8,
2347
+ "Pass rate": 0.998,
2348
  "Cost($)": 0.0,
2349
  "Framework": "",
2350
  "X-shot": "4",
2351
  "Samples": 500,
2352
+ "All tokens": 3490834,
2353
+ "Total input tokens": 1734545,
2354
+ "Average input tokens": 3469,
2355
+ "Total output tokens": 1756289,
2356
+ "Average output tokens": 3513
2357
  }
2358
  },
2359
  "Internllm2_5-7B": {
 
2363
  "Eval Date": "2025/1/22"
2364
  },
2365
  "gsm8k": {
2366
+ "Score": 44.66,
2367
+ "Pass rate": 0.9181,
2368
  "Cost($)": 0.0,
2369
  "Framework": "",
2370
  "X-shot": "8",
2371
  "Samples": 1319,
2372
+ "All tokens": 8162499,
2373
+ "Total input tokens": 5847761,
2374
+ "Average input tokens": 4433,
2375
+ "Total output tokens": 2314738,
2376
+ "Average output tokens": 1755
2377
  },
2378
  "AQuA": {
2379
+ "Score": 38.58,
2380
+ "Pass rate": 0.9724,
2381
  "Cost($)": 0.0,
2382
  "Framework": "",
2383
  "X-shot": "0",
2384
  "Samples": 254,
2385
+ "All tokens": 879671,
2386
+ "Total input tokens": 264557,
2387
+ "Average input tokens": 1042,
2388
+ "Total output tokens": 615114,
2389
+ "Average output tokens": 2422
2390
  },
2391
  "MATH-500": {
2392
+ "Score": 9.2,
2393
+ "Pass rate": 0.974,
2394
  "Cost($)": 0.0,
2395
  "Framework": "",
2396
  "X-shot": "4",
2397
  "Samples": 500,
2398
+ "All tokens": 3249876,
2399
+ "Total input tokens": 1994983,
2400
+ "Average input tokens": 3990,
2401
+ "Total output tokens": 1254893,
2402
+ "Average output tokens": 2510
2403
  }
2404
  },
2405
  "Qwen2-1.5B-Instruct": {
 
2409
  "Eval Date": "2025/1/22"
2410
  },
2411
  "gsm8k": {
2412
+ "Score": 8.19,
2413
+ "Pass rate": 0.6876,
2414
  "Cost($)": 0.0,
2415
  "Framework": "",
2416
  "X-shot": "8",
2417
  "Samples": 1319,
2418
+ "All tokens": 7386453,
2419
+ "Total input tokens": 5439568,
2420
+ "Average input tokens": 4124,
2421
+ "Total output tokens": 1946885,
2422
+ "Average output tokens": 1476
2423
  },
2424
  "AQuA": {
2425
+ "Score": 10.63,
2426
+ "Pass rate": 0.5157,
2427
  "Cost($)": 0.0,
2428
  "Framework": "",
2429
  "X-shot": "0",
2430
  "Samples": 254,
2431
+ "All tokens": 701980,
2432
+ "Total input tokens": 151410,
2433
+ "Average input tokens": 596,
2434
+ "Total output tokens": 550570,
2435
+ "Average output tokens": 2168
2436
  },
2437
  "MATH-500": {
2438
+ "Score": 2.0,
2439
+ "Pass rate": 0.894,
2440
  "Cost($)": 0.0,
2441
  "Framework": "",
2442
  "X-shot": "4",
2443
  "Samples": 500,
2444
+ "All tokens": 3139024,
2445
+ "Total input tokens": 1805170,
2446
+ "Average input tokens": 3610,
2447
+ "Total output tokens": 1333854,
2448
+ "Average output tokens": 2668
2449
  }
2450
  },
2451
  "Qwen2-0.5B-Instruct": {
 
2455
  "Eval Date": "2025/1/22"
2456
  },
2457
  "gsm8k": {
2458
+ "Score": 4.17,
2459
+ "Pass rate": 0.9447,
2460
  "Cost($)": 0.0,
2461
  "Framework": "",
2462
  "X-shot": "8",
2463
  "Samples": 1319,
2464
+ "All tokens": 7478767,
2465
+ "Total input tokens": 5441962,
2466
+ "Average input tokens": 4126,
2467
+ "Total output tokens": 2036805,
2468
+ "Average output tokens": 1544
2469
  },
2470
  "AQuA": {
2471
+ "Score": 17.32,
2472
+ "Pass rate": 0.8228,
2473
  "Cost($)": 0.0,
2474
  "Framework": "",
2475
  "X-shot": "0",
2476
  "Samples": 254,
2477
+ "All tokens": 753913,
2478
+ "Total input tokens": 150787,
2479
+ "Average input tokens": 594,
2480
+ "Total output tokens": 603126,
2481
+ "Average output tokens": 2375
2482
  },
2483
  "MATH-500": {
2484
+ "Score": 2.2,
2485
+ "Pass rate": 0.988,
2486
  "Cost($)": 0.0,
2487
  "Framework": "",
2488
  "X-shot": "4",
2489
  "Samples": 500,
2490
+ "All tokens": 2797682,
2491
+ "Total input tokens": 1808691,
2492
+ "Average input tokens": 3617,
2493
+ "Total output tokens": 988991,
2494
+ "Average output tokens": 1978
2495
  }
2496
  },
2497
  "deepseek-r1:1.5b": {
 
2501
  "Eval Date": "2025/2/10"
2502
  },
2503
  "gsm8k": {
2504
+ "Score": 69.07,
2505
+ "Pass rate": 0.9879,
2506
  "Cost($)": 0.0,
2507
  "Framework": "",
2508
  "X-shot": "8",
2509
  "Samples": 1319,
2510
+ "All tokens": 10029684,
2511
+ "Total input tokens": 5407357,
2512
+ "Average input tokens": 4100,
2513
+ "Total output tokens": 4622327,
2514
+ "Average output tokens": 3504
2515
  },
2516
  "AQuA": {
2517
+ "Score": 57.87,
2518
+ "Pass rate": 0.7402,
2519
  "Cost($)": 0.0,
2520
  "Framework": "",
2521
  "X-shot": "0",
2522
  "Samples": 254,
2523
+ "All tokens": 2132111,
2524
+ "Total input tokens": 144710,
2525
+ "Average input tokens": 570,
2526
+ "Total output tokens": 1987401,
2527
+ "Average output tokens": 7824
2528
  },
2529
  "MATH-500": {
2530
+ "Score": 46.8,
2531
+ "Pass rate": 0.992,
2532
  "Cost($)": 0.0,
2533
  "Framework": "",
2534
  "X-shot": "4",
2535
  "Samples": 500,
2536
+ "All tokens": 13968168,
2537
+ "Total input tokens": 1858874,
2538
+ "Average input tokens": 3718,
2539
+ "Total output tokens": 12109294,
2540
+ "Average output tokens": 24219
2541
  }
2542
  }
2543
  },
src/detail_results.csv CHANGED
@@ -1,24 +1,24 @@
1
  Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Samples,All tokens,Total input tokens,Average input tokens,Total output tokens,Average output tokens
2
- 1,SC-CoT,AQuA,gpt-4o,2025/1/22,86.61,0.9882,0,8.1485,,254,1373206,744478,2931,628728,2475
3
- 2,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,86.22,0.9921,0,0.0808,,254,143289,25143,99,118146,465
4
- 3,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.04,0.9921,0,1.0348,,254,1835669,1051218,4139,784451,3088
5
- 4,IO,AQuA,Qwen2.5-72B-Instruct,2025/1/22,84.25,0.9961,0,0.0742,,254,131604,25397,100,106207,418
6
- 5,CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.46,0.9843,0,0.0927,,254,164389,32555,128,131834,519
7
- 6,ToT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.07,1.0,0,2.9404,,254,5215848,4735188,18642,480660,1892
8
- 7,IO,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.68,0.9921,0,0.0798,,254,141567,32809,129,108758,428
9
- 8,CoT,AQuA,Doubao-lite-32k,2025/1/7,82.68,0.9724,0,0.0066,,254,94577,27978,110,66599,262
10
- 9,CoT,AQuA,gpt-4o,2025/1/22,82.68,0.9803,0,1.0417,,254,123017,25123,99,97894,385
11
- 10,SC-CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.28,0.9921,0,1.0756,,254,1907924,1135251,4469,772673,3042
12
  11,ToT,AQuA,gpt-4o,2025/1/22,81.5,0.9921,0,8.5295,,254,2613607,2347538,9242,266069,1048
13
- 12,SC-CoT,AQuA,Doubao-lite-32k,2025/1/7,81.1,0.9724,0,0.0519,,254,885986,503751,1983,382235,1505
14
- 13,ToT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,81.1,0.9921,0,3.7389,,254,6632255,6371642,25085,260613,1026
15
- 14,CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,80.71,0.9961,0,0.0,,254,149736,33017,130,116719,460
16
- 15,SC-CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,79.92,1.0,0,0.0,,254,1845332,1098280,4324,747052,2941
17
- 16,PoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.53,0.9921,0,0.1746,,254,309799,240735,948,69064,272
18
- 17,IO,AQuA,Doubao-lite-32k,2025/1/7,79.13,1.0,0,0.0058,,254,87742,33058,130,54684,215
19
- 18,ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.13,0.9961,0,0.768,,254,1362379,1119143,4406,243236,958
20
- 19,IO,AQuA,Qwen2.5-7B-Instruct,2025/1/22,78.74,0.9843,0,0.0,,254,137771,33271,131,104500,411
21
- 20,ReAct-Pro*,AQuA,Doubao-lite-32k,2025/1/7,77.56,0.9606,0,0.0445,,254,1032841,977890,3850,54951,216
22
  21,IO,AQuA,gpt-4o,2025/1/22,75.59,0.9724,0,1.1453,,254,133752,25631,101,108121,426
23
  22,PoT,AQuA,gpt-4o,2025/1/22,75.2,1.0,0,1.6087,,254,327908,222717,877,105191,414
24
  23,PoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,75.2,1.0,0,0.1645,,254,291764,249215,981,42549,168
@@ -28,14 +28,14 @@ Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Sa
28
  27,CoT,AQuA,deepseek-r1:1.5b,2025/1/23,71.65,0.9685,0,0.0,,254,333072,26413,104,306659,1207
29
  28,IO,AQuA,deepseek-r1:1.5b,2025/1/22,68.9,0.9488,0,0.0,,254,351767,26667,105,325100,1280
30
  29,PoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,68.11,1.0,0,0.0,,254,313728,264517,1041,49211,194
31
- 30,SC-CoT,AQuA,gpt-3.5-turbo,2025/1/7,66.14,0.9921,0,0.7888,,254,847335,482192,1898,365143,1438
32
- 31,ReAct-Pro*,AQuA,gpt-3.5-turbo,2025/1/7,64.57,0.9803,0,0.4928,,254,903587,862614,3396,40973,161
33
- 32,CoT,AQuA,gpt-3.5-turbo,2025/1/7,61.02,0.937,0,0.0957,,254,80793,25447,100,55346,218
34
- 33,CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,60.63,1.0,0,0.0,,254,144435,32555,128,111880,440
35
- 34,PoT,AQuA,gpt-3.5-turbo,2025/1/7,59.45,1.0,0,0.1748,,254,266654,225162,886,41492,163
36
- 35,SC-CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,59.45,0.9724,0,0.0,,254,1651333,971003,3823,680330,2678
37
- 36,SC-CoT,AQuA,deepseek-r1:1.5b,2025/2/10,59.06,0.9685,0,0.0,,254,5802711,2547772,10031,3254939,12815
38
- 37,ToT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,59.06,1.0,0,0.0,,254,5739684,4896222,19276,843462,3321
39
  38,ReAct-Pro*,AQuA,gpt-4o,2025/1/22,57.48,0.9724,0,2.304,,254,692096,615589,2424,76507,301
40
  39,ToT,AQuA,gpt-3.5-turbo,2025/1/7,57.09,0.9961,0,1.1513,,254,2001396,1850767,7286,150629,593
41
  40,ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,2025/1/22,55.51,0.9685,0,0.0,,254,4340821,3764723,14822,576098,2268
@@ -48,8 +48,8 @@ Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Sa
48
  47,ToT,AQuA,Doubao-lite-32k,2025/1/7,45.28,0.7402,0,0.0881,,254,2000550,1850249,7284,150301,592
49
  48,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0,0.0,,254,4428801,3592039,14142,836762,3294
50
  49,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.9882,0,0.0,,254,110040,30477,120,79563,313
51
- 50,SC-CoT,AQuA,Internllm2_5-7B,2025/1/22,39.37,0.9803,0,0.0,,254,2296222,1420494,5592,875728,3448
52
- 51,IO,AQuA,gpt-3.5-turbo,2025/1/7,38.98,1.0,0,0.038,,254,42471,25701,101,16770,66
53
  52,PoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,36.61,0.9685,0,0.0,,254,290914,240613,947,50301,198
54
  53,PoT,AQuA,Internllm2_5-7B,2025/1/22,36.61,0.9882,0,0.0,,254,301962,233505,919,68457,270
55
  54,ToT,AQuA,Internllm2_5-7B,2025/1/22,35.83,0.9961,0,0.0,,254,4734560,4263136,16784,471424,1856
@@ -62,109 +62,109 @@ Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Sa
62
  61,ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,2025/1/22,25.59,0.9606,0,0.0,,254,5072004,4555858,17936,516146,2032
63
  62,ToT,AQuA,deepseek-r1:1.5b,2025/2/10,24.8,0.5551,0,0.0,,254,794512,605028,2382,189484,746
64
  63,ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,2025/1/22,24.02,0.9685,0,0.0,,254,7170087,6344167,24977,825920,3252
65
- 64,SC-CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,23.62,0.9646,0,0.0,,254,1775335,1034362,4072,740973,2917
66
- 65,SC-CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,22.83,0.9724,0,0.0,,254,2215091,1246929,4909,968162,3812
67
- 66,PoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.9213,0,0.0,,254,322281,258867,1019,63414,250
68
  1,CoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,80.2,1.0,4,0.349,,500,619015,338549,677,280466,561
69
- 2,SC-CoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,74.2,1.0,4,3.2239,,500,5718739,3959492,7919,1759247,3518
70
- 3,SC-CoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,74.0,1.0,4,3.1556,,500,5597513,3823997,7648,1773516,3547
71
- 4,CoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,71.2,1.0,4,0.3463,,500,614221,342879,686,271342,543
72
- 5,IO,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,70.2,1.0,4,0.2506,,500,444591,169549,339,275042,550
73
- 6,CoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,69.8,1.0,4,0.0,,500,617204,354049,708,263155,526
74
- 7,IO,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,69.4,1.0,4,0.2386,,500,423216,155879,312,267337,535
75
- 8,CoT,MATH-500,gpt-4o,2025/1/22,68.0,1.0,4,3.0569,,500,552688,329332,659,223356,447
76
- 9,SC-CoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,67.0,1.0,4,0.0,,500,5451484,3833751,7668,1617733,3235
77
- 10,ReAct-Pro*,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,64.6,1.0,4,3.1806,,500,5641879,5223611,10447,418268,837
78
- 11,ReAct-Pro*,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,62.8,1.0,4,3.4541,,500,6127117,5747268,11495,379849,760
79
- 12,IO,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,59.4,1.0,4,0.0,,500,411362,169549,339,241813,484
80
- 13,CoT,MATH-500,Doubao-lite-32k,2025/1/7,59.0,1.0,4,0.0255,,500,479941,336370,673,143571,287
81
- 14,ReAct-Pro*,MATH-500,gpt-4o,2025/1/22,54.0,1.0,4,17.7735,,500,6153255,5834537,11669,318718,637
82
- 15,CoT,MATH-500,deepseek-r1:1.5b,2025/1/23,49.4,1.0,4,0.0,,500,1199129,341549,683,857580,1715
83
- 16,SC-CoT,MATH-500,Doubao-lite-32k,2025/1/7,49.2,1.0,4,0.1406,,500,2470810,1507651,3015,963159,1926
84
- 17,ReAct-Pro*,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,48.8,1.0,4,0.0,,500,4990240,4646708,9293,343532,687
85
- 18,ReAct-Pro*,MATH-500,Doubao-lite-32k,2025/1/7,47.2,1.0,4,0.186,,500,4388666,4234620,8469,154046,308
86
- 19,PoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,47.2,0.822,4,0.233,,500,413372,242549,485,170823,342
87
- 20,CoT,MATH-500,Internllm2_5-7B,2025/1/22,46.6,1.0,4,0.0,,500,546774,332883,666,213891,428
88
- 21,PoT,MATH-500,gpt-4o,2025/1/22,46.2,0.864,4,1.5994,,500,340960,241357,483,99603,199
89
- 22,IO,MATH-500,deepseek-r1:1.5b,2025/1/22,43.8,1.0,4,0.0,,500,1022548,157049,314,865499,1731
90
- 23,PoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,42.6,0.802,4,0.2839,,500,503596,253879,508,249717,499
91
- 24,IO,MATH-500,gpt-4o,2025/1/22,41.8,1.0,4,2.7907,,500,394447,153832,308,240615,481
92
- 25,CoT,MATH-500,gpt-3.5-turbo,2025/1/7,39.8,1.0,4,0.3189,,500,432196,329381,659,102815,206
93
- 26,PoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,39.6,0.744,4,0.0,,500,408812,258549,517,150263,301
94
- 27,IO,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,38.6,1.0,4,0.0,,500,503934,155563,311,348371,697
95
- 28,SC-CoT,MATH-500,deepseek-r1:1.5b,2025/2/10,38.0,1.0,4,0.0,,500,14742109,7080559,14161,7661550,15323
96
- 29,IO,MATH-500,Doubao-lite-32k,2025/1/7,37.4,1.0,4,0.0187,,500,311730,166870,334,144860,290
97
- 30,SC-CoT,MATH-500,gpt-4o,2025/1/22,34.4,1.0,4,19.6538,,500,3455323,1986584,3973,1468739,2937
98
- 31,PoT,MATH-500,Doubao-lite-32k,2025/1/7,32.6,0.68,4,0.0144,,500,303148,254377,509,48771,98
99
- 32,SC-CoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,30.2,1.0,4,0.0,,500,5034937,3546673,7093,1488264,2977
100
  33,ReAct-Pro*,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,28.8,1.0,4,0.0,,500,8763629,7486706,14973,1276923,2554
101
  34,PoT,MATH-500,gpt-3.5-turbo,2025/1/7,28.8,0.838,4,0.168,,500,271916,239902,480,32014,64
102
- 35,SC-CoT,MATH-500,gpt-3.5-turbo,2025/1/7,28.8,1.0,4,1.9764,,500,2238812,1381818,2764,856994,1714
103
- 36,CoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,25.8,1.0,4,0.0,,500,625568,342879,686,282689,565
104
- 37,PoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,25.4,0.684,4,0.0,,500,462271,253879,508,208392,417
105
- 38,ReAct-Pro*,MATH-500,deepseek-r1:1.5b,2025/2/10,24.4,1.0,4,0.0,,500,30177348,20729970,41460,9447378,18895
106
- 39,ReAct-Pro*,MATH-500,gpt-3.5-turbo,2025/1/7,23.8,1.0,4,2.0406,,500,3832714,3708461,7417,124253,249
107
- 40,IO,MATH-500,Internllm2_5-7B,2025/1/22,22.8,1.0,4,0.0,,500,467888,201883,404,266005,532
108
  41,IO,MATH-500,gpt-3.5-turbo,2025/1/7,17.2,1.0,4,0.2436,,500,265625,154881,310,110744,221
109
  42,CoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,15.2,1.0,4,0.0,,500,536377,349049,698,187328,375
110
  43,PoT,MATH-500,Internllm2_5-7B,2025/1/22,15.0,0.324,4,0.0,,500,368709,247883,496,120826,242
111
  44,ReAct-Pro*,MATH-500,Internllm2_5-7B,2025/1/22,14.8,1.0,4,0.0,,500,14186105,11831496,23663,2354609,4709
112
  45,ToT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,10.8,1.0,4,9.0421,,500,16039361,15657730,31315,381631,763
113
- 46,SC-CoT,MATH-500,Internllm2_5-7B,2025/1/22,9.8,1.0,4,0.0,,500,5838466,4193296,8387,1645170,3290
114
- 47,ToT,MATH-500,gpt-3.5-turbo,2025/1/7,9.8,1.0,4,5.2914,,500,10001767,9711244,19422,290523,581
115
  48,ReAct-Pro*,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,8.2,1.0,4,0.0,,500,8987061,8430774,16862,556287,1113
116
  49,IO,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,7.0,1.0,4,0.0,,500,413878,158777,318,255101,510
117
  50,CoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,6.2,1.0,4,0.0,,500,549188,349049,698,200139,400
118
- 51,SC-CoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,3.8,0.99,4,0.0,,500,5569442,3832429,7665,1737013,3474
119
- 52,ToT,MATH-500,gpt-4o,2025/1/22,3.2,1.0,4,40.8094,,500,15242432,14881985,29764,360447,721
120
- 53,IO,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,2.6,1.0,4,0.0,,500,429330,159049,318,270281,541
121
- 54,ToT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,1.8,0.908,4,0.0,,500,9035000,7729000,15458,1306000,2612
122
- 55,ToT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,1.4,0.698,4,8.2699,,500,14669500,14099500,28199,570000,1140
123
- 56,ToT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,1.4,0.916,4,0.0,,500,10167500,9749000,19498,418500,837
124
- 57,ToT,MATH-500,Doubao-lite-32k,2025/1/7,1.2,0.942,4,0.2371,,500,5564500,5338500,10677,226000,452
125
- 58,PoT,MATH-500,deepseek-r1:1.5b,2025/2/10,1.0,0.016,4,0.0,,500,1031067,245549,491,785518,1571
126
- 59,PoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,0.8,0.022,4,0.0,,500,786870,248509,497,538361,1077
127
- 60,SC-CoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.8,1.0,4,0.0,,500,6862056,4448663,8897,2413393,4827
128
  61,ToT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,0.8,0.972,4,0.0,,500,4535000,4408000,8816,127000,254
129
  62,ReAct-Pro*,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.6,1.0,4,0.0,,500,19442440,18137392,36275,1305048,2610
130
  63,ToT,MATH-500,deepseek-r1:1.5b,2025/2/10,0.4,0.716,4,0.0,,500,1941500,1831000,3662,110500,221
131
  64,ToT,MATH-500,Internllm2_5-7B,2025/1/22,0.2,0.99,4,0.0,,500,8350500,7515000,15030,835500,1671
132
  65,PoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.0,4,0.0,,500,437202,253549,507,183653,367
133
  66,ToT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.962,4,0.0,,500,5996500,5590500,11181,406000,812
134
- 1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.07,1.0,8,6.2005,,1319,10998794,8413717,6379,2585077,1960
135
- 2,CoT,gsm8k,gpt-4o,2025/1/22,94.09,1.0,8,4.5367,,1319,1165166,948668,719,216498,164
136
- 3,CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,93.93,1.0,8,0.687,,1319,1218665,990168,751,228497,173
137
- 4,SC-CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,93.86,1.0,8,5.9858,,1319,10618008,8136223,6168,2481785,1882
138
- 5,PoT,gsm8k,gpt-4o,2025/1/22,93.1,0.9977,8,4.2166,,1319,1247912,1101672,835,146240,111
139
- 6,CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.87,1.0,8,0.7195,,1319,1276252,1005119,762,271133,206
140
- 7,PoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.34,0.9939,8,0.7054,,1319,1251210,1106682,839,144528,110
141
- 8,IO,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,92.27,1.0,8,0.4709,,1319,835275,583916,443,251359,191
142
- 9,ToT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,91.89,1.0,8,20.8753,,1319,37029687,35096810,26609,1932877,1465
143
- 10,SC-CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,91.13,1.0,8,0.0,,1319,11140985,8586888,6510,2554097,1936
144
- 11,ToT,gsm8k,gpt-4o,2025/1/22,91.13,1.0,8,86.8581,,1319,30769735,29445237,22324,1324498,1004
145
- 12,SC-CoT,gsm8k,gpt-4o,2025/1/22,90.3,0.9992,8,31.0542,,1319,5798173,3590336,2722,2207837,1674
146
- 13,CoT,gsm8k,Doubao-lite-32k,2025/1/7,89.31,1.0,8,0.0558,,1319,1201820,1042095,790,159725,121
147
- 14,ToT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,88.88,1.0,8,23.5911,,1319,41847148,40435361,30656,1411787,1070
148
- 15,IO,gsm8k,gpt-4o,2025/1/22,88.4,1.0,8,3.3463,,1319,741446,542416,411,199030,151
149
- 16,ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,87.64,0.9992,8,10.1124,,1319,17937864,17038928,12918,898936,682
150
- 17,ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,87.26,1.0,8,10.5479,,1319,18710437,18160983,13769,549454,417
151
- 18,SC-CoT,gsm8k,Doubao-lite-32k,2025/1/7,87.26,0.9992,8,0.2083,,1319,3888813,2691714,2041,1197099,908
152
  19,IO,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,86.58,1.0,8,0.4899,,1319,869060,555340,421,313720,238
153
  20,CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,85.67,1.0,8,0.0,,1319,1290805,1046008,793,244797,186
154
  21,ReAct-Pro*,gsm8k,Doubao-lite-32k,2025/1/7,85.6,0.9962,8,0.2512,,1319,5998639,5862016,4444,136623,104
155
  22,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.87,1.0,8,0.0,,1319,14850914,14355752,10884,495162,375
156
- 23,SC-CoT,gsm8k,gpt-3.5-turbo,2025/1/7,79.91,0.9992,8,3.3938,,1319,4089612,2740652,2078,1348960,1023
157
- 24,PoT,gsm8k,Doubao-lite-32k,2025/1/7,79.61,0.9257,8,0.0576,,1319,1288055,1170038,887,118017,89
158
- 25,CoT,gsm8k,gpt-3.5-turbo,2025/1/7,78.7,1.0,8,0.6788,,1319,1088041,953242,723,134799,102
159
- 26,CoT,gsm8k,Internllm2_5-7B,2025/1/22,77.71,0.997,8,0.0,,1319,1202163,968163,734,234000,177
160
- 27,PoT,gsm8k,gpt-3.5-turbo,2025/1/7,76.88,0.9924,8,0.6902,,1319,1187080,1090418,827,96662,73
161
- 28,CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.44,0.9992,8,0.0,,1319,1248329,990168,751,258161,196
162
- 29,ReAct-Pro*,gsm8k,gpt-3.5-turbo,2025/1/7,74.91,0.9939,8,3.4633,,1319,6646286,6506164,4933,140122,106
163
- 30,SC-CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,73.46,0.9955,8,0.0,,1319,11778716,8630514,6543,3148202,2387
164
- 31,PoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,73.09,0.7961,8,0.9736,,1319,1727044,1126025,854,601019,456
165
- 32,ToT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,72.21,0.9901,8,0.0,,1319,31657319,20196528,15312,11460791,8689
166
- 33,IO,gsm8k,Doubao-lite-32k,2025/1/7,72.02,0.9992,8,0.0354,,1319,740483,617377,468,123106,93
167
- 34,CoT,gsm8k,deepseek-r1:1.5b,2025/1/23,70.66,0.9977,8,0.0,,1319,2090625,1011714,767,1078911,818
168
  35,ToT,gsm8k,gpt-3.5-turbo,2025/1/7,67.93,0.997,8,9.1707,,1319,16727175,15920037,12070,807138,612
169
  36,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.78,0.9856,8,0.0,,1319,22835767,21044978,15955,1790789,1358
170
  37,ToT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,65.05,0.9196,8,0.0,,1319,16432102,15554967,11793,877135,665
@@ -174,8 +174,8 @@ Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Sa
174
  41,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8,0.0,,1319,887913,596229,452,291684,221
175
  42,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.9955,8,0.0,,1319,1745429,550941,418,1194488,906
176
  43,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.5,1.0,8,0.0,,1319,1218525,1032818,783,185707,141
177
- 44,SC-CoT,gsm8k,deepseek-r1:1.5b,2025/2/10,55.34,0.997,8,0.0,,1319,25785865,14540096,11024,11245769,8526
178
- 45,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,48.22,0.9841,8,0.0,,1319,14526431,10678792,8096,3847639,2917
179
  46,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.67,0.5542,8,0.0,,1319,1391111,1147538,870,243573,185
180
  47,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8,0.0,,1319,1324949,1136843,862,188106,143
181
  48,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8,0.3328,,1319,586553,546990,415,39563,30
@@ -191,9 +191,9 @@ Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Sa
191
  58,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.68,1.0,8,0.0,,1319,736996,568530,431,168466,128
192
  59,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.71,1.0,8,0.0,,1319,834897,568116,431,266781,202
193
  60,PoT,gsm8k,deepseek-r1:1.5b,2025/2/10,11.9,0.1744,8,0.0,,1319,1954509,1138872,863,815637,618
194
- 61,SC-CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,11.75,0.9189,8,0.0,,1319,12411942,9066115,6873,3345827,2537
195
- 62,IO,gsm8k,Internllm2_5-7B,2025/1/22,11.6,0.9795,8,0.0,,1319,1113728,679302,515,434426,329
196
- 63,PoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,9.63,0.1691,8,0.0,,1319,1389135,1151528,873,237607,180
197
  64,ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,7.66,0.9522,8,0.0,,1319,55392611,52431343,39751,2961268,2245
198
- 65,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,1.67,0.9469,8,0.0,,1319,16465720,11019864,8355,5445856,4129
199
  66,ToT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.0,8,0.0,,1319,0,0,0,0,0
 
1
  Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Samples,All tokens,Total input tokens,Average input tokens,Total output tokens,Average output tokens
2
+ 1,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,86.22,0.9921,0,0.0808,,254,143289,25143,99,118146,465
3
+ 2,SC-CoT,AQuA,gpt-4o,2025/1/22,85.83,0.9921,0,5.2456,,254,545431,27829,110,517602,2038
4
+ 3,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.43,0.9685,0,0.4186,,254,742552,137990,543,604562,2380
5
+ 4,SC-CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,84.65,0.9961,0,0.4438,,254,787312,175050,689,612262,2410
6
+ 5,IO,AQuA,Qwen2.5-72B-Instruct,2025/1/22,84.25,0.9961,0,0.0742,,254,131604,25397,100,106207,418
7
+ 6,CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.46,0.9843,0,0.0927,,254,164389,32555,128,131834,519
8
+ 7,ToT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.07,1.0,0,2.9404,,254,5215848,4735188,18642,480660,1892
9
+ 8,IO,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.68,0.9921,0,0.0798,,254,141567,32809,129,108758,428
10
+ 9,CoT,AQuA,Doubao-lite-32k,2025/1/7,82.68,0.9724,0,0.0066,,254,94577,27978,110,66599,262
11
+ 10,CoT,AQuA,gpt-4o,2025/1/22,82.68,0.9803,0,1.0417,,254,123017,25123,99,97894,385
12
  11,ToT,AQuA,gpt-4o,2025/1/22,81.5,0.9921,0,8.5295,,254,2613607,2347538,9242,266069,1048
13
+ 12,ToT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,81.1,0.9921,0,3.7389,,254,6632255,6371642,25085,260613,1026
14
+ 13,CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,80.71,0.9961,0,0.0,,254,149736,33017,130,116719,460
15
+ 14,PoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.53,0.9921,0,0.1746,,254,309799,240735,948,69064,272
16
+ 15,SC-CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,79.53,1.0,0,0.0,,254,745410,177972,701,567438,2234
17
+ 16,IO,AQuA,Doubao-lite-32k,2025/1/7,79.13,1.0,0,0.0058,,254,87742,33058,130,54684,215
18
+ 17,ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.13,0.9961,0,0.768,,254,1362379,1119143,4406,243236,958
19
+ 18,IO,AQuA,Qwen2.5-7B-Instruct,2025/1/22,78.74,0.9843,0,0.0,,254,137771,33271,131,104500,411
20
+ 19,ReAct-Pro*,AQuA,Doubao-lite-32k,2025/1/7,77.56,0.9606,0,0.0445,,254,1032841,977890,3850,54951,216
21
+ 20,SC-CoT,AQuA,Doubao-lite-32k,2025/1/7,76.37,0.9173,0,0.0279,,254,356839,31703,125,325136,1280
22
  21,IO,AQuA,gpt-4o,2025/1/22,75.59,0.9724,0,1.1453,,254,133752,25631,101,108121,426
23
  22,PoT,AQuA,gpt-4o,2025/1/22,75.2,1.0,0,1.6087,,254,327908,222717,877,105191,414
24
  23,PoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,75.2,1.0,0,0.1645,,254,291764,249215,981,42549,168
 
28
  27,CoT,AQuA,deepseek-r1:1.5b,2025/1/23,71.65,0.9685,0,0.0,,254,333072,26413,104,306659,1207
29
  28,IO,AQuA,deepseek-r1:1.5b,2025/1/22,68.9,0.9488,0,0.0,,254,351767,26667,105,325100,1280
30
  29,PoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,68.11,1.0,0,0.0,,254,313728,264517,1041,49211,194
31
+ 30,ReAct-Pro*,AQuA,gpt-3.5-turbo,2025/1/7,64.57,0.9803,0,0.4928,,254,903587,862614,3396,40973,161
32
+ 31,CoT,AQuA,gpt-3.5-turbo,2025/1/7,61.02,0.937,0,0.0957,,254,80793,25447,100,55346,218
33
+ 32,CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,60.63,1.0,0,0.0,,254,144435,32555,128,111880,440
34
+ 33,PoT,AQuA,gpt-3.5-turbo,2025/1/7,59.45,1.0,0,0.1748,,254,266654,225162,886,41492,163
35
+ 34,SC-CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,59.45,0.9567,0,0.0,,254,690077,145108,571,544969,2146
36
+ 35,ToT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,59.06,1.0,0,0.0,,254,5739684,4896222,19276,843462,3321
37
+ 36,SC-CoT,AQuA,gpt-3.5-turbo,2025/1/7,58.66,0.9252,0,0.3277,,254,237066,27906,110,209160,823
38
+ 37,SC-CoT,AQuA,deepseek-r1:1.5b,2025/2/10,57.87,0.7402,0,0.0,,254,2132111,144710,570,1987401,7824
39
  38,ReAct-Pro*,AQuA,gpt-4o,2025/1/22,57.48,0.9724,0,2.304,,254,692096,615589,2424,76507,301
40
  39,ToT,AQuA,gpt-3.5-turbo,2025/1/7,57.09,0.9961,0,1.1513,,254,2001396,1850767,7286,150629,593
41
  40,ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,2025/1/22,55.51,0.9685,0,0.0,,254,4340821,3764723,14822,576098,2268
 
48
  47,ToT,AQuA,Doubao-lite-32k,2025/1/7,45.28,0.7402,0,0.0881,,254,2000550,1850249,7284,150301,592
49
  48,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0,0.0,,254,4428801,3592039,14142,836762,3294
50
  49,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.9882,0,0.0,,254,110040,30477,120,79563,313
51
+ 50,IO,AQuA,gpt-3.5-turbo,2025/1/7,38.98,1.0,0,0.038,,254,42471,25701,101,16770,66
52
+ 51,SC-CoT,AQuA,Internllm2_5-7B,2025/1/22,38.58,0.9724,0,0.0,,254,879671,264557,1042,615114,2422
53
  52,PoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,36.61,0.9685,0,0.0,,254,290914,240613,947,50301,198
54
  53,PoT,AQuA,Internllm2_5-7B,2025/1/22,36.61,0.9882,0,0.0,,254,301962,233505,919,68457,270
55
  54,ToT,AQuA,Internllm2_5-7B,2025/1/22,35.83,0.9961,0,0.0,,254,4734560,4263136,16784,471424,1856
 
62
  61,ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,2025/1/22,25.59,0.9606,0,0.0,,254,5072004,4555858,17936,516146,2032
63
  62,ToT,AQuA,deepseek-r1:1.5b,2025/2/10,24.8,0.5551,0,0.0,,254,794512,605028,2382,189484,746
64
  63,ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,2025/1/22,24.02,0.9685,0,0.0,,254,7170087,6344167,24977,825920,3252
65
+ 64,PoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.9213,0,0.0,,254,322281,258867,1019,63414,250
66
+ 65,SC-CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.8228,0,0.0,,254,753913,150787,594,603126,2375
67
+ 66,SC-CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,10.63,0.5157,0,0.0,,254,701980,151410,596,550570,2168
68
  1,CoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,80.2,1.0,4,0.349,,500,619015,338549,677,280466,561
69
+ 2,SC-CoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,79.8,1.0,4,1.8504,,500,3282349,1775395,3551,1506954,3014
70
+ 3,SC-CoT,MATH-500,gpt-4o,2025/1/22,74.6,1.0,4,12.3611,,500,1495125,345347,691,1149778,2300
71
+ 4,SC-CoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,72.4,1.0,4,1.7845,,500,3165511,1797045,3594,1368466,2737
72
+ 5,CoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,71.2,1.0,4,0.3463,,500,614221,342879,686,271342,543
73
+ 6,SC-CoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,71.2,1.0,4,0.0,,500,3155475,1855922,3712,1299553,2599
74
+ 7,IO,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,70.2,1.0,4,0.2506,,500,444591,169549,339,275042,550
75
+ 8,CoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,69.8,1.0,4,0.0,,500,617204,354049,708,263155,526
76
+ 9,IO,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,69.4,1.0,4,0.2386,,500,423216,155879,312,267337,535
77
+ 10,CoT,MATH-500,gpt-4o,2025/1/22,68.0,1.0,4,3.0569,,500,552688,329332,659,223356,447
78
+ 11,SC-CoT,MATH-500,Doubao-lite-32k,2025/1/7,65.8,0.998,4,0.0734,,500,1078003,362390,725,715613,1431
79
+ 12,ReAct-Pro*,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,64.6,1.0,4,3.1806,,500,5641879,5223611,10447,418268,837
80
+ 13,ReAct-Pro*,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,62.8,1.0,4,3.4541,,500,6127117,5747268,11495,379849,760
81
+ 14,IO,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,59.4,1.0,4,0.0,,500,411362,169549,339,241813,484
82
+ 15,CoT,MATH-500,Doubao-lite-32k,2025/1/7,59.0,1.0,4,0.0255,,500,479941,336370,673,143571,287
83
+ 16,ReAct-Pro*,MATH-500,gpt-4o,2025/1/22,54.0,1.0,4,17.7735,,500,6153255,5834537,11669,318718,637
84
+ 17,CoT,MATH-500,deepseek-r1:1.5b,2025/1/23,49.4,1.0,4,0.0,,500,1199129,341549,683,857580,1715
85
+ 18,ReAct-Pro*,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,48.8,1.0,4,0.0,,500,4990240,4646708,9293,343532,687
86
+ 19,ReAct-Pro*,MATH-500,Doubao-lite-32k,2025/1/7,47.2,1.0,4,0.186,,500,4388666,4234620,8469,154046,308
87
+ 20,PoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,47.2,0.822,4,0.233,,500,413372,242549,485,170823,342
88
+ 21,SC-CoT,MATH-500,deepseek-r1:1.5b,2025/2/10,46.8,0.992,4,0.0,,500,13968168,1858874,3718,12109294,24219
89
+ 22,CoT,MATH-500,Internllm2_5-7B,2025/1/22,46.6,1.0,4,0.0,,500,546774,332883,666,213891,428
90
+ 23,PoT,MATH-500,gpt-4o,2025/1/22,46.2,0.864,4,1.5994,,500,340960,241357,483,99603,199
91
+ 24,IO,MATH-500,deepseek-r1:1.5b,2025/1/22,43.8,1.0,4,0.0,,500,1022548,157049,314,865499,1731
92
+ 25,PoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,42.6,0.802,4,0.2839,,500,503596,253879,508,249717,499
93
+ 26,IO,MATH-500,gpt-4o,2025/1/22,41.8,1.0,4,2.7907,,500,394447,153832,308,240615,481
94
+ 27,SC-CoT,MATH-500,gpt-3.5-turbo,2025/1/7,40.8,1.0,4,1.2308,,500,1050819,345411,691,705408,1411
95
+ 28,CoT,MATH-500,gpt-3.5-turbo,2025/1/7,39.8,1.0,4,0.3189,,500,432196,329381,659,102815,206
96
+ 29,PoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,39.6,0.744,4,0.0,,500,408812,258549,517,150263,301
97
+ 30,IO,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,38.6,1.0,4,0.0,,500,503934,155563,311,348371,697
98
+ 31,IO,MATH-500,Doubao-lite-32k,2025/1/7,37.4,1.0,4,0.0187,,500,311730,166870,334,144860,290
99
+ 32,PoT,MATH-500,Doubao-lite-32k,2025/1/7,32.6,0.68,4,0.0144,,500,303148,254377,509,48771,98
100
  33,ReAct-Pro*,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,28.8,1.0,4,0.0,,500,8763629,7486706,14973,1276923,2554
101
  34,PoT,MATH-500,gpt-3.5-turbo,2025/1/7,28.8,0.838,4,0.168,,500,271916,239902,480,32014,64
102
+ 35,CoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,25.8,1.0,4,0.0,,500,625568,342879,686,282689,565
103
+ 36,PoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,25.4,0.684,4,0.0,,500,462271,253879,508,208392,417
104
+ 37,ReAct-Pro*,MATH-500,deepseek-r1:1.5b,2025/2/10,24.4,1.0,4,0.0,,500,30177348,20729970,41460,9447378,18895
105
+ 38,ReAct-Pro*,MATH-500,gpt-3.5-turbo,2025/1/7,23.8,1.0,4,2.0406,,500,3832714,3708461,7417,124253,249
106
+ 39,IO,MATH-500,Internllm2_5-7B,2025/1/22,22.8,1.0,4,0.0,,500,467888,201883,404,266005,532
107
+ 40,SC-CoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,19.8,0.998,4,0.0,,500,3490834,1734545,3469,1756289,3513
108
  41,IO,MATH-500,gpt-3.5-turbo,2025/1/7,17.2,1.0,4,0.2436,,500,265625,154881,310,110744,221
109
  42,CoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,15.2,1.0,4,0.0,,500,536377,349049,698,187328,375
110
  43,PoT,MATH-500,Internllm2_5-7B,2025/1/22,15.0,0.324,4,0.0,,500,368709,247883,496,120826,242
111
  44,ReAct-Pro*,MATH-500,Internllm2_5-7B,2025/1/22,14.8,1.0,4,0.0,,500,14186105,11831496,23663,2354609,4709
112
  45,ToT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,10.8,1.0,4,9.0421,,500,16039361,15657730,31315,381631,763
113
+ 46,ToT,MATH-500,gpt-3.5-turbo,2025/1/7,9.8,1.0,4,5.2914,,500,10001767,9711244,19422,290523,581
114
+ 47,SC-CoT,MATH-500,Internllm2_5-7B,2025/1/22,9.2,0.974,4,0.0,,500,3249876,1994983,3990,1254893,2510
115
  48,ReAct-Pro*,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,8.2,1.0,4,0.0,,500,8987061,8430774,16862,556287,1113
116
  49,IO,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,7.0,1.0,4,0.0,,500,413878,158777,318,255101,510
117
  50,CoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,6.2,1.0,4,0.0,,500,549188,349049,698,200139,400
118
+ 51,ToT,MATH-500,gpt-4o,2025/1/22,3.2,1.0,4,40.8094,,500,15242432,14881985,29764,360447,721
119
+ 52,IO,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,2.6,1.0,4,0.0,,500,429330,159049,318,270281,541
120
+ 53,SC-CoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,2.2,0.988,4,0.0,,500,2797682,1808691,3617,988991,1978
121
+ 54,SC-CoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,2.0,0.894,4,0.0,,500,3139024,1805170,3610,1333854,2668
122
+ 55,ToT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,1.8,0.908,4,0.0,,500,9035000,7729000,15458,1306000,2612
123
+ 56,ToT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,1.4,0.698,4,8.2699,,500,14669500,14099500,28199,570000,1140
124
+ 57,ToT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,1.4,0.916,4,0.0,,500,10167500,9749000,19498,418500,837
125
+ 58,ToT,MATH-500,Doubao-lite-32k,2025/1/7,1.2,0.942,4,0.2371,,500,5564500,5338500,10677,226000,452
126
+ 59,PoT,MATH-500,deepseek-r1:1.5b,2025/2/10,1.0,0.016,4,0.0,,500,1031067,245549,491,785518,1571
127
+ 60,PoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,0.8,0.022,4,0.0,,500,786870,248509,497,538361,1077
128
  61,ToT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,0.8,0.972,4,0.0,,500,4535000,4408000,8816,127000,254
129
  62,ReAct-Pro*,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.6,1.0,4,0.0,,500,19442440,18137392,36275,1305048,2610
130
  63,ToT,MATH-500,deepseek-r1:1.5b,2025/2/10,0.4,0.716,4,0.0,,500,1941500,1831000,3662,110500,221
131
  64,ToT,MATH-500,Internllm2_5-7B,2025/1/22,0.2,0.99,4,0.0,,500,8350500,7515000,15030,835500,1671
132
  65,PoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.0,4,0.0,,500,437202,253549,507,183653,367
133
  66,ToT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.962,4,0.0,,500,5996500,5590500,11181,406000,812
134
+ 1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.22,1.0,8,3.7895,,1319,6722014,5295585,4015,1426429,1081
135
+ 2,SC-CoT,gsm8k,gpt-4o,2025/1/22,94.77,1.0,8,18.2044,,1319,2491605,894889,678,1596716,1211
136
+ 3,SC-CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,94.77,1.0,8,4.045,,1319,7175258,5370360,4072,1804898,1368
137
+ 4,CoT,gsm8k,gpt-4o,2025/1/22,94.09,1.0,8,4.5367,,1319,1165166,948668,719,216498,164
138
+ 5,CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,93.93,1.0,8,0.687,,1319,1218665,990168,751,228497,173
139
+ 6,PoT,gsm8k,gpt-4o,2025/1/22,93.1,0.9977,8,4.2166,,1319,1247912,1101672,835,146240,111
140
+ 7,CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.87,1.0,8,0.7195,,1319,1276252,1005119,762,271133,206
141
+ 8,PoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.34,0.9939,8,0.7054,,1319,1251210,1106682,839,144528,110
142
+ 9,IO,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,92.27,1.0,8,0.4709,,1319,835275,583916,443,251359,191
143
+ 10,ToT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,91.89,1.0,8,20.8753,,1319,37029687,35096810,26609,1932877,1465
144
+ 11,SC-CoT,gsm8k,Doubao-lite-32k,2025/1/7,91.58,0.9992,8,0.1118,,1319,1835891,942182,714,893709,678
145
+ 12,ToT,gsm8k,gpt-4o,2025/1/22,91.13,1.0,8,86.8581,,1319,30769735,29445237,22324,1324498,1004
146
+ 13,SC-CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,90.98,1.0,8,0.0,,1319,7259943,5580524,4231,1679419,1273
147
+ 14,CoT,gsm8k,Doubao-lite-32k,2025/1/7,89.31,1.0,8,0.0558,,1319,1201820,1042095,790,159725,121
148
+ 15,ToT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,88.88,1.0,8,23.5911,,1319,41847148,40435361,30656,1411787,1070
149
+ 16,IO,gsm8k,gpt-4o,2025/1/22,88.4,1.0,8,3.3463,,1319,741446,542416,411,199030,151
150
+ 17,ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,87.64,0.9992,8,10.1124,,1319,17937864,17038928,12918,898936,682
151
+ 18,ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,87.26,1.0,8,10.5479,,1319,18710437,18160983,13769,549454,417
152
  19,IO,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,86.58,1.0,8,0.4899,,1319,869060,555340,421,313720,238
153
  20,CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,85.67,1.0,8,0.0,,1319,1290805,1046008,793,244797,186
154
  21,ReAct-Pro*,gsm8k,Doubao-lite-32k,2025/1/7,85.6,0.9962,8,0.2512,,1319,5998639,5862016,4444,136623,104
155
  22,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.87,1.0,8,0.0,,1319,14850914,14355752,10884,495162,375
156
+ 23,PoT,gsm8k,Doubao-lite-32k,2025/1/7,79.61,0.9257,8,0.0576,,1319,1288055,1170038,887,118017,89
157
+ 24,CoT,gsm8k,gpt-3.5-turbo,2025/1/7,78.7,1.0,8,0.6788,,1319,1088041,953242,723,134799,102
158
+ 25,CoT,gsm8k,Internllm2_5-7B,2025/1/22,77.71,0.997,8,0.0,,1319,1202163,968163,734,234000,177
159
+ 26,PoT,gsm8k,gpt-3.5-turbo,2025/1/7,76.88,0.9924,8,0.6902,,1319,1187080,1090418,827,96662,73
160
+ 27,CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.44,0.9992,8,0.0,,1319,1248329,990168,751,258161,196
161
+ 28,ReAct-Pro*,gsm8k,gpt-3.5-turbo,2025/1/7,74.91,0.9939,8,3.4633,,1319,6646286,6506164,4933,140122,106
162
+ 29,PoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,73.09,0.7961,8,0.9736,,1319,1727044,1126025,854,601019,456
163
+ 30,ToT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,72.21,0.9901,8,0.0,,1319,31657319,20196528,15312,11460791,8689
164
+ 31,IO,gsm8k,Doubao-lite-32k,2025/1/7,72.02,0.9992,8,0.0354,,1319,740483,617377,468,123106,93
165
+ 32,CoT,gsm8k,deepseek-r1:1.5b,2025/1/23,70.66,0.9977,8,0.0,,1319,2090625,1011714,767,1078911,818
166
+ 33,SC-CoT,gsm8k,gpt-3.5-turbo,2025/1/7,69.29,0.9879,8,2.5203,,1319,2277249,895571,679,1381678,1048
167
+ 34,SC-CoT,gsm8k,deepseek-r1:1.5b,2025/2/10,69.07,0.9879,8,0.0,,1319,10029684,5407357,4100,4622327,3504
168
  35,ToT,gsm8k,gpt-3.5-turbo,2025/1/7,67.93,0.997,8,9.1707,,1319,16727175,15920037,12070,807138,612
169
  36,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.78,0.9856,8,0.0,,1319,22835767,21044978,15955,1790789,1358
170
  37,ToT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,65.05,0.9196,8,0.0,,1319,16432102,15554967,11793,877135,665
 
174
  41,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8,0.0,,1319,887913,596229,452,291684,221
175
  42,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.9955,8,0.0,,1319,1745429,550941,418,1194488,906
176
  43,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.5,1.0,8,0.0,,1319,1218525,1032818,783,185707,141
177
+ 44,SC-CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,54.36,0.9985,8,0.0,,1319,10956434,5136762,3894,5819672,4412
178
+ 45,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,44.66,0.9181,8,0.0,,1319,8162499,5847761,4433,2314738,1755
179
  46,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.67,0.5542,8,0.0,,1319,1391111,1147538,870,243573,185
180
  47,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8,0.0,,1319,1324949,1136843,862,188106,143
181
  48,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8,0.3328,,1319,586553,546990,415,39563,30
 
191
  58,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.68,1.0,8,0.0,,1319,736996,568530,431,168466,128
192
  59,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.71,1.0,8,0.0,,1319,834897,568116,431,266781,202
193
  60,PoT,gsm8k,deepseek-r1:1.5b,2025/2/10,11.9,0.1744,8,0.0,,1319,1954509,1138872,863,815637,618
194
+ 61,IO,gsm8k,Internllm2_5-7B,2025/1/22,11.6,0.9795,8,0.0,,1319,1113728,679302,515,434426,329
195
+ 62,PoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,9.63,0.1691,8,0.0,,1319,1389135,1151528,873,237607,180
196
+ 63,SC-CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,8.19,0.6876,8,0.0,,1319,7386453,5439568,4124,1946885,1476
197
  64,ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,7.66,0.9522,8,0.0,,1319,55392611,52431343,39751,2961268,2245
198
+ 65,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,4.17,0.9447,8,0.0,,1319,7478767,5441962,4126,2036805,1544
199
  66,ToT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.0,8,0.0,,1319,0,0,0,0,0
src/overall_math_score.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "time": "2025-02-11 13:23:00",
3
  "results": {
4
  "IO": {
5
  "META": {
@@ -84,16 +84,16 @@
84
  "Eval Date": "2025/1/7"
85
  },
86
  "gsm8k": {
87
- "Score": 79.91,
88
- "Cost($)": 3.3938
89
  },
90
  "AQuA": {
91
- "Score": 66.14,
92
- "Cost($)": 0.7888
93
  },
94
  "MATH-500": {
95
- "Score": 28.8,
96
- "Cost($)": 1.9764
97
  }
98
  },
99
  "ToT": {
@@ -198,16 +198,16 @@
198
  "Eval Date": "2025/1/7"
199
  },
200
  "gsm8k": {
201
- "Score": 87.26,
202
- "Cost($)": 0.2083
203
  },
204
  "AQuA": {
205
- "Score": 81.1,
206
- "Cost($)": 0.0519
207
  },
208
  "MATH-500": {
209
- "Score": 49.2,
210
- "Cost($)": 0.1406
211
  }
212
  },
213
  "ToT-Doubao-lite-32k": {
@@ -312,16 +312,16 @@
312
  "Eval Date": "2025/1/22"
313
  },
314
  "gsm8k": {
315
- "Score": 90.3,
316
- "Cost($)": 31.0542
317
  },
318
  "AQuA": {
319
- "Score": 86.61,
320
- "Cost($)": 8.1485
321
  },
322
  "MATH-500": {
323
- "Score": 34.4,
324
- "Cost($)": 19.6538
325
  }
326
  },
327
  "ToT-gpt-4o": {
@@ -426,16 +426,16 @@
426
  "Eval Date": "2025/1/22"
427
  },
428
  "gsm8k": {
429
- "Score": 93.86,
430
- "Cost($)": 5.9858
431
  },
432
  "AQuA": {
433
- "Score": 85.04,
434
- "Cost($)": 1.0348
435
  },
436
  "MATH-500": {
437
- "Score": 74.0,
438
- "Cost($)": 3.1556
439
  }
440
  },
441
  "ToT-Qwen2.5-72B-Instruct": {
@@ -540,16 +540,16 @@
540
  "Eval Date": "2025/1/22"
541
  },
542
  "gsm8k": {
543
- "Score": 95.07,
544
- "Cost($)": 6.2005
545
  },
546
  "AQuA": {
547
- "Score": 82.28,
548
- "Cost($)": 1.0756
549
  },
550
  "MATH-500": {
551
- "Score": 74.2,
552
- "Cost($)": 3.2239
553
  }
554
  },
555
  "ToT-Llama-3.3-70B-Instruct": {
@@ -654,15 +654,15 @@
654
  "Eval Date": "2025/1/22"
655
  },
656
  "gsm8k": {
657
- "Score": 91.13,
658
  "Cost($)": 0.0
659
  },
660
  "AQuA": {
661
- "Score": 79.92,
662
  "Cost($)": 0.0
663
  },
664
  "MATH-500": {
665
- "Score": 67.0,
666
  "Cost($)": 0.0
667
  }
668
  },
@@ -768,7 +768,7 @@
768
  "Eval Date": "2025/1/22"
769
  },
770
  "gsm8k": {
771
- "Score": 73.46,
772
  "Cost($)": 0.0
773
  },
774
  "AQuA": {
@@ -776,7 +776,7 @@
776
  "Cost($)": 0.0
777
  },
778
  "MATH-500": {
779
- "Score": 30.2,
780
  "Cost($)": 0.0
781
  }
782
  },
@@ -882,15 +882,15 @@
882
  "Eval Date": "2025/1/22"
883
  },
884
  "gsm8k": {
885
- "Score": 48.22,
886
  "Cost($)": 0.0
887
  },
888
  "AQuA": {
889
- "Score": 39.37,
890
  "Cost($)": 0.0
891
  },
892
  "MATH-500": {
893
- "Score": 9.8,
894
  "Cost($)": 0.0
895
  }
896
  },
@@ -996,15 +996,15 @@
996
  "Eval Date": "2025/1/22"
997
  },
998
  "gsm8k": {
999
- "Score": 11.75,
1000
  "Cost($)": 0.0
1001
  },
1002
  "AQuA": {
1003
- "Score": 23.62,
1004
  "Cost($)": 0.0
1005
  },
1006
  "MATH-500": {
1007
- "Score": 3.8,
1008
  "Cost($)": 0.0
1009
  }
1010
  },
@@ -1110,15 +1110,15 @@
1110
  "Eval Date": "2025/1/22"
1111
  },
1112
  "gsm8k": {
1113
- "Score": 1.67,
1114
  "Cost($)": 0.0
1115
  },
1116
  "AQuA": {
1117
- "Score": 22.83,
1118
  "Cost($)": 0.0
1119
  },
1120
  "MATH-500": {
1121
- "Score": 0.8,
1122
  "Cost($)": 0.0
1123
  }
1124
  },
@@ -1224,15 +1224,15 @@
1224
  "Eval Date": "2025/2/10"
1225
  },
1226
  "gsm8k": {
1227
- "Score": 55.34,
1228
  "Cost($)": 0.0
1229
  },
1230
  "AQuA": {
1231
- "Score": 59.06,
1232
  "Cost($)": 0.0
1233
  },
1234
  "MATH-500": {
1235
- "Score": 38.0,
1236
  "Cost($)": 0.0
1237
  }
1238
  },
 
1
  {
2
+ "time": "2025-03-05 13:15:02",
3
  "results": {
4
  "IO": {
5
  "META": {
 
84
  "Eval Date": "2025/1/7"
85
  },
86
  "gsm8k": {
87
+ "Score": 69.29,
88
+ "Cost($)": 2.5203
89
  },
90
  "AQuA": {
91
+ "Score": 58.66,
92
+ "Cost($)": 0.3277
93
  },
94
  "MATH-500": {
95
+ "Score": 40.8,
96
+ "Cost($)": 1.2308
97
  }
98
  },
99
  "ToT": {
 
198
  "Eval Date": "2025/1/7"
199
  },
200
  "gsm8k": {
201
+ "Score": 91.58,
202
+ "Cost($)": 0.1118
203
  },
204
  "AQuA": {
205
+ "Score": 76.37,
206
+ "Cost($)": 0.0279
207
  },
208
  "MATH-500": {
209
+ "Score": 65.8,
210
+ "Cost($)": 0.0734
211
  }
212
  },
213
  "ToT-Doubao-lite-32k": {
 
312
  "Eval Date": "2025/1/22"
313
  },
314
  "gsm8k": {
315
+ "Score": 94.77,
316
+ "Cost($)": 18.2044
317
  },
318
  "AQuA": {
319
+ "Score": 85.83,
320
+ "Cost($)": 5.2456
321
  },
322
  "MATH-500": {
323
+ "Score": 74.6,
324
+ "Cost($)": 12.3611
325
  }
326
  },
327
  "ToT-gpt-4o": {
 
426
  "Eval Date": "2025/1/22"
427
  },
428
  "gsm8k": {
429
+ "Score": 94.77,
430
+ "Cost($)": 4.045
431
  },
432
  "AQuA": {
433
+ "Score": 85.43,
434
+ "Cost($)": 0.4186
435
  },
436
  "MATH-500": {
437
+ "Score": 79.8,
438
+ "Cost($)": 1.8504
439
  }
440
  },
441
  "ToT-Qwen2.5-72B-Instruct": {
 
540
  "Eval Date": "2025/1/22"
541
  },
542
  "gsm8k": {
543
+ "Score": 95.22,
544
+ "Cost($)": 3.7895
545
  },
546
  "AQuA": {
547
+ "Score": 84.65,
548
+ "Cost($)": 0.4438
549
  },
550
  "MATH-500": {
551
+ "Score": 72.4,
552
+ "Cost($)": 1.7845
553
  }
554
  },
555
  "ToT-Llama-3.3-70B-Instruct": {
 
654
  "Eval Date": "2025/1/22"
655
  },
656
  "gsm8k": {
657
+ "Score": 90.98,
658
  "Cost($)": 0.0
659
  },
660
  "AQuA": {
661
+ "Score": 79.53,
662
  "Cost($)": 0.0
663
  },
664
  "MATH-500": {
665
+ "Score": 71.2,
666
  "Cost($)": 0.0
667
  }
668
  },
 
768
  "Eval Date": "2025/1/22"
769
  },
770
  "gsm8k": {
771
+ "Score": 54.36,
772
  "Cost($)": 0.0
773
  },
774
  "AQuA": {
 
776
  "Cost($)": 0.0
777
  },
778
  "MATH-500": {
779
+ "Score": 19.8,
780
  "Cost($)": 0.0
781
  }
782
  },
 
882
  "Eval Date": "2025/1/22"
883
  },
884
  "gsm8k": {
885
+ "Score": 44.66,
886
  "Cost($)": 0.0
887
  },
888
  "AQuA": {
889
+ "Score": 38.58,
890
  "Cost($)": 0.0
891
  },
892
  "MATH-500": {
893
+ "Score": 9.2,
894
  "Cost($)": 0.0
895
  }
896
  },
 
996
  "Eval Date": "2025/1/22"
997
  },
998
  "gsm8k": {
999
+ "Score": 8.19,
1000
  "Cost($)": 0.0
1001
  },
1002
  "AQuA": {
1003
+ "Score": 10.63,
1004
  "Cost($)": 0.0
1005
  },
1006
  "MATH-500": {
1007
+ "Score": 2.0,
1008
  "Cost($)": 0.0
1009
  }
1010
  },
 
1110
  "Eval Date": "2025/1/22"
1111
  },
1112
  "gsm8k": {
1113
+ "Score": 4.17,
1114
  "Cost($)": 0.0
1115
  },
1116
  "AQuA": {
1117
+ "Score": 17.32,
1118
  "Cost($)": 0.0
1119
  },
1120
  "MATH-500": {
1121
+ "Score": 2.2,
1122
  "Cost($)": 0.0
1123
  }
1124
  },
 
1224
  "Eval Date": "2025/2/10"
1225
  },
1226
  "gsm8k": {
1227
+ "Score": 69.07,
1228
  "Cost($)": 0.0
1229
  },
1230
  "AQuA": {
1231
+ "Score": 57.87,
1232
  "Cost($)": 0.0
1233
  },
1234
  "MATH-500": {
1235
+ "Score": 46.8,
1236
  "Cost($)": 0.0
1237
  }
1238
  },
src/overall_results.csv CHANGED
@@ -1,20 +1,20 @@
1
  Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA-Cost($),MATH-500-Score,MATH-500-Cost($)
2
- 1.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,86.43,92.87,0.7195,86.22,0.0808,80.2,0.349
3
- 2.0,SC-CoT,Qwen2.5-72B-Instruct,2025/1/22,84.30,93.86,5.9858,85.04,1.0348,74.0,3.1556
4
- 3.0,SC-CoT,Llama-3.3-70B-Instruct,2025/1/22,83.85,95.07,6.2005,82.28,1.0756,74.2,3.2239
5
- 4.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,82.86,93.93,0.687,83.46,0.0927,71.2,0.3463
6
- 5.0,CoT,gpt-4o,2025/1/22,81.59,94.09,4.5367,82.68,1.0417,68.0,3.0569
7
- 6.0,IO,Llama-3.3-70B-Instruct,2025/1/22,81.45,92.27,0.4709,82.68,0.0798,69.4,0.2386
8
- 7.0,IO,Qwen2.5-72B-Instruct,2025/1/22,80.34,86.58,0.4899,84.25,0.0742,70.2,0.2506
9
- 8.0,SC-CoT,Qwen2.5-7B-Instruct,2025/1/22,79.35,91.13,0.0,79.92,0.0,67.0,0.0
10
- 9.0,CoT,Qwen2.5-7B-Instruct,2025/1/22,78.73,85.67,0.0,80.71,0.0,69.8,0.0
11
- 10.0,ReAct-Pro*,Llama-3.3-70B-Instruct,2025/1/22,77.12,87.64,10.1124,79.13,0.768,64.6,3.1806
12
- 11.0,CoT,Doubao-lite-32k,2025/1/7,77.00,89.31,0.0558,82.68,0.0066,59.0,0.0255
13
- 12.0,ReAct-Pro*,Qwen2.5-72B-Instruct,2025/1/22,74.43,87.26,10.5479,73.23,0.3177,62.8,3.4541
14
- 13.0,SC-CoT,Doubao-lite-32k,2025/1/7,72.52,87.26,0.2083,81.1,0.0519,49.2,0.1406
15
- 14.0,PoT,Qwen2.5-72B-Instruct,2025/1/22,71.58,92.34,0.7054,75.2,0.1645,47.2,0.233
16
- 15.0,PoT,gpt-4o,2025/1/22,71.50,93.1,4.2166,75.2,1.6087,46.2,1.5994
17
- 16.0,SC-CoT,gpt-4o,2025/1/22,70.44,90.3,31.0542,86.61,8.1485,34.4,19.6538
18
  17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,70.12,85.6,0.2512,77.56,0.0445,47.2,0.186
19
  18.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,68.69,82.87,0.0,74.41,0.0,48.8,0.0
20
  19.0,IO,gpt-4o,2025/1/22,68.60,88.4,3.3463,75.59,1.1453,41.8,2.7907
@@ -29,24 +29,24 @@ Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA
29
  28.0,IO,deepseek-r1:1.5b,2025/1/22,58.95,64.14,0.0,68.9,0.0,43.8,0.0
30
  29.0,ToT,Llama-3.3-70B-Instruct,2025/1/22,58.79,91.89,20.8753,83.07,2.9404,1.4,8.2699
31
  30.0,ToT,gpt-4o,2025/1/22,58.61,91.13,86.8581,81.5,8.5295,3.2,40.8094
32
- 31.0,SC-CoT,gpt-3.5-turbo,2025/1/7,58.28,79.91,3.3938,66.14,0.7888,28.8,1.9764
33
- 32.0,ReAct-Pro*,gpt-4o,2025/1/22,58.26,63.31,39.0751,57.48,2.304,54.0,17.7735
34
- 33.0,PoT,Qwen2.5-7B-Instruct,2025/1/22,55.51,58.83,0.0,68.11,0.0,39.6,0.0
35
- 34.0,PoT,gpt-3.5-turbo,2025/1/7,55.04,76.88,0.6902,59.45,0.1748,28.8,0.168
36
- 35.0,ReAct-Pro*,gpt-3.5-turbo,2025/1/7,54.43,74.91,3.4633,64.57,0.4928,23.8,2.0406
37
- 36.0,SC-CoT,Llama-3.1-8B-Instruct,2025/1/22,54.37,73.46,0.0,59.45,0.0,30.2,0.0
38
  37.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,53.96,75.44,0.0,60.63,0.0,25.8,0.0
39
- 38.0,SC-CoT,deepseek-r1:1.5b,2025/2/10,50.80,55.34,0.0,59.06,0.0,38.0,0.0
40
- 39.0,ReAct-Pro*,Llama-3.1-8B-Instruct,2025/1/22,50.70,67.78,0.0,55.51,0.0,28.8,0.0
41
- 40.0,IO,Llama-3.1-8B-Instruct,2025/1/22,48.98,57.16,0.0,51.18,0.0,38.6,0.0
42
- 41.0,ToT,gpt-3.5-turbo,2025/1/7,44.94,67.93,9.1707,57.09,1.1513,9.8,5.2914
43
  42.0,ToT,Qwen2.5-7B-Instruct,2025/1/22,42.52,72.21,0.0,53.94,0.0,1.4,0.0
44
  43.0,ToT,Llama-3.1-8B-Instruct,2025/1/22,41.97,65.05,0.0,59.06,0.0,1.8,0.0
45
  44.0,ReAct-Pro*,deepseek-r1:1.5b,2025/2/10,38.22,35.94,0.0,54.33,0.0,24.4,0.0
46
  45.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,37.08,55.5,0.0,40.55,0.0,15.2,0.0
47
  46.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,33.56,38.67,0.0,36.61,0.0,25.4,0.0
48
- 47.0,SC-CoT,Internllm2_5-7B,2025/1/22,32.46,48.22,0.0,39.37,0.0,9.8,0.0
49
- 48.0,IO,gpt-3.5-turbo,2025/1/7,31.34,37.83,0.3328,38.98,0.038,17.2,0.2436
50
  49.0,PoT,Internllm2_5-7B,2025/1/22,29.94,38.21,0.0,36.61,0.0,15.0,0.0
51
  50.0,ReAct-Pro*,Internllm2_5-7B,2025/1/22,29.75,33.51,0.0,40.94,0.0,14.8,0.0
52
  51.0,ToT,Doubao-lite-32k,2025/1/7,28.10,37.83,0.8739,45.28,0.0881,1.2,0.2371
@@ -60,8 +60,8 @@ Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA
60
  59.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,16.67,18.5,0.0,30.71,0.0,0.8,0.0
61
  60.0,ToT,deepseek-r1:1.5b,2025/2/10,16.11,23.12,0.0,24.8,0.0,0.4,0.0
62
  61.0,IO,Qwen2-0.5B-Instruct,2025/1/22,14.83,14.71,0.0,27.17,0.0,2.6,0.0
63
- 62.0,SC-CoT,Qwen2-1.5B-Instruct,2025/1/22,13.06,11.75,0.0,23.62,0.0,3.8,0.0
64
- 63.0,ReAct-Pro*,Qwen2-0.5B-Instruct,2025/1/22,10.76,7.66,0.0,24.02,0.0,0.6,0.0
65
- 64.0,ToT,Qwen2-0.5B-Instruct,2025/1/22,9.97,0.0,0.0,29.92,0.0,0.0,0.0
66
- 65.0,PoT,Qwen2-0.5B-Instruct,2025/1/22,8.98,9.63,0.0,17.32,0.0,0.0,0.0
67
- 66.0,SC-CoT,Qwen2-0.5B-Instruct,2025/1/22,8.43,1.67,0.0,22.83,0.0,0.8,0.0
 
1
  Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA-Cost($),MATH-500-Score,MATH-500-Cost($)
2
+ 1.0,SC-CoT,Qwen2.5-72B-Instruct,2025/1/22,86.67,94.77,4.045,85.43,0.4186,79.8,1.8504
3
+ 2.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,86.43,92.87,0.7195,86.22,0.0808,80.2,0.349
4
+ 3.0,SC-CoT,gpt-4o,2025/1/22,85.07,94.77,18.2044,85.83,5.2456,74.6,12.3611
5
+ 4.0,SC-CoT,Llama-3.3-70B-Instruct,2025/1/22,84.09,95.22,3.7895,84.65,0.4438,72.4,1.7845
6
+ 5.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,82.86,93.93,0.687,83.46,0.0927,71.2,0.3463
7
+ 6.0,CoT,gpt-4o,2025/1/22,81.59,94.09,4.5367,82.68,1.0417,68.0,3.0569
8
+ 7.0,IO,Llama-3.3-70B-Instruct,2025/1/22,81.45,92.27,0.4709,82.68,0.0798,69.4,0.2386
9
+ 8.0,SC-CoT,Qwen2.5-7B-Instruct,2025/1/22,80.57,90.98,0.0,79.53,0.0,71.2,0.0
10
+ 9.0,IO,Qwen2.5-72B-Instruct,2025/1/22,80.34,86.58,0.4899,84.25,0.0742,70.2,0.2506
11
+ 10.0,CoT,Qwen2.5-7B-Instruct,2025/1/22,78.73,85.67,0.0,80.71,0.0,69.8,0.0
12
+ 11.0,SC-CoT,Doubao-lite-32k,2025/1/7,77.92,91.58,0.1118,76.37,0.0279,65.8,0.0734
13
+ 12.0,ReAct-Pro*,Llama-3.3-70B-Instruct,2025/1/22,77.12,87.64,10.1124,79.13,0.768,64.6,3.1806
14
+ 13.0,CoT,Doubao-lite-32k,2025/1/7,77.00,89.31,0.0558,82.68,0.0066,59.0,0.0255
15
+ 14.0,ReAct-Pro*,Qwen2.5-72B-Instruct,2025/1/22,74.43,87.26,10.5479,73.23,0.3177,62.8,3.4541
16
+ 15.0,PoT,Qwen2.5-72B-Instruct,2025/1/22,71.58,92.34,0.7054,75.2,0.1645,47.2,0.233
17
+ 16.0,PoT,gpt-4o,2025/1/22,71.50,93.1,4.2166,75.2,1.6087,46.2,1.5994
18
  17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,70.12,85.6,0.2512,77.56,0.0445,47.2,0.186
19
  18.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,68.69,82.87,0.0,74.41,0.0,48.8,0.0
20
  19.0,IO,gpt-4o,2025/1/22,68.60,88.4,3.3463,75.59,1.1453,41.8,2.7907
 
29
  28.0,IO,deepseek-r1:1.5b,2025/1/22,58.95,64.14,0.0,68.9,0.0,43.8,0.0
30
  29.0,ToT,Llama-3.3-70B-Instruct,2025/1/22,58.79,91.89,20.8753,83.07,2.9404,1.4,8.2699
31
  30.0,ToT,gpt-4o,2025/1/22,58.61,91.13,86.8581,81.5,8.5295,3.2,40.8094
32
+ 31.0,ReAct-Pro*,gpt-4o,2025/1/22,58.26,63.31,39.0751,57.48,2.304,54.0,17.7735
33
+ 32.0,SC-CoT,deepseek-r1:1.5b,2025/2/10,57.91,69.07,0.0,57.87,0.0,46.8,0.0
34
+ 33.0,SC-CoT,gpt-3.5-turbo,2025/1/7,56.25,69.29,2.5203,58.66,0.3277,40.8,1.2308
35
+ 34.0,PoT,Qwen2.5-7B-Instruct,2025/1/22,55.51,58.83,0.0,68.11,0.0,39.6,0.0
36
+ 35.0,PoT,gpt-3.5-turbo,2025/1/7,55.04,76.88,0.6902,59.45,0.1748,28.8,0.168
37
+ 36.0,ReAct-Pro*,gpt-3.5-turbo,2025/1/7,54.43,74.91,3.4633,64.57,0.4928,23.8,2.0406
38
  37.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,53.96,75.44,0.0,60.63,0.0,25.8,0.0
39
+ 38.0,ReAct-Pro*,Llama-3.1-8B-Instruct,2025/1/22,50.70,67.78,0.0,55.51,0.0,28.8,0.0
40
+ 39.0,IO,Llama-3.1-8B-Instruct,2025/1/22,48.98,57.16,0.0,51.18,0.0,38.6,0.0
41
+ 40.0,ToT,gpt-3.5-turbo,2025/1/7,44.94,67.93,9.1707,57.09,1.1513,9.8,5.2914
42
+ 41.0,SC-CoT,Llama-3.1-8B-Instruct,2025/1/22,44.54,54.36,0.0,59.45,0.0,19.8,0.0
43
  42.0,ToT,Qwen2.5-7B-Instruct,2025/1/22,42.52,72.21,0.0,53.94,0.0,1.4,0.0
44
  43.0,ToT,Llama-3.1-8B-Instruct,2025/1/22,41.97,65.05,0.0,59.06,0.0,1.8,0.0
45
  44.0,ReAct-Pro*,deepseek-r1:1.5b,2025/2/10,38.22,35.94,0.0,54.33,0.0,24.4,0.0
46
  45.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,37.08,55.5,0.0,40.55,0.0,15.2,0.0
47
  46.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,33.56,38.67,0.0,36.61,0.0,25.4,0.0
48
+ 47.0,IO,gpt-3.5-turbo,2025/1/7,31.34,37.83,0.3328,38.98,0.038,17.2,0.2436
49
+ 48.0,SC-CoT,Internllm2_5-7B,2025/1/22,30.81,44.66,0.0,38.58,0.0,9.2,0.0
50
  49.0,PoT,Internllm2_5-7B,2025/1/22,29.94,38.21,0.0,36.61,0.0,15.0,0.0
51
  50.0,ReAct-Pro*,Internllm2_5-7B,2025/1/22,29.75,33.51,0.0,40.94,0.0,14.8,0.0
52
  51.0,ToT,Doubao-lite-32k,2025/1/7,28.10,37.83,0.8739,45.28,0.0881,1.2,0.2371
 
60
  59.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,16.67,18.5,0.0,30.71,0.0,0.8,0.0
61
  60.0,ToT,deepseek-r1:1.5b,2025/2/10,16.11,23.12,0.0,24.8,0.0,0.4,0.0
62
  61.0,IO,Qwen2-0.5B-Instruct,2025/1/22,14.83,14.71,0.0,27.17,0.0,2.6,0.0
63
+ 62.0,ReAct-Pro*,Qwen2-0.5B-Instruct,2025/1/22,10.76,7.66,0.0,24.02,0.0,0.6,0.0
64
+ 63.0,ToT,Qwen2-0.5B-Instruct,2025/1/22,9.97,0.0,0.0,29.92,0.0,0.0,0.0
65
+ 64.0,PoT,Qwen2-0.5B-Instruct,2025/1/22,8.98,9.63,0.0,17.32,0.0,0.0,0.0
66
+ 65.0,SC-CoT,Qwen2-0.5B-Instruct,2025/1/22,7.90,4.17,0.0,17.32,0.0,2.2,0.0
67
+ 66.0,SC-CoT,Qwen2-1.5B-Instruct,2025/1/22,6.94,8.19,0.0,10.63,0.0,2.0,0.0
src/record.csv CHANGED
@@ -43,17 +43,17 @@ CoT,gsm8k,2025/1/22,Internllm2_5-7B,77.71,99.70,8,,"1,319","968,163",734,"234,00
43
  CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,55.50,100.00,8,,"1,319","1,032,818",783,"185,707",141,"1,218,525",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
44
  CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,35.94,99.92,8,,"1,319","1,032,818",783,"190,641",145,"1,223,459",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
45
  CoT,gsm8k,2025/1/23,deepseek-r1:1.5b,70.66,99.77,8,,"1,319","1,011,714",767,"1,078,911",818,"2,090,625",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
46
- SC-CoT,gsm8k,2025/1/7,gpt-3.5-turbo,79.91,99.92,8,"temperature=1, path_num=5","1,319","2,740,652","2,078","1,348,960","1,023","4,089,612",3.3938,,,,,,,,,,,,,,,,,,,,,,,,,,,,
47
- SC-CoT,gsm8k,2025/1/7,Doubao-lite-32k,87.26,99.92,8,"temperature=1, path_num=5","1,319","2,691,714","2,041","1,197,099",908,"3,888,813",0.2083,,,,,,,,,,,,,,,,,,,,,,,,,,,,
48
- SC-CoT,gsm8k,2025/1/22,gpt-4o,90.30,99.92,8,"temperature=1, path_num=5","1,319","3,590,336","2,722","2,207,837","1,674","5,798,173",31.0542,,,,,,,,,,,,,,,,,,,,,,,,,,,,
49
- SC-CoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,93.86,100.00,8,"temperature=1, path_num=5","1,319","8,136,223","6,168","2,481,785","1,882","10,618,008",5.9858,,,,,,,,,,,,,,,,,,,,,,,,,,,,
50
- SC-CoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,95.07,100.00,8,"temperature=1, path_num=5","1,319","8,413,717","6,379","2,585,077","1,960","10,998,794",6.2005,,,,,,,,,,,,,,,,,,,,,,,,,,,,
51
- SC-CoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,91.13,100.00,8,"temperature=1, path_num=5","1,319","8,586,888","6,510","2,554,097","1,936","11,140,985",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
52
- SC-CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,73.46,99.55,8,"temperature=1, path_num=5","1,319","8,630,514","6,543","3,148,202","2,387","11,778,716",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
53
- SC-CoT,gsm8k,2025/1/22,Internllm2_5-7B,48.22,98.41,8,"temperature=1, path_num=5","1,319","10,678,792","8,096","3,847,639","2,917","14,526,431",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
54
- SC-CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,11.75,91.89,8,"temperature=1, path_num=5","1,319","9,066,115","6,873","3,345,827","2,537","12,411,942",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
55
- SC-CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,1.67,94.69,8,"temperature=1, path_num=5","1,319","11,019,864","8,355","5,445,856","4,129","16,465,720",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
56
- SC-CoT,gsm8k,2025/2/10,deepseek-r1:1.5b,55.34,99.70,8,"temperature=1, path_num=5","1,319","14,540,096","11,024","11,245,769","8,526","25,785,865",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
57
  ToT,gsm8k,2025/1/7,gpt-3.5-turbo,67.93,99.70,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","15,920,037","12,070","807,138",612,"16,727,175",9.1707,,,,,,,,,,,,,,,,,,,,,,,,,,,,
58
  ToT,gsm8k,2025/1/7,Doubao-lite-32k,37.83,87.34,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","19,208,597","14,563","1,065,752",808,"20,274,349",0.8739,,,,,,,,,,,,,,,,,,,,,,,,,,,,
59
  ToT,gsm8k,2025/1/22,gpt-4o,91.13,100.00,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","29,445,237","22,324","1,324,498","1,004","30,769,735",86.8581,,,,,,,,,,,,,,,,,,,,,,,,,,,,
@@ -98,17 +98,17 @@ PoT,AQuA,2025/1/22,Internllm2_5-7B,36.61,98.82,0,,254,"233,505",919,"68,457",270
98
  PoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,30.71,96.46,0,,254,"246,560",971,"51,915",204,"298,475",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
99
  PoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,17.32,92.13,0,,254,"258,867","1,019","63,414",250,"322,281",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
100
  PoT,AQuA,2025/2/10,deepseek-r1:1.5b,54.72,97.24,0,,254,"250,690",987,"765,957","3,016","1,016,647",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
101
- SC-CoT,AQuA,2025/1/22,gpt-3.5-turbo,66.14,99.21,0,"temperature=1, path_num=5",254,"482,192","1,898","365,143","1,438","847,335",0.7888,,,,,,,,,,,,,,,,,,,,,,,,,,,,
102
- SC-CoT,AQuA,2025/1/22,Doubao-lite-32k,81.10,97.24,0,"temperature=1, path_num=5",254,"503,751","1,983","382,235","1,505","885,986",0.0519,,,,,,,,,,,,,,,,,,,,,,,,,,,,
103
- SC-CoT,AQuA,2025/1/22,gpt-4o,86.61,98.82,0,"temperature=1, path_num=5",254,"744,478","2,931","628,728","2,475","1,373,206",8.1485,,,,,,,,,,,,,,,,,,,,,,,,,,,,
104
- SC-CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,85.04,99.21,0,"temperature=1, path_num=5",254,"1,051,218","4,139","784,451","3,088","1,835,669",1.0348,,,,,,,,,,,,,,,,,,,,,,,,,,,,
105
- SC-CoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,82.28,99.21,0,"temperature=1, path_num=5",254,"1,135,251","4,469","772,673","3,042","1,907,924",1.0756,,,,,,,,,,,,,,,,,,,,,,,,,,,,
106
- SC-CoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,79.92,100.00,0,"temperature=1, path_num=5",254,"1,098,280","4,324","747,052","2,941","1,845,332",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
107
- SC-CoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,59.45,97.24,0,"temperature=1, path_num=5",254,"971,003","3,823","680,330","2,678","1,651,333",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
108
- SC-CoT,AQuA,2025/1/22,Internllm2_5-7B,39.37,98.03,0,"temperature=1, path_num=5",254,"1,420,494","5,592","875,728","3,448","2,296,222",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
109
- SC-CoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,23.62,96.46,0,"temperature=1, path_num=5",254,"1,034,362","4,072","740,973","2,917","1,775,335",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
110
- SC-CoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,22.83,97.24,0,"temperature=1, path_num=5",254,"1,246,929","4,909","968,162","3,812","2,215,091",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
111
- SC-CoT,AQuA,2025/2/10,deepseek-r1:1.5b,59.06,96.85,0,"temperature=1, path_num=5",254,"2,547,772","10,031","3,254,939","12,815","5,802,711",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
112
  ReAct-Pro*,AQuA,2025/1/7,gpt-3.5-turbo,64.57,98.03,0,max_steps=10,254,"862,614","3,396","40,973",161,"903,587",0.4928,"think-action ε•η‹¬θΏ”ε›ž,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
113
  ReAct-Pro*,AQuA,2025/1/7,Doubao-lite-32k,77.56,96.06,0,max_steps=10,254,"977,890","3,850","54,951",216,"1,032,841",0.0445,"think-action ε•η‹¬θΏ”ε›ž,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
114
  ReAct-Pro*,AQuA,2025/1/22,gpt-4o,57.48,97.24,0,max_steps=10,254,"615,589","2,424","76,507",301,"692,096",2.3040,"think-action ε•η‹¬θΏ”ε›ž,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
@@ -164,17 +164,17 @@ PoT,MATH-500,2025/2/10,Internllm2_5-7B,15.00,32.40,4,,500,"247,883",496,"120,826
164
  PoT,MATH-500,2025/2/10,Qwen2-1.5B-Instruct,0.80,2.20,4,,500,"248,509",497,"538,361","1,077","786,870",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
165
  PoT,MATH-500,2025/2/10,Qwen2-0.5B-Instruct,0.00,0.00,4,,500,"253,549",507,"183,653",367,"437,202",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
166
  PoT,MATH-500,2025/2/10,deepseek-r1:1.5b,1.00,1.60,4,,500,"245,549",491,"785,518","1,571","1,031,067",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
167
- SC-CoT,MATH-500,2025/2/10,gpt-3.5-turbo,28.80,100.00,4,"temperature=1, path_num=5",500,"1,381,818","2,764","856,994","1,714","2,238,812",1.9764,,,,,,,,,,,,,,,,,,,,,,,,,,,,
168
- SC-CoT,MATH-500,2025/2/10,Doubao-lite-32k,49.20,100.00,4,"temperature=1, path_num=5",500,"1,507,651","3,015","963,159","1,926","2,470,810",0.1406,,,,,,,,,,,,,,,,,,,,,,,,,,,,
169
- SC-CoT,MATH-500,2025/2/10,gpt-4o,34.40,100.00,4,"temperature=1, path_num=5",500,"1,986,584","3,973","1,468,739","2,937","3,455,323",19.6538,,,,,,,,,,,,,,,,,,,,,,,,,,,,
170
- SC-CoT,MATH-500,2025/2/10,Qwen2.5-72B-Instruct,74.00,100.00,4,"temperature=1, path_num=5",500,"3,823,997","7,648","1,773,516","3,547","5,597,513",3.1556,,,,,,,,,,,,,,,,,,,,,,,,,,,,
171
- SC-CoT,MATH-500,2025/2/10,Llama-3.3-70B-Instruct,74.20,100.00,4,"temperature=1, path_num=5",500,"3,959,492","7,919","1,759,247","3,518","5,718,739",3.2239,,,,,,,,,,,,,,,,,,,,,,,,,,,,
172
- SC-CoT,MATH-500,2025/2/10,Qwen2.5-7B-Instruct,67.00,100.00,4,"temperature=1, path_num=5",500,"3,833,751","7,668","1,617,733","3,235","5,451,484",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
173
- SC-CoT,MATH-500,2025/2/10,Llama-3.1-8B-Instruct,30.20,100.00,4,"temperature=1, path_num=5",500,"3,546,673","7,093","1,488,264","2,977","5,034,937",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
174
- SC-CoT,MATH-500,2025/2/10,Internllm2_5-7B,9.80,100.00,4,"temperature=1, path_num=5",500,"4,193,296","8,387","1,645,170","3,290","5,838,466",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
175
- SC-CoT,MATH-500,2025/2/10,Qwen2-1.5B-Instruct,3.80,99.00,4,"temperature=1, path_num=5",500,"3,832,429","7,665","1,737,013","3,474","5,569,442",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
176
- SC-CoT,MATH-500,2025/2/10,Qwen2-0.5B-Instruct,0.80,100.00,4,"temperature=1, path_num=5",500,"4,448,663","8,897","2,413,393","4,827","6,862,056",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
177
- SC-CoT,MATH-500,2025/2/10,deepseek-r1:1.5b,38.00,100.00,4,"temperature=1, path_num=5",500,"7,080,559","14,161","7,661,550","15,323","14,742,109",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
178
  ReAct-Pro*,MATH-500,2025/2/10,gpt-3.5-turbo,23.80,100.00,4,max_steps=10,500,"3,708,461","7,417","124,253",249,"3,832,714",2.0406,,,,,,,,,,,,,,,,,,,,,,,,,,,,
179
  ReAct-Pro*,MATH-500,2025/2/10,Doubao-lite-32k,47.20,100.00,4,max_steps=10,500,"4,234,620","8,469","154,046",308,"4,388,666",0.1860,,,,,,,,,,,,,,,,,,,,,,,,,,,,
180
  ReAct-Pro*,MATH-500,2025/2/10,gpt-4o,54.00,100.00,4,max_steps=10,500,"5,834,537","11,669","318,718",637,"6,153,255",17.7735,,,,,,,,,,,,,,,,,,,,,,,,,,,,
 
43
  CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,55.50,100.00,8,,"1,319","1,032,818",783,"185,707",141,"1,218,525",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
44
  CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,35.94,99.92,8,,"1,319","1,032,818",783,"190,641",145,"1,223,459",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
45
  CoT,gsm8k,2025/1/23,deepseek-r1:1.5b,70.66,99.77,8,,"1,319","1,011,714",767,"1,078,911",818,"2,090,625",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
46
+ SC-CoT,gsm8k,2025/1/7,gpt-3.5-turbo,69.29,98.79,8,"temperature=1, path_num=5","1,319","895,571",679,"1,381,678","1,048","2,277,249",2.5203,,,,,,,,,,,,,,,,,,,,,,,,,,,,
47
+ SC-CoT,gsm8k,2025/1/7,Doubao-lite-32k,91.58,99.92,8,"temperature=1, path_num=5","1,319","942,182",714,"893,709",678,"1,835,891",0.1118,,,,,,,,,,,,,,,,,,,,,,,,,,,,
48
+ SC-CoT,gsm8k,2025/1/22,gpt-4o,94.77,100.00,8,"temperature=1, path_num=5","1,319","894,889",678,"1,596,716","1,211","2,491,605",18.2044,,,,,,,,,,,,,,,,,,,,,,,,,,,,
49
+ SC-CoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,94.77,100.00,8,"temperature=1, path_num=5","1,319","5,370,360","4,072","1,804,898","1,368","7,175,258",4.0450,,,,,,,,,,,,,,,,,,,,,,,,,,,,
50
+ SC-CoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,95.22,100.00,8,"temperature=1, path_num=5","1,319","5,295,585","4,015","1,426,429","1,081","6,722,014",3.7895,,,,,,,,,,,,,,,,,,,,,,,,,,,,
51
+ SC-CoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,90.98,100.00,8,"temperature=1, path_num=5","1,319","5,580,524","4,231","1,679,419","1,273","7,259,943",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
52
+ SC-CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,54.36,99.85,8,"temperature=1, path_num=5","1,319","5,136,762","3,894","5,819,672","4,412","10,956,434",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
53
+ SC-CoT,gsm8k,2025/1/22,Internllm2_5-7B,44.66,91.81,8,"temperature=1, path_num=5","1,319","5,847,761","4,433","2,314,738","1,755","8,162,499",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
54
+ SC-CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,8.19,68.76,8,"temperature=1, path_num=5","1,319","5,439,568","4,124","1,946,885","1,476","7,386,453",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
55
+ SC-CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,4.17,94.47,8,"temperature=1, path_num=5","1,319","5,441,962","4,126","2,036,805","1,544","7,478,767",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
56
+ SC-CoT,gsm8k,2025/2/10,deepseek-r1:1.5b,69.07,98.79,8,"temperature=1, path_num=5","1,319","5,407,357","4,100","4,622,327","3,504","10,029,684",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
57
  ToT,gsm8k,2025/1/7,gpt-3.5-turbo,67.93,99.70,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","15,920,037","12,070","807,138",612,"16,727,175",9.1707,,,,,,,,,,,,,,,,,,,,,,,,,,,,
58
  ToT,gsm8k,2025/1/7,Doubao-lite-32k,37.83,87.34,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","19,208,597","14,563","1,065,752",808,"20,274,349",0.8739,,,,,,,,,,,,,,,,,,,,,,,,,,,,
59
  ToT,gsm8k,2025/1/22,gpt-4o,91.13,100.00,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","29,445,237","22,324","1,324,498","1,004","30,769,735",86.8581,,,,,,,,,,,,,,,,,,,,,,,,,,,,
 
98
  PoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,30.71,96.46,0,,254,"246,560",971,"51,915",204,"298,475",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
99
  PoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,17.32,92.13,0,,254,"258,867","1,019","63,414",250,"322,281",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
100
  PoT,AQuA,2025/2/10,deepseek-r1:1.5b,54.72,97.24,0,,254,"250,690",987,"765,957","3,016","1,016,647",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
101
+ SC-CoT,AQuA,2025/1/22,gpt-3.5-turbo,58.66,92.52,0,"temperature=1, path_num=5",254,"27,906",110,"209,160",823,"237,066",0.3277,,,,,,,,,,,,,,,,,,,,,,,,,,,,
102
+ SC-CoT,AQuA,2025/1/22,Doubao-lite-32k,76.37,91.73,0,"temperature=1, path_num=5",254,"31,703",125,"325,136","1,280","356,839",0.0279,,,,,,,,,,,,,,,,,,,,,,,,,,,,
103
+ SC-CoT,AQuA,2025/1/22,gpt-4o,85.83,99.21,0,"temperature=1, path_num=5",254,"27,829",110,"517,602","2,038","545,431",5.2456,,,,,,,,,,,,,,,,,,,,,,,,,,,,
104
+ SC-CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,85.43,96.85,0,"temperature=1, path_num=5",254,"137,990",543,"604,562","2,380","742,552",0.4186,,,,,,,,,,,,,,,,,,,,,,,,,,,,
105
+ SC-CoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,84.65,99.61,0,"temperature=1, path_num=5",254,"175,050",689,"612,262","2,410","787,312",0.4438,,,,,,,,,,,,,,,,,,,,,,,,,,,,
106
+ SC-CoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,79.53,100.00,0,"temperature=1, path_num=5",254,"177,972",701,"567,438","2,234","745,410",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
107
+ SC-CoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,59.45,95.67,0,"temperature=1, path_num=5",254,"145,108",571,"544,969","2,146","690,077",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
108
+ SC-CoT,AQuA,2025/1/22,Internllm2_5-7B,38.58,97.24,0,"temperature=1, path_num=5",254,"264,557","1,042","615,114","2,422","879,671",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
109
+ SC-CoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,10.63,51.57,0,"temperature=1, path_num=5",254,"151,410",596,"550,570","2,168","701,980",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
110
+ SC-CoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,17.32,82.28,0,"temperature=1, path_num=5",254,"150,787",594,"603,126","2,375","753,913",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
111
+ SC-CoT,AQuA,2025/2/10,deepseek-r1:1.5b,57.87,74.02,0,"temperature=1, path_num=5",254,"144,710",570,"1,987,401","7,824","2,132,111",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
112
  ReAct-Pro*,AQuA,2025/1/7,gpt-3.5-turbo,64.57,98.03,0,max_steps=10,254,"862,614","3,396","40,973",161,"903,587",0.4928,"think-action ε•η‹¬θΏ”ε›ž,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
113
  ReAct-Pro*,AQuA,2025/1/7,Doubao-lite-32k,77.56,96.06,0,max_steps=10,254,"977,890","3,850","54,951",216,"1,032,841",0.0445,"think-action ε•η‹¬θΏ”ε›ž,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
114
  ReAct-Pro*,AQuA,2025/1/22,gpt-4o,57.48,97.24,0,max_steps=10,254,"615,589","2,424","76,507",301,"692,096",2.3040,"think-action ε•η‹¬θΏ”ε›ž,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
 
164
  PoT,MATH-500,2025/2/10,Qwen2-1.5B-Instruct,0.80,2.20,4,,500,"248,509",497,"538,361","1,077","786,870",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
165
  PoT,MATH-500,2025/2/10,Qwen2-0.5B-Instruct,0.00,0.00,4,,500,"253,549",507,"183,653",367,"437,202",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
166
  PoT,MATH-500,2025/2/10,deepseek-r1:1.5b,1.00,1.60,4,,500,"245,549",491,"785,518","1,571","1,031,067",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
167
+ SC-CoT,MATH-500,2025/2/10,gpt-3.5-turbo,40.80,100.00,4,"temperature=1, path_num=5",500,"345,411",691,"705,408","1,411","1,050,819",1.2308,,,,,,,,,,,,,,,,,,,,,,,,,,,,
168
+ SC-CoT,MATH-500,2025/2/10,Doubao-lite-32k,65.80,99.80,4,"temperature=1, path_num=5",500,"362,390",725,"715,613","1,431","1,078,003",0.0734,,,,,,,,,,,,,,,,,,,,,,,,,,,,
169
+ SC-CoT,MATH-500,2025/2/10,gpt-4o,74.60,100.00,4,"temperature=1, path_num=5",500,"345,347",691,"1,149,778","2,300","1,495,125",12.3611,,,,,,,,,,,,,,,,,,,,,,,,,,,,
170
+ SC-CoT,MATH-500,2025/2/10,Qwen2.5-72B-Instruct,79.80,100.00,4,"temperature=1, path_num=5",500,"1,775,395","3,551","1,506,954","3,014","3,282,349",1.8504,,,,,,,,,,,,,,,,,,,,,,,,,,,,
171
+ SC-CoT,MATH-500,2025/2/10,Llama-3.3-70B-Instruct,72.40,100.00,4,"temperature=1, path_num=5",500,"1,797,045","3,594","1,368,466","2,737","3,165,511",1.7845,,,,,,,,,,,,,,,,,,,,,,,,,,,,
172
+ SC-CoT,MATH-500,2025/2/10,Qwen2.5-7B-Instruct,71.20,100.00,4,"temperature=1, path_num=5",500,"1,855,922","3,712","1,299,553","2,599","3,155,475",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
173
+ SC-CoT,MATH-500,2025/2/10,Llama-3.1-8B-Instruct,19.80,99.80,4,"temperature=1, path_num=5",500,"1,734,545","3,469","1,756,289","3,513","3,490,834",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
174
+ SC-CoT,MATH-500,2025/2/10,Internllm2_5-7B,9.20,97.40,4,"temperature=1, path_num=5",500,"1,994,983","3,990","1,254,893","2,510","3,249,876",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
175
+ SC-CoT,MATH-500,2025/2/10,Qwen2-1.5B-Instruct,2.00,89.40,4,"temperature=1, path_num=5",500,"1,805,170","3,610","1,333,854","2,668","3,139,024",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
176
+ SC-CoT,MATH-500,2025/2/10,Qwen2-0.5B-Instruct,2.20,98.80,4,"temperature=1, path_num=5",500,"1,808,691","3,617","988,991","1,978","2,797,682",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
177
+ SC-CoT,MATH-500,2025/2/10,deepseek-r1:1.5b,46.80,99.20,4,"temperature=1, path_num=5",500,"1,858,874","3,718","12,109,294","24,219","13,968,168",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
178
  ReAct-Pro*,MATH-500,2025/2/10,gpt-3.5-turbo,23.80,100.00,4,max_steps=10,500,"3,708,461","7,417","124,253",249,"3,832,714",2.0406,,,,,,,,,,,,,,,,,,,,,,,,,,,,
179
  ReAct-Pro*,MATH-500,2025/2/10,Doubao-lite-32k,47.20,100.00,4,max_steps=10,500,"4,234,620","8,469","154,046",308,"4,388,666",0.1860,,,,,,,,,,,,,,,,,,,,,,,,,,,,
180
  ReAct-Pro*,MATH-500,2025/2/10,gpt-4o,54.00,100.00,4,max_steps=10,500,"5,834,537","11,669","318,718",637,"6,153,255",17.7735,,,,,,,,,,,,,,,,,,,,,,,,,,,,