liaojiajia commited on
Commit
cd01d35
·
1 Parent(s): be9cdf5

add tot and math500 scores

Browse files
app.py CHANGED
@@ -18,23 +18,23 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
18
  DATASETS.remove('META')
19
  print(DATASETS)
20
 
21
- # 确保在定义llm_options之前生成overall_table
22
  check_box = BUILD_L1_DF(results, DEFAULT_MATH_BENCH)
23
  overall_table = generate_table(results, DEFAULT_MATH_BENCH)
24
 
25
- # 保存完整的overall_tableCSV文件
26
  csv_path_overall = os.path.join(os.getcwd(), 'src/overall_results.csv')
27
  overall_table.to_csv(csv_path_overall, index=False)
28
  print(f"Overall results saved to {csv_path_overall}")
29
 
30
- # overall_table中提取所有可能的LLM选项
31
  llm_options = list(set(row.LLM for row in overall_table.itertuples() if hasattr(row, 'LLM')))
32
 
33
  gr.Markdown(LEADERBORAD_INTRODUCTION.format(EVAL_TIME))
34
  with gr.Tabs(elem_classes='tab-buttons') as tabs:
35
  with gr.Tab(label='🏅 Open Agent Overall Math Leaderboard'):
36
  gr.Markdown(LEADERBOARD_MD['MATH_MAIN'])
37
- # 移动check_boxoverall_table的定义到这里
38
  # check_box = BUILD_L1_DF(results, DEFAULT_MATH_BENCH)
39
  # overall_table = generate_table(results, DEFAULT_MATH_BENCH)
40
 
@@ -48,7 +48,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
48
  interactive=True,
49
  )
50
 
51
- # 新增的CheckboxGroup组件用于选择AlgorithmLLM
52
  algo_name = gr.CheckboxGroup(
53
  choices=ALGORITHMS,
54
  value=ALGORITHMS,
@@ -57,7 +57,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
57
  )
58
 
59
  llm_name = gr.CheckboxGroup(
60
- choices=llm_options, # 使用提取的llm_options
61
  value=llm_options,
62
  label='LLM',
63
  interactive=True
@@ -78,7 +78,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
78
  headers = ['Rank'] + check_box['essential'] + fields
79
  df = overall_table.copy()
80
 
81
- # 添加过滤逻辑
82
  df['flag'] = df.apply(lambda row: (
83
  row['Algorithm'] in algos and
84
  row['LLM'] in llms
@@ -107,7 +107,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
107
 
108
  return comp
109
 
110
- # 更新change事件以包含新的过滤条件
111
  checkbox_group.change(
112
  fn=filter_df,
113
  inputs=[checkbox_group, algo_name, llm_name],
@@ -135,7 +135,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
135
 
136
  table, check_box = BUILD_L2_DF(results_detail, DEFAULT_MATH_BENCH)
137
 
138
- # 保存完整的tableCSV文件
139
  csv_path_detail = os.path.join(os.getcwd(), 'src/detail_results.csv')
140
  table.to_csv(csv_path_detail, index=False)
141
  print(f"Detail results saved to {csv_path_detail}")
@@ -217,7 +217,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
217
 
218
  return comp
219
 
220
- # 为所有复选框组添加change事件
221
  checkbox_group.change(
222
  fn=filter_df2,
223
  inputs=[checkbox_group, algo_name, dataset_name, llm_name],
 
18
  DATASETS.remove('META')
19
  print(DATASETS)
20
 
21
+ # Ensure overall_table is generated before defining llm_options
22
  check_box = BUILD_L1_DF(results, DEFAULT_MATH_BENCH)
23
  overall_table = generate_table(results, DEFAULT_MATH_BENCH)
24
 
25
+ # Save the complete overall_table as a CSV file
26
  csv_path_overall = os.path.join(os.getcwd(), 'src/overall_results.csv')
27
  overall_table.to_csv(csv_path_overall, index=False)
28
  print(f"Overall results saved to {csv_path_overall}")
29
 
30
+ # Extract all possible LLM options from overall_table
31
  llm_options = list(set(row.LLM for row in overall_table.itertuples() if hasattr(row, 'LLM')))
32
 
33
  gr.Markdown(LEADERBORAD_INTRODUCTION.format(EVAL_TIME))
34
  with gr.Tabs(elem_classes='tab-buttons') as tabs:
35
  with gr.Tab(label='🏅 Open Agent Overall Math Leaderboard'):
36
  gr.Markdown(LEADERBOARD_MD['MATH_MAIN'])
37
+ # Move the definition of check_box and overall_table here
38
  # check_box = BUILD_L1_DF(results, DEFAULT_MATH_BENCH)
39
  # overall_table = generate_table(results, DEFAULT_MATH_BENCH)
40
 
 
48
  interactive=True,
49
  )
50
 
51
+ # New CheckboxGroup component for selecting Algorithm and LLM
52
  algo_name = gr.CheckboxGroup(
53
  choices=ALGORITHMS,
54
  value=ALGORITHMS,
 
57
  )
58
 
59
  llm_name = gr.CheckboxGroup(
60
+ choices=llm_options, # Use the extracted llm_options
61
  value=llm_options,
62
  label='LLM',
63
  interactive=True
 
78
  headers = ['Rank'] + check_box['essential'] + fields
79
  df = overall_table.copy()
80
 
81
+ # Add filtering logic
82
  df['flag'] = df.apply(lambda row: (
83
  row['Algorithm'] in algos and
84
  row['LLM'] in llms
 
107
 
108
  return comp
109
 
110
+ # Update change events to include new filtering conditions
111
  checkbox_group.change(
112
  fn=filter_df,
113
  inputs=[checkbox_group, algo_name, llm_name],
 
135
 
136
  table, check_box = BUILD_L2_DF(results_detail, DEFAULT_MATH_BENCH)
137
 
138
+ # Save the complete table as a CSV file
139
  csv_path_detail = os.path.join(os.getcwd(), 'src/detail_results.csv')
140
  table.to_csv(csv_path_detail, index=False)
141
  print(f"Detail results saved to {csv_path_detail}")
 
217
 
218
  return comp
219
 
220
+ # Add change events for all checkbox groups
221
  checkbox_group.change(
222
  fn=filter_df2,
223
  inputs=[checkbox_group, algo_name, dataset_name, llm_name],
meta_data.py CHANGED
@@ -1,9 +1,10 @@
1
  # CONSTANTS-URL
2
  OVERALL_MATH_SCORE_FILE = "src/overall_math_score.json"
3
  DETAIL_MATH_SCORE_FILE = "src/detail_math_score.json"
 
4
  # CONSTANTS-TEXT
5
  LEADERBORAD_INTRODUCTION = """# Open Agent Leaderboard
6
- ### Welcome to the Open Agent Leaderboard! We share the evaluation results of open agents: CoT, SC-CoT, PoT, ReAct, etc. The agents are impletemented by the OpenSource Framework: [*OmAgent*](https://github.com/om-ai-lab/OmAgent)
7
 
8
  This leaderboard was last updated: {}.
9
 
@@ -11,8 +12,9 @@ To add your own agent to the leaderboard, please create a PR in [*OmAgent*](http
11
  """
12
 
13
  DEFAULT_MATH_BENCH = [
14
- 'gsm8k', 'AQuA'
15
  ]
 
16
  # The README file for each benchmark
17
  LEADERBOARD_MD = {}
18
 
@@ -42,7 +44,7 @@ LEADERBOARD_MD['MATH_DETAIL'] = f"""
42
  - Cost: The cost on each math Benchmarks (the lower the better).
43
  - Rank: The rank on each math Benchmarks (the lower the better).
44
 
45
- - default parameters: temperature=0.0
46
  - LLM prices:
47
  - gpt-3.5-turbo:
48
  - 0.5$/1M tokens (input)
@@ -53,7 +55,7 @@ LEADERBOARD_MD['MATH_DETAIL'] = f"""
53
  - gpt-4o-2024-08-06:
54
  - 2.50$ /1M input tokens (input)
55
  - 10$ /1M output tokens (output)
56
- - Qwen2.5-7B-Instruct and Llama-3.3-70B-Instruct:
57
  - Prices can be found https://cloud.siliconflow.cn/.
58
  - Other open source LLMs:
59
  - Deployed locally, please check the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository for more information.
@@ -69,7 +71,7 @@ META_FIELDS = [
69
  ]
70
 
71
  DATASETS = [
72
- 'gsm8k', 'AQuA'
73
  ]
74
 
75
  LLM = [
@@ -77,7 +79,7 @@ LLM = [
77
  ]
78
 
79
  ALGORITHMS = [
80
- 'IO', 'CoT', 'SC-CoT', 'PoT', 'ReAct-Pro*'
81
  ]
82
 
83
  CITATION_BUTTON_TEXT = r"""@misc{open-agent-leaderboard,
 
1
  # CONSTANTS-URL
2
  OVERALL_MATH_SCORE_FILE = "src/overall_math_score.json"
3
  DETAIL_MATH_SCORE_FILE = "src/detail_math_score.json"
4
+
5
  # CONSTANTS-TEXT
6
  LEADERBORAD_INTRODUCTION = """# Open Agent Leaderboard
7
+ ### Welcome to the Open Agent Leaderboard! We share the evaluation results of open agents: CoT, SC-CoT, PoT, ReAct, ToT, etc. The agents are implemented by the OpenSource Framework: [*OmAgent*](https://github.com/om-ai-lab/OmAgent)
8
 
9
  This leaderboard was last updated: {}.
10
 
 
12
  """
13
 
14
  DEFAULT_MATH_BENCH = [
15
+ 'gsm8k', 'AQuA', 'MATH-500',
16
  ]
17
+
18
  # The README file for each benchmark
19
  LEADERBOARD_MD = {}
20
 
 
44
  - Cost: The cost on each math Benchmarks (the lower the better).
45
  - Rank: The rank on each math Benchmarks (the lower the better).
46
 
47
+ - default parameters: temperature=0.0 (except for SC-CoT)
48
  - LLM prices:
49
  - gpt-3.5-turbo:
50
  - 0.5$/1M tokens (input)
 
55
  - gpt-4o-2024-08-06:
56
  - 2.50$ /1M input tokens (input)
57
  - 10$ /1M output tokens (output)
58
+ - Qwen2.5-72B-Instruct and Llama-3.3-70B-Instruct:
59
  - Prices can be found https://cloud.siliconflow.cn/.
60
  - Other open source LLMs:
61
  - Deployed locally, please check the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository for more information.
 
71
  ]
72
 
73
  DATASETS = [
74
+ 'gsm8k', 'AQuA', 'MATH-500'
75
  ]
76
 
77
  LLM = [
 
79
  ]
80
 
81
  ALGORITHMS = [
82
+ 'IO', 'CoT', 'SC-CoT', 'PoT', 'ReAct-Pro*', 'ToT'
83
  ]
84
 
85
  CITATION_BUTTON_TEXT = r"""@misc{open-agent-leaderboard,
preprocess.py CHANGED
@@ -3,10 +3,10 @@ import json
3
  from datetime import datetime
4
 
5
  def process_csv_to_json():
6
- # 读取CSV文件
7
  df = pd.read_csv('src/record.csv')
8
 
9
- # 清理数据:删除空行,重命名列
10
  df = df.dropna(how='all')
11
  df = df.rename(columns={
12
  'dataset': 'Dataset',
@@ -25,36 +25,36 @@ def process_csv_to_json():
25
  'Average output tokens': 'Average output tokens'
26
  })
27
 
28
- # 辅助函数:处理包含逗号的数字字符串
29
  def parse_number(value):
30
- if pd.isna(value):
31
  return 0
32
- # 先移除逗号,然后转换为浮点数,最后转换为整数
33
  return int(float(str(value).replace(',', '')))
34
 
35
- # 初始化结果字典
36
  result = {
37
  "time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
38
  "results": {}
39
  }
40
 
41
- # 获取所有唯一的LLM
42
  llms = df['LLM'].dropna().unique()
43
 
44
- # 遍历每个算法
45
  for algorithm in df['Algorithm'].dropna().unique():
46
  if not isinstance(algorithm, str):
47
  continue
48
 
49
  result['results'][algorithm] = {}
50
 
51
- # 对每个LLM进行处理
52
  for llm in llms:
53
  llm_data = df[(df['Algorithm'] == algorithm) & (df['LLM'] == llm)]
54
  if llm_data.empty:
55
  continue
56
 
57
- # 创建LLM对应的字典
58
  result['results'][algorithm][llm] = {
59
  'META': {
60
  'Algorithm': str(algorithm),
@@ -63,7 +63,7 @@ def process_csv_to_json():
63
  }
64
  }
65
 
66
- # 对每个数据集进行处理
67
  for dataset in df['Dataset'].dropna().unique():
68
  if not isinstance(dataset, str):
69
  continue
@@ -73,9 +73,9 @@ def process_csv_to_json():
73
  if not dataset_data.empty:
74
  data_row = dataset_data.iloc[0]
75
  result['results'][algorithm][llm][dataset] = {
76
- 'Score': round(float(data_row['Score']), 2), # 保留两位小数
77
- 'Pass rate': round(float(data_row['Pass rate']) / 100, 4), # 转换为小数并保留两位小数
78
- 'Cost($)': float(data_row['Cost($)']) if pd.notnull(data_row['Cost($)']) else 0.0,
79
  'Framework': str(data_row['Framework']) if 'Framework' in data_row and pd.notnull(data_row['Framework']) else '',
80
  'X-shot': str(data_row['X-shot']) if pd.notnull(data_row['X-shot']) else '',
81
  'Samples': parse_number(data_row['Samples']),
@@ -86,12 +86,12 @@ def process_csv_to_json():
86
  'Average output tokens': parse_number(data_row['Average output tokens'])
87
  }
88
 
89
- # 检查每个字段是否存在
90
  required_fields = ['Score', 'Pass rate', 'Cost($)', 'Framework', 'X-shot', 'Samples', 'All tokens', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens']
91
 
92
  for key, value in result['results'].items():
93
  for llm, datasets in value.items():
94
- # 检查 META 信息
95
  meta = datasets.get('META', {})
96
  if 'LLM' not in meta or 'Eval Date' not in meta:
97
  print(f"Missing META fields in algorithm '{key}' for LLM '{llm}'")
@@ -103,15 +103,15 @@ def process_csv_to_json():
103
  if missing_fields:
104
  print(f"Missing fields {missing_fields} in dataset '{dataset}' for LLM '{llm}' in algorithm '{key}'")
105
 
106
- # 保存为JSON文件
107
  with open('src/detail_math_score.json', 'w', encoding='utf-8') as f:
108
  json.dump(result, f, indent=4, ensure_ascii=False)
109
 
110
  def process_csv_to_overall_json():
111
- # 读取CSV文件
112
  df = pd.read_csv('src/record.csv')
113
 
114
- # 清理数据:删除空行,重命名列
115
  df = df.dropna(how='all')
116
  df = df.rename(columns={
117
  'dataset': 'Dataset',
@@ -121,24 +121,24 @@ def process_csv_to_overall_json():
121
  'Eval Date': 'Eval Date'
122
  })
123
 
124
- # 初始化结果字典
125
  result = {
126
  "time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
127
  "results": {}
128
  }
129
 
130
- # 获取所有唯一的LLM
131
  llms = df['LLM'].dropna().unique()
132
  for llm in llms:
133
- # 处理基础算法
134
  for algorithm in df['Algorithm'].dropna().unique():
135
  if not isinstance(algorithm, str):
136
  continue
137
 
138
- # 为非gpt-3.5-turbo的模型添加后缀
139
- # 修改:为llama模型添加更多信息以确保唯一性
140
  algo_key = algorithm if llm == 'gpt-3.5-turbo' else f"{algorithm}-{llm}"
141
- # 检查该算法-LLM组合是否存在
142
  algo_data = df[(df['Algorithm'] == algorithm) & (df['LLM'] == llm)]
143
  if algo_data.empty:
144
  print(f"No data found for algorithm '{algorithm}' and LLM '{llm}'")
@@ -152,29 +152,29 @@ def process_csv_to_overall_json():
152
  }
153
  }
154
 
155
- # 处理每个数据集
156
- for dataset in ['gsm8k', 'AQuA']:
157
  dataset_data = df[(df['Algorithm'] == algorithm) &
158
  (df['Dataset'] == dataset) &
159
  (df['LLM'] == llm)]
160
  if not dataset_data.empty:
161
  result['results'][algo_key][dataset] = {
162
- "Score": float(dataset_data['Score'].iloc[0]) if pd.notnull(dataset_data['Score'].iloc[0]) else 0.0,
163
- "Cost($)": float(dataset_data['Cost($)'].iloc[0]) if pd.notnull(dataset_data['Cost($)'].iloc[0]) else 0.0
164
  }
165
  else:
166
- # 如果数据集为空,确保键存在并设置默认值
167
  result['results'][algo_key][dataset] = {
168
  "Score": 0.0,
169
  "Cost($)": 0.0
170
  }
171
 
172
 
173
- # 保存为JSON文件
174
  with open('src/overall_math_score.json', 'w', encoding='utf-8') as f:
175
  json.dump(result, f, indent=4, ensure_ascii=False)
176
 
177
  if __name__ == "__main__":
178
- # 生成两种格式的JSON文件
179
  process_csv_to_json()
180
- process_csv_to_overall_json()
 
3
  from datetime import datetime
4
 
5
  def process_csv_to_json():
6
+ # Read the CSV file
7
  df = pd.read_csv('src/record.csv')
8
 
9
+ # Clean the data: remove empty rows, rename columns
10
  df = df.dropna(how='all')
11
  df = df.rename(columns={
12
  'dataset': 'Dataset',
 
25
  'Average output tokens': 'Average output tokens'
26
  })
27
 
28
+ # Helper function: handle number strings with commas
29
  def parse_number(value):
30
+ if pd.isna(value) or value == '-':
31
  return 0
32
+ # Remove commas, convert to float, then to int
33
  return int(float(str(value).replace(',', '')))
34
 
35
+ # Initialize result dictionary
36
  result = {
37
  "time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
38
  "results": {}
39
  }
40
 
41
+ # Get all unique LLMs
42
  llms = df['LLM'].dropna().unique()
43
 
44
+ # Iterate through each algorithm
45
  for algorithm in df['Algorithm'].dropna().unique():
46
  if not isinstance(algorithm, str):
47
  continue
48
 
49
  result['results'][algorithm] = {}
50
 
51
+ # Process each LLM
52
  for llm in llms:
53
  llm_data = df[(df['Algorithm'] == algorithm) & (df['LLM'] == llm)]
54
  if llm_data.empty:
55
  continue
56
 
57
+ # Create dictionary for each LLM
58
  result['results'][algorithm][llm] = {
59
  'META': {
60
  'Algorithm': str(algorithm),
 
63
  }
64
  }
65
 
66
+ # Process each dataset
67
  for dataset in df['Dataset'].dropna().unique():
68
  if not isinstance(dataset, str):
69
  continue
 
73
  if not dataset_data.empty:
74
  data_row = dataset_data.iloc[0]
75
  result['results'][algorithm][llm][dataset] = {
76
+ 'Score': round(float(data_row['Score']) if data_row['Score'] != '-' else 0, 2), # Keep two decimal places
77
+ 'Pass rate': round(float(data_row['Pass rate']) / 100, 4) if data_row['Pass rate'] != '-' else 0.0, # Convert to decimal and keep two decimal places
78
+ 'Cost($)': float(data_row['Cost($)']) if pd.notnull(data_row['Cost($)']) and data_row['Cost($)'] != '-' else 0.0,
79
  'Framework': str(data_row['Framework']) if 'Framework' in data_row and pd.notnull(data_row['Framework']) else '',
80
  'X-shot': str(data_row['X-shot']) if pd.notnull(data_row['X-shot']) else '',
81
  'Samples': parse_number(data_row['Samples']),
 
86
  'Average output tokens': parse_number(data_row['Average output tokens'])
87
  }
88
 
89
+ # Check if each field exists
90
  required_fields = ['Score', 'Pass rate', 'Cost($)', 'Framework', 'X-shot', 'Samples', 'All tokens', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens']
91
 
92
  for key, value in result['results'].items():
93
  for llm, datasets in value.items():
94
+ # Check META information
95
  meta = datasets.get('META', {})
96
  if 'LLM' not in meta or 'Eval Date' not in meta:
97
  print(f"Missing META fields in algorithm '{key}' for LLM '{llm}'")
 
103
  if missing_fields:
104
  print(f"Missing fields {missing_fields} in dataset '{dataset}' for LLM '{llm}' in algorithm '{key}'")
105
 
106
+ # Save as JSON file
107
  with open('src/detail_math_score.json', 'w', encoding='utf-8') as f:
108
  json.dump(result, f, indent=4, ensure_ascii=False)
109
 
110
  def process_csv_to_overall_json():
111
+ # Read the CSV file
112
  df = pd.read_csv('src/record.csv')
113
 
114
+ # Clean the data: remove empty rows, rename columns
115
  df = df.dropna(how='all')
116
  df = df.rename(columns={
117
  'dataset': 'Dataset',
 
121
  'Eval Date': 'Eval Date'
122
  })
123
 
124
+ # Initialize result dictionary
125
  result = {
126
  "time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
127
  "results": {}
128
  }
129
 
130
+ # Get all unique LLMs
131
  llms = df['LLM'].dropna().unique()
132
  for llm in llms:
133
+ # Process base algorithms
134
  for algorithm in df['Algorithm'].dropna().unique():
135
  if not isinstance(algorithm, str):
136
  continue
137
 
138
+ # Add suffix for non-gpt-3.5-turbo models
139
+ # Modification: add more information for llama models to ensure uniqueness
140
  algo_key = algorithm if llm == 'gpt-3.5-turbo' else f"{algorithm}-{llm}"
141
+ # Check if the algorithm-LLM combination exists
142
  algo_data = df[(df['Algorithm'] == algorithm) & (df['LLM'] == llm)]
143
  if algo_data.empty:
144
  print(f"No data found for algorithm '{algorithm}' and LLM '{llm}'")
 
152
  }
153
  }
154
 
155
+ # Process each dataset
156
+ for dataset in ['gsm8k', 'AQuA', 'MATH-500']:
157
  dataset_data = df[(df['Algorithm'] == algorithm) &
158
  (df['Dataset'] == dataset) &
159
  (df['LLM'] == llm)]
160
  if not dataset_data.empty:
161
  result['results'][algo_key][dataset] = {
162
+ "Score": float(dataset_data['Score'].iloc[0]) if pd.notnull(dataset_data['Score'].iloc[0]) and dataset_data['Score'].iloc[0] != '-' else 0.0,
163
+ "Cost($)": float(dataset_data['Cost($)'].iloc[0]) if pd.notnull(dataset_data['Cost($)'].iloc[0]) and dataset_data['Cost($)'].iloc[0] != '-' else 0.0
164
  }
165
  else:
166
+ # If the dataset is empty, ensure the key exists and set default values
167
  result['results'][algo_key][dataset] = {
168
  "Score": 0.0,
169
  "Cost($)": 0.0
170
  }
171
 
172
 
173
+ # Save as JSON file
174
  with open('src/overall_math_score.json', 'w', encoding='utf-8') as f:
175
  json.dump(result, f, indent=4, ensure_ascii=False)
176
 
177
  if __name__ == "__main__":
178
+ # Generate JSON files in two formats
179
  process_csv_to_json()
180
+ process_csv_to_overall_json()
src/detail_math_score.json CHANGED
The diff for this file is too large to render. See raw diff
 
src/detail_results.csv CHANGED
@@ -1,101 +1,199 @@
1
  Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Samples,All tokens,Total input tokens,Average input tokens,Total output tokens,Average output tokens
2
- 1,SC-CoT,AQuA,gpt-4o,2025/1/22,86.61,0.9882,0.0,8.1485,,254,1373206,744478,2931,628728,2475
3
- 2,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,86.22,0.9921,0.0,0.0808,,254,143289,25143,99,118146,465
4
- 3,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.04,0.9921,0.0,1.0348,,254,1835669,1051218,4139,784451,3088
5
- 4,IO,AQuA,Qwen2.5-72B-Instruct,2025/1/22,84.25,0.9961,0.0,0.0742,,254,131604,25397,100,106207,418
6
- 5,CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.46,0.9843,0.0,0.0927,,254,164389,32555,128,131834,519
7
- 6,IO,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.68,0.9921,0.0,0.0798,,254,141567,32809,129,108758,428
8
- 7,CoT,AQuA,Doubao-lite-32k,2025/1/7,82.68,0.9724,0.0,0.0066,,254,94577,27978,110,66599,262
9
- 8,CoT,AQuA,gpt-4o,2025/1/22,82.68,0.9803,0.0,1.0417,,254,123017,25123,99,97894,385
10
- 9,SC-CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.28,0.9921,0.0,1.0756,,254,1907924,1135251,4469,772673,3042
11
- 10,SC-CoT,AQuA,Doubao-lite-32k,2025/1/7,81.1,0.9724,0.0,0.0519,,254,885986,503751,1983,382235,1505
12
- 11,CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,80.71,0.9961,0.0,0.0,,254,149736,33017,130,116719,460
13
- 12,SC-CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,79.92,1.0,0.0,0.0,,254,1845332,1098280,4324,747052,2941
14
- 13,PoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.53,0.9921,0.0,0.1746,,254,309799,240735,948,69064,272
15
- 14,IO,AQuA,Doubao-lite-32k,2025/1/7,79.13,1.0,0.0,0.0058,,254,87742,33058,130,54684,215
16
- 15,ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.13,0.9961,0.0,0.768,,254,1362379,1119143,4406,243236,958
17
- 16,IO,AQuA,Qwen2.5-7B-Instruct,2025/1/22,78.74,0.9843,0.0,0.0,,254,137771,33271,131,104500,411
18
- 17,ReAct-Pro*,AQuA,Doubao-lite-32k,2025/1/7,77.56,0.9606,0.0,0.0445,,254,1032841,977890,3850,54951,216
19
- 18,IO,AQuA,gpt-4o,2025/1/22,75.59,0.9724,0.0,1.1453,,254,133752,25631,101,108121,426
20
- 19,PoT,AQuA,gpt-4o,2025/1/22,75.2,1.0,0.0,1.6087,,254,327908,222717,877,105191,414
21
- 20,PoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,75.2,1.0,0.0,0.1645,,254,291764,249215,981,42549,168
22
- 21,ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,2025/1/22,74.41,0.9921,0.0,0.0,,254,695844,564165,2221,131679,518
23
- 22,ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,2025/1/22,73.23,1.0,0.0,0.3177,,254,563603,441765,1739,121838,480
24
- 23,PoT,AQuA,Doubao-lite-32k,2025/1/7,71.65,0.9685,0.0,0.0147,,254,309436,259863,1023,49573,195
25
- 24,PoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,68.11,1.0,0.0,0.0,,254,313728,264517,1041,49211,194
26
- 25,SC-CoT,AQuA,gpt-3.5-turbo,2025/1/7,66.14,0.9921,0.0,0.7888,,254,847335,482192,1898,365143,1438
27
- 26,ReAct-Pro*,AQuA,gpt-3.5-turbo,2025/1/7,64.57,0.9803,0.0,0.4928,,254,903587,862614,3396,40973,161
28
- 27,CoT,AQuA,gpt-3.5-turbo,2025/1/7,61.02,0.937,0.0,0.0957,,254,80793,25447,100,55346,218
29
- 28,CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,60.63,1.0,0.0,0.0,,254,144435,32555,128,111880,440
30
- 29,PoT,AQuA,gpt-3.5-turbo,2025/1/7,59.45,1.0,0.0,0.1748,,254,266654,225162,886,41492,163
31
- 30,SC-CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,59.45,0.9724,0.0,0.0,,254,1651333,971003,3823,680330,2678
32
- 31,ReAct-Pro*,AQuA,gpt-4o,2025/1/22,57.48,0.9724,0.0,2.304,,254,692096,615589,2424,76507,301
33
- 32,ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,2025/1/22,55.51,0.9685,0.0,0.0,,254,4340821,3764723,14822,576098,2268
34
- 33,CoT,AQuA,Internllm2_5-7B,2025/1/22,52.76,0.8937,0.0,0.0,,254,127520,26610,105,100910,397
35
- 34,IO,AQuA,Llama-3.1-8B-Instruct,2025/1/22,51.18,0.9882,0.0,0.0,,254,133106,26459,104,106647,420
36
- 35,IO,AQuA,Internllm2_5-7B,2025/1/22,47.64,0.9094,0.0,0.0,,254,185041,50232,198,134809,531
37
- 36,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0.0,0.0,,254,4428801,3592039,14142,836762,3294
38
- 37,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.9882,0.0,0.0,,254,110040,30477,120,79563,313
39
- 38,SC-CoT,AQuA,Internllm2_5-7B,2025/1/22,39.37,0.9803,0.0,0.0,,254,2296222,1420494,5592,875728,3448
40
- 39,IO,AQuA,gpt-3.5-turbo,2025/1/7,38.98,1.0,0.0,0.038,,254,42471,25701,101,16770,66
41
- 40,PoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,36.61,0.9685,0.0,0.0,,254,290914,240613,947,50301,198
42
- 41,PoT,AQuA,Internllm2_5-7B,2025/1/22,36.61,0.9882,0.0,0.0,,254,301962,233505,919,68457,270
43
- 42,CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,33.07,0.9882,0.0,0.0,,254,117339,30477,120,86862,342
44
- 43,PoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.71,0.9646,0.0,0.0,,254,298475,246560,971,51915,204
45
- 44,IO,AQuA,Qwen2-1.5B-Instruct,2025/1/22,29.13,0.9764,0.0,0.0,,254,71047,27937,110,43110,170
46
- 45,IO,AQuA,Qwen2-0.5B-Instruct,2025/1/22,27.17,0.9882,0.0,0.0,,254,110415,27937,110,82478,325
47
- 46,ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,2025/1/22,25.59,0.9606,0.0,0.0,,254,5072004,4555858,17936,516146,2032
48
- 47,ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,2025/1/22,24.02,0.9685,0.0,0.0,,254,7170087,6344167,24977,825920,3252
49
- 48,SC-CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,23.62,0.9646,0.0,0.0,,254,1775335,1034362,4072,740973,2917
50
- 49,SC-CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,22.83,0.9724,0.0,0.0,,254,2215091,1246929,4909,968162,3812
51
- 50,PoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.9213,0.0,0.0,,254,322281,258867,1019,63414,250
52
- 1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.07,1.0,8.0,6.2005,,1319,10998794,8413717,6379,2585077,1960
53
- 2,CoT,gsm8k,gpt-4o,2025/1/22,94.09,1.0,8.0,4.5367,,1319,1165166,948668,719,216498,164
54
- 3,CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,93.93,1.0,8.0,0.687,,1319,1218665,990168,751,228497,173
55
- 4,SC-CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,93.86,1.0,8.0,5.9858,,1319,10618008,8136223,6168,2481785,1882
56
- 5,PoT,gsm8k,gpt-4o,2025/1/22,93.1,0.9977,8.0,4.2166,,1319,1247912,1101672,835,146240,111
57
- 6,CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.87,1.0,8.0,0.7195,,1319,1276252,1005119,762,271133,206
58
- 7,PoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.34,0.9939,8.0,0.7054,,1319,1251210,1106682,839,144528,110
59
- 8,IO,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,92.27,1.0,8.0,0.4709,,1319,835275,583916,443,251359,191
60
- 9,SC-CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,91.13,1.0,8.0,0.0,,1319,11140985,8586888,6510,2554097,1936
61
- 10,SC-CoT,gsm8k,gpt-4o,2025/1/22,90.3,0.9992,8.0,31.0542,,1319,5798173,3590336,2722,2207837,1674
62
- 11,CoT,gsm8k,Doubao-lite-32k,2025/1/7,89.31,1.0,8.0,0.0558,,1319,1201820,1042095,790,159725,121
63
- 12,IO,gsm8k,gpt-4o,2025/1/22,88.4,1.0,8.0,3.3463,,1319,741446,542416,411,199030,151
64
- 13,ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,87.64,0.9992,8.0,10.1124,,1319,17937864,17038928,12918,898936,682
65
- 14,ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,87.26,1.0,8.0,10.5479,,1319,18710437,18160983,13769,549454,417
66
- 15,SC-CoT,gsm8k,Doubao-lite-32k,2025/1/7,87.26,0.9992,8.0,0.2083,,1319,3888813,2691714,2041,1197099,908
67
- 16,IO,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,86.58,1.0,8.0,0.4899,,1319,869060,555340,421,313720,238
68
- 17,CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,85.67,1.0,8.0,0.0,,1319,1290805,1046008,793,244797,186
69
- 18,ReAct-Pro*,gsm8k,Doubao-lite-32k,2025/1/7,85.6,0.9962,8.0,0.2512,,1319,5998639,5862016,4444,136623,104
70
- 19,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.87,1.0,8.0,0.0,,1319,14850914,14355752,10884,495162,375
71
- 20,SC-CoT,gsm8k,gpt-3.5-turbo,2025/1/7,79.91,0.9992,8.0,3.3938,,1319,4089612,2740652,2078,1348960,1023
72
- 21,PoT,gsm8k,Doubao-lite-32k,2025/1/7,79.61,0.9257,8.0,0.0576,,1319,1288055,1170038,887,118017,89
73
- 22,CoT,gsm8k,gpt-3.5-turbo,2025/1/7,78.7,1.0,8.0,0.6788,,1319,1088041,953242,723,134799,102
74
- 23,CoT,gsm8k,Internllm2_5-7B,2025/1/22,77.71,0.997,8.0,0.0,,1319,1202163,968163,734,234000,177
75
- 24,PoT,gsm8k,gpt-3.5-turbo,2025/1/7,76.88,0.9924,8.0,0.6902,,1319,1187080,1090418,827,96662,73
76
- 25,CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.44,0.9992,8.0,0.0,,1319,1248329,990168,751,258161,196
77
- 26,ReAct-Pro*,gsm8k,gpt-3.5-turbo,2025/1/7,74.91,0.9939,8.0,3.4633,,1319,6646286,6506164,4933,140122,106
78
- 27,SC-CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,73.46,0.9955,8.0,0.0,,1319,11778716,8630514,6543,3148202,2387
79
- 28,PoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,73.09,0.7961,8.0,0.9736,,1319,1727044,1126025,854,601019,456
80
- 29,IO,gsm8k,Doubao-lite-32k,2025/1/7,72.02,0.9992,8.0,0.0354,,1319,740483,617377,468,123106,93
81
- 30,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.78,0.9856,8.0,0.0,,1319,22835767,21044978,15955,1790789,1358
82
- 31,ReAct-Pro*,gsm8k,gpt-4o,2025/1/22,63.31,0.9955,8.0,39.0751,,1319,14715887,14411173,10926,304714,231
83
- 32,PoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,58.83,0.7051,8.0,0.0,,1319,1362822,1145390,868,217432,165
84
- 33,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8.0,0.0,,1319,887913,596229,452,291684,221
85
- 34,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.9955,8.0,0.0,,1319,1745429,550941,418,1194488,906
86
- 35,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.5,1.0,8.0,0.0,,1319,1218525,1032818,783,185707,141
87
- 36,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,48.22,0.9841,8.0,0.0,,1319,14526431,10678792,8096,3847639,2917
88
- 37,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.67,0.5542,8.0,0.0,,1319,1391111,1147538,870,243573,185
89
- 38,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8.0,0.0,,1319,1324949,1136843,862,188106,143
90
- 39,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8.0,0.3328,,1319,586553,546990,415,39563,30
91
- 40,CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,35.94,0.9992,8.0,0.0,,1319,1223459,1032818,783,190641,145
92
- 41,ReAct-Pro*,gsm8k,Internllm2_5-7B,2025/1/22,33.51,0.9795,8.0,0.0,,1319,35669989,30120070,22836,5549919,4208
93
- 42,ReAct-Pro*,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,24.87,0.8021,8.0,0.0,,1319,9828001,9133603,6925,694398,526
94
- 43,PoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,18.5,0.3101,8.0,0.0,,1319,1327522,1151528,873,175994,133
95
- 44,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.68,1.0,8.0,0.0,,1319,736996,568530,431,168466,128
96
- 45,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.71,1.0,8.0,0.0,,1319,834897,568116,431,266781,202
97
- 46,SC-CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,11.75,0.9189,8.0,0.0,,1319,12411942,9066115,6873,3345827,2537
98
- 47,IO,gsm8k,Internllm2_5-7B,2025/1/22,11.6,0.9795,8.0,0.0,,1319,1113728,679302,515,434426,329
99
- 48,PoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,9.62,0.1691,8.0,0.0,,1319,1389135,1151528,873,237607,180
100
- 49,ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,7.66,0.9522,8.0,0.0,,1319,55392611,52431343,39751,2961268,2245
101
- 50,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,1.67,0.9469,8.0,0.0,,1319,16465720,11019864,8355,5445856,4129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Samples,All tokens,Total input tokens,Average input tokens,Total output tokens,Average output tokens
2
+ 1,SC-CoT,AQuA,gpt-4o,2025/1/22,86.61,0.9882,0,8.1485,,254,1373206,744478,2931,628728,2475
3
+ 2,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,86.22,0.9921,0,0.0808,,254,143289,25143,99,118146,465
4
+ 3,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.04,0.9921,0,1.0348,,254,1835669,1051218,4139,784451,3088
5
+ 4,IO,AQuA,Qwen2.5-72B-Instruct,2025/1/22,84.25,0.9961,0,0.0742,,254,131604,25397,100,106207,418
6
+ 5,CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.46,0.9843,0,0.0927,,254,164389,32555,128,131834,519
7
+ 6,ToT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.07,1.0,0,2.9404,,254,5215848,4735188,18642,480660,1892
8
+ 7,IO,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.68,0.9921,0,0.0798,,254,141567,32809,129,108758,428
9
+ 8,CoT,AQuA,Doubao-lite-32k,2025/1/7,82.68,0.9724,0,0.0066,,254,94577,27978,110,66599,262
10
+ 9,CoT,AQuA,gpt-4o,2025/1/22,82.68,0.9803,0,1.0417,,254,123017,25123,99,97894,385
11
+ 10,SC-CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.28,0.9921,0,1.0756,,254,1907924,1135251,4469,772673,3042
12
+ 11,ToT,AQuA,gpt-4o,2025/1/22,81.5,0.9921,0,8.5295,,254,2613607,2347538,9242,266069,1048
13
+ 12,SC-CoT,AQuA,Doubao-lite-32k,2025/1/7,81.1,0.9724,0,0.0519,,254,885986,503751,1983,382235,1505
14
+ 13,ToT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,81.1,0.9921,0,3.7389,,254,6632255,6371642,25085,260613,1026
15
+ 14,CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,80.71,0.9961,0,0.0,,254,149736,33017,130,116719,460
16
+ 15,SC-CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,79.92,1.0,0,0.0,,254,1845332,1098280,4324,747052,2941
17
+ 16,PoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.53,0.9921,0,0.1746,,254,309799,240735,948,69064,272
18
+ 17,IO,AQuA,Doubao-lite-32k,2025/1/7,79.13,1.0,0,0.0058,,254,87742,33058,130,54684,215
19
+ 18,ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.13,0.9961,0,0.768,,254,1362379,1119143,4406,243236,958
20
+ 19,IO,AQuA,Qwen2.5-7B-Instruct,2025/1/22,78.74,0.9843,0,0.0,,254,137771,33271,131,104500,411
21
+ 20,ReAct-Pro*,AQuA,Doubao-lite-32k,2025/1/7,77.56,0.9606,0,0.0445,,254,1032841,977890,3850,54951,216
22
+ 21,IO,AQuA,gpt-4o,2025/1/22,75.59,0.9724,0,1.1453,,254,133752,25631,101,108121,426
23
+ 22,PoT,AQuA,gpt-4o,2025/1/22,75.2,1.0,0,1.6087,,254,327908,222717,877,105191,414
24
+ 23,PoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,75.2,1.0,0,0.1645,,254,291764,249215,981,42549,168
25
+ 24,ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,2025/1/22,74.41,0.9921,0,0.0,,254,695844,564165,2221,131679,518
26
+ 25,ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,2025/1/22,73.23,1.0,0,0.3177,,254,563603,441765,1739,121838,480
27
+ 26,PoT,AQuA,Doubao-lite-32k,2025/1/7,71.65,0.9685,0,0.0147,,254,309436,259863,1023,49573,195
28
+ 27,CoT,AQuA,deepseek-r1:1.5b,2025/1/23,71.65,0.9685,0,0.0,,254,333072,26413,104,306659,1207
29
+ 28,IO,AQuA,deepseek-r1:1.5b,2025/1/22,68.9,0.9488,0,0.0,,254,351767,26667,105,325100,1280
30
+ 29,PoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,68.11,1.0,0,0.0,,254,313728,264517,1041,49211,194
31
+ 30,SC-CoT,AQuA,gpt-3.5-turbo,2025/1/7,66.14,0.9921,0,0.7888,,254,847335,482192,1898,365143,1438
32
+ 31,ReAct-Pro*,AQuA,gpt-3.5-turbo,2025/1/7,64.57,0.9803,0,0.4928,,254,903587,862614,3396,40973,161
33
+ 32,CoT,AQuA,gpt-3.5-turbo,2025/1/7,61.02,0.937,0,0.0957,,254,80793,25447,100,55346,218
34
+ 33,CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,60.63,1.0,0,0.0,,254,144435,32555,128,111880,440
35
+ 34,PoT,AQuA,gpt-3.5-turbo,2025/1/7,59.45,1.0,0,0.1748,,254,266654,225162,886,41492,163
36
+ 35,SC-CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,59.45,0.9724,0,0.0,,254,1651333,971003,3823,680330,2678
37
+ 36,SC-CoT,AQuA,deepseek-r1:1.5b,2025/2/10,59.06,0.9685,0,0.0,,254,5802711,2547772,10031,3254939,12815
38
+ 37,ToT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,59.06,1.0,0,0.0,,254,5739684,4896222,19276,843462,3321
39
+ 38,ReAct-Pro*,AQuA,gpt-4o,2025/1/22,57.48,0.9724,0,2.304,,254,692096,615589,2424,76507,301
40
+ 39,ToT,AQuA,gpt-3.5-turbo,2025/1/7,57.09,0.9961,0,1.1513,,254,2001396,1850767,7286,150629,593
41
+ 40,ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,2025/1/22,55.51,0.9685,0,0.0,,254,4340821,3764723,14822,576098,2268
42
+ 41,PoT,AQuA,deepseek-r1:1.5b,2025/2/10,54.72,0.9724,0,0.0,,254,1016647,250690,987,765957,3016
43
+ 42,ReAct-Pro*,AQuA,deepseek-r1:1.5b,2025/2/10,54.33,0.9646,0,0.0,,254,14445041,10578715,41648,3866326,15222
44
+ 43,ToT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,53.94,1.0,0,0.0,,254,8602682,8224468,32380,378214,1489
45
+ 44,CoT,AQuA,Internllm2_5-7B,2025/1/22,52.76,0.8937,0,0.0,,254,127520,26610,105,100910,397
46
+ 45,IO,AQuA,Llama-3.1-8B-Instruct,2025/1/22,51.18,0.9882,0,0.0,,254,133106,26459,104,106647,420
47
+ 46,IO,AQuA,Internllm2_5-7B,2025/1/22,47.64,0.9094,0,0.0,,254,185041,50232,198,134809,531
48
+ 47,ToT,AQuA,Doubao-lite-32k,2025/1/7,45.28,0.7402,0,0.0881,,254,2000550,1850249,7284,150301,592
49
+ 48,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0,0.0,,254,4428801,3592039,14142,836762,3294
50
+ 49,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.9882,0,0.0,,254,110040,30477,120,79563,313
51
+ 50,SC-CoT,AQuA,Internllm2_5-7B,2025/1/22,39.37,0.9803,0,0.0,,254,2296222,1420494,5592,875728,3448
52
+ 51,IO,AQuA,gpt-3.5-turbo,2025/1/7,38.98,1.0,0,0.038,,254,42471,25701,101,16770,66
53
+ 52,PoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,36.61,0.9685,0,0.0,,254,290914,240613,947,50301,198
54
+ 53,PoT,AQuA,Internllm2_5-7B,2025/1/22,36.61,0.9882,0,0.0,,254,301962,233505,919,68457,270
55
+ 54,ToT,AQuA,Internllm2_5-7B,2025/1/22,35.83,0.9961,0,0.0,,254,4734560,4263136,16784,471424,1856
56
+ 55,CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,33.07,0.9882,0,0.0,,254,117339,30477,120,86862,342
57
+ 56,ToT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,31.5,0.9882,0,0.0,,254,6250702,6058022,23850,192680,759
58
+ 57,PoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.71,0.9646,0,0.0,,254,298475,246560,971,51915,204
59
+ 58,ToT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,29.92,1.0,0,0.0,,254,8700281,8100085,31890,600196,2363
60
+ 59,IO,AQuA,Qwen2-1.5B-Instruct,2025/1/22,29.13,0.9764,0,0.0,,254,71047,27937,110,43110,170
61
+ 60,IO,AQuA,Qwen2-0.5B-Instruct,2025/1/22,27.17,0.9882,0,0.0,,254,110415,27937,110,82478,325
62
+ 61,ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,2025/1/22,25.59,0.9606,0,0.0,,254,5072004,4555858,17936,516146,2032
63
+ 62,ToT,AQuA,deepseek-r1:1.5b,2025/2/10,24.8,0.5551,0,0.0,,254,794512,605028,2382,189484,746
64
+ 63,ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,2025/1/22,24.02,0.9685,0,0.0,,254,7170087,6344167,24977,825920,3252
65
+ 64,SC-CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,23.62,0.9646,0,0.0,,254,1775335,1034362,4072,740973,2917
66
+ 65,SC-CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,22.83,0.9724,0,0.0,,254,2215091,1246929,4909,968162,3812
67
+ 66,PoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.9213,0,0.0,,254,322281,258867,1019,63414,250
68
+ 1,CoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,80.2,1.0,4,0.349,,500,619015,338549,677,280466,561
69
+ 2,SC-CoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,74.2,1.0,4,3.2239,,500,5718739,3959492,7919,1759247,3518
70
+ 3,SC-CoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,74.0,1.0,4,3.1556,,500,5597513,3823997,7648,1773516,3547
71
+ 4,CoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,71.2,1.0,4,0.3463,,500,614221,342879,686,271342,543
72
+ 5,IO,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,70.2,1.0,4,0.2506,,500,444591,169549,339,275042,550
73
+ 6,CoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,69.8,1.0,4,0.0,,500,617204,354049,708,263155,526
74
+ 7,IO,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,69.4,1.0,4,0.2386,,500,423216,155879,312,267337,535
75
+ 8,CoT,MATH-500,gpt-4o,2025/1/22,68.0,1.0,4,3.0569,,500,552688,329332,659,223356,447
76
+ 9,SC-CoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,67.0,1.0,4,0.0,,500,5451484,3833751,7668,1617733,3235
77
+ 10,ReAct-Pro*,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,64.6,1.0,4,3.1806,,500,5641879,5223611,10447,418268,837
78
+ 11,ReAct-Pro*,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,62.8,1.0,4,3.4541,,500,6127117,5747268,11495,379849,760
79
+ 12,IO,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,59.4,1.0,4,0.0,,500,411362,169549,339,241813,484
80
+ 13,CoT,MATH-500,Doubao-lite-32k,2025/1/7,59.0,1.0,4,0.0255,,500,479941,336370,673,143571,287
81
+ 14,ReAct-Pro*,MATH-500,gpt-4o,2025/1/22,54.0,1.0,4,17.7735,,500,6153255,5834537,11669,318718,637
82
+ 15,CoT,MATH-500,deepseek-r1:1.5b,2025/1/23,49.4,1.0,4,0.0,,500,1199129,341549,683,857580,1715
83
+ 16,SC-CoT,MATH-500,Doubao-lite-32k,2025/1/7,49.2,1.0,4,0.1406,,500,2470810,1507651,3015,963159,1926
84
+ 17,ReAct-Pro*,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,48.8,1.0,4,0.0,,500,4990240,4646708,9293,343532,687
85
+ 18,ReAct-Pro*,MATH-500,Doubao-lite-32k,2025/1/7,47.2,1.0,4,0.186,,500,4388666,4234620,8469,154046,308
86
+ 19,PoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,47.2,0.822,4,0.233,,500,413372,242549,485,170823,342
87
+ 20,CoT,MATH-500,Internllm2_5-7B,2025/1/22,46.6,1.0,4,0.0,,500,546774,332883,666,213891,428
88
+ 21,PoT,MATH-500,gpt-4o,2025/1/22,46.2,0.864,4,1.5994,,500,340960,241357,483,99603,199
89
+ 22,IO,MATH-500,deepseek-r1:1.5b,2025/1/22,43.8,1.0,4,0.0,,500,1022548,157049,314,865499,1731
90
+ 23,PoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,42.6,0.802,4,0.2839,,500,503596,253879,508,249717,499
91
+ 24,IO,MATH-500,gpt-4o,2025/1/22,41.8,1.0,4,2.7907,,500,394447,153832,308,240615,481
92
+ 25,CoT,MATH-500,gpt-3.5-turbo,2025/1/7,39.8,1.0,4,0.3189,,500,432196,329381,659,102815,206
93
+ 26,PoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,39.6,0.744,4,0.0,,500,408812,258549,517,150263,301
94
+ 27,IO,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,38.6,1.0,4,0.0,,500,503934,155563,311,348371,697
95
+ 28,SC-CoT,MATH-500,deepseek-r1:1.5b,2025/2/10,38.0,1.0,4,0.0,,500,14742109,7080559,14161,7661550,15323
96
+ 29,IO,MATH-500,Doubao-lite-32k,2025/1/7,37.4,1.0,4,0.0187,,500,311730,166870,334,144860,290
97
+ 30,SC-CoT,MATH-500,gpt-4o,2025/1/22,34.4,1.0,4,19.6538,,500,3455323,1986584,3973,1468739,2937
98
+ 31,PoT,MATH-500,Doubao-lite-32k,2025/1/7,32.6,0.68,4,0.0144,,500,303148,254377,509,48771,98
99
+ 32,SC-CoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,30.2,1.0,4,0.0,,500,5034937,3546673,7093,1488264,2977
100
+ 33,ReAct-Pro*,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,28.8,1.0,4,0.0,,500,8763629,7486706,14973,1276923,2554
101
+ 34,PoT,MATH-500,gpt-3.5-turbo,2025/1/7,28.8,0.838,4,0.168,,500,271916,239902,480,32014,64
102
+ 35,SC-CoT,MATH-500,gpt-3.5-turbo,2025/1/7,28.8,1.0,4,1.9764,,500,2238812,1381818,2764,856994,1714
103
+ 36,CoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,25.8,1.0,4,0.0,,500,625568,342879,686,282689,565
104
+ 37,PoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,25.4,0.684,4,0.0,,500,462271,253879,508,208392,417
105
+ 38,ReAct-Pro*,MATH-500,deepseek-r1:1.5b,2025/2/10,24.4,1.0,4,0.0,,500,30177348,20729970,41460,9447378,18895
106
+ 39,ReAct-Pro*,MATH-500,gpt-3.5-turbo,2025/1/7,23.8,1.0,4,2.0406,,500,3832714,3708461,7417,124253,249
107
+ 40,IO,MATH-500,Internllm2_5-7B,2025/1/22,22.8,1.0,4,0.0,,500,467888,201883,404,266005,532
108
+ 41,IO,MATH-500,gpt-3.5-turbo,2025/1/7,17.2,1.0,4,0.2436,,500,265625,154881,310,110744,221
109
+ 42,CoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,15.2,1.0,4,0.0,,500,536377,349049,698,187328,375
110
+ 43,PoT,MATH-500,Internllm2_5-7B,2025/1/22,15.0,0.324,4,0.0,,500,368709,247883,496,120826,242
111
+ 44,ReAct-Pro*,MATH-500,Internllm2_5-7B,2025/1/22,14.8,1.0,4,0.0,,500,14186105,11831496,23663,2354609,4709
112
+ 45,ToT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,10.8,1.0,4,9.0421,,500,16039361,15657730,31315,381631,763
113
+ 46,SC-CoT,MATH-500,Internllm2_5-7B,2025/1/22,9.8,1.0,4,0.0,,500,5838466,4193296,8387,1645170,3290
114
+ 47,ToT,MATH-500,gpt-3.5-turbo,2025/1/7,9.8,1.0,4,5.2914,,500,10001767,9711244,19422,290523,581
115
+ 48,ReAct-Pro*,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,8.2,1.0,4,0.0,,500,8987061,8430774,16862,556287,1113
116
+ 49,IO,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,7.0,1.0,4,0.0,,500,413878,158777,318,255101,510
117
+ 50,CoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,6.2,1.0,4,0.0,,500,549188,349049,698,200139,400
118
+ 51,SC-CoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,3.8,0.99,4,0.0,,500,5569442,3832429,7665,1737013,3474
119
+ 52,ToT,MATH-500,gpt-4o,2025/1/22,3.2,1.0,4,40.8094,,500,15242432,14881985,29764,360447,721
120
+ 53,IO,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,2.6,1.0,4,0.0,,500,429330,159049,318,270281,541
121
+ 54,ToT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,1.8,0.908,4,0.0,,500,9035000,7729000,15458,1306000,2612
122
+ 55,ToT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,1.4,0.698,4,8.2699,,500,14669500,14099500,28199,570000,1140
123
+ 56,ToT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,1.4,0.916,4,0.0,,500,10167500,9749000,19498,418500,837
124
+ 57,ToT,MATH-500,Doubao-lite-32k,2025/1/7,1.2,0.942,4,0.2371,,500,5564500,5338500,10677,226000,452
125
+ 58,PoT,MATH-500,deepseek-r1:1.5b,2025/2/10,1.0,0.016,4,0.0,,500,1031067,245549,491,785518,1571
126
+ 59,PoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,0.8,0.022,4,0.0,,500,786870,248509,497,538361,1077
127
+ 60,SC-CoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.8,1.0,4,0.0,,500,6862056,4448663,8897,2413393,4827
128
+ 61,ToT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,0.8,0.972,4,0.0,,500,4535000,4408000,8816,127000,254
129
+ 62,ReAct-Pro*,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.6,1.0,4,0.0,,500,19442440,18137392,36275,1305048,2610
130
+ 63,ToT,MATH-500,deepseek-r1:1.5b,2025/2/10,0.4,0.716,4,0.0,,500,1941500,1831000,3662,110500,221
131
+ 64,ToT,MATH-500,Internllm2_5-7B,2025/1/22,0.2,0.99,4,0.0,,500,8350500,7515000,15030,835500,1671
132
+ 65,PoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.0,4,0.0,,500,437202,253549,507,183653,367
133
+ 66,ToT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.962,4,0.0,,500,5996500,5590500,11181,406000,812
134
+ 1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.07,1.0,8,6.2005,,1319,10998794,8413717,6379,2585077,1960
135
+ 2,CoT,gsm8k,gpt-4o,2025/1/22,94.09,1.0,8,4.5367,,1319,1165166,948668,719,216498,164
136
+ 3,CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,93.93,1.0,8,0.687,,1319,1218665,990168,751,228497,173
137
+ 4,SC-CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,93.86,1.0,8,5.9858,,1319,10618008,8136223,6168,2481785,1882
138
+ 5,PoT,gsm8k,gpt-4o,2025/1/22,93.1,0.9977,8,4.2166,,1319,1247912,1101672,835,146240,111
139
+ 6,CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.87,1.0,8,0.7195,,1319,1276252,1005119,762,271133,206
140
+ 7,PoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.34,0.9939,8,0.7054,,1319,1251210,1106682,839,144528,110
141
+ 8,IO,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,92.27,1.0,8,0.4709,,1319,835275,583916,443,251359,191
142
+ 9,ToT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,91.89,1.0,8,20.8753,,1319,37029687,35096810,26609,1932877,1465
143
+ 10,SC-CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,91.13,1.0,8,0.0,,1319,11140985,8586888,6510,2554097,1936
144
+ 11,ToT,gsm8k,gpt-4o,2025/1/22,91.13,1.0,8,86.8581,,1319,30769735,29445237,22324,1324498,1004
145
+ 12,SC-CoT,gsm8k,gpt-4o,2025/1/22,90.3,0.9992,8,31.0542,,1319,5798173,3590336,2722,2207837,1674
146
+ 13,CoT,gsm8k,Doubao-lite-32k,2025/1/7,89.31,1.0,8,0.0558,,1319,1201820,1042095,790,159725,121
147
+ 14,ToT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,88.88,1.0,8,23.5911,,1319,41847148,40435361,30656,1411787,1070
148
+ 15,IO,gsm8k,gpt-4o,2025/1/22,88.4,1.0,8,3.3463,,1319,741446,542416,411,199030,151
149
+ 16,ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,87.64,0.9992,8,10.1124,,1319,17937864,17038928,12918,898936,682
150
+ 17,ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,87.26,1.0,8,10.5479,,1319,18710437,18160983,13769,549454,417
151
+ 18,SC-CoT,gsm8k,Doubao-lite-32k,2025/1/7,87.26,0.9992,8,0.2083,,1319,3888813,2691714,2041,1197099,908
152
+ 19,IO,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,86.58,1.0,8,0.4899,,1319,869060,555340,421,313720,238
153
+ 20,CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,85.67,1.0,8,0.0,,1319,1290805,1046008,793,244797,186
154
+ 21,ReAct-Pro*,gsm8k,Doubao-lite-32k,2025/1/7,85.6,0.9962,8,0.2512,,1319,5998639,5862016,4444,136623,104
155
+ 22,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.87,1.0,8,0.0,,1319,14850914,14355752,10884,495162,375
156
+ 23,SC-CoT,gsm8k,gpt-3.5-turbo,2025/1/7,79.91,0.9992,8,3.3938,,1319,4089612,2740652,2078,1348960,1023
157
+ 24,PoT,gsm8k,Doubao-lite-32k,2025/1/7,79.61,0.9257,8,0.0576,,1319,1288055,1170038,887,118017,89
158
+ 25,CoT,gsm8k,gpt-3.5-turbo,2025/1/7,78.7,1.0,8,0.6788,,1319,1088041,953242,723,134799,102
159
+ 26,CoT,gsm8k,Internllm2_5-7B,2025/1/22,77.71,0.997,8,0.0,,1319,1202163,968163,734,234000,177
160
+ 27,PoT,gsm8k,gpt-3.5-turbo,2025/1/7,76.88,0.9924,8,0.6902,,1319,1187080,1090418,827,96662,73
161
+ 28,CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.44,0.9992,8,0.0,,1319,1248329,990168,751,258161,196
162
+ 29,ReAct-Pro*,gsm8k,gpt-3.5-turbo,2025/1/7,74.91,0.9939,8,3.4633,,1319,6646286,6506164,4933,140122,106
163
+ 30,SC-CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,73.46,0.9955,8,0.0,,1319,11778716,8630514,6543,3148202,2387
164
+ 31,PoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,73.09,0.7961,8,0.9736,,1319,1727044,1126025,854,601019,456
165
+ 32,ToT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,72.21,0.9901,8,0.0,,1319,31657319,20196528,15312,11460791,8689
166
+ 33,IO,gsm8k,Doubao-lite-32k,2025/1/7,72.02,0.9992,8,0.0354,,1319,740483,617377,468,123106,93
167
+ 34,CoT,gsm8k,deepseek-r1:1.5b,2025/1/23,70.66,0.9977,8,0.0,,1319,2090625,1011714,767,1078911,818
168
+ 35,ToT,gsm8k,gpt-3.5-turbo,2025/1/7,67.93,0.997,8,9.1707,,1319,16727175,15920037,12070,807138,612
169
+ 36,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.78,0.9856,8,0.0,,1319,22835767,21044978,15955,1790789,1358
170
+ 37,ToT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,65.05,0.9196,8,0.0,,1319,16432102,15554967,11793,877135,665
171
+ 38,IO,gsm8k,deepseek-r1:1.5b,2025/1/22,64.14,0.9962,8,0.0,,1319,1483051,561935,426,921116,698
172
+ 39,ReAct-Pro*,gsm8k,gpt-4o,2025/1/22,63.31,0.9955,8,39.0751,,1319,14715887,14411173,10926,304714,231
173
+ 40,PoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,58.83,0.7051,8,0.0,,1319,1362822,1145390,868,217432,165
174
+ 41,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8,0.0,,1319,887913,596229,452,291684,221
175
+ 42,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.9955,8,0.0,,1319,1745429,550941,418,1194488,906
176
+ 43,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.5,1.0,8,0.0,,1319,1218525,1032818,783,185707,141
177
+ 44,SC-CoT,gsm8k,deepseek-r1:1.5b,2025/2/10,55.34,0.997,8,0.0,,1319,25785865,14540096,11024,11245769,8526
178
+ 45,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,48.22,0.9841,8,0.0,,1319,14526431,10678792,8096,3847639,2917
179
+ 46,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.67,0.5542,8,0.0,,1319,1391111,1147538,870,243573,185
180
+ 47,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8,0.0,,1319,1324949,1136843,862,188106,143
181
+ 48,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8,0.3328,,1319,586553,546990,415,39563,30
182
+ 49,ToT,gsm8k,Doubao-lite-32k,2025/1/7,37.83,0.8734,8,0.8739,,1319,20274349,19208597,14563,1065752,808
183
+ 50,ReAct-Pro*,gsm8k,deepseek-r1:1.5b,2025/2/10,35.94,0.9962,8,0.0,,1319,24219077,19299381,14632,4919696,3730
184
+ 51,CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,35.94,0.9992,8,0.0,,1319,1223459,1032818,783,190641,145
185
+ 52,ReAct-Pro*,gsm8k,Internllm2_5-7B,2025/1/22,33.51,0.9795,8,0.0,,1319,35669989,30120070,22836,5549919,4208
186
+ 53,ReAct-Pro*,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,24.87,0.8021,8,0.0,,1319,9828001,9133603,6925,694398,526
187
+ 54,ToT,gsm8k,deepseek-r1:1.5b,2025/2/10,23.12,0.7248,8,0.0,,1319,3421486,2738244,2076,683242,518
188
+ 55,ToT,gsm8k,Internllm2_5-7B,2025/1/22,20.85,0.7013,8,0.0,,1319,13178129,11768118,8922,1410011,1069
189
+ 56,ToT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,19.64,0.7726,8,0.0,,1319,12758687,12124248,9192,634439,481
190
+ 57,PoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,18.5,0.3101,8,0.0,,1319,1327522,1151528,873,175994,133
191
+ 58,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.68,1.0,8,0.0,,1319,736996,568530,431,168466,128
192
+ 59,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.71,1.0,8,0.0,,1319,834897,568116,431,266781,202
193
+ 60,PoT,gsm8k,deepseek-r1:1.5b,2025/2/10,11.9,0.1744,8,0.0,,1319,1954509,1138872,863,815637,618
194
+ 61,SC-CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,11.75,0.9189,8,0.0,,1319,12411942,9066115,6873,3345827,2537
195
+ 62,IO,gsm8k,Internllm2_5-7B,2025/1/22,11.6,0.9795,8,0.0,,1319,1113728,679302,515,434426,329
196
+ 63,PoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,9.63,0.1691,8,0.0,,1319,1389135,1151528,873,237607,180
197
+ 64,ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,7.66,0.9522,8,0.0,,1319,55392611,52431343,39751,2961268,2245
198
+ 65,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,1.67,0.9469,8,0.0,,1319,16465720,11019864,8355,5445856,4129
199
+ 66,ToT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.0,8,0.0,,1319,0,0,0,0,0
src/overall_math_score.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "time": "2025-01-24 15:10:27",
3
  "results": {
4
  "IO": {
5
  "META": {
@@ -14,6 +14,10 @@
14
  "AQuA": {
15
  "Score": 38.98,
16
  "Cost($)": 0.038
 
 
 
 
17
  }
18
  },
19
  "ReAct-Pro*": {
@@ -29,6 +33,10 @@
29
  "AQuA": {
30
  "Score": 64.57,
31
  "Cost($)": 0.4928
 
 
 
 
32
  }
33
  },
34
  "PoT": {
@@ -44,6 +52,10 @@
44
  "AQuA": {
45
  "Score": 59.45,
46
  "Cost($)": 0.1748
 
 
 
 
47
  }
48
  },
49
  "CoT": {
@@ -59,6 +71,10 @@
59
  "AQuA": {
60
  "Score": 61.02,
61
  "Cost($)": 0.0957
 
 
 
 
62
  }
63
  },
64
  "SC-CoT": {
@@ -74,6 +90,29 @@
74
  "AQuA": {
75
  "Score": 66.14,
76
  "Cost($)": 0.7888
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  }
78
  },
79
  "IO-Doubao-lite-32k": {
@@ -89,6 +128,10 @@
89
  "AQuA": {
90
  "Score": 79.13,
91
  "Cost($)": 0.0058
 
 
 
 
92
  }
93
  },
94
  "ReAct-Pro*-Doubao-lite-32k": {
@@ -104,6 +147,10 @@
104
  "AQuA": {
105
  "Score": 77.56,
106
  "Cost($)": 0.0445
 
 
 
 
107
  }
108
  },
109
  "PoT-Doubao-lite-32k": {
@@ -119,6 +166,10 @@
119
  "AQuA": {
120
  "Score": 71.65,
121
  "Cost($)": 0.0147
 
 
 
 
122
  }
123
  },
124
  "CoT-Doubao-lite-32k": {
@@ -134,6 +185,10 @@
134
  "AQuA": {
135
  "Score": 82.68,
136
  "Cost($)": 0.0066
 
 
 
 
137
  }
138
  },
139
  "SC-CoT-Doubao-lite-32k": {
@@ -149,6 +204,29 @@
149
  "AQuA": {
150
  "Score": 81.1,
151
  "Cost($)": 0.0519
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  }
153
  },
154
  "IO-gpt-4o": {
@@ -164,6 +242,10 @@
164
  "AQuA": {
165
  "Score": 75.59,
166
  "Cost($)": 1.1453
 
 
 
 
167
  }
168
  },
169
  "ReAct-Pro*-gpt-4o": {
@@ -179,6 +261,10 @@
179
  "AQuA": {
180
  "Score": 57.48,
181
  "Cost($)": 2.304
 
 
 
 
182
  }
183
  },
184
  "PoT-gpt-4o": {
@@ -194,6 +280,10 @@
194
  "AQuA": {
195
  "Score": 75.2,
196
  "Cost($)": 1.6087
 
 
 
 
197
  }
198
  },
199
  "CoT-gpt-4o": {
@@ -209,6 +299,10 @@
209
  "AQuA": {
210
  "Score": 82.68,
211
  "Cost($)": 1.0417
 
 
 
 
212
  }
213
  },
214
  "SC-CoT-gpt-4o": {
@@ -224,6 +318,29 @@
224
  "AQuA": {
225
  "Score": 86.61,
226
  "Cost($)": 8.1485
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  }
228
  },
229
  "IO-Qwen2.5-72B-Instruct": {
@@ -239,6 +356,10 @@
239
  "AQuA": {
240
  "Score": 84.25,
241
  "Cost($)": 0.0742
 
 
 
 
242
  }
243
  },
244
  "ReAct-Pro*-Qwen2.5-72B-Instruct": {
@@ -254,6 +375,10 @@
254
  "AQuA": {
255
  "Score": 73.23,
256
  "Cost($)": 0.3177
 
 
 
 
257
  }
258
  },
259
  "PoT-Qwen2.5-72B-Instruct": {
@@ -269,6 +394,10 @@
269
  "AQuA": {
270
  "Score": 75.2,
271
  "Cost($)": 0.1645
 
 
 
 
272
  }
273
  },
274
  "CoT-Qwen2.5-72B-Instruct": {
@@ -284,6 +413,10 @@
284
  "AQuA": {
285
  "Score": 86.22,
286
  "Cost($)": 0.0808
 
 
 
 
287
  }
288
  },
289
  "SC-CoT-Qwen2.5-72B-Instruct": {
@@ -299,6 +432,29 @@
299
  "AQuA": {
300
  "Score": 85.04,
301
  "Cost($)": 1.0348
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  }
303
  },
304
  "IO-Llama-3.3-70B-Instruct": {
@@ -314,6 +470,10 @@
314
  "AQuA": {
315
  "Score": 82.68,
316
  "Cost($)": 0.0798
 
 
 
 
317
  }
318
  },
319
  "ReAct-Pro*-Llama-3.3-70B-Instruct": {
@@ -329,6 +489,10 @@
329
  "AQuA": {
330
  "Score": 79.13,
331
  "Cost($)": 0.768
 
 
 
 
332
  }
333
  },
334
  "PoT-Llama-3.3-70B-Instruct": {
@@ -344,6 +508,10 @@
344
  "AQuA": {
345
  "Score": 79.53,
346
  "Cost($)": 0.1746
 
 
 
 
347
  }
348
  },
349
  "CoT-Llama-3.3-70B-Instruct": {
@@ -359,6 +527,10 @@
359
  "AQuA": {
360
  "Score": 83.46,
361
  "Cost($)": 0.0927
 
 
 
 
362
  }
363
  },
364
  "SC-CoT-Llama-3.3-70B-Instruct": {
@@ -374,6 +546,29 @@
374
  "AQuA": {
375
  "Score": 82.28,
376
  "Cost($)": 1.0756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  }
378
  },
379
  "IO-Qwen2.5-7B-Instruct": {
@@ -389,6 +584,10 @@
389
  "AQuA": {
390
  "Score": 78.74,
391
  "Cost($)": 0.0
 
 
 
 
392
  }
393
  },
394
  "ReAct-Pro*-Qwen2.5-7B-Instruct": {
@@ -404,6 +603,10 @@
404
  "AQuA": {
405
  "Score": 74.41,
406
  "Cost($)": 0.0
 
 
 
 
407
  }
408
  },
409
  "PoT-Qwen2.5-7B-Instruct": {
@@ -419,6 +622,10 @@
419
  "AQuA": {
420
  "Score": 68.11,
421
  "Cost($)": 0.0
 
 
 
 
422
  }
423
  },
424
  "CoT-Qwen2.5-7B-Instruct": {
@@ -434,6 +641,10 @@
434
  "AQuA": {
435
  "Score": 80.71,
436
  "Cost($)": 0.0
 
 
 
 
437
  }
438
  },
439
  "SC-CoT-Qwen2.5-7B-Instruct": {
@@ -449,6 +660,29 @@
449
  "AQuA": {
450
  "Score": 79.92,
451
  "Cost($)": 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
  }
453
  },
454
  "IO-Llama-3.1-8B-Instruct": {
@@ -464,6 +698,10 @@
464
  "AQuA": {
465
  "Score": 51.18,
466
  "Cost($)": 0.0
 
 
 
 
467
  }
468
  },
469
  "ReAct-Pro*-Llama-3.1-8B-Instruct": {
@@ -479,6 +717,10 @@
479
  "AQuA": {
480
  "Score": 55.51,
481
  "Cost($)": 0.0
 
 
 
 
482
  }
483
  },
484
  "PoT-Llama-3.1-8B-Instruct": {
@@ -494,6 +736,10 @@
494
  "AQuA": {
495
  "Score": 36.61,
496
  "Cost($)": 0.0
 
 
 
 
497
  }
498
  },
499
  "CoT-Llama-3.1-8B-Instruct": {
@@ -509,6 +755,10 @@
509
  "AQuA": {
510
  "Score": 60.63,
511
  "Cost($)": 0.0
 
 
 
 
512
  }
513
  },
514
  "SC-CoT-Llama-3.1-8B-Instruct": {
@@ -524,6 +774,29 @@
524
  "AQuA": {
525
  "Score": 59.45,
526
  "Cost($)": 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527
  }
528
  },
529
  "IO-Internllm2_5-7B": {
@@ -539,6 +812,10 @@
539
  "AQuA": {
540
  "Score": 47.64,
541
  "Cost($)": 0.0
 
 
 
 
542
  }
543
  },
544
  "ReAct-Pro*-Internllm2_5-7B": {
@@ -554,6 +831,10 @@
554
  "AQuA": {
555
  "Score": 40.94,
556
  "Cost($)": 0.0
 
 
 
 
557
  }
558
  },
559
  "PoT-Internllm2_5-7B": {
@@ -569,6 +850,10 @@
569
  "AQuA": {
570
  "Score": 36.61,
571
  "Cost($)": 0.0
 
 
 
 
572
  }
573
  },
574
  "CoT-Internllm2_5-7B": {
@@ -584,6 +869,10 @@
584
  "AQuA": {
585
  "Score": 52.76,
586
  "Cost($)": 0.0
 
 
 
 
587
  }
588
  },
589
  "SC-CoT-Internllm2_5-7B": {
@@ -599,6 +888,29 @@
599
  "AQuA": {
600
  "Score": 39.37,
601
  "Cost($)": 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
602
  }
603
  },
604
  "IO-Qwen2-1.5B-Instruct": {
@@ -614,6 +926,10 @@
614
  "AQuA": {
615
  "Score": 29.13,
616
  "Cost($)": 0.0
 
 
 
 
617
  }
618
  },
619
  "ReAct-Pro*-Qwen2-1.5B-Instruct": {
@@ -629,6 +945,10 @@
629
  "AQuA": {
630
  "Score": 25.59,
631
  "Cost($)": 0.0
 
 
 
 
632
  }
633
  },
634
  "PoT-Qwen2-1.5B-Instruct": {
@@ -644,6 +964,10 @@
644
  "AQuA": {
645
  "Score": 30.71,
646
  "Cost($)": 0.0
 
 
 
 
647
  }
648
  },
649
  "CoT-Qwen2-1.5B-Instruct": {
@@ -659,6 +983,10 @@
659
  "AQuA": {
660
  "Score": 40.55,
661
  "Cost($)": 0.0
 
 
 
 
662
  }
663
  },
664
  "SC-CoT-Qwen2-1.5B-Instruct": {
@@ -674,6 +1002,29 @@
674
  "AQuA": {
675
  "Score": 23.62,
676
  "Cost($)": 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
677
  }
678
  },
679
  "IO-Qwen2-0.5B-Instruct": {
@@ -689,6 +1040,10 @@
689
  "AQuA": {
690
  "Score": 27.17,
691
  "Cost($)": 0.0
 
 
 
 
692
  }
693
  },
694
  "ReAct-Pro*-Qwen2-0.5B-Instruct": {
@@ -704,6 +1059,10 @@
704
  "AQuA": {
705
  "Score": 24.02,
706
  "Cost($)": 0.0
 
 
 
 
707
  }
708
  },
709
  "PoT-Qwen2-0.5B-Instruct": {
@@ -713,12 +1072,16 @@
713
  "Eval Date": "2025/1/22"
714
  },
715
  "gsm8k": {
716
- "Score": 9.62,
717
  "Cost($)": 0.0
718
  },
719
  "AQuA": {
720
  "Score": 17.32,
721
  "Cost($)": 0.0
 
 
 
 
722
  }
723
  },
724
  "CoT-Qwen2-0.5B-Instruct": {
@@ -734,6 +1097,10 @@
734
  "AQuA": {
735
  "Score": 33.07,
736
  "Cost($)": 0.0
 
 
 
 
737
  }
738
  },
739
  "SC-CoT-Qwen2-0.5B-Instruct": {
@@ -749,6 +1116,143 @@
749
  "AQuA": {
750
  "Score": 22.83,
751
  "Cost($)": 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
752
  }
753
  }
754
  }
 
1
  {
2
+ "time": "2025-02-11 13:23:00",
3
  "results": {
4
  "IO": {
5
  "META": {
 
14
  "AQuA": {
15
  "Score": 38.98,
16
  "Cost($)": 0.038
17
+ },
18
+ "MATH-500": {
19
+ "Score": 17.2,
20
+ "Cost($)": 0.2436
21
  }
22
  },
23
  "ReAct-Pro*": {
 
33
  "AQuA": {
34
  "Score": 64.57,
35
  "Cost($)": 0.4928
36
+ },
37
+ "MATH-500": {
38
+ "Score": 23.8,
39
+ "Cost($)": 2.0406
40
  }
41
  },
42
  "PoT": {
 
52
  "AQuA": {
53
  "Score": 59.45,
54
  "Cost($)": 0.1748
55
+ },
56
+ "MATH-500": {
57
+ "Score": 28.8,
58
+ "Cost($)": 0.168
59
  }
60
  },
61
  "CoT": {
 
71
  "AQuA": {
72
  "Score": 61.02,
73
  "Cost($)": 0.0957
74
+ },
75
+ "MATH-500": {
76
+ "Score": 39.8,
77
+ "Cost($)": 0.3189
78
  }
79
  },
80
  "SC-CoT": {
 
90
  "AQuA": {
91
  "Score": 66.14,
92
  "Cost($)": 0.7888
93
+ },
94
+ "MATH-500": {
95
+ "Score": 28.8,
96
+ "Cost($)": 1.9764
97
+ }
98
+ },
99
+ "ToT": {
100
+ "META": {
101
+ "Algorithm": "ToT",
102
+ "LLM": "gpt-3.5-turbo",
103
+ "Eval Date": "2025/1/7"
104
+ },
105
+ "gsm8k": {
106
+ "Score": 67.93,
107
+ "Cost($)": 9.1707
108
+ },
109
+ "AQuA": {
110
+ "Score": 57.09,
111
+ "Cost($)": 1.1513
112
+ },
113
+ "MATH-500": {
114
+ "Score": 9.8,
115
+ "Cost($)": 5.2914
116
  }
117
  },
118
  "IO-Doubao-lite-32k": {
 
128
  "AQuA": {
129
  "Score": 79.13,
130
  "Cost($)": 0.0058
131
+ },
132
+ "MATH-500": {
133
+ "Score": 37.4,
134
+ "Cost($)": 0.0187
135
  }
136
  },
137
  "ReAct-Pro*-Doubao-lite-32k": {
 
147
  "AQuA": {
148
  "Score": 77.56,
149
  "Cost($)": 0.0445
150
+ },
151
+ "MATH-500": {
152
+ "Score": 47.2,
153
+ "Cost($)": 0.186
154
  }
155
  },
156
  "PoT-Doubao-lite-32k": {
 
166
  "AQuA": {
167
  "Score": 71.65,
168
  "Cost($)": 0.0147
169
+ },
170
+ "MATH-500": {
171
+ "Score": 32.6,
172
+ "Cost($)": 0.0144
173
  }
174
  },
175
  "CoT-Doubao-lite-32k": {
 
185
  "AQuA": {
186
  "Score": 82.68,
187
  "Cost($)": 0.0066
188
+ },
189
+ "MATH-500": {
190
+ "Score": 59.0,
191
+ "Cost($)": 0.0255
192
  }
193
  },
194
  "SC-CoT-Doubao-lite-32k": {
 
204
  "AQuA": {
205
  "Score": 81.1,
206
  "Cost($)": 0.0519
207
+ },
208
+ "MATH-500": {
209
+ "Score": 49.2,
210
+ "Cost($)": 0.1406
211
+ }
212
+ },
213
+ "ToT-Doubao-lite-32k": {
214
+ "META": {
215
+ "Algorithm": "ToT",
216
+ "LLM": "Doubao-lite-32k",
217
+ "Eval Date": "2025/1/7"
218
+ },
219
+ "gsm8k": {
220
+ "Score": 37.83,
221
+ "Cost($)": 0.8739
222
+ },
223
+ "AQuA": {
224
+ "Score": 45.28,
225
+ "Cost($)": 0.0881
226
+ },
227
+ "MATH-500": {
228
+ "Score": 1.2,
229
+ "Cost($)": 0.2371
230
  }
231
  },
232
  "IO-gpt-4o": {
 
242
  "AQuA": {
243
  "Score": 75.59,
244
  "Cost($)": 1.1453
245
+ },
246
+ "MATH-500": {
247
+ "Score": 41.8,
248
+ "Cost($)": 2.7907
249
  }
250
  },
251
  "ReAct-Pro*-gpt-4o": {
 
261
  "AQuA": {
262
  "Score": 57.48,
263
  "Cost($)": 2.304
264
+ },
265
+ "MATH-500": {
266
+ "Score": 54.0,
267
+ "Cost($)": 17.7735
268
  }
269
  },
270
  "PoT-gpt-4o": {
 
280
  "AQuA": {
281
  "Score": 75.2,
282
  "Cost($)": 1.6087
283
+ },
284
+ "MATH-500": {
285
+ "Score": 46.2,
286
+ "Cost($)": 1.5994
287
  }
288
  },
289
  "CoT-gpt-4o": {
 
299
  "AQuA": {
300
  "Score": 82.68,
301
  "Cost($)": 1.0417
302
+ },
303
+ "MATH-500": {
304
+ "Score": 68.0,
305
+ "Cost($)": 3.0569
306
  }
307
  },
308
  "SC-CoT-gpt-4o": {
 
318
  "AQuA": {
319
  "Score": 86.61,
320
  "Cost($)": 8.1485
321
+ },
322
+ "MATH-500": {
323
+ "Score": 34.4,
324
+ "Cost($)": 19.6538
325
+ }
326
+ },
327
+ "ToT-gpt-4o": {
328
+ "META": {
329
+ "Algorithm": "ToT",
330
+ "LLM": "gpt-4o",
331
+ "Eval Date": "2025/1/22"
332
+ },
333
+ "gsm8k": {
334
+ "Score": 91.13,
335
+ "Cost($)": 86.8581
336
+ },
337
+ "AQuA": {
338
+ "Score": 81.5,
339
+ "Cost($)": 8.5295
340
+ },
341
+ "MATH-500": {
342
+ "Score": 3.2,
343
+ "Cost($)": 40.8094
344
  }
345
  },
346
  "IO-Qwen2.5-72B-Instruct": {
 
356
  "AQuA": {
357
  "Score": 84.25,
358
  "Cost($)": 0.0742
359
+ },
360
+ "MATH-500": {
361
+ "Score": 70.2,
362
+ "Cost($)": 0.2506
363
  }
364
  },
365
  "ReAct-Pro*-Qwen2.5-72B-Instruct": {
 
375
  "AQuA": {
376
  "Score": 73.23,
377
  "Cost($)": 0.3177
378
+ },
379
+ "MATH-500": {
380
+ "Score": 62.8,
381
+ "Cost($)": 3.4541
382
  }
383
  },
384
  "PoT-Qwen2.5-72B-Instruct": {
 
394
  "AQuA": {
395
  "Score": 75.2,
396
  "Cost($)": 0.1645
397
+ },
398
+ "MATH-500": {
399
+ "Score": 47.2,
400
+ "Cost($)": 0.233
401
  }
402
  },
403
  "CoT-Qwen2.5-72B-Instruct": {
 
413
  "AQuA": {
414
  "Score": 86.22,
415
  "Cost($)": 0.0808
416
+ },
417
+ "MATH-500": {
418
+ "Score": 80.2,
419
+ "Cost($)": 0.349
420
  }
421
  },
422
  "SC-CoT-Qwen2.5-72B-Instruct": {
 
432
  "AQuA": {
433
  "Score": 85.04,
434
  "Cost($)": 1.0348
435
+ },
436
+ "MATH-500": {
437
+ "Score": 74.0,
438
+ "Cost($)": 3.1556
439
+ }
440
+ },
441
+ "ToT-Qwen2.5-72B-Instruct": {
442
+ "META": {
443
+ "Algorithm": "ToT",
444
+ "LLM": "Qwen2.5-72B-Instruct",
445
+ "Eval Date": "2025/1/22"
446
+ },
447
+ "gsm8k": {
448
+ "Score": 88.88,
449
+ "Cost($)": 23.5911
450
+ },
451
+ "AQuA": {
452
+ "Score": 81.1,
453
+ "Cost($)": 3.7389
454
+ },
455
+ "MATH-500": {
456
+ "Score": 10.8,
457
+ "Cost($)": 9.0421
458
  }
459
  },
460
  "IO-Llama-3.3-70B-Instruct": {
 
470
  "AQuA": {
471
  "Score": 82.68,
472
  "Cost($)": 0.0798
473
+ },
474
+ "MATH-500": {
475
+ "Score": 69.4,
476
+ "Cost($)": 0.2386
477
  }
478
  },
479
  "ReAct-Pro*-Llama-3.3-70B-Instruct": {
 
489
  "AQuA": {
490
  "Score": 79.13,
491
  "Cost($)": 0.768
492
+ },
493
+ "MATH-500": {
494
+ "Score": 64.6,
495
+ "Cost($)": 3.1806
496
  }
497
  },
498
  "PoT-Llama-3.3-70B-Instruct": {
 
508
  "AQuA": {
509
  "Score": 79.53,
510
  "Cost($)": 0.1746
511
+ },
512
+ "MATH-500": {
513
+ "Score": 42.6,
514
+ "Cost($)": 0.2839
515
  }
516
  },
517
  "CoT-Llama-3.3-70B-Instruct": {
 
527
  "AQuA": {
528
  "Score": 83.46,
529
  "Cost($)": 0.0927
530
+ },
531
+ "MATH-500": {
532
+ "Score": 71.2,
533
+ "Cost($)": 0.3463
534
  }
535
  },
536
  "SC-CoT-Llama-3.3-70B-Instruct": {
 
546
  "AQuA": {
547
  "Score": 82.28,
548
  "Cost($)": 1.0756
549
+ },
550
+ "MATH-500": {
551
+ "Score": 74.2,
552
+ "Cost($)": 3.2239
553
+ }
554
+ },
555
+ "ToT-Llama-3.3-70B-Instruct": {
556
+ "META": {
557
+ "Algorithm": "ToT",
558
+ "LLM": "Llama-3.3-70B-Instruct",
559
+ "Eval Date": "2025/1/22"
560
+ },
561
+ "gsm8k": {
562
+ "Score": 91.89,
563
+ "Cost($)": 20.8753
564
+ },
565
+ "AQuA": {
566
+ "Score": 83.07,
567
+ "Cost($)": 2.9404
568
+ },
569
+ "MATH-500": {
570
+ "Score": 1.4,
571
+ "Cost($)": 8.2699
572
  }
573
  },
574
  "IO-Qwen2.5-7B-Instruct": {
 
584
  "AQuA": {
585
  "Score": 78.74,
586
  "Cost($)": 0.0
587
+ },
588
+ "MATH-500": {
589
+ "Score": 59.4,
590
+ "Cost($)": 0.0
591
  }
592
  },
593
  "ReAct-Pro*-Qwen2.5-7B-Instruct": {
 
603
  "AQuA": {
604
  "Score": 74.41,
605
  "Cost($)": 0.0
606
+ },
607
+ "MATH-500": {
608
+ "Score": 48.8,
609
+ "Cost($)": 0.0
610
  }
611
  },
612
  "PoT-Qwen2.5-7B-Instruct": {
 
622
  "AQuA": {
623
  "Score": 68.11,
624
  "Cost($)": 0.0
625
+ },
626
+ "MATH-500": {
627
+ "Score": 39.6,
628
+ "Cost($)": 0.0
629
  }
630
  },
631
  "CoT-Qwen2.5-7B-Instruct": {
 
641
  "AQuA": {
642
  "Score": 80.71,
643
  "Cost($)": 0.0
644
+ },
645
+ "MATH-500": {
646
+ "Score": 69.8,
647
+ "Cost($)": 0.0
648
  }
649
  },
650
  "SC-CoT-Qwen2.5-7B-Instruct": {
 
660
  "AQuA": {
661
  "Score": 79.92,
662
  "Cost($)": 0.0
663
+ },
664
+ "MATH-500": {
665
+ "Score": 67.0,
666
+ "Cost($)": 0.0
667
+ }
668
+ },
669
+ "ToT-Qwen2.5-7B-Instruct": {
670
+ "META": {
671
+ "Algorithm": "ToT",
672
+ "LLM": "Qwen2.5-7B-Instruct",
673
+ "Eval Date": "2025/1/22"
674
+ },
675
+ "gsm8k": {
676
+ "Score": 72.21,
677
+ "Cost($)": 0.0
678
+ },
679
+ "AQuA": {
680
+ "Score": 53.94,
681
+ "Cost($)": 0.0
682
+ },
683
+ "MATH-500": {
684
+ "Score": 1.4,
685
+ "Cost($)": 0.0
686
  }
687
  },
688
  "IO-Llama-3.1-8B-Instruct": {
 
698
  "AQuA": {
699
  "Score": 51.18,
700
  "Cost($)": 0.0
701
+ },
702
+ "MATH-500": {
703
+ "Score": 38.6,
704
+ "Cost($)": 0.0
705
  }
706
  },
707
  "ReAct-Pro*-Llama-3.1-8B-Instruct": {
 
717
  "AQuA": {
718
  "Score": 55.51,
719
  "Cost($)": 0.0
720
+ },
721
+ "MATH-500": {
722
+ "Score": 28.8,
723
+ "Cost($)": 0.0
724
  }
725
  },
726
  "PoT-Llama-3.1-8B-Instruct": {
 
736
  "AQuA": {
737
  "Score": 36.61,
738
  "Cost($)": 0.0
739
+ },
740
+ "MATH-500": {
741
+ "Score": 25.4,
742
+ "Cost($)": 0.0
743
  }
744
  },
745
  "CoT-Llama-3.1-8B-Instruct": {
 
755
  "AQuA": {
756
  "Score": 60.63,
757
  "Cost($)": 0.0
758
+ },
759
+ "MATH-500": {
760
+ "Score": 25.8,
761
+ "Cost($)": 0.0
762
  }
763
  },
764
  "SC-CoT-Llama-3.1-8B-Instruct": {
 
774
  "AQuA": {
775
  "Score": 59.45,
776
  "Cost($)": 0.0
777
+ },
778
+ "MATH-500": {
779
+ "Score": 30.2,
780
+ "Cost($)": 0.0
781
+ }
782
+ },
783
+ "ToT-Llama-3.1-8B-Instruct": {
784
+ "META": {
785
+ "Algorithm": "ToT",
786
+ "LLM": "Llama-3.1-8B-Instruct",
787
+ "Eval Date": "2025/1/22"
788
+ },
789
+ "gsm8k": {
790
+ "Score": 65.05,
791
+ "Cost($)": 0.0
792
+ },
793
+ "AQuA": {
794
+ "Score": 59.06,
795
+ "Cost($)": 0.0
796
+ },
797
+ "MATH-500": {
798
+ "Score": 1.8,
799
+ "Cost($)": 0.0
800
  }
801
  },
802
  "IO-Internllm2_5-7B": {
 
812
  "AQuA": {
813
  "Score": 47.64,
814
  "Cost($)": 0.0
815
+ },
816
+ "MATH-500": {
817
+ "Score": 22.8,
818
+ "Cost($)": 0.0
819
  }
820
  },
821
  "ReAct-Pro*-Internllm2_5-7B": {
 
831
  "AQuA": {
832
  "Score": 40.94,
833
  "Cost($)": 0.0
834
+ },
835
+ "MATH-500": {
836
+ "Score": 14.8,
837
+ "Cost($)": 0.0
838
  }
839
  },
840
  "PoT-Internllm2_5-7B": {
 
850
  "AQuA": {
851
  "Score": 36.61,
852
  "Cost($)": 0.0
853
+ },
854
+ "MATH-500": {
855
+ "Score": 15.0,
856
+ "Cost($)": 0.0
857
  }
858
  },
859
  "CoT-Internllm2_5-7B": {
 
869
  "AQuA": {
870
  "Score": 52.76,
871
  "Cost($)": 0.0
872
+ },
873
+ "MATH-500": {
874
+ "Score": 46.6,
875
+ "Cost($)": 0.0
876
  }
877
  },
878
  "SC-CoT-Internllm2_5-7B": {
 
888
  "AQuA": {
889
  "Score": 39.37,
890
  "Cost($)": 0.0
891
+ },
892
+ "MATH-500": {
893
+ "Score": 9.8,
894
+ "Cost($)": 0.0
895
+ }
896
+ },
897
+ "ToT-Internllm2_5-7B": {
898
+ "META": {
899
+ "Algorithm": "ToT",
900
+ "LLM": "Internllm2_5-7B",
901
+ "Eval Date": "2025/1/22"
902
+ },
903
+ "gsm8k": {
904
+ "Score": 20.85,
905
+ "Cost($)": 0.0
906
+ },
907
+ "AQuA": {
908
+ "Score": 35.83,
909
+ "Cost($)": 0.0
910
+ },
911
+ "MATH-500": {
912
+ "Score": 0.2,
913
+ "Cost($)": 0.0
914
  }
915
  },
916
  "IO-Qwen2-1.5B-Instruct": {
 
926
  "AQuA": {
927
  "Score": 29.13,
928
  "Cost($)": 0.0
929
+ },
930
+ "MATH-500": {
931
+ "Score": 7.0,
932
+ "Cost($)": 0.0
933
  }
934
  },
935
  "ReAct-Pro*-Qwen2-1.5B-Instruct": {
 
945
  "AQuA": {
946
  "Score": 25.59,
947
  "Cost($)": 0.0
948
+ },
949
+ "MATH-500": {
950
+ "Score": 8.2,
951
+ "Cost($)": 0.0
952
  }
953
  },
954
  "PoT-Qwen2-1.5B-Instruct": {
 
964
  "AQuA": {
965
  "Score": 30.71,
966
  "Cost($)": 0.0
967
+ },
968
+ "MATH-500": {
969
+ "Score": 0.8,
970
+ "Cost($)": 0.0
971
  }
972
  },
973
  "CoT-Qwen2-1.5B-Instruct": {
 
983
  "AQuA": {
984
  "Score": 40.55,
985
  "Cost($)": 0.0
986
+ },
987
+ "MATH-500": {
988
+ "Score": 15.2,
989
+ "Cost($)": 0.0
990
  }
991
  },
992
  "SC-CoT-Qwen2-1.5B-Instruct": {
 
1002
  "AQuA": {
1003
  "Score": 23.62,
1004
  "Cost($)": 0.0
1005
+ },
1006
+ "MATH-500": {
1007
+ "Score": 3.8,
1008
+ "Cost($)": 0.0
1009
+ }
1010
+ },
1011
+ "ToT-Qwen2-1.5B-Instruct": {
1012
+ "META": {
1013
+ "Algorithm": "ToT",
1014
+ "LLM": "Qwen2-1.5B-Instruct",
1015
+ "Eval Date": "2025/1/22"
1016
+ },
1017
+ "gsm8k": {
1018
+ "Score": 19.64,
1019
+ "Cost($)": 0.0
1020
+ },
1021
+ "AQuA": {
1022
+ "Score": 31.5,
1023
+ "Cost($)": 0.0
1024
+ },
1025
+ "MATH-500": {
1026
+ "Score": 0.8,
1027
+ "Cost($)": 0.0
1028
  }
1029
  },
1030
  "IO-Qwen2-0.5B-Instruct": {
 
1040
  "AQuA": {
1041
  "Score": 27.17,
1042
  "Cost($)": 0.0
1043
+ },
1044
+ "MATH-500": {
1045
+ "Score": 2.6,
1046
+ "Cost($)": 0.0
1047
  }
1048
  },
1049
  "ReAct-Pro*-Qwen2-0.5B-Instruct": {
 
1059
  "AQuA": {
1060
  "Score": 24.02,
1061
  "Cost($)": 0.0
1062
+ },
1063
+ "MATH-500": {
1064
+ "Score": 0.6,
1065
+ "Cost($)": 0.0
1066
  }
1067
  },
1068
  "PoT-Qwen2-0.5B-Instruct": {
 
1072
  "Eval Date": "2025/1/22"
1073
  },
1074
  "gsm8k": {
1075
+ "Score": 9.63,
1076
  "Cost($)": 0.0
1077
  },
1078
  "AQuA": {
1079
  "Score": 17.32,
1080
  "Cost($)": 0.0
1081
+ },
1082
+ "MATH-500": {
1083
+ "Score": 0.0,
1084
+ "Cost($)": 0.0
1085
  }
1086
  },
1087
  "CoT-Qwen2-0.5B-Instruct": {
 
1097
  "AQuA": {
1098
  "Score": 33.07,
1099
  "Cost($)": 0.0
1100
+ },
1101
+ "MATH-500": {
1102
+ "Score": 6.2,
1103
+ "Cost($)": 0.0
1104
  }
1105
  },
1106
  "SC-CoT-Qwen2-0.5B-Instruct": {
 
1116
  "AQuA": {
1117
  "Score": 22.83,
1118
  "Cost($)": 0.0
1119
+ },
1120
+ "MATH-500": {
1121
+ "Score": 0.8,
1122
+ "Cost($)": 0.0
1123
+ }
1124
+ },
1125
+ "ToT-Qwen2-0.5B-Instruct": {
1126
+ "META": {
1127
+ "Algorithm": "ToT",
1128
+ "LLM": "Qwen2-0.5B-Instruct",
1129
+ "Eval Date": "2025/1/22"
1130
+ },
1131
+ "gsm8k": {
1132
+ "Score": 0.0,
1133
+ "Cost($)": 0.0
1134
+ },
1135
+ "AQuA": {
1136
+ "Score": 29.92,
1137
+ "Cost($)": 0.0
1138
+ },
1139
+ "MATH-500": {
1140
+ "Score": 0.0,
1141
+ "Cost($)": 0.0
1142
+ }
1143
+ },
1144
+ "IO-deepseek-r1:1.5b": {
1145
+ "META": {
1146
+ "Algorithm": "IO",
1147
+ "LLM": "deepseek-r1:1.5b",
1148
+ "Eval Date": "2025/1/22"
1149
+ },
1150
+ "gsm8k": {
1151
+ "Score": 64.14,
1152
+ "Cost($)": 0.0
1153
+ },
1154
+ "AQuA": {
1155
+ "Score": 68.9,
1156
+ "Cost($)": 0.0
1157
+ },
1158
+ "MATH-500": {
1159
+ "Score": 43.8,
1160
+ "Cost($)": 0.0
1161
+ }
1162
+ },
1163
+ "ReAct-Pro*-deepseek-r1:1.5b": {
1164
+ "META": {
1165
+ "Algorithm": "ReAct-Pro*",
1166
+ "LLM": "deepseek-r1:1.5b",
1167
+ "Eval Date": "2025/2/10"
1168
+ },
1169
+ "gsm8k": {
1170
+ "Score": 35.94,
1171
+ "Cost($)": 0.0
1172
+ },
1173
+ "AQuA": {
1174
+ "Score": 54.33,
1175
+ "Cost($)": 0.0
1176
+ },
1177
+ "MATH-500": {
1178
+ "Score": 24.4,
1179
+ "Cost($)": 0.0
1180
+ }
1181
+ },
1182
+ "PoT-deepseek-r1:1.5b": {
1183
+ "META": {
1184
+ "Algorithm": "PoT",
1185
+ "LLM": "deepseek-r1:1.5b",
1186
+ "Eval Date": "2025/2/10"
1187
+ },
1188
+ "gsm8k": {
1189
+ "Score": 11.9,
1190
+ "Cost($)": 0.0
1191
+ },
1192
+ "AQuA": {
1193
+ "Score": 54.72,
1194
+ "Cost($)": 0.0
1195
+ },
1196
+ "MATH-500": {
1197
+ "Score": 1.0,
1198
+ "Cost($)": 0.0
1199
+ }
1200
+ },
1201
+ "CoT-deepseek-r1:1.5b": {
1202
+ "META": {
1203
+ "Algorithm": "CoT",
1204
+ "LLM": "deepseek-r1:1.5b",
1205
+ "Eval Date": "2025/1/23"
1206
+ },
1207
+ "gsm8k": {
1208
+ "Score": 70.66,
1209
+ "Cost($)": 0.0
1210
+ },
1211
+ "AQuA": {
1212
+ "Score": 71.65,
1213
+ "Cost($)": 0.0
1214
+ },
1215
+ "MATH-500": {
1216
+ "Score": 49.4,
1217
+ "Cost($)": 0.0
1218
+ }
1219
+ },
1220
+ "SC-CoT-deepseek-r1:1.5b": {
1221
+ "META": {
1222
+ "Algorithm": "SC-CoT",
1223
+ "LLM": "deepseek-r1:1.5b",
1224
+ "Eval Date": "2025/2/10"
1225
+ },
1226
+ "gsm8k": {
1227
+ "Score": 55.34,
1228
+ "Cost($)": 0.0
1229
+ },
1230
+ "AQuA": {
1231
+ "Score": 59.06,
1232
+ "Cost($)": 0.0
1233
+ },
1234
+ "MATH-500": {
1235
+ "Score": 38.0,
1236
+ "Cost($)": 0.0
1237
+ }
1238
+ },
1239
+ "ToT-deepseek-r1:1.5b": {
1240
+ "META": {
1241
+ "Algorithm": "ToT",
1242
+ "LLM": "deepseek-r1:1.5b",
1243
+ "Eval Date": "2025/2/10"
1244
+ },
1245
+ "gsm8k": {
1246
+ "Score": 23.12,
1247
+ "Cost($)": 0.0
1248
+ },
1249
+ "AQuA": {
1250
+ "Score": 24.8,
1251
+ "Cost($)": 0.0
1252
+ },
1253
+ "MATH-500": {
1254
+ "Score": 0.4,
1255
+ "Cost($)": 0.0
1256
  }
1257
  }
1258
  }
src/overall_results.csv CHANGED
@@ -1,51 +1,67 @@
1
- Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA-Cost($)
2
- 1.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,89.55,92.87,0.7195,86.22,0.0808
3
- 2.0,SC-CoT,Qwen2.5-72B-Instruct,2025/1/22,89.45,93.86,5.9858,85.04,1.0348
4
- 3.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,88.70,93.93,0.687,83.46,0.0927
5
- 4.0,SC-CoT,Llama-3.3-70B-Instruct,2025/1/22,88.68,95.07,6.2005,82.28,1.0756
6
- 5.0,SC-CoT,gpt-4o,2025/1/22,88.46,90.3,31.0542,86.61,8.1485
7
- 6.0,CoT,gpt-4o,2025/1/22,88.39,94.09,4.5367,82.68,1.0417
8
- 7.0,IO,Llama-3.3-70B-Instruct,2025/1/22,87.48,92.27,0.4709,82.68,0.0798
9
- 8.0,CoT,Doubao-lite-32k,2025/1/7,86.00,89.31,0.0558,82.68,0.0066
10
- 9.0,SC-CoT,Qwen2.5-7B-Instruct,2025/1/22,85.53,91.13,0.0,79.92,0.0
11
- 10.0,IO,Qwen2.5-72B-Instruct,2025/1/22,85.42,86.58,0.4899,84.25,0.0742
12
- 11.0,SC-CoT,Doubao-lite-32k,2025/1/7,84.18,87.26,0.2083,81.1,0.0519
13
- 12.0,PoT,gpt-4o,2025/1/22,84.15,93.1,4.2166,75.2,1.6087
14
- 13.0,PoT,Qwen2.5-72B-Instruct,2025/1/22,83.77,92.34,0.7054,75.2,0.1645
15
- 14.0,ReAct-Pro*,Llama-3.3-70B-Instruct,2025/1/22,83.39,87.64,10.1124,79.13,0.768
16
- 15.0,CoT,Qwen2.5-7B-Instruct,2025/1/22,83.19,85.67,0.0,80.71,0.0
17
- 16.0,IO,gpt-4o,2025/1/22,82.00,88.4,3.3463,75.59,1.1453
18
- 17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,81.58,85.6,0.2512,77.56,0.0445
19
- 18.0,ReAct-Pro*,Qwen2.5-72B-Instruct,2025/1/22,80.25,87.26,10.5479,73.23,0.3177
20
- 19.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,78.64,82.87,0.0,74.41,0.0
21
- 20.0,PoT,Llama-3.3-70B-Instruct,2025/1/22,76.31,73.09,0.9736,79.53,0.1746
22
- 21.0,PoT,Doubao-lite-32k,2025/1/7,75.63,79.61,0.0576,71.65,0.0147
23
- 22.0,IO,Doubao-lite-32k,2025/1/7,75.58,72.02,0.0354,79.13,0.0058
24
- 23.0,SC-CoT,gpt-3.5-turbo,2025/1/7,73.03,79.91,3.3938,66.14,0.7888
25
- 24.0,CoT,gpt-3.5-turbo,2025/1/7,69.86,78.7,0.6788,61.02,0.0957
26
- 25.0,ReAct-Pro*,gpt-3.5-turbo,2025/1/7,69.74,74.91,3.4633,64.57,0.4928
27
- 26.0,PoT,gpt-3.5-turbo,2025/1/7,68.17,76.88,0.6902,59.45,0.1748
28
- 27.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,68.04,75.44,0.0,60.63,0.0
29
- 28.0,IO,Qwen2.5-7B-Instruct,2025/1/22,67.99,57.24,0.0,78.74,0.0
30
- 29.0,SC-CoT,Llama-3.1-8B-Instruct,2025/1/22,66.46,73.46,0.0,59.45,0.0
31
- 30.0,CoT,Internllm2_5-7B,2025/1/22,65.24,77.71,0.0,52.76,0.0
32
- 31.0,PoT,Qwen2.5-7B-Instruct,2025/1/22,63.47,58.83,0.0,68.11,0.0
33
- 32.0,ReAct-Pro*,Llama-3.1-8B-Instruct,2025/1/22,61.65,67.78,0.0,55.51,0.0
34
- 33.0,ReAct-Pro*,gpt-4o,2025/1/22,60.40,63.31,39.0751,57.48,2.304
35
- 34.0,IO,Llama-3.1-8B-Instruct,2025/1/22,54.17,57.16,0.0,51.18,0.0
36
- 35.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,48.03,55.5,0.0,40.55,0.0
37
- 36.0,SC-CoT,Internllm2_5-7B,2025/1/22,43.80,48.22,0.0,39.37,0.0
38
- 37.0,IO,gpt-3.5-turbo,2025/1/7,38.41,37.83,0.3328,38.98,0.038
39
- 38.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,37.64,38.67,0.0,36.61,0.0
40
- 39.0,PoT,Internllm2_5-7B,2025/1/22,37.41,38.21,0.0,36.61,0.0
41
- 40.0,ReAct-Pro*,Internllm2_5-7B,2025/1/22,37.23,33.51,0.0,40.94,0.0
42
- 41.0,CoT,Qwen2-0.5B-Instruct,2025/1/22,34.51,35.94,0.0,33.07,0.0
43
- 42.0,IO,Internllm2_5-7B,2025/1/22,29.62,11.6,0.0,47.64,0.0
44
- 43.0,ReAct-Pro*,Qwen2-1.5B-Instruct,2025/1/22,25.23,24.87,0.0,25.59,0.0
45
- 44.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,24.61,18.5,0.0,30.71,0.0
46
- 45.0,IO,Qwen2-1.5B-Instruct,2025/1/22,22.91,16.68,0.0,29.13,0.0
47
- 46.0,IO,Qwen2-0.5B-Instruct,2025/1/22,20.94,14.71,0.0,27.17,0.0
48
- 47.0,SC-CoT,Qwen2-1.5B-Instruct,2025/1/22,17.69,11.75,0.0,23.62,0.0
49
- 48.0,ReAct-Pro*,Qwen2-0.5B-Instruct,2025/1/22,15.84,7.66,0.0,24.02,0.0
50
- 49.0,PoT,Qwen2-0.5B-Instruct,2025/1/22,13.47,9.62,0.0,17.32,0.0
51
- 50.0,SC-CoT,Qwen2-0.5B-Instruct,2025/1/22,12.25,1.67,0.0,22.83,0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA-Cost($),MATH-500-Score,MATH-500-Cost($)
2
+ 1.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,86.43,92.87,0.7195,86.22,0.0808,80.2,0.349
3
+ 2.0,SC-CoT,Qwen2.5-72B-Instruct,2025/1/22,84.30,93.86,5.9858,85.04,1.0348,74.0,3.1556
4
+ 3.0,SC-CoT,Llama-3.3-70B-Instruct,2025/1/22,83.85,95.07,6.2005,82.28,1.0756,74.2,3.2239
5
+ 4.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,82.86,93.93,0.687,83.46,0.0927,71.2,0.3463
6
+ 5.0,CoT,gpt-4o,2025/1/22,81.59,94.09,4.5367,82.68,1.0417,68.0,3.0569
7
+ 6.0,IO,Llama-3.3-70B-Instruct,2025/1/22,81.45,92.27,0.4709,82.68,0.0798,69.4,0.2386
8
+ 7.0,IO,Qwen2.5-72B-Instruct,2025/1/22,80.34,86.58,0.4899,84.25,0.0742,70.2,0.2506
9
+ 8.0,SC-CoT,Qwen2.5-7B-Instruct,2025/1/22,79.35,91.13,0.0,79.92,0.0,67.0,0.0
10
+ 9.0,CoT,Qwen2.5-7B-Instruct,2025/1/22,78.73,85.67,0.0,80.71,0.0,69.8,0.0
11
+ 10.0,ReAct-Pro*,Llama-3.3-70B-Instruct,2025/1/22,77.12,87.64,10.1124,79.13,0.768,64.6,3.1806
12
+ 11.0,CoT,Doubao-lite-32k,2025/1/7,77.00,89.31,0.0558,82.68,0.0066,59.0,0.0255
13
+ 12.0,ReAct-Pro*,Qwen2.5-72B-Instruct,2025/1/22,74.43,87.26,10.5479,73.23,0.3177,62.8,3.4541
14
+ 13.0,SC-CoT,Doubao-lite-32k,2025/1/7,72.52,87.26,0.2083,81.1,0.0519,49.2,0.1406
15
+ 14.0,PoT,Qwen2.5-72B-Instruct,2025/1/22,71.58,92.34,0.7054,75.2,0.1645,47.2,0.233
16
+ 15.0,PoT,gpt-4o,2025/1/22,71.50,93.1,4.2166,75.2,1.6087,46.2,1.5994
17
+ 16.0,SC-CoT,gpt-4o,2025/1/22,70.44,90.3,31.0542,86.61,8.1485,34.4,19.6538
18
+ 17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,70.12,85.6,0.2512,77.56,0.0445,47.2,0.186
19
+ 18.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,68.69,82.87,0.0,74.41,0.0,48.8,0.0
20
+ 19.0,IO,gpt-4o,2025/1/22,68.60,88.4,3.3463,75.59,1.1453,41.8,2.7907
21
+ 20.0,IO,Qwen2.5-7B-Instruct,2025/1/22,65.13,57.24,0.0,78.74,0.0,59.4,0.0
22
+ 21.0,PoT,Llama-3.3-70B-Instruct,2025/1/22,65.07,73.09,0.9736,79.53,0.1746,42.6,0.2839
23
+ 22.0,CoT,deepseek-r1:1.5b,2025/1/23,63.90,70.66,0.0,71.65,0.0,49.4,0.0
24
+ 23.0,IO,Doubao-lite-32k,2025/1/7,62.85,72.02,0.0354,79.13,0.0058,37.4,0.0187
25
+ 24.0,PoT,Doubao-lite-32k,2025/1/7,61.29,79.61,0.0576,71.65,0.0147,32.6,0.0144
26
+ 25.0,ToT,Qwen2.5-72B-Instruct,2025/1/22,60.26,88.88,23.5911,81.1,3.7389,10.8,9.0421
27
+ 26.0,CoT,gpt-3.5-turbo,2025/1/7,59.84,78.7,0.6788,61.02,0.0957,39.8,0.3189
28
+ 27.0,CoT,Internllm2_5-7B,2025/1/22,59.02,77.71,0.0,52.76,0.0,46.6,0.0
29
+ 28.0,IO,deepseek-r1:1.5b,2025/1/22,58.95,64.14,0.0,68.9,0.0,43.8,0.0
30
+ 29.0,ToT,Llama-3.3-70B-Instruct,2025/1/22,58.79,91.89,20.8753,83.07,2.9404,1.4,8.2699
31
+ 30.0,ToT,gpt-4o,2025/1/22,58.61,91.13,86.8581,81.5,8.5295,3.2,40.8094
32
+ 31.0,SC-CoT,gpt-3.5-turbo,2025/1/7,58.28,79.91,3.3938,66.14,0.7888,28.8,1.9764
33
+ 32.0,ReAct-Pro*,gpt-4o,2025/1/22,58.26,63.31,39.0751,57.48,2.304,54.0,17.7735
34
+ 33.0,PoT,Qwen2.5-7B-Instruct,2025/1/22,55.51,58.83,0.0,68.11,0.0,39.6,0.0
35
+ 34.0,PoT,gpt-3.5-turbo,2025/1/7,55.04,76.88,0.6902,59.45,0.1748,28.8,0.168
36
+ 35.0,ReAct-Pro*,gpt-3.5-turbo,2025/1/7,54.43,74.91,3.4633,64.57,0.4928,23.8,2.0406
37
+ 36.0,SC-CoT,Llama-3.1-8B-Instruct,2025/1/22,54.37,73.46,0.0,59.45,0.0,30.2,0.0
38
+ 37.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,53.96,75.44,0.0,60.63,0.0,25.8,0.0
39
+ 38.0,SC-CoT,deepseek-r1:1.5b,2025/2/10,50.80,55.34,0.0,59.06,0.0,38.0,0.0
40
+ 39.0,ReAct-Pro*,Llama-3.1-8B-Instruct,2025/1/22,50.70,67.78,0.0,55.51,0.0,28.8,0.0
41
+ 40.0,IO,Llama-3.1-8B-Instruct,2025/1/22,48.98,57.16,0.0,51.18,0.0,38.6,0.0
42
+ 41.0,ToT,gpt-3.5-turbo,2025/1/7,44.94,67.93,9.1707,57.09,1.1513,9.8,5.2914
43
+ 42.0,ToT,Qwen2.5-7B-Instruct,2025/1/22,42.52,72.21,0.0,53.94,0.0,1.4,0.0
44
+ 43.0,ToT,Llama-3.1-8B-Instruct,2025/1/22,41.97,65.05,0.0,59.06,0.0,1.8,0.0
45
+ 44.0,ReAct-Pro*,deepseek-r1:1.5b,2025/2/10,38.22,35.94,0.0,54.33,0.0,24.4,0.0
46
+ 45.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,37.08,55.5,0.0,40.55,0.0,15.2,0.0
47
+ 46.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,33.56,38.67,0.0,36.61,0.0,25.4,0.0
48
+ 47.0,SC-CoT,Internllm2_5-7B,2025/1/22,32.46,48.22,0.0,39.37,0.0,9.8,0.0
49
+ 48.0,IO,gpt-3.5-turbo,2025/1/7,31.34,37.83,0.3328,38.98,0.038,17.2,0.2436
50
+ 49.0,PoT,Internllm2_5-7B,2025/1/22,29.94,38.21,0.0,36.61,0.0,15.0,0.0
51
+ 50.0,ReAct-Pro*,Internllm2_5-7B,2025/1/22,29.75,33.51,0.0,40.94,0.0,14.8,0.0
52
+ 51.0,ToT,Doubao-lite-32k,2025/1/7,28.10,37.83,0.8739,45.28,0.0881,1.2,0.2371
53
+ 52.0,IO,Internllm2_5-7B,2025/1/22,27.35,11.6,0.0,47.64,0.0,22.8,0.0
54
+ 53.0,CoT,Qwen2-0.5B-Instruct,2025/1/22,25.07,35.94,0.0,33.07,0.0,6.2,0.0
55
+ 54.0,PoT,deepseek-r1:1.5b,2025/2/10,22.54,11.9,0.0,54.72,0.0,1.0,0.0
56
+ 55.0,ReAct-Pro*,Qwen2-1.5B-Instruct,2025/1/22,19.55,24.87,0.0,25.59,0.0,8.2,0.0
57
+ 56.0,ToT,Internllm2_5-7B,2025/1/22,18.96,20.85,0.0,35.83,0.0,0.2,0.0
58
+ 57.0,IO,Qwen2-1.5B-Instruct,2025/1/22,17.60,16.68,0.0,29.13,0.0,7.0,0.0
59
+ 58.0,ToT,Qwen2-1.5B-Instruct,2025/1/22,17.31,19.64,0.0,31.5,0.0,0.8,0.0
60
+ 59.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,16.67,18.5,0.0,30.71,0.0,0.8,0.0
61
+ 60.0,ToT,deepseek-r1:1.5b,2025/2/10,16.11,23.12,0.0,24.8,0.0,0.4,0.0
62
+ 61.0,IO,Qwen2-0.5B-Instruct,2025/1/22,14.83,14.71,0.0,27.17,0.0,2.6,0.0
63
+ 62.0,SC-CoT,Qwen2-1.5B-Instruct,2025/1/22,13.06,11.75,0.0,23.62,0.0,3.8,0.0
64
+ 63.0,ReAct-Pro*,Qwen2-0.5B-Instruct,2025/1/22,10.76,7.66,0.0,24.02,0.0,0.6,0.0
65
+ 64.0,ToT,Qwen2-0.5B-Instruct,2025/1/22,9.97,0.0,0.0,29.92,0.0,0.0,0.0
66
+ 65.0,PoT,Qwen2-0.5B-Instruct,2025/1/22,8.98,9.63,0.0,17.32,0.0,0.0,0.0
67
+ 66.0,SC-CoT,Qwen2-0.5B-Instruct,2025/1/22,8.43,1.67,0.0,22.83,0.0,0.8,0.0
src/record.csv CHANGED
@@ -1,148 +1,199 @@
1
- Algorithm,Dataset,Eval Date,LLM,Score,Pass rate,X-shot,Parameters,Samples,Total input tokens,Average input tokens,Total output tokens,Average output tokens,All tokens,Cost($),Note,,,,,,,,,,,,,,,,,,,
2
- IO,gsm8k,2025/1/7,gpt-3.5-turbo,37.83,99.92,8,,1319,"546,990",415,"39,563",30,"586,553",0.3328,,,,,,,,,,,,,,,,,,,,
3
- IO,gsm8k,2025/1/7,Doubao-lite-32k,72.02,99.92,8,,1319,"617,377",468,"123,106",93,"740,483",0.0354,0.2590 (元),,,,,,,,,,,,,,,,,,,
4
- IO,gsm8k,2025/1/22,gpt-4o,88.4,100,8,,1319,"542,416",411,"199,030",151,"741,446",3.3463,,,,,,,,,,,,,,,,,,,,
5
- IO,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,86.58,100,8,,1319,"555,340",421,"313,720",238,"869,060",0.4899,,,,,,,,,,,,,,,,,,,,
6
- IO,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,92.27,100,8,,1319,"583,916",443,"251,359",191,"835,275",0.4709,,,,,,,,,,,,,,,,,,,,
7
- IO,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,57.24,100,8,,1319,"596,229",452,"291,684",221,"887,913",0.0000,,,,,,,,,,,,,,,,,,,,
8
- IO,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,57.16,99.55,8,,1319,"550,941",418,"1,194,488",906,"1,745,429",0.0000,,,,,,,,,,,,,,,,,,,,
9
- IO,gsm8k,2025/1/22,Internllm2_5-7B,11.6,97.95,8,,1319,"679,302",515,"434,426",329,"1,113,728",0.0000,,,,,,,,,,,,,,,,,,,,
10
- IO,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,16.68,100,8,,1319,"568,530",431,"168,466",128,"736,996",0.0000,,,,,,,,,,,,,,,,,,,,
11
- IO,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,14.71,100,8,,1319,"568,116",431,"266,781",202,"834,897",0.0000,,,,,,,,,,,,,,,,,,,,
12
- ReAct-Pro*,gsm8k,2025/1/7,gpt-3.5-turbo,74.91,99.39,8,max_steps=10,1319,"6,506,164","4,933","140,122",106,"6,646,286",3.4633,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
13
- ReAct-Pro*,gsm8k,2025/1/7,Doubao-lite-32k,85.6,99.62,8,max_steps=10,1319,"5,862,016","4,444","136,623",104,"5,998,639",0.2512,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
14
- ReAct-Pro*,gsm8k,2025/1/22,gpt-4o,63.31,99.55,8,max_steps=10,1319,"14,411,173","10,926","304,714",231,"14,715,887",39.0751,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
15
- ReAct-Pro*,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,87.26,100,8,max_steps=10,1319,"18,160,983","13,769","549,454",417,"18,710,437",10.5479,,,,,,,,,,,,,,,,,,,,
16
- ReAct-Pro*,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,87.64,99.92,8,max_steps=10,1319,"17,038,928","12,918","898,936",682,"17,937,864",10.1124,,,,,,,,,,,,,,,,,,,,
17
- ReAct-Pro*,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,82.87,100,8,max_steps=10,1319,"14,355,752","10,884","495,162",375,"14,850,914",0.0000,,,,,,,,,,,,,,,,,,,,
18
- ReAct-Pro*,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,67.78,98.56,8,max_steps=10,1319,"21,044,978","15,955","1,790,789","1,358","22,835,767",0.0000,,,,,,,,,,,,,,,,,,,,
19
- ReAct-Pro*,gsm8k,2025/1/22,Internllm2_5-7B,33.51,97.95,8,max_steps=10,1319,"30,120,070","22,836","5,549,919","4,208","35,669,989",0.0000,,,,,,,,,,,,,,,,,,,,
20
- ReAct-Pro*,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,24.87,80.21,8,max_steps=10,1319,"9,133,603","6,925","694,398",526,"9,828,001",0.0000,,,,,,,,,,,,,,,,,,,,
21
- ReAct-Pro*,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,7.66,95.22,8,max_steps=10,1319,"52,431,343","39,751","2,961,268","2,245","55,392,611",0.0000,,,,,,,,,,,,,,,,,,,,
22
- PoT,gsm8k,2025/1/7,gpt-3.5-turbo,76.88,99.24,8,,1319,"1,090,418",827,"96,662",73,"1,187,080",0.6902,,,,,,,,,,,,,,,,,,,,
23
- PoT,gsm8k,2025/1/7,Doubao-lite-32k,79.61,92.57,8,,1319,"1,170,038",887,"118,017",89,"1,288,055",0.0576,,,,,,,,,,,,,,,,,,,,
24
- PoT,gsm8k,2025/1/22,gpt-4o,93.1,99.77,8,,1319,"1,101,672",835,"146,240",111,"1,247,912",4.2166,,,,,,,,,,,,,,,,,,,,
25
- PoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,92.34,99.39,8,,1319,"1,106,682",839,"144,528",110,"1,251,210",0.7054,,,,,,,,,,,,,,,,,,,,
26
- PoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,73.09,79.61,8,,1319,"1,126,025",854,"601,019",456,"1,727,044",0.9736,,,,,,,,,,,,,,,,,,,,
27
- PoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,58.83,70.51,8,,1319,"1,145,390",868,"217,432",165,"1,362,822",0.0000,,,,,,,,,,,,,,,,,,,,
28
- PoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,38.67,55.42,8,,1319,"1,147,538",870,"243,573",185,"1,391,111",0.0000,,,,,,,,,,,,,,,,,,,,
29
- PoT,gsm8k,2025/1/22,Internllm2_5-7B,38.21,48.9,8,,1319,"1,136,843",862,"188,106",143,"1,324,949",0.0000,,,,,,,,,,,,,,,,,,,,
30
- PoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,18.5,31.01,8,,1319,"1,151,528",873,"175,994",133,"1,327,522",0.0000,,,,,,,,,,,,,,,,,,,,
31
- PoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,9.62,16.91,8,,1319,"1,151,528",873,"237,607",180,"1,389,135",0.0000,,,,,,,,,,,,,,,,,,,,
32
- CoT,gsm8k,2025/1/7,gpt-3.5-turbo,78.7,100,8,,1319,"953,242",723,"134,799",102,"1,088,041",0.6788,,,,,,,,,,,,,,,,,,,,
33
- CoT,gsm8k,2025/1/7,Doubao-lite-32k,89.31,100,8,,1319,"1,042,095",790,"159,725",121,"1,201,820",0.0558,0.4084635 (元),,,,,,,,,,,,,,,,,,,
34
- CoT,gsm8k,2025/1/22,gpt-4o,94.09,100,8,,1319,"948,668",719,"216,498",164,"1,165,166",4.5367,,,,,,,,,,,,,,,,,,,,
35
- CoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,92.87,100,8,,1319,"1,005,119",762,"271,133",206,"1,276,252",0.7195,,,,,,,,,,,,,,,,,,,,
36
- CoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,93.93,100,8,,1319,"990,168",751,"228,497",173,"1,218,665",0.6870,,,,,,,,,,,,,,,,,,,,
37
- CoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,85.67,100,8,,1319,"1,046,008",793,"244,797",186,"1,290,805",0.0000,,,,,,,,,,,,,,,,,,,,
38
- CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,75.44,99.92,8,,1319,"990,168",751,"258,161",196,"1,248,329",0.0000,,,,,,,,,,,,,,,,,,,,
39
- CoT,gsm8k,2025/1/22,Internllm2_5-7B,77.71,99.7,8,,1319,"968,163",734,"234,000",177,"1,202,163",0.0000,,,,,,,,,,,,,,,,,,,,
40
- CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,55.5,100,8,,1319,"1,032,818",783,"185,707",141,"1,218,525",0.0000,,,,,,,,,,,,,,,,,,,,
41
- CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,35.94,99.92,8,,1319,"1,032,818",783,"190,641",145,"1,223,459",0.0000,,,,,,,,,,,,,,,,,,,,
42
- SC-CoT,gsm8k,2025/1/7,gpt-3.5-turbo,79.91,99.92,8,"temperature=1, path_num=5",1319,"2,740,652","2,078","1,348,960","1,023","4,089,612",3.3938,,,,,,,,,,,,,,,,,,,,
43
- SC-CoT,gsm8k,2025/1/7,Doubao-lite-32k,87.26,99.92,8,"temperature=1, path_num=5",1319,"2,691,714","2,041","1,197,099",908,"3,888,813",0.2083,,,,,,,,,,,,,,,,,,,,
44
- SC-CoT,gsm8k,2025/1/22,gpt-4o,90.3,99.92,8,"temperature=1, path_num=5",1319,"3,590,336","2,722","2,207,837","1,674","5,798,173",31.0542,,,,,,,,,,,,,,,,,,,,
45
- SC-CoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,93.86,100,8,"temperature=1, path_num=5",1319,"8,136,223","6,168","2,481,785","1,882","10,618,008",5.9858,,,,,,,,,,,,,,,,,,,,
46
- SC-CoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,95.07,100,8,"temperature=1, path_num=5",1319,"8,413,717","6,379","2,585,077","1,960","10,998,794",6.2005,,,,,,,,,,,,,,,,,,,,
47
- SC-CoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,91.13,100,8,"temperature=1, path_num=5",1319,"8,586,888","6,510","2,554,097","1,936","11,140,985",0.0000,,,,,,,,,,,,,,,,,,,,
48
- SC-CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,73.46,99.55,8,"temperature=1, path_num=5",1319,"8,630,514","6,543","3,148,202","2,387","11,778,716",0.0000,,,,,,,,,,,,,,,,,,,,
49
- SC-CoT,gsm8k,2025/1/22,Internllm2_5-7B,48.22,98.41,8,"temperature=1, path_num=5",1319,"10,678,792","8,096","3,847,639","2,917","14,526,431",0.0000,,,,,,,,,,,,,,,,,,,,
50
- SC-CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,11.75,91.89,8,"temperature=1, path_num=5",1319,"9,066,115","6,873","3,345,827","2,537","12,411,942",0.0000,,,,,,,,,,,,,,,,,,,,
51
- SC-CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,1.67,94.69,8,"temperature=1, path_num=5",1319,"11,019,864","8,355","5,445,856","4,129","16,465,720",0.0000,,,,,,,,,,,,,,,,,,,,
52
- IO,AQuA,2025/1/7,gpt-3.5-turbo,38.98,100,0,,254,"25,701",101,"16,770",66,"42,471",0.0380,,,,,,,,,,,,,,,,,,,,
53
- IO,AQuA,2025/1/7,Doubao-lite-32k,79.13,100,0,,254,"33,058",130,"54,684",215,"87,742",0.0058,0.0427(元),,,,,,,,,,,,,,,,,,,
54
- IO,AQuA,2025/1/22,gpt-4o,75.59,97.24,0,,254,"25,631",101,"108,121",426,"133,752",1.1453,,,,,,,,,,,,,,,,,,,,
55
- IO,AQuA,2025/1/22,Qwen2.5-72B-Instruct,84.25,99.61,0,,254,"25,397",100,"106,207",418,"131,604",0.0742,,,,,,,,,,,,,,,,,,,,
56
- IO,AQuA,2025/1/22,Llama-3.3-70B-Instruct,82.68,99.21,0,,254,"32,809",129,"108,758",428,"141,567",0.0798,,,,,,,,,,,,,,,,,,,,
57
- IO,AQuA,2025/1/22,Qwen2.5-7B-Instruct,78.74,98.43,0,,254,"33,271",131,"104,500",411,"137,771",0.0000,,,,,,,,,,,,,,,,,,,,
58
- IO,AQuA,2025/1/22,Llama-3.1-8B-Instruct,51.18,98.82,0,,254,"26,459",104,"106,647",420,"133,106",0.0000,,,,,,,,,,,,,,,,,,,,
59
- IO,AQuA,2025/1/22,Internllm2_5-7B,47.64,90.94,0,,254,"50,232",198,"134,809",531,"185,041",0.0000,,,,,,,,,,,,,,,,,,,,
60
- IO,AQuA,2025/1/22,Qwen2-1.5B-Instruct,29.13,97.64,0,,254,"27,937",110,"43,110",170,"71,047",0.0000,,,,,,,,,,,,,,,,,,,,
61
- IO,AQuA,2025/1/22,Qwen2-0.5B-Instruct,27.17,98.82,0,,254,"27,937",110,"82,478",325,"110,415",0.0000,,,,,,,,,,,,,,,,,,,,
62
- CoT,AQuA,2025/1/7,gpt-3.5-turbo,61.02,93.7,0,,254,"25,447",100,"55,346",218,"80,793",0.0957,,,,,,,,,,,,,,,,,,,,
63
- CoT,AQuA,2025/1/7,Doubao-lite-32k,82.68,97.24,0,,254,"27,978",110,"66,599",262,"94,577",0.0066,0.0483 (元),,,,,,,,,,,,,,,,,,,
64
- CoT,AQuA,2025/1/22,gpt-4o,82.68,98.03,0,,254,"25,123",99,"97,894",385,"123,017",1.0417,,,,,,,,,,,,,,,,,,,,
65
- CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,86.22,99.21,0,,254,"25,143",99,"118,146",465,"143,289",0.0808,,,,,,,,,,,,,,,,,,,,
66
- CoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,83.46,98.43,0,,254,"32,555",128,"131,834",519,"164,389",0.0927,,,,,,,,,,,,,,,,,,,,
67
- CoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,80.71,99.61,0,,254,"33,017",130,"116,719",460,"149,736",0.0000,,,,,,,,,,,,,,,,,,,,
68
- CoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,60.63,100,0,,254,"32,555",128,"111,880",440,"144,435",0.0000,,,,,,,,,,,,,,,,,,,,
69
- CoT,AQuA,2025/1/22,Internllm2_5-7B,52.76,89.37,0,,254,"26,610",105,"100,910",397,"127,520",0.0000,,,,,,,,,,,,,,,,,,,,
70
- CoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,40.55,98.82,0,,254,"30,477",120,"79,563",313,"110,040",0.0000,,,,,,,,,,,,,,,,,,,,
71
- CoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,33.07,98.82,0,,254,"30,477",120,"86,862",342,"117,339",0.0000,,,,,,,,,,,,,,,,,,,,
72
- PoT,AQuA,2025/1/7,gpt-3.5-turbo,59.45,100,0,,254,"225,162",886,"41,492",163,"266,654",0.1748,,,,,,,,,,,,,,,,,,,,
73
- PoT,AQuA,2025/1/7,Doubao-lite-32k,71.65,96.85,0,,254,"259,863","1,023","49,573",195,"309,436",0.0147,,,,,,,,,,,,,,,,,,,,
74
- PoT,AQuA,2025/1/22,gpt-4o,75.2,100,0,,254,"222,717",877,"105,191",414,"327,908",1.6087,,,,,,,,,,,,,,,,,,,,
75
- PoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,75.2,100,0,,254,"249,215",981,"42,549",168,"291,764",0.1645,,,,,,,,,,,,,,,,,,,,
76
- PoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,79.53,99.21,0,,254,"240,735",948,"69,064",272,"309,799",0.1746,,,,,,,,,,,,,,,,,,,,
77
- PoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,68.11,100,0,,254,"264,517","1,041","49,211",194,"313,728",0.0000,,,,,,,,,,,,,,,,,,,,
78
- PoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,36.61,96.85,0,,254,"240,613",947,"50,301",198,"290,914",0.0000,,,,,,,,,,,,,,,,,,,,
79
- PoT,AQuA,2025/1/22,Internllm2_5-7B,36.61,98.82,0,,254,"233,505",919,"68,457",270,"301,962",0.0000,,,,,,,,,,,,,,,,,,,,
80
- PoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,30.71,96.46,0,,254,"246,560",971,"51,915",204,"298,475",0.0000,,,,,,,,,,,,,,,,,,,,
81
- PoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,17.32,92.13,0,,254,"258,867","1,019","63,414",250,"322,281",0.0000,,,,,,,,,,,,,,,,,,,,
82
- SC-CoT,AQuA,2025/1/7,gpt-3.5-turbo,66.14,99.21,0,"temperature=1, path_num=5",254,"482,192","1,898","365,143","1,438","847,335",0.7888,,,,,,,,,,,,,,,,,,,,
83
- SC-CoT,AQuA,2025/1/7,Doubao-lite-32k,81.1,97.24,0,"temperature=1, path_num=5",254,"503,751","1,983","382,235","1,505","885,986",0.0519,,,,,,,,,,,,,,,,,,,,
84
- SC-CoT,AQuA,2025/1/22,gpt-4o,86.61,98.82,0,"temperature=1, path_num=5",254,"744,478","2,931","628,728","2,475","1,373,206",8.1485,,,,,,,,,,,,,,,,,,,,
85
- SC-CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,85.04,99.21,0,"temperature=1, path_num=5",254,"1,051,218","4,139","784,451","3,088","1,835,669",1.0348,,,,,,,,,,,,,,,,,,,,
86
- SC-CoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,82.28,99.21,0,"temperature=1, path_num=5",254,"1,135,251","4,469","772,673","3,042","1,907,924",1.0756,,,,,,,,,,,,,,,,,,,,
87
- SC-CoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,79.92,100,0,"temperature=1, path_num=5",254,"1,098,280","4,324","747,052","2,941","1,845,332",0.0000,,,,,,,,,,,,,,,,,,,,
88
- SC-CoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,59.45,97.24,0,"temperature=1, path_num=5",254,"971,003","3,823","680,330","2,678","1,651,333",0.0000,,,,,,,,,,,,,,,,,,,,
89
- SC-CoT,AQuA,2025/1/22,Internllm2_5-7B,39.37,98.03,0,"temperature=1, path_num=5",254,"1,420,494","5,592","875,728","3,448","2,296,222",0.0000,,,,,,,,,,,,,,,,,,,,
90
- SC-CoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,23.62,96.46,0,"temperature=1, path_num=5",254,"1,034,362","4,072","740,973","2,917","1,775,335",0.0000,,,,,,,,,,,,,,,,,,,,
91
- SC-CoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,22.83,97.24,0,"temperature=1, path_num=5",254,"1,246,929","4,909","968,162","3,812","2,215,091",0.0000,,,,,,,,,,,,,,,,,,,,
92
- ReAct-Pro*,AQuA,2025/1/7,gpt-3.5-turbo,64.57,98.03,0,max_steps=10,254,"862,614","3,396","40,973",161,"903,587",0.4928,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
93
- ReAct-Pro*,AQuA,2025/1/7,Doubao-lite-32k,77.56,96.06,0,max_steps=10,254,"977,890","3,850","54,951",216,"1,032,841",0.0445,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
94
- ReAct-Pro*,AQuA,2025/1/22,gpt-4o,57.48,97.24,0,max_steps=10,254,"615,589","2,424","76,507",301,"692,096",2.3040,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
95
- ReAct-Pro*,AQuA,2025/1/22,Qwen2.5-72B-Instruct,73.23,100,0,max_steps=10,254,"441,765","1,739","121,838",480,"563,603",0.3177,,,,,,,,,,,,,,,,,,,,
96
- ReAct-Pro*,AQuA,2025/1/22,Llama-3.3-70B-Instruct,79.13,99.61,0,max_steps=10,254,"1,119,143","4,406","243,236",958,"1,362,379",0.7680,,,,,,,,,,,,,,,,,,,,
97
- ReAct-Pro*,AQuA,2025/1/22,Qwen2.5-7B-Instruct,74.41,99.21,0,max_steps=10,254,"564,165","2,221","131,679",518,"695,844",0.0000,,,,,,,,,,,,,,,,,,,,
98
- ReAct-Pro*,AQuA,2025/1/22,Llama-3.1-8B-Instruct,55.51,96.85,0,max_steps=10,254,"3,764,723","14,822","576,098","2,268","4,340,821",0.0000,,,,,,,,,,,,,,,,,,,,
99
- ReAct-Pro*,AQuA,2025/1/22,Internllm2_5-7B,40.94,96.85,0,max_steps=10,254,"3,592,039","14,142","836,762","3,294","4,428,801",0.0000,,,,,,,,,,,,,,,,,,,,
100
- ReAct-Pro*,AQuA,2025/1/22,Qwen2-1.5B-Instruct,25.59,96.06,0,max_steps=10,254,"4,555,858","17,936","516,146","2,032","5,072,004",0.0000,,,,,,,,,,,,,,,,,,,,
101
- ReAct-Pro*,AQuA,2025/1/22,Qwen2-0.5B-Instruct,24.02,96.85,0,max_steps=10,254,"6,344,167","24,977","825,920","3,252","7,170,087",0.0000,,,,,,,,,,,,,,,,,,,,
102
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
103
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
104
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
105
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
106
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
107
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
108
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
109
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
110
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
111
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
112
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
113
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
114
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
115
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
116
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
117
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
118
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
119
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
120
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
121
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
122
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
123
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
124
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
125
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
126
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
127
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
128
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
129
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
130
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
131
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
132
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
133
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
134
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
135
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
136
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
137
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
138
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
139
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
140
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
141
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
142
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
143
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
144
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
145
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
146
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
147
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
148
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Algorithm,Dataset,Eval Date,LLM,Score,Pass rate,X-shot,Parameters,Samples,Total input tokens,Average input tokens,Total output tokens,Average output tokens,All tokens,Cost($),Note,,,,,,,,,,,,,,,,,,,,,,,,,,,
2
+ IO,gsm8k,2025/1/7,gpt-3.5-turbo,37.83,99.92,8,,"1,319","546,990",415,"39,563",30,"586,553",0.3328,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3
+ IO,gsm8k,2025/1/7,Doubao-lite-32k,72.02,99.92,8,,"1,319","617,377",468,"123,106",93,"740,483",0.0354,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4
+ IO,gsm8k,2025/1/22,gpt-4o,88.40,100.00,8,,"1,319","542,416",411,"199,030",151,"741,446",3.3463,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5
+ IO,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,86.58,100.00,8,,"1,319","555,340",421,"313,720",238,"869,060",0.4899,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6
+ IO,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,92.27,100.00,8,,"1,319","583,916",443,"251,359",191,"835,275",0.4709,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7
+ IO,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,57.24,100.00,8,,"1,319","596,229",452,"291,684",221,"887,913",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8
+ IO,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,57.16,99.55,8,,"1,319","550,941",418,"1,194,488",906,"1,745,429",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9
+ IO,gsm8k,2025/1/22,Internllm2_5-7B,11.60,97.95,8,,"1,319","679,302",515,"434,426",329,"1,113,728",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10
+ IO,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,16.68,100.00,8,,"1,319","568,530",431,"168,466",128,"736,996",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
11
+ IO,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,14.71,100.00,8,,"1,319","568,116",431,"266,781",202,"834,897",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
12
+ IO,gsm8k,2025/1/22,deepseek-r1:1.5b,64.14,99.62,8,,"1,319","561,935",426,"921,116",698,"1,483,051",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
13
+ ReAct-Pro*,gsm8k,2025/1/7,gpt-3.5-turbo,74.91,99.39,8,max_steps=10,"1,319","6,506,164","4,933","140,122",106,"6,646,286",3.4633,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
14
+ ReAct-Pro*,gsm8k,2025/1/7,Doubao-lite-32k,85.60,99.62,8,max_steps=10,"1,319","5,862,016","4,444","136,623",104,"5,998,639",0.2512,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
15
+ ReAct-Pro*,gsm8k,2025/1/22,gpt-4o,63.31,99.55,8,max_steps=10,"1,319","14,411,173","10,926","304,714",231,"14,715,887",39.0751,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
16
+ ReAct-Pro*,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,87.26,100.00,8,max_steps=10,"1,319","18,160,983","13,769","549,454",417,"18,710,437",10.5479,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
17
+ ReAct-Pro*,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,87.64,99.92,8,max_steps=10,"1,319","17,038,928","12,918","898,936",682,"17,937,864",10.1124,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
18
+ ReAct-Pro*,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,82.87,100.00,8,max_steps=10,"1,319","14,355,752","10,884","495,162",375,"14,850,914",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
19
+ ReAct-Pro*,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,67.78,98.56,8,max_steps=10,"1,319","21,044,978","15,955","1,790,789","1,358","22,835,767",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
20
+ ReAct-Pro*,gsm8k,2025/1/22,Internllm2_5-7B,33.51,97.95,8,max_steps=10,"1,319","30,120,070","22,836","5,549,919","4,208","35,669,989",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
21
+ ReAct-Pro*,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,24.87,80.21,8,max_steps=10,"1,319","9,133,603","6,925","694,398",526,"9,828,001",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
22
+ ReAct-Pro*,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,7.66,95.22,8,max_steps=10,"1,319","52,431,343","39,751","2,961,268","2,245","55,392,611",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
23
+ ReAct-Pro*,gsm8k,2025/2/10,deepseek-r1:1.5b,35.94,99.62,8,max_steps=10,"1,319","19,299,381","14,632","4,919,696","3,730","24,219,077",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
24
+ PoT,gsm8k,2025/1/7,gpt-3.5-turbo,76.88,99.24,8,,"1,319","1,090,418",827,"96,662",73,"1,187,080",0.6902,,,,,,,,,,,,,,,,,,,,,,,,,,,,
25
+ PoT,gsm8k,2025/1/7,Doubao-lite-32k,79.61,92.57,8,,"1,319","1,170,038",887,"118,017",89,"1,288,055",0.0576,,,,,,,,,,,,,,,,,,,,,,,,,,,,
26
+ PoT,gsm8k,2025/1/22,gpt-4o,93.10,99.77,8,,"1,319","1,101,672",835,"146,240",111,"1,247,912",4.2166,,,,,,,,,,,,,,,,,,,,,,,,,,,,
27
+ PoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,92.34,99.39,8,,"1,319","1,106,682",839,"144,528",110,"1,251,210",0.7054,,,,,,,,,,,,,,,,,,,,,,,,,,,,
28
+ PoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,73.09,79.61,8,,"1,319","1,126,025",854,"601,019",456,"1,727,044",0.9736,,,,,,,,,,,,,,,,,,,,,,,,,,,,
29
+ PoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,58.83,70.51,8,,"1,319","1,145,390",868,"217,432",165,"1,362,822",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
30
+ PoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,38.67,55.42,8,,"1,319","1,147,538",870,"243,573",185,"1,391,111",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
31
+ PoT,gsm8k,2025/1/22,Internllm2_5-7B,38.21,48.90,8,,"1,319","1,136,843",862,"188,106",143,"1,324,949",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32
+ PoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,18.50,31.01,8,,"1,319","1,151,528",873,"175,994",133,"1,327,522",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
33
+ PoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,9.63,16.91,8,,"1,319","1,151,528",873,"237,607",180,"1,389,135",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
34
+ PoT,gsm8k,2025/2/10,deepseek-r1:1.5b,11.90,17.44,8,,"1,319","1,138,872",863,"815,637",618,"1,954,509",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
35
+ CoT,gsm8k,2025/1/7,gpt-3.5-turbo,78.70,100.00,8,,"1,319","953,242",723,"134,799",102,"1,088,041",0.6788,,,,,,,,,,,,,,,,,,,,,,,,,,,,
36
+ CoT,gsm8k,2025/1/7,Doubao-lite-32k,89.31,100.00,8,,"1,319","1,042,095",790,"159,725",121,"1,201,820",0.0558,,,,,,,,,,,,,,,,,,,,,,,,,,,,
37
+ CoT,gsm8k,2025/1/22,gpt-4o,94.09,100.00,8,,"1,319","948,668",719,"216,498",164,"1,165,166",4.5367,,,,,,,,,,,,,,,,,,,,,,,,,,,,
38
+ CoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,92.87,100.00,8,,"1,319","1,005,119",762,"271,133",206,"1,276,252",0.7195,,,,,,,,,,,,,,,,,,,,,,,,,,,,
39
+ CoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,93.93,100.00,8,,"1,319","990,168",751,"228,497",173,"1,218,665",0.6870,,,,,,,,,,,,,,,,,,,,,,,,,,,,
40
+ CoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,85.67,100.00,8,,"1,319","1,046,008",793,"244,797",186,"1,290,805",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
41
+ CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,75.44,99.92,8,,"1,319","990,168",751,"258,161",196,"1,248,329",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
42
+ CoT,gsm8k,2025/1/22,Internllm2_5-7B,77.71,99.70,8,,"1,319","968,163",734,"234,000",177,"1,202,163",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
43
+ CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,55.50,100.00,8,,"1,319","1,032,818",783,"185,707",141,"1,218,525",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
44
+ CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,35.94,99.92,8,,"1,319","1,032,818",783,"190,641",145,"1,223,459",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
45
+ CoT,gsm8k,2025/1/23,deepseek-r1:1.5b,70.66,99.77,8,,"1,319","1,011,714",767,"1,078,911",818,"2,090,625",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
46
+ SC-CoT,gsm8k,2025/1/7,gpt-3.5-turbo,79.91,99.92,8,"temperature=1, path_num=5","1,319","2,740,652","2,078","1,348,960","1,023","4,089,612",3.3938,,,,,,,,,,,,,,,,,,,,,,,,,,,,
47
+ SC-CoT,gsm8k,2025/1/7,Doubao-lite-32k,87.26,99.92,8,"temperature=1, path_num=5","1,319","2,691,714","2,041","1,197,099",908,"3,888,813",0.2083,,,,,,,,,,,,,,,,,,,,,,,,,,,,
48
+ SC-CoT,gsm8k,2025/1/22,gpt-4o,90.30,99.92,8,"temperature=1, path_num=5","1,319","3,590,336","2,722","2,207,837","1,674","5,798,173",31.0542,,,,,,,,,,,,,,,,,,,,,,,,,,,,
49
+ SC-CoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,93.86,100.00,8,"temperature=1, path_num=5","1,319","8,136,223","6,168","2,481,785","1,882","10,618,008",5.9858,,,,,,,,,,,,,,,,,,,,,,,,,,,,
50
+ SC-CoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,95.07,100.00,8,"temperature=1, path_num=5","1,319","8,413,717","6,379","2,585,077","1,960","10,998,794",6.2005,,,,,,,,,,,,,,,,,,,,,,,,,,,,
51
+ SC-CoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,91.13,100.00,8,"temperature=1, path_num=5","1,319","8,586,888","6,510","2,554,097","1,936","11,140,985",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
52
+ SC-CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,73.46,99.55,8,"temperature=1, path_num=5","1,319","8,630,514","6,543","3,148,202","2,387","11,778,716",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
53
+ SC-CoT,gsm8k,2025/1/22,Internllm2_5-7B,48.22,98.41,8,"temperature=1, path_num=5","1,319","10,678,792","8,096","3,847,639","2,917","14,526,431",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
54
+ SC-CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,11.75,91.89,8,"temperature=1, path_num=5","1,319","9,066,115","6,873","3,345,827","2,537","12,411,942",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
55
+ SC-CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,1.67,94.69,8,"temperature=1, path_num=5","1,319","11,019,864","8,355","5,445,856","4,129","16,465,720",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
56
+ SC-CoT,gsm8k,2025/2/10,deepseek-r1:1.5b,55.34,99.70,8,"temperature=1, path_num=5","1,319","14,540,096","11,024","11,245,769","8,526","25,785,865",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
57
+ ToT,gsm8k,2025/1/7,gpt-3.5-turbo,67.93,99.70,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","15,920,037","12,070","807,138",612,"16,727,175",9.1707,,,,,,,,,,,,,,,,,,,,,,,,,,,,
58
+ ToT,gsm8k,2025/1/7,Doubao-lite-32k,37.83,87.34,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","19,208,597","14,563","1,065,752",808,"20,274,349",0.8739,,,,,,,,,,,,,,,,,,,,,,,,,,,,
59
+ ToT,gsm8k,2025/1/22,gpt-4o,91.13,100.00,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","29,445,237","22,324","1,324,498","1,004","30,769,735",86.8581,,,,,,,,,,,,,,,,,,,,,,,,,,,,
60
+ ToT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,88.88,100.00,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","40,435,361","30,656","1,411,787","1,070","41,847,148",23.5911,,,,,,,,,,,,,,,,,,,,,,,,,,,,
61
+ ToT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,91.89,100.00,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","35,096,810","26,609","1,932,877","1,465","37,029,687",20.8753,,,,,,,,,,,,,,,,,,,,,,,,,,,,
62
+ ToT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,72.21,99.01,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","20,196,528","15,312","11,460,791","8,689","31,657,319",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
63
+ ToT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,65.05,91.96,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","15,554,967","11,793","877,135",665,"16,432,102",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
64
+ ToT,gsm8k,2025/1/22,Internllm2_5-7B,20.85,70.13,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","11,768,118","8,922","1,410,011","1,069","13,178,129",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
65
+ ToT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,19.64,77.26,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","12,124,248","9,192","634,439",481,"12,758,687",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
66
+ ToT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,-,-,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319",-,-,-,-,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,
67
+ ToT,gsm8k,2025/2/10,deepseek-r1:1.5b,23.12,72.48,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","2,738,244","2,076","683,242",518,"3,421,486",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
68
+ IO,AQuA,2025/1/7,gpt-3.5-turbo,38.98,100.00,0,,254,"25,701",101,"16,770",66,"42,471",0.0380,,,,,,,,,,,,,,,,,,,,,,,,,,,,
69
+ IO,AQuA,2025/1/7,Doubao-lite-32k,79.13,100.00,0,,254,"33,058",130,"54,684",215,"87,742",0.0058,,,,,,,,,,,,,,,,,,,,,,,,,,,,
70
+ IO,AQuA,2025/1/22,gpt-4o,75.59,97.24,0,,254,"25,631",101,"108,121",426,"133,752",1.1453,,,,,,,,,,,,,,,,,,,,,,,,,,,,
71
+ IO,AQuA,2025/1/22,Qwen2.5-72B-Instruct,84.25,99.61,0,,254,"25,397",100,"106,207",418,"131,604",0.0742,,,,,,,,,,,,,,,,,,,,,,,,,,,,
72
+ IO,AQuA,2025/1/22,Llama-3.3-70B-Instruct,82.68,99.21,0,,254,"32,809",129,"108,758",428,"141,567",0.0798,,,,,,,,,,,,,,,,,,,,,,,,,,,,
73
+ IO,AQuA,2025/1/22,Qwen2.5-7B-Instruct,78.74,98.43,0,,254,"33,271",131,"104,500",411,"137,771",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
74
+ IO,AQuA,2025/1/22,Llama-3.1-8B-Instruct,51.18,98.82,0,,254,"26,459",104,"106,647",420,"133,106",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
75
+ IO,AQuA,2025/1/22,Internllm2_5-7B,47.64,90.94,0,,254,"50,232",198,"134,809",531,"185,041",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
76
+ IO,AQuA,2025/1/22,Qwen2-1.5B-Instruct,29.13,97.64,0,,254,"27,937",110,"43,110",170,"71,047",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
77
+ IO,AQuA,2025/1/22,Qwen2-0.5B-Instruct,27.17,98.82,0,,254,"27,937",110,"82,478",325,"110,415",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
78
+ IO,AQuA,2025/1/22,deepseek-r1:1.5b,68.90,94.88,0,,254,"26,667",105,"325,100","1,280","351,767",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
79
+ CoT,AQuA,2025/1/7,gpt-3.5-turbo,61.02,93.70,0,,254,"25,447",100,"55,346",218,"80,793",0.0957,,,,,,,,,,,,,,,,,,,,,,,,,,,,
80
+ CoT,AQuA,2025/1/7,Doubao-lite-32k,82.68,97.24,0,,254,"27,978",110,"66,599",262,"94,577",0.0066,,,,,,,,,,,,,,,,,,,,,,,,,,,,
81
+ CoT,AQuA,2025/1/22,gpt-4o,82.68,98.03,0,,254,"25,123",99,"97,894",385,"123,017",1.0417,,,,,,,,,,,,,,,,,,,,,,,,,,,,
82
+ CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,86.22,99.21,0,,254,"25,143",99,"118,146",465,"143,289",0.0808,,,,,,,,,,,,,,,,,,,,,,,,,,,,
83
+ CoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,83.46,98.43,0,,254,"32,555",128,"131,834",519,"164,389",0.0927,,,,,,,,,,,,,,,,,,,,,,,,,,,,
84
+ CoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,80.71,99.61,0,,254,"33,017",130,"116,719",460,"149,736",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
85
+ CoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,60.63,100.00,0,,254,"32,555",128,"111,880",440,"144,435",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
86
+ CoT,AQuA,2025/1/22,Internllm2_5-7B,52.76,89.37,0,,254,"26,610",105,"100,910",397,"127,520",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
87
+ CoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,40.55,98.82,0,,254,"30,477",120,"79,563",313,"110,040",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
88
+ CoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,33.07,98.82,0,,254,"30,477",120,"86,862",342,"117,339",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
89
+ CoT,AQuA,2025/1/23,deepseek-r1:1.5b,71.65,96.85,0,,254,"26,413",104,"306,659","1,207","333,072",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
90
+ PoT,AQuA,2025/1/7,gpt-3.5-turbo,59.45,100.00,0,,254,"225,162",886,"41,492",163,"266,654",0.1748,,,,,,,,,,,,,,,,,,,,,,,,,,,,
91
+ PoT,AQuA,2025/1/7,Doubao-lite-32k,71.65,96.85,0,,254,"259,863","1,023","49,573",195,"309,436",0.0147,,,,,,,,,,,,,,,,,,,,,,,,,,,,
92
+ PoT,AQuA,2025/1/22,gpt-4o,75.20,100.00,0,,254,"222,717",877,"105,191",414,"327,908",1.6087,,,,,,,,,,,,,,,,,,,,,,,,,,,,
93
+ PoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,75.20,100.00,0,,254,"249,215",981,"42,549",168,"291,764",0.1645,,,,,,,,,,,,,,,,,,,,,,,,,,,,
94
+ PoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,79.53,99.21,0,,254,"240,735",948,"69,064",272,"309,799",0.1746,,,,,,,,,,,,,,,,,,,,,,,,,,,,
95
+ PoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,68.11,100.00,0,,254,"264,517","1,041","49,211",194,"313,728",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
96
+ PoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,36.61,96.85,0,,254,"240,613",947,"50,301",198,"290,914",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
97
+ PoT,AQuA,2025/1/22,Internllm2_5-7B,36.61,98.82,0,,254,"233,505",919,"68,457",270,"301,962",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
98
+ PoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,30.71,96.46,0,,254,"246,560",971,"51,915",204,"298,475",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
99
+ PoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,17.32,92.13,0,,254,"258,867","1,019","63,414",250,"322,281",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
100
+ PoT,AQuA,2025/2/10,deepseek-r1:1.5b,54.72,97.24,0,,254,"250,690",987,"765,957","3,016","1,016,647",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
101
+ SC-CoT,AQuA,2025/1/22,gpt-3.5-turbo,66.14,99.21,0,"temperature=1, path_num=5",254,"482,192","1,898","365,143","1,438","847,335",0.7888,,,,,,,,,,,,,,,,,,,,,,,,,,,,
102
+ SC-CoT,AQuA,2025/1/22,Doubao-lite-32k,81.10,97.24,0,"temperature=1, path_num=5",254,"503,751","1,983","382,235","1,505","885,986",0.0519,,,,,,,,,,,,,,,,,,,,,,,,,,,,
103
+ SC-CoT,AQuA,2025/1/22,gpt-4o,86.61,98.82,0,"temperature=1, path_num=5",254,"744,478","2,931","628,728","2,475","1,373,206",8.1485,,,,,,,,,,,,,,,,,,,,,,,,,,,,
104
+ SC-CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,85.04,99.21,0,"temperature=1, path_num=5",254,"1,051,218","4,139","784,451","3,088","1,835,669",1.0348,,,,,,,,,,,,,,,,,,,,,,,,,,,,
105
+ SC-CoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,82.28,99.21,0,"temperature=1, path_num=5",254,"1,135,251","4,469","772,673","3,042","1,907,924",1.0756,,,,,,,,,,,,,,,,,,,,,,,,,,,,
106
+ SC-CoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,79.92,100.00,0,"temperature=1, path_num=5",254,"1,098,280","4,324","747,052","2,941","1,845,332",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
107
+ SC-CoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,59.45,97.24,0,"temperature=1, path_num=5",254,"971,003","3,823","680,330","2,678","1,651,333",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
108
+ SC-CoT,AQuA,2025/1/22,Internllm2_5-7B,39.37,98.03,0,"temperature=1, path_num=5",254,"1,420,494","5,592","875,728","3,448","2,296,222",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
109
+ SC-CoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,23.62,96.46,0,"temperature=1, path_num=5",254,"1,034,362","4,072","740,973","2,917","1,775,335",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
110
+ SC-CoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,22.83,97.24,0,"temperature=1, path_num=5",254,"1,246,929","4,909","968,162","3,812","2,215,091",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
111
+ SC-CoT,AQuA,2025/2/10,deepseek-r1:1.5b,59.06,96.85,0,"temperature=1, path_num=5",254,"2,547,772","10,031","3,254,939","12,815","5,802,711",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
112
+ ReAct-Pro*,AQuA,2025/1/7,gpt-3.5-turbo,64.57,98.03,0,max_steps=10,254,"862,614","3,396","40,973",161,"903,587",0.4928,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
113
+ ReAct-Pro*,AQuA,2025/1/7,Doubao-lite-32k,77.56,96.06,0,max_steps=10,254,"977,890","3,850","54,951",216,"1,032,841",0.0445,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
114
+ ReAct-Pro*,AQuA,2025/1/22,gpt-4o,57.48,97.24,0,max_steps=10,254,"615,589","2,424","76,507",301,"692,096",2.3040,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
115
+ ReAct-Pro*,AQuA,2025/1/22,Qwen2.5-72B-Instruct,73.23,100.00,0,max_steps=10,254,"441,765","1,739","121,838",480,"563,603",0.3177,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
116
+ ReAct-Pro*,AQuA,2025/1/22,Llama-3.3-70B-Instruct,79.13,99.61,0,max_steps=10,254,"1,119,143","4,406","243,236",958,"1,362,379",0.7680,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
117
+ ReAct-Pro*,AQuA,2025/1/22,Qwen2.5-7B-Instruct,74.41,99.21,0,max_steps=10,254,"564,165","2,221","131,679",518,"695,844",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
118
+ ReAct-Pro*,AQuA,2025/1/22,Llama-3.1-8B-Instruct,55.51,96.85,0,max_steps=10,254,"3,764,723","14,822","576,098","2,268","4,340,821",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
119
+ ReAct-Pro*,AQuA,2025/1/22,Internllm2_5-7B,40.94,96.85,0,max_steps=10,254,"3,592,039","14,142","836,762","3,294","4,428,801",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
120
+ ReAct-Pro*,AQuA,2025/1/22,Qwen2-1.5B-Instruct,25.59,96.06,0,max_steps=10,254,"4,555,858","17,936","516,146","2,032","5,072,004",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
121
+ ReAct-Pro*,AQuA,2025/1/22,Qwen2-0.5B-Instruct,24.02,96.85,0,max_steps=10,254,"6,344,167","24,977","825,920","3,252","7,170,087",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
122
+ ReAct-Pro*,AQuA,2025/2/10,deepseek-r1:1.5b,54.33,96.46,0,max_steps=10,254,"10,578,715","41,648","3,866,326","15,222","14,445,041",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
123
+ ToT,AQuA,2025/1/7,gpt-3.5-turbo,57.09,99.61,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"1,850,767","7,286","150,629",593,"2,001,396",1.1513,,,,,,,,,,,,,,,,,,,,,,,,,,,,
124
+ ToT,AQuA,2025/1/7,Doubao-lite-32k,45.28,74.02,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"1,850,249","7,284","150,301",592,"2,000,550",0.0881,,,,,,,,,,,,,,,,,,,,,,,,,,,,
125
+ ToT,AQuA,2025/1/22,gpt-4o,81.50,99.21,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"2,347,538","9,242","266,069","1,048","2,613,607",8.5295,,,,,,,,,,,,,,,,,,,,,,,,,,,,
126
+ ToT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,81.10,99.21,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"6,371,642","25,085","260,613","1,026","6,632,255",3.7389,,,,,,,,,,,,,,,,,,,,,,,,,,,,
127
+ ToT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,83.07,100.00,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"4,735,188","18,642","480,660","1,892","5,215,848",2.9404,,,,,,,,,,,,,,,,,,,,,,,,,,,,
128
+ ToT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,53.94,100.00,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"8,224,468","32,380","378,214","1,489","8,602,682",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
129
+ ToT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,59.06,100.00,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"4,896,222","19,276","843,462","3,321","5,739,684",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
130
+ ToT,AQuA,2025/1/22,Internllm2_5-7B,35.83,99.61,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"4,263,136","16,784","471,424","1,856","4,734,560",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
131
+ ToT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,31.50,98.82,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"6,058,022","23,850","192,680",759,"6,250,702",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
132
+ ToT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,29.92,100.00,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"8,100,085","31,890","600,196","2,363","8,700,281",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
133
+ ToT,AQuA,2025/2/10,deepseek-r1:1.5b,24.80,55.51,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"605,028","2,382","189,484",746,"794,512",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
134
+ IO,MATH-500,2025/1/24,gpt-3.5-turbo,17.20,100.00,4,,500,"154,881",310,"110,744",221,"265,625",0.2436,,,,,,,,,,,,,,,,,,,,,,,,,,,,
135
+ IO,MATH-500,2025/1/24,Doubao-lite-32k,37.40,100.00,4,,500,"166,870",334,"144,860",290,"311,730",0.0187,,,,,,,,,,,,,,,,,,,,,,,,,,,,
136
+ IO,MATH-500,2025/1/22,gpt-4o,41.80,100.00,4,,500,"153,832",308,"240,615",481,"394,447",2.7907,,,,,,,,,,,,,,,,,,,,,,,,,,,,
137
+ IO,MATH-500,2025/1/24,Qwen2.5-72B-Instruct,70.20,100.00,4,,500,"169,549",339,"275,042",550,"444,591",0.2506,,,,,,,,,,,,,,,,,,,,,,,,,,,,
138
+ IO,MATH-500,2025/1/24,Llama-3.3-70B-Instruct,69.40,100.00,4,,500,"155,879",312,"267,337",535,"423,216",0.2386,,,,,,,,,,,,,,,,,,,,,,,,,,,,
139
+ IO,MATH-500,2025/1/24,Qwen2.5-7B-Instruct,59.40,100.00,4,,500,"169,549",339,"241,813",484,"411,362",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
140
+ IO,MATH-500,2025/1/24,Llama-3.1-8B-Instruct,38.60,100.00,4,,500,"155,563",311,"348,371",697,"503,934",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
141
+ IO,MATH-500,2025/1/24,Internllm2_5-7B,22.80,100.00,4,,500,"201,883",404,"266,005",532,"467,888",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
142
+ IO,MATH-500,2025/1/24,Qwen2-1.5B-Instruct,7.00,100.00,4,,500,"158,777",318,"255,101",510,"413,878",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
143
+ IO,MATH-500,2025/1/24,Qwen2-0.5B-Instruct,2.60,100.00,4,,500,"159,049",318,"270,281",541,"429,330",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
144
+ IO,MATH-500,2025/1/24,deepseek-r1:1.5b,43.80,100.00,4,,500,"157,049",314,"865,499","1,731","1,022,548",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
145
+ CoT,MATH-500,2025/1/24,gpt-3.5-turbo,39.80,100.00,4,,500,"329,381",659,"102,815",206,"432,196",0.3189,,,,,,,,,,,,,,,,,,,,,,,,,,,,
146
+ CoT,MATH-500,2025/1/22,Doubao-lite-32k,59.00,100.00,4,,500,"336,370",673,"143,571",287,"479,941",0.0255,,,,,,,,,,,,,,,,,,,,,,,,,,,,
147
+ CoT,MATH-500,2025/1/24,gpt-4o,68.00,100.00,4,,500,"329,332",659,"223,356",447,"552,688",3.0569,,,,,,,,,,,,,,,,,,,,,,,,,,,,
148
+ CoT,MATH-500,2025/1/22,Qwen2.5-72B-Instruct,80.20,100.00,4,,500,"338,549",677,"280,466",561,"619,015",0.3490,,,,,,,,,,,,,,,,,,,,,,,,,,,,
149
+ CoT,MATH-500,2025/1/24,Llama-3.3-70B-Instruct,71.20,100.00,4,,500,"342,879",686,"271,342",543,"614,221",0.3463,,,,,,,,,,,,,,,,,,,,,,,,,,,,
150
+ CoT,MATH-500,2025/1/24,Qwen2.5-7B-Instruct,69.80,100.00,4,,500,"354,049",708,"263,155",526,"617,204",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
151
+ CoT,MATH-500,2025/1/24,Llama-3.1-8B-Instruct,25.80,100.00,4,,500,"342,879",686,"282,689",565,"625,568",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
152
+ CoT,MATH-500,2025/1/24,Internllm2_5-7B,46.60,100.00,4,,500,"332,883",666,"213,891",428,"546,774",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
153
+ CoT,MATH-500,2025/1/24,Qwen2-1.5B-Instruct,15.20,100.00,4,,500,"349,049",698,"187,328",375,"536,377",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
154
+ CoT,MATH-500,2025/1/24,Qwen2-0.5B-Instruct,6.20,100.00,4,,500,"349,049",698,"200,139",400,"549,188",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
155
+ CoT,MATH-500,2025/1/24,deepseek-r1:1.5b,49.40,100.00,4,,500,"341,549",683,"857,580","1,715","1,199,129",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
156
+ PoT,MATH-500,2025/2/10,gpt-3.5-turbo,28.80,83.80,4,,500,"239,902",480,"32,014",64,"271,916",0.1680,,,,,,,,,,,,,,,,,,,,,,,,,,,,
157
+ PoT,MATH-500,2025/2/10,Doubao-lite-32k,32.60,68.00,4,,500,"254,377",509,"48,771",98,"303,148",0.0144,,,,,,,,,,,,,,,,,,,,,,,,,,,,
158
+ PoT,MATH-500,2025/2/10,gpt-4o,46.20,86.40,4,,500,"241,357",483,"99,603",199,"340,960",1.5994,,,,,,,,,,,,,,,,,,,,,,,,,,,,
159
+ PoT,MATH-500,2025/2/10,Qwen2.5-72B-Instruct,47.20,82.20,4,,500,"242,549",485,"170,823",342,"413,372",0.2330,,,,,,,,,,,,,,,,,,,,,,,,,,,,
160
+ PoT,MATH-500,2025/2/10,Llama-3.3-70B-Instruct,42.60,80.20,4,,500,"253,879",508,"249,717",499,"503,596",0.2839,,,,,,,,,,,,,,,,,,,,,,,,,,,,
161
+ PoT,MATH-500,2025/2/10,Qwen2.5-7B-Instruct,39.60,74.40,4,,500,"258,549",517,"150,263",301,"408,812",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
162
+ PoT,MATH-500,2025/2/10,Llama-3.1-8B-Instruct,25.40,68.40,4,,500,"253,879",508,"208,392",417,"462,271",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
163
+ PoT,MATH-500,2025/2/10,Internllm2_5-7B,15.00,32.40,4,,500,"247,883",496,"120,826",242,"368,709",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
164
+ PoT,MATH-500,2025/2/10,Qwen2-1.5B-Instruct,0.80,2.20,4,,500,"248,509",497,"538,361","1,077","786,870",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
165
+ PoT,MATH-500,2025/2/10,Qwen2-0.5B-Instruct,0.00,0.00,4,,500,"253,549",507,"183,653",367,"437,202",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
166
+ PoT,MATH-500,2025/2/10,deepseek-r1:1.5b,1.00,1.60,4,,500,"245,549",491,"785,518","1,571","1,031,067",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
167
+ SC-CoT,MATH-500,2025/2/10,gpt-3.5-turbo,28.80,100.00,4,"temperature=1, path_num=5",500,"1,381,818","2,764","856,994","1,714","2,238,812",1.9764,,,,,,,,,,,,,,,,,,,,,,,,,,,,
168
+ SC-CoT,MATH-500,2025/2/10,Doubao-lite-32k,49.20,100.00,4,"temperature=1, path_num=5",500,"1,507,651","3,015","963,159","1,926","2,470,810",0.1406,,,,,,,,,,,,,,,,,,,,,,,,,,,,
169
+ SC-CoT,MATH-500,2025/2/10,gpt-4o,34.40,100.00,4,"temperature=1, path_num=5",500,"1,986,584","3,973","1,468,739","2,937","3,455,323",19.6538,,,,,,,,,,,,,,,,,,,,,,,,,,,,
170
+ SC-CoT,MATH-500,2025/2/10,Qwen2.5-72B-Instruct,74.00,100.00,4,"temperature=1, path_num=5",500,"3,823,997","7,648","1,773,516","3,547","5,597,513",3.1556,,,,,,,,,,,,,,,,,,,,,,,,,,,,
171
+ SC-CoT,MATH-500,2025/2/10,Llama-3.3-70B-Instruct,74.20,100.00,4,"temperature=1, path_num=5",500,"3,959,492","7,919","1,759,247","3,518","5,718,739",3.2239,,,,,,,,,,,,,,,,,,,,,,,,,,,,
172
+ SC-CoT,MATH-500,2025/2/10,Qwen2.5-7B-Instruct,67.00,100.00,4,"temperature=1, path_num=5",500,"3,833,751","7,668","1,617,733","3,235","5,451,484",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
173
+ SC-CoT,MATH-500,2025/2/10,Llama-3.1-8B-Instruct,30.20,100.00,4,"temperature=1, path_num=5",500,"3,546,673","7,093","1,488,264","2,977","5,034,937",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
174
+ SC-CoT,MATH-500,2025/2/10,Internllm2_5-7B,9.80,100.00,4,"temperature=1, path_num=5",500,"4,193,296","8,387","1,645,170","3,290","5,838,466",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
175
+ SC-CoT,MATH-500,2025/2/10,Qwen2-1.5B-Instruct,3.80,99.00,4,"temperature=1, path_num=5",500,"3,832,429","7,665","1,737,013","3,474","5,569,442",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
176
+ SC-CoT,MATH-500,2025/2/10,Qwen2-0.5B-Instruct,0.80,100.00,4,"temperature=1, path_num=5",500,"4,448,663","8,897","2,413,393","4,827","6,862,056",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
177
+ SC-CoT,MATH-500,2025/2/10,deepseek-r1:1.5b,38.00,100.00,4,"temperature=1, path_num=5",500,"7,080,559","14,161","7,661,550","15,323","14,742,109",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
178
+ ReAct-Pro*,MATH-500,2025/2/10,gpt-3.5-turbo,23.80,100.00,4,max_steps=10,500,"3,708,461","7,417","124,253",249,"3,832,714",2.0406,,,,,,,,,,,,,,,,,,,,,,,,,,,,
179
+ ReAct-Pro*,MATH-500,2025/2/10,Doubao-lite-32k,47.20,100.00,4,max_steps=10,500,"4,234,620","8,469","154,046",308,"4,388,666",0.1860,,,,,,,,,,,,,,,,,,,,,,,,,,,,
180
+ ReAct-Pro*,MATH-500,2025/2/10,gpt-4o,54.00,100.00,4,max_steps=10,500,"5,834,537","11,669","318,718",637,"6,153,255",17.7735,,,,,,,,,,,,,,,,,,,,,,,,,,,,
181
+ ReAct-Pro*,MATH-500,2025/2/10,Qwen2.5-72B-Instruct,62.80,100.00,4,max_steps=10,500,"5,747,268","11,495","379,849",760,"6,127,117",3.4541,,,,,,,,,,,,,,,,,,,,,,,,,,,,
182
+ ReAct-Pro*,MATH-500,2025/2/10,Llama-3.3-70B-Instruct,64.60,100.00,4,max_steps=10,500,"5,223,611","10,447","418,268",837,"5,641,879",3.1806,,,,,,,,,,,,,,,,,,,,,,,,,,,,
183
+ ReAct-Pro*,MATH-500,2025/2/10,Qwen2.5-7B-Instruct,48.80,100.00,4,max_steps=10,500,"4,646,708","9,293","343,532",687,"4,990,240",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
184
+ ReAct-Pro*,MATH-500,2025/2/10,Llama-3.1-8B-Instruct,28.80,100.00,4,max_steps=10,500,"7,486,706","14,973","1,276,923","2,554","8,763,629",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
185
+ ReAct-Pro*,MATH-500,2025/2/10,Internllm2_5-7B,14.80,100.00,4,max_steps=10,500,"11,831,496","23,663","2,354,609","4,709","14,186,105",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
186
+ ReAct-Pro*,MATH-500,2025/2/10,Qwen2-1.5B-Instruct,8.20,100.00,4,max_steps=10,500,"8,430,774","16,862","556,287","1,113","8,987,061",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
187
+ ReAct-Pro*,MATH-500,2025/2/10,Qwen2-0.5B-Instruct,0.60,100.00,4,max_steps=10,500,"18,137,392","36,275","1,305,048","2,610","19,442,440",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
188
+ ReAct-Pro*,MATH-500,2025/2/10,deepseek-r1:1.5b,24.40,100.00,4,max_steps=10,500,"20,729,970","41,460","9,447,378","18,895","30,177,348",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
189
+ ToT,MATH-500,2025/2/10,gpt-3.5-turbo,9.80,100.00,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"9,711,244","19,422","290,523",581,"10,001,767",5.2914,,,,,,,,,,,,,,,,,,,,,,,,,,,,
190
+ ToT,MATH-500,2025/2/10,Doubao-lite-32k,1.20,94.20,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"5,338,500","10,677","226,000",452,"5,564,500",0.2371,,,,,,,,,,,,,,,,,,,,,,,,,,,,
191
+ ToT,MATH-500,2025/2/10,gpt-4o,3.20,100.00,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"14,881,985","29,764","360,447",721,"15,242,432",40.8094,,,,,,,,,,,,,,,,,,,,,,,,,,,,
192
+ ToT,MATH-500,2025/2/10,Qwen2.5-72B-Instruct,10.80,100.00,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"15,657,730","31,315","381,631",763,"16,039,361",9.0421,,,,,,,,,,,,,,,,,,,,,,,,,,,,
193
+ ToT,MATH-500,2025/2/10,Llama-3.3-70B-Instruct,1.40,69.80,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"14,099,500","28,199","570,000","1,140","14,669,500",8.2699,,,,,,,,,,,,,,,,,,,,,,,,,,,,
194
+ ToT,MATH-500,2025/2/10,Qwen2.5-7B-Instruct,1.40,91.60,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"9,749,000","19,498","418,500",837,"10,167,500",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
195
+ ToT,MATH-500,2025/2/10,Llama-3.1-8B-Instruct,1.80,90.80,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"7,729,000","15,458","1,306,000","2,612","9,035,000",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
196
+ ToT,MATH-500,2025/2/10,Internllm2_5-7B,0.20,99.00,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"7,515,000","15,030","835,500","1,671","8,350,500",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
197
+ ToT,MATH-500,2025/2/10,Qwen2-1.5B-Instruct,0.80,97.20,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"4,408,000","8,816","127,000",254,"4,535,000",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
198
+ ToT,MATH-500,2025/2/10,Qwen2-0.5B-Instruct,0.00,96.20,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"5,590,500","11,181","406,000",812,"5,996,500",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
199
+ ToT,MATH-500,2025/2/10,deepseek-r1:1.5b,0.40,71.60,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"1,831,000","3,662","110,500",221,"1,941,500",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,