Spaces:
Running
Running
liaojiajia
commited on
Commit
·
cd01d35
1
Parent(s):
be9cdf5
add tot and math500 scores
Browse files- app.py +10 -10
- meta_data.py +8 -6
- preprocess.py +33 -33
- src/detail_math_score.json +0 -0
- src/detail_results.csv +198 -100
- src/overall_math_score.json +506 -2
- src/overall_results.csv +67 -51
- src/record.csv +199 -148
app.py
CHANGED
@@ -18,23 +18,23 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
18 |
DATASETS.remove('META')
|
19 |
print(DATASETS)
|
20 |
|
21 |
-
#
|
22 |
check_box = BUILD_L1_DF(results, DEFAULT_MATH_BENCH)
|
23 |
overall_table = generate_table(results, DEFAULT_MATH_BENCH)
|
24 |
|
25 |
-
#
|
26 |
csv_path_overall = os.path.join(os.getcwd(), 'src/overall_results.csv')
|
27 |
overall_table.to_csv(csv_path_overall, index=False)
|
28 |
print(f"Overall results saved to {csv_path_overall}")
|
29 |
|
30 |
-
#
|
31 |
llm_options = list(set(row.LLM for row in overall_table.itertuples() if hasattr(row, 'LLM')))
|
32 |
|
33 |
gr.Markdown(LEADERBORAD_INTRODUCTION.format(EVAL_TIME))
|
34 |
with gr.Tabs(elem_classes='tab-buttons') as tabs:
|
35 |
with gr.Tab(label='🏅 Open Agent Overall Math Leaderboard'):
|
36 |
gr.Markdown(LEADERBOARD_MD['MATH_MAIN'])
|
37 |
-
#
|
38 |
# check_box = BUILD_L1_DF(results, DEFAULT_MATH_BENCH)
|
39 |
# overall_table = generate_table(results, DEFAULT_MATH_BENCH)
|
40 |
|
@@ -48,7 +48,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
48 |
interactive=True,
|
49 |
)
|
50 |
|
51 |
-
#
|
52 |
algo_name = gr.CheckboxGroup(
|
53 |
choices=ALGORITHMS,
|
54 |
value=ALGORITHMS,
|
@@ -57,7 +57,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
57 |
)
|
58 |
|
59 |
llm_name = gr.CheckboxGroup(
|
60 |
-
choices=llm_options, #
|
61 |
value=llm_options,
|
62 |
label='LLM',
|
63 |
interactive=True
|
@@ -78,7 +78,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
78 |
headers = ['Rank'] + check_box['essential'] + fields
|
79 |
df = overall_table.copy()
|
80 |
|
81 |
-
#
|
82 |
df['flag'] = df.apply(lambda row: (
|
83 |
row['Algorithm'] in algos and
|
84 |
row['LLM'] in llms
|
@@ -107,7 +107,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
107 |
|
108 |
return comp
|
109 |
|
110 |
-
#
|
111 |
checkbox_group.change(
|
112 |
fn=filter_df,
|
113 |
inputs=[checkbox_group, algo_name, llm_name],
|
@@ -135,7 +135,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
135 |
|
136 |
table, check_box = BUILD_L2_DF(results_detail, DEFAULT_MATH_BENCH)
|
137 |
|
138 |
-
#
|
139 |
csv_path_detail = os.path.join(os.getcwd(), 'src/detail_results.csv')
|
140 |
table.to_csv(csv_path_detail, index=False)
|
141 |
print(f"Detail results saved to {csv_path_detail}")
|
@@ -217,7 +217,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
217 |
|
218 |
return comp
|
219 |
|
220 |
-
#
|
221 |
checkbox_group.change(
|
222 |
fn=filter_df2,
|
223 |
inputs=[checkbox_group, algo_name, dataset_name, llm_name],
|
|
|
18 |
DATASETS.remove('META')
|
19 |
print(DATASETS)
|
20 |
|
21 |
+
# Ensure overall_table is generated before defining llm_options
|
22 |
check_box = BUILD_L1_DF(results, DEFAULT_MATH_BENCH)
|
23 |
overall_table = generate_table(results, DEFAULT_MATH_BENCH)
|
24 |
|
25 |
+
# Save the complete overall_table as a CSV file
|
26 |
csv_path_overall = os.path.join(os.getcwd(), 'src/overall_results.csv')
|
27 |
overall_table.to_csv(csv_path_overall, index=False)
|
28 |
print(f"Overall results saved to {csv_path_overall}")
|
29 |
|
30 |
+
# Extract all possible LLM options from overall_table
|
31 |
llm_options = list(set(row.LLM for row in overall_table.itertuples() if hasattr(row, 'LLM')))
|
32 |
|
33 |
gr.Markdown(LEADERBORAD_INTRODUCTION.format(EVAL_TIME))
|
34 |
with gr.Tabs(elem_classes='tab-buttons') as tabs:
|
35 |
with gr.Tab(label='🏅 Open Agent Overall Math Leaderboard'):
|
36 |
gr.Markdown(LEADERBOARD_MD['MATH_MAIN'])
|
37 |
+
# Move the definition of check_box and overall_table here
|
38 |
# check_box = BUILD_L1_DF(results, DEFAULT_MATH_BENCH)
|
39 |
# overall_table = generate_table(results, DEFAULT_MATH_BENCH)
|
40 |
|
|
|
48 |
interactive=True,
|
49 |
)
|
50 |
|
51 |
+
# New CheckboxGroup component for selecting Algorithm and LLM
|
52 |
algo_name = gr.CheckboxGroup(
|
53 |
choices=ALGORITHMS,
|
54 |
value=ALGORITHMS,
|
|
|
57 |
)
|
58 |
|
59 |
llm_name = gr.CheckboxGroup(
|
60 |
+
choices=llm_options, # Use the extracted llm_options
|
61 |
value=llm_options,
|
62 |
label='LLM',
|
63 |
interactive=True
|
|
|
78 |
headers = ['Rank'] + check_box['essential'] + fields
|
79 |
df = overall_table.copy()
|
80 |
|
81 |
+
# Add filtering logic
|
82 |
df['flag'] = df.apply(lambda row: (
|
83 |
row['Algorithm'] in algos and
|
84 |
row['LLM'] in llms
|
|
|
107 |
|
108 |
return comp
|
109 |
|
110 |
+
# Update change events to include new filtering conditions
|
111 |
checkbox_group.change(
|
112 |
fn=filter_df,
|
113 |
inputs=[checkbox_group, algo_name, llm_name],
|
|
|
135 |
|
136 |
table, check_box = BUILD_L2_DF(results_detail, DEFAULT_MATH_BENCH)
|
137 |
|
138 |
+
# Save the complete table as a CSV file
|
139 |
csv_path_detail = os.path.join(os.getcwd(), 'src/detail_results.csv')
|
140 |
table.to_csv(csv_path_detail, index=False)
|
141 |
print(f"Detail results saved to {csv_path_detail}")
|
|
|
217 |
|
218 |
return comp
|
219 |
|
220 |
+
# Add change events for all checkbox groups
|
221 |
checkbox_group.change(
|
222 |
fn=filter_df2,
|
223 |
inputs=[checkbox_group, algo_name, dataset_name, llm_name],
|
meta_data.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1 |
# CONSTANTS-URL
|
2 |
OVERALL_MATH_SCORE_FILE = "src/overall_math_score.json"
|
3 |
DETAIL_MATH_SCORE_FILE = "src/detail_math_score.json"
|
|
|
4 |
# CONSTANTS-TEXT
|
5 |
LEADERBORAD_INTRODUCTION = """# Open Agent Leaderboard
|
6 |
-
### Welcome to the Open Agent Leaderboard! We share the evaluation results of open agents: CoT, SC-CoT, PoT, ReAct, etc. The agents are
|
7 |
|
8 |
This leaderboard was last updated: {}.
|
9 |
|
@@ -11,8 +12,9 @@ To add your own agent to the leaderboard, please create a PR in [*OmAgent*](http
|
|
11 |
"""
|
12 |
|
13 |
DEFAULT_MATH_BENCH = [
|
14 |
-
'gsm8k', 'AQuA'
|
15 |
]
|
|
|
16 |
# The README file for each benchmark
|
17 |
LEADERBOARD_MD = {}
|
18 |
|
@@ -42,7 +44,7 @@ LEADERBOARD_MD['MATH_DETAIL'] = f"""
|
|
42 |
- Cost: The cost on each math Benchmarks (the lower the better).
|
43 |
- Rank: The rank on each math Benchmarks (the lower the better).
|
44 |
|
45 |
-
- default parameters: temperature=0.0
|
46 |
- LLM prices:
|
47 |
- gpt-3.5-turbo:
|
48 |
- 0.5$/1M tokens (input)
|
@@ -53,7 +55,7 @@ LEADERBOARD_MD['MATH_DETAIL'] = f"""
|
|
53 |
- gpt-4o-2024-08-06:
|
54 |
- 2.50$ /1M input tokens (input)
|
55 |
- 10$ /1M output tokens (output)
|
56 |
-
- Qwen2.5-
|
57 |
- Prices can be found https://cloud.siliconflow.cn/.
|
58 |
- Other open source LLMs:
|
59 |
- Deployed locally, please check the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository for more information.
|
@@ -69,7 +71,7 @@ META_FIELDS = [
|
|
69 |
]
|
70 |
|
71 |
DATASETS = [
|
72 |
-
'gsm8k', 'AQuA'
|
73 |
]
|
74 |
|
75 |
LLM = [
|
@@ -77,7 +79,7 @@ LLM = [
|
|
77 |
]
|
78 |
|
79 |
ALGORITHMS = [
|
80 |
-
'IO', 'CoT', 'SC-CoT', 'PoT', 'ReAct-Pro*'
|
81 |
]
|
82 |
|
83 |
CITATION_BUTTON_TEXT = r"""@misc{open-agent-leaderboard,
|
|
|
1 |
# CONSTANTS-URL
|
2 |
OVERALL_MATH_SCORE_FILE = "src/overall_math_score.json"
|
3 |
DETAIL_MATH_SCORE_FILE = "src/detail_math_score.json"
|
4 |
+
|
5 |
# CONSTANTS-TEXT
|
6 |
LEADERBORAD_INTRODUCTION = """# Open Agent Leaderboard
|
7 |
+
### Welcome to the Open Agent Leaderboard! We share the evaluation results of open agents: CoT, SC-CoT, PoT, ReAct, ToT, etc. The agents are implemented by the OpenSource Framework: [*OmAgent*](https://github.com/om-ai-lab/OmAgent)
|
8 |
|
9 |
This leaderboard was last updated: {}.
|
10 |
|
|
|
12 |
"""
|
13 |
|
14 |
DEFAULT_MATH_BENCH = [
|
15 |
+
'gsm8k', 'AQuA', 'MATH-500',
|
16 |
]
|
17 |
+
|
18 |
# The README file for each benchmark
|
19 |
LEADERBOARD_MD = {}
|
20 |
|
|
|
44 |
- Cost: The cost on each math Benchmarks (the lower the better).
|
45 |
- Rank: The rank on each math Benchmarks (the lower the better).
|
46 |
|
47 |
+
- default parameters: temperature=0.0 (except for SC-CoT)
|
48 |
- LLM prices:
|
49 |
- gpt-3.5-turbo:
|
50 |
- 0.5$/1M tokens (input)
|
|
|
55 |
- gpt-4o-2024-08-06:
|
56 |
- 2.50$ /1M input tokens (input)
|
57 |
- 10$ /1M output tokens (output)
|
58 |
+
- Qwen2.5-72B-Instruct and Llama-3.3-70B-Instruct:
|
59 |
- Prices can be found https://cloud.siliconflow.cn/.
|
60 |
- Other open source LLMs:
|
61 |
- Deployed locally, please check the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository for more information.
|
|
|
71 |
]
|
72 |
|
73 |
DATASETS = [
|
74 |
+
'gsm8k', 'AQuA', 'MATH-500'
|
75 |
]
|
76 |
|
77 |
LLM = [
|
|
|
79 |
]
|
80 |
|
81 |
ALGORITHMS = [
|
82 |
+
'IO', 'CoT', 'SC-CoT', 'PoT', 'ReAct-Pro*', 'ToT'
|
83 |
]
|
84 |
|
85 |
CITATION_BUTTON_TEXT = r"""@misc{open-agent-leaderboard,
|
preprocess.py
CHANGED
@@ -3,10 +3,10 @@ import json
|
|
3 |
from datetime import datetime
|
4 |
|
5 |
def process_csv_to_json():
|
6 |
-
#
|
7 |
df = pd.read_csv('src/record.csv')
|
8 |
|
9 |
-
#
|
10 |
df = df.dropna(how='all')
|
11 |
df = df.rename(columns={
|
12 |
'dataset': 'Dataset',
|
@@ -25,36 +25,36 @@ def process_csv_to_json():
|
|
25 |
'Average output tokens': 'Average output tokens'
|
26 |
})
|
27 |
|
28 |
-
#
|
29 |
def parse_number(value):
|
30 |
-
if pd.isna(value):
|
31 |
return 0
|
32 |
-
#
|
33 |
return int(float(str(value).replace(',', '')))
|
34 |
|
35 |
-
#
|
36 |
result = {
|
37 |
"time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
38 |
"results": {}
|
39 |
}
|
40 |
|
41 |
-
#
|
42 |
llms = df['LLM'].dropna().unique()
|
43 |
|
44 |
-
#
|
45 |
for algorithm in df['Algorithm'].dropna().unique():
|
46 |
if not isinstance(algorithm, str):
|
47 |
continue
|
48 |
|
49 |
result['results'][algorithm] = {}
|
50 |
|
51 |
-
#
|
52 |
for llm in llms:
|
53 |
llm_data = df[(df['Algorithm'] == algorithm) & (df['LLM'] == llm)]
|
54 |
if llm_data.empty:
|
55 |
continue
|
56 |
|
57 |
-
#
|
58 |
result['results'][algorithm][llm] = {
|
59 |
'META': {
|
60 |
'Algorithm': str(algorithm),
|
@@ -63,7 +63,7 @@ def process_csv_to_json():
|
|
63 |
}
|
64 |
}
|
65 |
|
66 |
-
#
|
67 |
for dataset in df['Dataset'].dropna().unique():
|
68 |
if not isinstance(dataset, str):
|
69 |
continue
|
@@ -73,9 +73,9 @@ def process_csv_to_json():
|
|
73 |
if not dataset_data.empty:
|
74 |
data_row = dataset_data.iloc[0]
|
75 |
result['results'][algorithm][llm][dataset] = {
|
76 |
-
'Score': round(float(data_row['Score']), 2), #
|
77 |
-
'Pass rate': round(float(data_row['Pass rate']) / 100, 4), #
|
78 |
-
'Cost($)': float(data_row['Cost($)']) if pd.notnull(data_row['Cost($)']) else 0.0,
|
79 |
'Framework': str(data_row['Framework']) if 'Framework' in data_row and pd.notnull(data_row['Framework']) else '',
|
80 |
'X-shot': str(data_row['X-shot']) if pd.notnull(data_row['X-shot']) else '',
|
81 |
'Samples': parse_number(data_row['Samples']),
|
@@ -86,12 +86,12 @@ def process_csv_to_json():
|
|
86 |
'Average output tokens': parse_number(data_row['Average output tokens'])
|
87 |
}
|
88 |
|
89 |
-
#
|
90 |
required_fields = ['Score', 'Pass rate', 'Cost($)', 'Framework', 'X-shot', 'Samples', 'All tokens', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens']
|
91 |
|
92 |
for key, value in result['results'].items():
|
93 |
for llm, datasets in value.items():
|
94 |
-
#
|
95 |
meta = datasets.get('META', {})
|
96 |
if 'LLM' not in meta or 'Eval Date' not in meta:
|
97 |
print(f"Missing META fields in algorithm '{key}' for LLM '{llm}'")
|
@@ -103,15 +103,15 @@ def process_csv_to_json():
|
|
103 |
if missing_fields:
|
104 |
print(f"Missing fields {missing_fields} in dataset '{dataset}' for LLM '{llm}' in algorithm '{key}'")
|
105 |
|
106 |
-
#
|
107 |
with open('src/detail_math_score.json', 'w', encoding='utf-8') as f:
|
108 |
json.dump(result, f, indent=4, ensure_ascii=False)
|
109 |
|
110 |
def process_csv_to_overall_json():
|
111 |
-
#
|
112 |
df = pd.read_csv('src/record.csv')
|
113 |
|
114 |
-
#
|
115 |
df = df.dropna(how='all')
|
116 |
df = df.rename(columns={
|
117 |
'dataset': 'Dataset',
|
@@ -121,24 +121,24 @@ def process_csv_to_overall_json():
|
|
121 |
'Eval Date': 'Eval Date'
|
122 |
})
|
123 |
|
124 |
-
#
|
125 |
result = {
|
126 |
"time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
127 |
"results": {}
|
128 |
}
|
129 |
|
130 |
-
#
|
131 |
llms = df['LLM'].dropna().unique()
|
132 |
for llm in llms:
|
133 |
-
#
|
134 |
for algorithm in df['Algorithm'].dropna().unique():
|
135 |
if not isinstance(algorithm, str):
|
136 |
continue
|
137 |
|
138 |
-
#
|
139 |
-
#
|
140 |
algo_key = algorithm if llm == 'gpt-3.5-turbo' else f"{algorithm}-{llm}"
|
141 |
-
#
|
142 |
algo_data = df[(df['Algorithm'] == algorithm) & (df['LLM'] == llm)]
|
143 |
if algo_data.empty:
|
144 |
print(f"No data found for algorithm '{algorithm}' and LLM '{llm}'")
|
@@ -152,29 +152,29 @@ def process_csv_to_overall_json():
|
|
152 |
}
|
153 |
}
|
154 |
|
155 |
-
#
|
156 |
-
for dataset in ['gsm8k', 'AQuA']:
|
157 |
dataset_data = df[(df['Algorithm'] == algorithm) &
|
158 |
(df['Dataset'] == dataset) &
|
159 |
(df['LLM'] == llm)]
|
160 |
if not dataset_data.empty:
|
161 |
result['results'][algo_key][dataset] = {
|
162 |
-
"Score": float(dataset_data['Score'].iloc[0]) if pd.notnull(dataset_data['Score'].iloc[0]) else 0.0,
|
163 |
-
"Cost($)": float(dataset_data['Cost($)'].iloc[0]) if pd.notnull(dataset_data['Cost($)'].iloc[0]) else 0.0
|
164 |
}
|
165 |
else:
|
166 |
-
#
|
167 |
result['results'][algo_key][dataset] = {
|
168 |
"Score": 0.0,
|
169 |
"Cost($)": 0.0
|
170 |
}
|
171 |
|
172 |
|
173 |
-
#
|
174 |
with open('src/overall_math_score.json', 'w', encoding='utf-8') as f:
|
175 |
json.dump(result, f, indent=4, ensure_ascii=False)
|
176 |
|
177 |
if __name__ == "__main__":
|
178 |
-
#
|
179 |
process_csv_to_json()
|
180 |
-
process_csv_to_overall_json()
|
|
|
3 |
from datetime import datetime
|
4 |
|
5 |
def process_csv_to_json():
|
6 |
+
# Read the CSV file
|
7 |
df = pd.read_csv('src/record.csv')
|
8 |
|
9 |
+
# Clean the data: remove empty rows, rename columns
|
10 |
df = df.dropna(how='all')
|
11 |
df = df.rename(columns={
|
12 |
'dataset': 'Dataset',
|
|
|
25 |
'Average output tokens': 'Average output tokens'
|
26 |
})
|
27 |
|
28 |
+
# Helper function: handle number strings with commas
|
29 |
def parse_number(value):
|
30 |
+
if pd.isna(value) or value == '-':
|
31 |
return 0
|
32 |
+
# Remove commas, convert to float, then to int
|
33 |
return int(float(str(value).replace(',', '')))
|
34 |
|
35 |
+
# Initialize result dictionary
|
36 |
result = {
|
37 |
"time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
38 |
"results": {}
|
39 |
}
|
40 |
|
41 |
+
# Get all unique LLMs
|
42 |
llms = df['LLM'].dropna().unique()
|
43 |
|
44 |
+
# Iterate through each algorithm
|
45 |
for algorithm in df['Algorithm'].dropna().unique():
|
46 |
if not isinstance(algorithm, str):
|
47 |
continue
|
48 |
|
49 |
result['results'][algorithm] = {}
|
50 |
|
51 |
+
# Process each LLM
|
52 |
for llm in llms:
|
53 |
llm_data = df[(df['Algorithm'] == algorithm) & (df['LLM'] == llm)]
|
54 |
if llm_data.empty:
|
55 |
continue
|
56 |
|
57 |
+
# Create dictionary for each LLM
|
58 |
result['results'][algorithm][llm] = {
|
59 |
'META': {
|
60 |
'Algorithm': str(algorithm),
|
|
|
63 |
}
|
64 |
}
|
65 |
|
66 |
+
# Process each dataset
|
67 |
for dataset in df['Dataset'].dropna().unique():
|
68 |
if not isinstance(dataset, str):
|
69 |
continue
|
|
|
73 |
if not dataset_data.empty:
|
74 |
data_row = dataset_data.iloc[0]
|
75 |
result['results'][algorithm][llm][dataset] = {
|
76 |
+
'Score': round(float(data_row['Score']) if data_row['Score'] != '-' else 0, 2), # Keep two decimal places
|
77 |
+
'Pass rate': round(float(data_row['Pass rate']) / 100, 4) if data_row['Pass rate'] != '-' else 0.0, # Convert to decimal and keep two decimal places
|
78 |
+
'Cost($)': float(data_row['Cost($)']) if pd.notnull(data_row['Cost($)']) and data_row['Cost($)'] != '-' else 0.0,
|
79 |
'Framework': str(data_row['Framework']) if 'Framework' in data_row and pd.notnull(data_row['Framework']) else '',
|
80 |
'X-shot': str(data_row['X-shot']) if pd.notnull(data_row['X-shot']) else '',
|
81 |
'Samples': parse_number(data_row['Samples']),
|
|
|
86 |
'Average output tokens': parse_number(data_row['Average output tokens'])
|
87 |
}
|
88 |
|
89 |
+
# Check if each field exists
|
90 |
required_fields = ['Score', 'Pass rate', 'Cost($)', 'Framework', 'X-shot', 'Samples', 'All tokens', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens']
|
91 |
|
92 |
for key, value in result['results'].items():
|
93 |
for llm, datasets in value.items():
|
94 |
+
# Check META information
|
95 |
meta = datasets.get('META', {})
|
96 |
if 'LLM' not in meta or 'Eval Date' not in meta:
|
97 |
print(f"Missing META fields in algorithm '{key}' for LLM '{llm}'")
|
|
|
103 |
if missing_fields:
|
104 |
print(f"Missing fields {missing_fields} in dataset '{dataset}' for LLM '{llm}' in algorithm '{key}'")
|
105 |
|
106 |
+
# Save as JSON file
|
107 |
with open('src/detail_math_score.json', 'w', encoding='utf-8') as f:
|
108 |
json.dump(result, f, indent=4, ensure_ascii=False)
|
109 |
|
110 |
def process_csv_to_overall_json():
|
111 |
+
# Read the CSV file
|
112 |
df = pd.read_csv('src/record.csv')
|
113 |
|
114 |
+
# Clean the data: remove empty rows, rename columns
|
115 |
df = df.dropna(how='all')
|
116 |
df = df.rename(columns={
|
117 |
'dataset': 'Dataset',
|
|
|
121 |
'Eval Date': 'Eval Date'
|
122 |
})
|
123 |
|
124 |
+
# Initialize result dictionary
|
125 |
result = {
|
126 |
"time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
127 |
"results": {}
|
128 |
}
|
129 |
|
130 |
+
# Get all unique LLMs
|
131 |
llms = df['LLM'].dropna().unique()
|
132 |
for llm in llms:
|
133 |
+
# Process base algorithms
|
134 |
for algorithm in df['Algorithm'].dropna().unique():
|
135 |
if not isinstance(algorithm, str):
|
136 |
continue
|
137 |
|
138 |
+
# Add suffix for non-gpt-3.5-turbo models
|
139 |
+
# Modification: add more information for llama models to ensure uniqueness
|
140 |
algo_key = algorithm if llm == 'gpt-3.5-turbo' else f"{algorithm}-{llm}"
|
141 |
+
# Check if the algorithm-LLM combination exists
|
142 |
algo_data = df[(df['Algorithm'] == algorithm) & (df['LLM'] == llm)]
|
143 |
if algo_data.empty:
|
144 |
print(f"No data found for algorithm '{algorithm}' and LLM '{llm}'")
|
|
|
152 |
}
|
153 |
}
|
154 |
|
155 |
+
# Process each dataset
|
156 |
+
for dataset in ['gsm8k', 'AQuA', 'MATH-500']:
|
157 |
dataset_data = df[(df['Algorithm'] == algorithm) &
|
158 |
(df['Dataset'] == dataset) &
|
159 |
(df['LLM'] == llm)]
|
160 |
if not dataset_data.empty:
|
161 |
result['results'][algo_key][dataset] = {
|
162 |
+
"Score": float(dataset_data['Score'].iloc[0]) if pd.notnull(dataset_data['Score'].iloc[0]) and dataset_data['Score'].iloc[0] != '-' else 0.0,
|
163 |
+
"Cost($)": float(dataset_data['Cost($)'].iloc[0]) if pd.notnull(dataset_data['Cost($)'].iloc[0]) and dataset_data['Cost($)'].iloc[0] != '-' else 0.0
|
164 |
}
|
165 |
else:
|
166 |
+
# If the dataset is empty, ensure the key exists and set default values
|
167 |
result['results'][algo_key][dataset] = {
|
168 |
"Score": 0.0,
|
169 |
"Cost($)": 0.0
|
170 |
}
|
171 |
|
172 |
|
173 |
+
# Save as JSON file
|
174 |
with open('src/overall_math_score.json', 'w', encoding='utf-8') as f:
|
175 |
json.dump(result, f, indent=4, ensure_ascii=False)
|
176 |
|
177 |
if __name__ == "__main__":
|
178 |
+
# Generate JSON files in two formats
|
179 |
process_csv_to_json()
|
180 |
+
process_csv_to_overall_json()
|
src/detail_math_score.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
src/detail_results.csv
CHANGED
@@ -1,101 +1,199 @@
|
|
1 |
Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Samples,All tokens,Total input tokens,Average input tokens,Total output tokens,Average output tokens
|
2 |
-
1,SC-CoT,AQuA,gpt-4o,2025/1/22,86.61,0.9882,0
|
3 |
-
2,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,86.22,0.9921,0
|
4 |
-
3,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.04,0.9921,0
|
5 |
-
4,IO,AQuA,Qwen2.5-72B-Instruct,2025/1/22,84.25,0.9961,0
|
6 |
-
5,CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.46,0.9843,0
|
7 |
-
6,
|
8 |
-
7,
|
9 |
-
8,CoT,AQuA,
|
10 |
-
9,
|
11 |
-
10,SC-CoT,AQuA,
|
12 |
-
11,
|
13 |
-
12,SC-CoT,AQuA,
|
14 |
-
13,
|
15 |
-
14,
|
16 |
-
15,
|
17 |
-
16,
|
18 |
-
17,
|
19 |
-
18,
|
20 |
-
19,
|
21 |
-
20,
|
22 |
-
21,
|
23 |
-
22,
|
24 |
-
23,PoT,AQuA,
|
25 |
-
24,
|
26 |
-
25,
|
27 |
-
26,
|
28 |
-
27,CoT,AQuA,
|
29 |
-
28,
|
30 |
-
29,PoT,AQuA,
|
31 |
-
30,SC-CoT,AQuA,
|
32 |
-
31,ReAct-Pro*,AQuA,gpt-
|
33 |
-
32,
|
34 |
-
33,CoT,AQuA,
|
35 |
-
34,
|
36 |
-
35,
|
37 |
-
36,
|
38 |
-
37,
|
39 |
-
38,
|
40 |
-
39,
|
41 |
-
40,
|
42 |
-
41,PoT,AQuA,
|
43 |
-
42,
|
44 |
-
43,
|
45 |
-
44,
|
46 |
-
45,IO,AQuA,
|
47 |
-
46,
|
48 |
-
47,
|
49 |
-
48,
|
50 |
-
49,
|
51 |
-
50,
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Samples,All tokens,Total input tokens,Average input tokens,Total output tokens,Average output tokens
|
2 |
+
1,SC-CoT,AQuA,gpt-4o,2025/1/22,86.61,0.9882,0,8.1485,,254,1373206,744478,2931,628728,2475
|
3 |
+
2,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,86.22,0.9921,0,0.0808,,254,143289,25143,99,118146,465
|
4 |
+
3,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.04,0.9921,0,1.0348,,254,1835669,1051218,4139,784451,3088
|
5 |
+
4,IO,AQuA,Qwen2.5-72B-Instruct,2025/1/22,84.25,0.9961,0,0.0742,,254,131604,25397,100,106207,418
|
6 |
+
5,CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.46,0.9843,0,0.0927,,254,164389,32555,128,131834,519
|
7 |
+
6,ToT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.07,1.0,0,2.9404,,254,5215848,4735188,18642,480660,1892
|
8 |
+
7,IO,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.68,0.9921,0,0.0798,,254,141567,32809,129,108758,428
|
9 |
+
8,CoT,AQuA,Doubao-lite-32k,2025/1/7,82.68,0.9724,0,0.0066,,254,94577,27978,110,66599,262
|
10 |
+
9,CoT,AQuA,gpt-4o,2025/1/22,82.68,0.9803,0,1.0417,,254,123017,25123,99,97894,385
|
11 |
+
10,SC-CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.28,0.9921,0,1.0756,,254,1907924,1135251,4469,772673,3042
|
12 |
+
11,ToT,AQuA,gpt-4o,2025/1/22,81.5,0.9921,0,8.5295,,254,2613607,2347538,9242,266069,1048
|
13 |
+
12,SC-CoT,AQuA,Doubao-lite-32k,2025/1/7,81.1,0.9724,0,0.0519,,254,885986,503751,1983,382235,1505
|
14 |
+
13,ToT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,81.1,0.9921,0,3.7389,,254,6632255,6371642,25085,260613,1026
|
15 |
+
14,CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,80.71,0.9961,0,0.0,,254,149736,33017,130,116719,460
|
16 |
+
15,SC-CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,79.92,1.0,0,0.0,,254,1845332,1098280,4324,747052,2941
|
17 |
+
16,PoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.53,0.9921,0,0.1746,,254,309799,240735,948,69064,272
|
18 |
+
17,IO,AQuA,Doubao-lite-32k,2025/1/7,79.13,1.0,0,0.0058,,254,87742,33058,130,54684,215
|
19 |
+
18,ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.13,0.9961,0,0.768,,254,1362379,1119143,4406,243236,958
|
20 |
+
19,IO,AQuA,Qwen2.5-7B-Instruct,2025/1/22,78.74,0.9843,0,0.0,,254,137771,33271,131,104500,411
|
21 |
+
20,ReAct-Pro*,AQuA,Doubao-lite-32k,2025/1/7,77.56,0.9606,0,0.0445,,254,1032841,977890,3850,54951,216
|
22 |
+
21,IO,AQuA,gpt-4o,2025/1/22,75.59,0.9724,0,1.1453,,254,133752,25631,101,108121,426
|
23 |
+
22,PoT,AQuA,gpt-4o,2025/1/22,75.2,1.0,0,1.6087,,254,327908,222717,877,105191,414
|
24 |
+
23,PoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,75.2,1.0,0,0.1645,,254,291764,249215,981,42549,168
|
25 |
+
24,ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,2025/1/22,74.41,0.9921,0,0.0,,254,695844,564165,2221,131679,518
|
26 |
+
25,ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,2025/1/22,73.23,1.0,0,0.3177,,254,563603,441765,1739,121838,480
|
27 |
+
26,PoT,AQuA,Doubao-lite-32k,2025/1/7,71.65,0.9685,0,0.0147,,254,309436,259863,1023,49573,195
|
28 |
+
27,CoT,AQuA,deepseek-r1:1.5b,2025/1/23,71.65,0.9685,0,0.0,,254,333072,26413,104,306659,1207
|
29 |
+
28,IO,AQuA,deepseek-r1:1.5b,2025/1/22,68.9,0.9488,0,0.0,,254,351767,26667,105,325100,1280
|
30 |
+
29,PoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,68.11,1.0,0,0.0,,254,313728,264517,1041,49211,194
|
31 |
+
30,SC-CoT,AQuA,gpt-3.5-turbo,2025/1/7,66.14,0.9921,0,0.7888,,254,847335,482192,1898,365143,1438
|
32 |
+
31,ReAct-Pro*,AQuA,gpt-3.5-turbo,2025/1/7,64.57,0.9803,0,0.4928,,254,903587,862614,3396,40973,161
|
33 |
+
32,CoT,AQuA,gpt-3.5-turbo,2025/1/7,61.02,0.937,0,0.0957,,254,80793,25447,100,55346,218
|
34 |
+
33,CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,60.63,1.0,0,0.0,,254,144435,32555,128,111880,440
|
35 |
+
34,PoT,AQuA,gpt-3.5-turbo,2025/1/7,59.45,1.0,0,0.1748,,254,266654,225162,886,41492,163
|
36 |
+
35,SC-CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,59.45,0.9724,0,0.0,,254,1651333,971003,3823,680330,2678
|
37 |
+
36,SC-CoT,AQuA,deepseek-r1:1.5b,2025/2/10,59.06,0.9685,0,0.0,,254,5802711,2547772,10031,3254939,12815
|
38 |
+
37,ToT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,59.06,1.0,0,0.0,,254,5739684,4896222,19276,843462,3321
|
39 |
+
38,ReAct-Pro*,AQuA,gpt-4o,2025/1/22,57.48,0.9724,0,2.304,,254,692096,615589,2424,76507,301
|
40 |
+
39,ToT,AQuA,gpt-3.5-turbo,2025/1/7,57.09,0.9961,0,1.1513,,254,2001396,1850767,7286,150629,593
|
41 |
+
40,ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,2025/1/22,55.51,0.9685,0,0.0,,254,4340821,3764723,14822,576098,2268
|
42 |
+
41,PoT,AQuA,deepseek-r1:1.5b,2025/2/10,54.72,0.9724,0,0.0,,254,1016647,250690,987,765957,3016
|
43 |
+
42,ReAct-Pro*,AQuA,deepseek-r1:1.5b,2025/2/10,54.33,0.9646,0,0.0,,254,14445041,10578715,41648,3866326,15222
|
44 |
+
43,ToT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,53.94,1.0,0,0.0,,254,8602682,8224468,32380,378214,1489
|
45 |
+
44,CoT,AQuA,Internllm2_5-7B,2025/1/22,52.76,0.8937,0,0.0,,254,127520,26610,105,100910,397
|
46 |
+
45,IO,AQuA,Llama-3.1-8B-Instruct,2025/1/22,51.18,0.9882,0,0.0,,254,133106,26459,104,106647,420
|
47 |
+
46,IO,AQuA,Internllm2_5-7B,2025/1/22,47.64,0.9094,0,0.0,,254,185041,50232,198,134809,531
|
48 |
+
47,ToT,AQuA,Doubao-lite-32k,2025/1/7,45.28,0.7402,0,0.0881,,254,2000550,1850249,7284,150301,592
|
49 |
+
48,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0,0.0,,254,4428801,3592039,14142,836762,3294
|
50 |
+
49,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.9882,0,0.0,,254,110040,30477,120,79563,313
|
51 |
+
50,SC-CoT,AQuA,Internllm2_5-7B,2025/1/22,39.37,0.9803,0,0.0,,254,2296222,1420494,5592,875728,3448
|
52 |
+
51,IO,AQuA,gpt-3.5-turbo,2025/1/7,38.98,1.0,0,0.038,,254,42471,25701,101,16770,66
|
53 |
+
52,PoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,36.61,0.9685,0,0.0,,254,290914,240613,947,50301,198
|
54 |
+
53,PoT,AQuA,Internllm2_5-7B,2025/1/22,36.61,0.9882,0,0.0,,254,301962,233505,919,68457,270
|
55 |
+
54,ToT,AQuA,Internllm2_5-7B,2025/1/22,35.83,0.9961,0,0.0,,254,4734560,4263136,16784,471424,1856
|
56 |
+
55,CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,33.07,0.9882,0,0.0,,254,117339,30477,120,86862,342
|
57 |
+
56,ToT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,31.5,0.9882,0,0.0,,254,6250702,6058022,23850,192680,759
|
58 |
+
57,PoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.71,0.9646,0,0.0,,254,298475,246560,971,51915,204
|
59 |
+
58,ToT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,29.92,1.0,0,0.0,,254,8700281,8100085,31890,600196,2363
|
60 |
+
59,IO,AQuA,Qwen2-1.5B-Instruct,2025/1/22,29.13,0.9764,0,0.0,,254,71047,27937,110,43110,170
|
61 |
+
60,IO,AQuA,Qwen2-0.5B-Instruct,2025/1/22,27.17,0.9882,0,0.0,,254,110415,27937,110,82478,325
|
62 |
+
61,ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,2025/1/22,25.59,0.9606,0,0.0,,254,5072004,4555858,17936,516146,2032
|
63 |
+
62,ToT,AQuA,deepseek-r1:1.5b,2025/2/10,24.8,0.5551,0,0.0,,254,794512,605028,2382,189484,746
|
64 |
+
63,ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,2025/1/22,24.02,0.9685,0,0.0,,254,7170087,6344167,24977,825920,3252
|
65 |
+
64,SC-CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,23.62,0.9646,0,0.0,,254,1775335,1034362,4072,740973,2917
|
66 |
+
65,SC-CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,22.83,0.9724,0,0.0,,254,2215091,1246929,4909,968162,3812
|
67 |
+
66,PoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.9213,0,0.0,,254,322281,258867,1019,63414,250
|
68 |
+
1,CoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,80.2,1.0,4,0.349,,500,619015,338549,677,280466,561
|
69 |
+
2,SC-CoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,74.2,1.0,4,3.2239,,500,5718739,3959492,7919,1759247,3518
|
70 |
+
3,SC-CoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,74.0,1.0,4,3.1556,,500,5597513,3823997,7648,1773516,3547
|
71 |
+
4,CoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,71.2,1.0,4,0.3463,,500,614221,342879,686,271342,543
|
72 |
+
5,IO,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,70.2,1.0,4,0.2506,,500,444591,169549,339,275042,550
|
73 |
+
6,CoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,69.8,1.0,4,0.0,,500,617204,354049,708,263155,526
|
74 |
+
7,IO,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,69.4,1.0,4,0.2386,,500,423216,155879,312,267337,535
|
75 |
+
8,CoT,MATH-500,gpt-4o,2025/1/22,68.0,1.0,4,3.0569,,500,552688,329332,659,223356,447
|
76 |
+
9,SC-CoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,67.0,1.0,4,0.0,,500,5451484,3833751,7668,1617733,3235
|
77 |
+
10,ReAct-Pro*,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,64.6,1.0,4,3.1806,,500,5641879,5223611,10447,418268,837
|
78 |
+
11,ReAct-Pro*,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,62.8,1.0,4,3.4541,,500,6127117,5747268,11495,379849,760
|
79 |
+
12,IO,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,59.4,1.0,4,0.0,,500,411362,169549,339,241813,484
|
80 |
+
13,CoT,MATH-500,Doubao-lite-32k,2025/1/7,59.0,1.0,4,0.0255,,500,479941,336370,673,143571,287
|
81 |
+
14,ReAct-Pro*,MATH-500,gpt-4o,2025/1/22,54.0,1.0,4,17.7735,,500,6153255,5834537,11669,318718,637
|
82 |
+
15,CoT,MATH-500,deepseek-r1:1.5b,2025/1/23,49.4,1.0,4,0.0,,500,1199129,341549,683,857580,1715
|
83 |
+
16,SC-CoT,MATH-500,Doubao-lite-32k,2025/1/7,49.2,1.0,4,0.1406,,500,2470810,1507651,3015,963159,1926
|
84 |
+
17,ReAct-Pro*,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,48.8,1.0,4,0.0,,500,4990240,4646708,9293,343532,687
|
85 |
+
18,ReAct-Pro*,MATH-500,Doubao-lite-32k,2025/1/7,47.2,1.0,4,0.186,,500,4388666,4234620,8469,154046,308
|
86 |
+
19,PoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,47.2,0.822,4,0.233,,500,413372,242549,485,170823,342
|
87 |
+
20,CoT,MATH-500,Internllm2_5-7B,2025/1/22,46.6,1.0,4,0.0,,500,546774,332883,666,213891,428
|
88 |
+
21,PoT,MATH-500,gpt-4o,2025/1/22,46.2,0.864,4,1.5994,,500,340960,241357,483,99603,199
|
89 |
+
22,IO,MATH-500,deepseek-r1:1.5b,2025/1/22,43.8,1.0,4,0.0,,500,1022548,157049,314,865499,1731
|
90 |
+
23,PoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,42.6,0.802,4,0.2839,,500,503596,253879,508,249717,499
|
91 |
+
24,IO,MATH-500,gpt-4o,2025/1/22,41.8,1.0,4,2.7907,,500,394447,153832,308,240615,481
|
92 |
+
25,CoT,MATH-500,gpt-3.5-turbo,2025/1/7,39.8,1.0,4,0.3189,,500,432196,329381,659,102815,206
|
93 |
+
26,PoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,39.6,0.744,4,0.0,,500,408812,258549,517,150263,301
|
94 |
+
27,IO,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,38.6,1.0,4,0.0,,500,503934,155563,311,348371,697
|
95 |
+
28,SC-CoT,MATH-500,deepseek-r1:1.5b,2025/2/10,38.0,1.0,4,0.0,,500,14742109,7080559,14161,7661550,15323
|
96 |
+
29,IO,MATH-500,Doubao-lite-32k,2025/1/7,37.4,1.0,4,0.0187,,500,311730,166870,334,144860,290
|
97 |
+
30,SC-CoT,MATH-500,gpt-4o,2025/1/22,34.4,1.0,4,19.6538,,500,3455323,1986584,3973,1468739,2937
|
98 |
+
31,PoT,MATH-500,Doubao-lite-32k,2025/1/7,32.6,0.68,4,0.0144,,500,303148,254377,509,48771,98
|
99 |
+
32,SC-CoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,30.2,1.0,4,0.0,,500,5034937,3546673,7093,1488264,2977
|
100 |
+
33,ReAct-Pro*,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,28.8,1.0,4,0.0,,500,8763629,7486706,14973,1276923,2554
|
101 |
+
34,PoT,MATH-500,gpt-3.5-turbo,2025/1/7,28.8,0.838,4,0.168,,500,271916,239902,480,32014,64
|
102 |
+
35,SC-CoT,MATH-500,gpt-3.5-turbo,2025/1/7,28.8,1.0,4,1.9764,,500,2238812,1381818,2764,856994,1714
|
103 |
+
36,CoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,25.8,1.0,4,0.0,,500,625568,342879,686,282689,565
|
104 |
+
37,PoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,25.4,0.684,4,0.0,,500,462271,253879,508,208392,417
|
105 |
+
38,ReAct-Pro*,MATH-500,deepseek-r1:1.5b,2025/2/10,24.4,1.0,4,0.0,,500,30177348,20729970,41460,9447378,18895
|
106 |
+
39,ReAct-Pro*,MATH-500,gpt-3.5-turbo,2025/1/7,23.8,1.0,4,2.0406,,500,3832714,3708461,7417,124253,249
|
107 |
+
40,IO,MATH-500,Internllm2_5-7B,2025/1/22,22.8,1.0,4,0.0,,500,467888,201883,404,266005,532
|
108 |
+
41,IO,MATH-500,gpt-3.5-turbo,2025/1/7,17.2,1.0,4,0.2436,,500,265625,154881,310,110744,221
|
109 |
+
42,CoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,15.2,1.0,4,0.0,,500,536377,349049,698,187328,375
|
110 |
+
43,PoT,MATH-500,Internllm2_5-7B,2025/1/22,15.0,0.324,4,0.0,,500,368709,247883,496,120826,242
|
111 |
+
44,ReAct-Pro*,MATH-500,Internllm2_5-7B,2025/1/22,14.8,1.0,4,0.0,,500,14186105,11831496,23663,2354609,4709
|
112 |
+
45,ToT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,10.8,1.0,4,9.0421,,500,16039361,15657730,31315,381631,763
|
113 |
+
46,SC-CoT,MATH-500,Internllm2_5-7B,2025/1/22,9.8,1.0,4,0.0,,500,5838466,4193296,8387,1645170,3290
|
114 |
+
47,ToT,MATH-500,gpt-3.5-turbo,2025/1/7,9.8,1.0,4,5.2914,,500,10001767,9711244,19422,290523,581
|
115 |
+
48,ReAct-Pro*,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,8.2,1.0,4,0.0,,500,8987061,8430774,16862,556287,1113
|
116 |
+
49,IO,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,7.0,1.0,4,0.0,,500,413878,158777,318,255101,510
|
117 |
+
50,CoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,6.2,1.0,4,0.0,,500,549188,349049,698,200139,400
|
118 |
+
51,SC-CoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,3.8,0.99,4,0.0,,500,5569442,3832429,7665,1737013,3474
|
119 |
+
52,ToT,MATH-500,gpt-4o,2025/1/22,3.2,1.0,4,40.8094,,500,15242432,14881985,29764,360447,721
|
120 |
+
53,IO,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,2.6,1.0,4,0.0,,500,429330,159049,318,270281,541
|
121 |
+
54,ToT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,1.8,0.908,4,0.0,,500,9035000,7729000,15458,1306000,2612
|
122 |
+
55,ToT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,1.4,0.698,4,8.2699,,500,14669500,14099500,28199,570000,1140
|
123 |
+
56,ToT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,1.4,0.916,4,0.0,,500,10167500,9749000,19498,418500,837
|
124 |
+
57,ToT,MATH-500,Doubao-lite-32k,2025/1/7,1.2,0.942,4,0.2371,,500,5564500,5338500,10677,226000,452
|
125 |
+
58,PoT,MATH-500,deepseek-r1:1.5b,2025/2/10,1.0,0.016,4,0.0,,500,1031067,245549,491,785518,1571
|
126 |
+
59,PoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,0.8,0.022,4,0.0,,500,786870,248509,497,538361,1077
|
127 |
+
60,SC-CoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.8,1.0,4,0.0,,500,6862056,4448663,8897,2413393,4827
|
128 |
+
61,ToT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,0.8,0.972,4,0.0,,500,4535000,4408000,8816,127000,254
|
129 |
+
62,ReAct-Pro*,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.6,1.0,4,0.0,,500,19442440,18137392,36275,1305048,2610
|
130 |
+
63,ToT,MATH-500,deepseek-r1:1.5b,2025/2/10,0.4,0.716,4,0.0,,500,1941500,1831000,3662,110500,221
|
131 |
+
64,ToT,MATH-500,Internllm2_5-7B,2025/1/22,0.2,0.99,4,0.0,,500,8350500,7515000,15030,835500,1671
|
132 |
+
65,PoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.0,4,0.0,,500,437202,253549,507,183653,367
|
133 |
+
66,ToT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.962,4,0.0,,500,5996500,5590500,11181,406000,812
|
134 |
+
1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.07,1.0,8,6.2005,,1319,10998794,8413717,6379,2585077,1960
|
135 |
+
2,CoT,gsm8k,gpt-4o,2025/1/22,94.09,1.0,8,4.5367,,1319,1165166,948668,719,216498,164
|
136 |
+
3,CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,93.93,1.0,8,0.687,,1319,1218665,990168,751,228497,173
|
137 |
+
4,SC-CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,93.86,1.0,8,5.9858,,1319,10618008,8136223,6168,2481785,1882
|
138 |
+
5,PoT,gsm8k,gpt-4o,2025/1/22,93.1,0.9977,8,4.2166,,1319,1247912,1101672,835,146240,111
|
139 |
+
6,CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.87,1.0,8,0.7195,,1319,1276252,1005119,762,271133,206
|
140 |
+
7,PoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.34,0.9939,8,0.7054,,1319,1251210,1106682,839,144528,110
|
141 |
+
8,IO,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,92.27,1.0,8,0.4709,,1319,835275,583916,443,251359,191
|
142 |
+
9,ToT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,91.89,1.0,8,20.8753,,1319,37029687,35096810,26609,1932877,1465
|
143 |
+
10,SC-CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,91.13,1.0,8,0.0,,1319,11140985,8586888,6510,2554097,1936
|
144 |
+
11,ToT,gsm8k,gpt-4o,2025/1/22,91.13,1.0,8,86.8581,,1319,30769735,29445237,22324,1324498,1004
|
145 |
+
12,SC-CoT,gsm8k,gpt-4o,2025/1/22,90.3,0.9992,8,31.0542,,1319,5798173,3590336,2722,2207837,1674
|
146 |
+
13,CoT,gsm8k,Doubao-lite-32k,2025/1/7,89.31,1.0,8,0.0558,,1319,1201820,1042095,790,159725,121
|
147 |
+
14,ToT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,88.88,1.0,8,23.5911,,1319,41847148,40435361,30656,1411787,1070
|
148 |
+
15,IO,gsm8k,gpt-4o,2025/1/22,88.4,1.0,8,3.3463,,1319,741446,542416,411,199030,151
|
149 |
+
16,ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,87.64,0.9992,8,10.1124,,1319,17937864,17038928,12918,898936,682
|
150 |
+
17,ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,87.26,1.0,8,10.5479,,1319,18710437,18160983,13769,549454,417
|
151 |
+
18,SC-CoT,gsm8k,Doubao-lite-32k,2025/1/7,87.26,0.9992,8,0.2083,,1319,3888813,2691714,2041,1197099,908
|
152 |
+
19,IO,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,86.58,1.0,8,0.4899,,1319,869060,555340,421,313720,238
|
153 |
+
20,CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,85.67,1.0,8,0.0,,1319,1290805,1046008,793,244797,186
|
154 |
+
21,ReAct-Pro*,gsm8k,Doubao-lite-32k,2025/1/7,85.6,0.9962,8,0.2512,,1319,5998639,5862016,4444,136623,104
|
155 |
+
22,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.87,1.0,8,0.0,,1319,14850914,14355752,10884,495162,375
|
156 |
+
23,SC-CoT,gsm8k,gpt-3.5-turbo,2025/1/7,79.91,0.9992,8,3.3938,,1319,4089612,2740652,2078,1348960,1023
|
157 |
+
24,PoT,gsm8k,Doubao-lite-32k,2025/1/7,79.61,0.9257,8,0.0576,,1319,1288055,1170038,887,118017,89
|
158 |
+
25,CoT,gsm8k,gpt-3.5-turbo,2025/1/7,78.7,1.0,8,0.6788,,1319,1088041,953242,723,134799,102
|
159 |
+
26,CoT,gsm8k,Internllm2_5-7B,2025/1/22,77.71,0.997,8,0.0,,1319,1202163,968163,734,234000,177
|
160 |
+
27,PoT,gsm8k,gpt-3.5-turbo,2025/1/7,76.88,0.9924,8,0.6902,,1319,1187080,1090418,827,96662,73
|
161 |
+
28,CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.44,0.9992,8,0.0,,1319,1248329,990168,751,258161,196
|
162 |
+
29,ReAct-Pro*,gsm8k,gpt-3.5-turbo,2025/1/7,74.91,0.9939,8,3.4633,,1319,6646286,6506164,4933,140122,106
|
163 |
+
30,SC-CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,73.46,0.9955,8,0.0,,1319,11778716,8630514,6543,3148202,2387
|
164 |
+
31,PoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,73.09,0.7961,8,0.9736,,1319,1727044,1126025,854,601019,456
|
165 |
+
32,ToT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,72.21,0.9901,8,0.0,,1319,31657319,20196528,15312,11460791,8689
|
166 |
+
33,IO,gsm8k,Doubao-lite-32k,2025/1/7,72.02,0.9992,8,0.0354,,1319,740483,617377,468,123106,93
|
167 |
+
34,CoT,gsm8k,deepseek-r1:1.5b,2025/1/23,70.66,0.9977,8,0.0,,1319,2090625,1011714,767,1078911,818
|
168 |
+
35,ToT,gsm8k,gpt-3.5-turbo,2025/1/7,67.93,0.997,8,9.1707,,1319,16727175,15920037,12070,807138,612
|
169 |
+
36,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.78,0.9856,8,0.0,,1319,22835767,21044978,15955,1790789,1358
|
170 |
+
37,ToT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,65.05,0.9196,8,0.0,,1319,16432102,15554967,11793,877135,665
|
171 |
+
38,IO,gsm8k,deepseek-r1:1.5b,2025/1/22,64.14,0.9962,8,0.0,,1319,1483051,561935,426,921116,698
|
172 |
+
39,ReAct-Pro*,gsm8k,gpt-4o,2025/1/22,63.31,0.9955,8,39.0751,,1319,14715887,14411173,10926,304714,231
|
173 |
+
40,PoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,58.83,0.7051,8,0.0,,1319,1362822,1145390,868,217432,165
|
174 |
+
41,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8,0.0,,1319,887913,596229,452,291684,221
|
175 |
+
42,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.9955,8,0.0,,1319,1745429,550941,418,1194488,906
|
176 |
+
43,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.5,1.0,8,0.0,,1319,1218525,1032818,783,185707,141
|
177 |
+
44,SC-CoT,gsm8k,deepseek-r1:1.5b,2025/2/10,55.34,0.997,8,0.0,,1319,25785865,14540096,11024,11245769,8526
|
178 |
+
45,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,48.22,0.9841,8,0.0,,1319,14526431,10678792,8096,3847639,2917
|
179 |
+
46,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.67,0.5542,8,0.0,,1319,1391111,1147538,870,243573,185
|
180 |
+
47,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8,0.0,,1319,1324949,1136843,862,188106,143
|
181 |
+
48,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8,0.3328,,1319,586553,546990,415,39563,30
|
182 |
+
49,ToT,gsm8k,Doubao-lite-32k,2025/1/7,37.83,0.8734,8,0.8739,,1319,20274349,19208597,14563,1065752,808
|
183 |
+
50,ReAct-Pro*,gsm8k,deepseek-r1:1.5b,2025/2/10,35.94,0.9962,8,0.0,,1319,24219077,19299381,14632,4919696,3730
|
184 |
+
51,CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,35.94,0.9992,8,0.0,,1319,1223459,1032818,783,190641,145
|
185 |
+
52,ReAct-Pro*,gsm8k,Internllm2_5-7B,2025/1/22,33.51,0.9795,8,0.0,,1319,35669989,30120070,22836,5549919,4208
|
186 |
+
53,ReAct-Pro*,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,24.87,0.8021,8,0.0,,1319,9828001,9133603,6925,694398,526
|
187 |
+
54,ToT,gsm8k,deepseek-r1:1.5b,2025/2/10,23.12,0.7248,8,0.0,,1319,3421486,2738244,2076,683242,518
|
188 |
+
55,ToT,gsm8k,Internllm2_5-7B,2025/1/22,20.85,0.7013,8,0.0,,1319,13178129,11768118,8922,1410011,1069
|
189 |
+
56,ToT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,19.64,0.7726,8,0.0,,1319,12758687,12124248,9192,634439,481
|
190 |
+
57,PoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,18.5,0.3101,8,0.0,,1319,1327522,1151528,873,175994,133
|
191 |
+
58,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.68,1.0,8,0.0,,1319,736996,568530,431,168466,128
|
192 |
+
59,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.71,1.0,8,0.0,,1319,834897,568116,431,266781,202
|
193 |
+
60,PoT,gsm8k,deepseek-r1:1.5b,2025/2/10,11.9,0.1744,8,0.0,,1319,1954509,1138872,863,815637,618
|
194 |
+
61,SC-CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,11.75,0.9189,8,0.0,,1319,12411942,9066115,6873,3345827,2537
|
195 |
+
62,IO,gsm8k,Internllm2_5-7B,2025/1/22,11.6,0.9795,8,0.0,,1319,1113728,679302,515,434426,329
|
196 |
+
63,PoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,9.63,0.1691,8,0.0,,1319,1389135,1151528,873,237607,180
|
197 |
+
64,ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,7.66,0.9522,8,0.0,,1319,55392611,52431343,39751,2961268,2245
|
198 |
+
65,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,1.67,0.9469,8,0.0,,1319,16465720,11019864,8355,5445856,4129
|
199 |
+
66,ToT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.0,8,0.0,,1319,0,0,0,0,0
|
src/overall_math_score.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"time": "2025-
|
3 |
"results": {
|
4 |
"IO": {
|
5 |
"META": {
|
@@ -14,6 +14,10 @@
|
|
14 |
"AQuA": {
|
15 |
"Score": 38.98,
|
16 |
"Cost($)": 0.038
|
|
|
|
|
|
|
|
|
17 |
}
|
18 |
},
|
19 |
"ReAct-Pro*": {
|
@@ -29,6 +33,10 @@
|
|
29 |
"AQuA": {
|
30 |
"Score": 64.57,
|
31 |
"Cost($)": 0.4928
|
|
|
|
|
|
|
|
|
32 |
}
|
33 |
},
|
34 |
"PoT": {
|
@@ -44,6 +52,10 @@
|
|
44 |
"AQuA": {
|
45 |
"Score": 59.45,
|
46 |
"Cost($)": 0.1748
|
|
|
|
|
|
|
|
|
47 |
}
|
48 |
},
|
49 |
"CoT": {
|
@@ -59,6 +71,10 @@
|
|
59 |
"AQuA": {
|
60 |
"Score": 61.02,
|
61 |
"Cost($)": 0.0957
|
|
|
|
|
|
|
|
|
62 |
}
|
63 |
},
|
64 |
"SC-CoT": {
|
@@ -74,6 +90,29 @@
|
|
74 |
"AQuA": {
|
75 |
"Score": 66.14,
|
76 |
"Cost($)": 0.7888
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
}
|
78 |
},
|
79 |
"IO-Doubao-lite-32k": {
|
@@ -89,6 +128,10 @@
|
|
89 |
"AQuA": {
|
90 |
"Score": 79.13,
|
91 |
"Cost($)": 0.0058
|
|
|
|
|
|
|
|
|
92 |
}
|
93 |
},
|
94 |
"ReAct-Pro*-Doubao-lite-32k": {
|
@@ -104,6 +147,10 @@
|
|
104 |
"AQuA": {
|
105 |
"Score": 77.56,
|
106 |
"Cost($)": 0.0445
|
|
|
|
|
|
|
|
|
107 |
}
|
108 |
},
|
109 |
"PoT-Doubao-lite-32k": {
|
@@ -119,6 +166,10 @@
|
|
119 |
"AQuA": {
|
120 |
"Score": 71.65,
|
121 |
"Cost($)": 0.0147
|
|
|
|
|
|
|
|
|
122 |
}
|
123 |
},
|
124 |
"CoT-Doubao-lite-32k": {
|
@@ -134,6 +185,10 @@
|
|
134 |
"AQuA": {
|
135 |
"Score": 82.68,
|
136 |
"Cost($)": 0.0066
|
|
|
|
|
|
|
|
|
137 |
}
|
138 |
},
|
139 |
"SC-CoT-Doubao-lite-32k": {
|
@@ -149,6 +204,29 @@
|
|
149 |
"AQuA": {
|
150 |
"Score": 81.1,
|
151 |
"Cost($)": 0.0519
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
}
|
153 |
},
|
154 |
"IO-gpt-4o": {
|
@@ -164,6 +242,10 @@
|
|
164 |
"AQuA": {
|
165 |
"Score": 75.59,
|
166 |
"Cost($)": 1.1453
|
|
|
|
|
|
|
|
|
167 |
}
|
168 |
},
|
169 |
"ReAct-Pro*-gpt-4o": {
|
@@ -179,6 +261,10 @@
|
|
179 |
"AQuA": {
|
180 |
"Score": 57.48,
|
181 |
"Cost($)": 2.304
|
|
|
|
|
|
|
|
|
182 |
}
|
183 |
},
|
184 |
"PoT-gpt-4o": {
|
@@ -194,6 +280,10 @@
|
|
194 |
"AQuA": {
|
195 |
"Score": 75.2,
|
196 |
"Cost($)": 1.6087
|
|
|
|
|
|
|
|
|
197 |
}
|
198 |
},
|
199 |
"CoT-gpt-4o": {
|
@@ -209,6 +299,10 @@
|
|
209 |
"AQuA": {
|
210 |
"Score": 82.68,
|
211 |
"Cost($)": 1.0417
|
|
|
|
|
|
|
|
|
212 |
}
|
213 |
},
|
214 |
"SC-CoT-gpt-4o": {
|
@@ -224,6 +318,29 @@
|
|
224 |
"AQuA": {
|
225 |
"Score": 86.61,
|
226 |
"Cost($)": 8.1485
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
}
|
228 |
},
|
229 |
"IO-Qwen2.5-72B-Instruct": {
|
@@ -239,6 +356,10 @@
|
|
239 |
"AQuA": {
|
240 |
"Score": 84.25,
|
241 |
"Cost($)": 0.0742
|
|
|
|
|
|
|
|
|
242 |
}
|
243 |
},
|
244 |
"ReAct-Pro*-Qwen2.5-72B-Instruct": {
|
@@ -254,6 +375,10 @@
|
|
254 |
"AQuA": {
|
255 |
"Score": 73.23,
|
256 |
"Cost($)": 0.3177
|
|
|
|
|
|
|
|
|
257 |
}
|
258 |
},
|
259 |
"PoT-Qwen2.5-72B-Instruct": {
|
@@ -269,6 +394,10 @@
|
|
269 |
"AQuA": {
|
270 |
"Score": 75.2,
|
271 |
"Cost($)": 0.1645
|
|
|
|
|
|
|
|
|
272 |
}
|
273 |
},
|
274 |
"CoT-Qwen2.5-72B-Instruct": {
|
@@ -284,6 +413,10 @@
|
|
284 |
"AQuA": {
|
285 |
"Score": 86.22,
|
286 |
"Cost($)": 0.0808
|
|
|
|
|
|
|
|
|
287 |
}
|
288 |
},
|
289 |
"SC-CoT-Qwen2.5-72B-Instruct": {
|
@@ -299,6 +432,29 @@
|
|
299 |
"AQuA": {
|
300 |
"Score": 85.04,
|
301 |
"Cost($)": 1.0348
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
}
|
303 |
},
|
304 |
"IO-Llama-3.3-70B-Instruct": {
|
@@ -314,6 +470,10 @@
|
|
314 |
"AQuA": {
|
315 |
"Score": 82.68,
|
316 |
"Cost($)": 0.0798
|
|
|
|
|
|
|
|
|
317 |
}
|
318 |
},
|
319 |
"ReAct-Pro*-Llama-3.3-70B-Instruct": {
|
@@ -329,6 +489,10 @@
|
|
329 |
"AQuA": {
|
330 |
"Score": 79.13,
|
331 |
"Cost($)": 0.768
|
|
|
|
|
|
|
|
|
332 |
}
|
333 |
},
|
334 |
"PoT-Llama-3.3-70B-Instruct": {
|
@@ -344,6 +508,10 @@
|
|
344 |
"AQuA": {
|
345 |
"Score": 79.53,
|
346 |
"Cost($)": 0.1746
|
|
|
|
|
|
|
|
|
347 |
}
|
348 |
},
|
349 |
"CoT-Llama-3.3-70B-Instruct": {
|
@@ -359,6 +527,10 @@
|
|
359 |
"AQuA": {
|
360 |
"Score": 83.46,
|
361 |
"Cost($)": 0.0927
|
|
|
|
|
|
|
|
|
362 |
}
|
363 |
},
|
364 |
"SC-CoT-Llama-3.3-70B-Instruct": {
|
@@ -374,6 +546,29 @@
|
|
374 |
"AQuA": {
|
375 |
"Score": 82.28,
|
376 |
"Cost($)": 1.0756
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
}
|
378 |
},
|
379 |
"IO-Qwen2.5-7B-Instruct": {
|
@@ -389,6 +584,10 @@
|
|
389 |
"AQuA": {
|
390 |
"Score": 78.74,
|
391 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
392 |
}
|
393 |
},
|
394 |
"ReAct-Pro*-Qwen2.5-7B-Instruct": {
|
@@ -404,6 +603,10 @@
|
|
404 |
"AQuA": {
|
405 |
"Score": 74.41,
|
406 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
407 |
}
|
408 |
},
|
409 |
"PoT-Qwen2.5-7B-Instruct": {
|
@@ -419,6 +622,10 @@
|
|
419 |
"AQuA": {
|
420 |
"Score": 68.11,
|
421 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
422 |
}
|
423 |
},
|
424 |
"CoT-Qwen2.5-7B-Instruct": {
|
@@ -434,6 +641,10 @@
|
|
434 |
"AQuA": {
|
435 |
"Score": 80.71,
|
436 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
437 |
}
|
438 |
},
|
439 |
"SC-CoT-Qwen2.5-7B-Instruct": {
|
@@ -449,6 +660,29 @@
|
|
449 |
"AQuA": {
|
450 |
"Score": 79.92,
|
451 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
452 |
}
|
453 |
},
|
454 |
"IO-Llama-3.1-8B-Instruct": {
|
@@ -464,6 +698,10 @@
|
|
464 |
"AQuA": {
|
465 |
"Score": 51.18,
|
466 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
467 |
}
|
468 |
},
|
469 |
"ReAct-Pro*-Llama-3.1-8B-Instruct": {
|
@@ -479,6 +717,10 @@
|
|
479 |
"AQuA": {
|
480 |
"Score": 55.51,
|
481 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
482 |
}
|
483 |
},
|
484 |
"PoT-Llama-3.1-8B-Instruct": {
|
@@ -494,6 +736,10 @@
|
|
494 |
"AQuA": {
|
495 |
"Score": 36.61,
|
496 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
497 |
}
|
498 |
},
|
499 |
"CoT-Llama-3.1-8B-Instruct": {
|
@@ -509,6 +755,10 @@
|
|
509 |
"AQuA": {
|
510 |
"Score": 60.63,
|
511 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
512 |
}
|
513 |
},
|
514 |
"SC-CoT-Llama-3.1-8B-Instruct": {
|
@@ -524,6 +774,29 @@
|
|
524 |
"AQuA": {
|
525 |
"Score": 59.45,
|
526 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
527 |
}
|
528 |
},
|
529 |
"IO-Internllm2_5-7B": {
|
@@ -539,6 +812,10 @@
|
|
539 |
"AQuA": {
|
540 |
"Score": 47.64,
|
541 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
542 |
}
|
543 |
},
|
544 |
"ReAct-Pro*-Internllm2_5-7B": {
|
@@ -554,6 +831,10 @@
|
|
554 |
"AQuA": {
|
555 |
"Score": 40.94,
|
556 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
557 |
}
|
558 |
},
|
559 |
"PoT-Internllm2_5-7B": {
|
@@ -569,6 +850,10 @@
|
|
569 |
"AQuA": {
|
570 |
"Score": 36.61,
|
571 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
572 |
}
|
573 |
},
|
574 |
"CoT-Internllm2_5-7B": {
|
@@ -584,6 +869,10 @@
|
|
584 |
"AQuA": {
|
585 |
"Score": 52.76,
|
586 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
587 |
}
|
588 |
},
|
589 |
"SC-CoT-Internllm2_5-7B": {
|
@@ -599,6 +888,29 @@
|
|
599 |
"AQuA": {
|
600 |
"Score": 39.37,
|
601 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
602 |
}
|
603 |
},
|
604 |
"IO-Qwen2-1.5B-Instruct": {
|
@@ -614,6 +926,10 @@
|
|
614 |
"AQuA": {
|
615 |
"Score": 29.13,
|
616 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
617 |
}
|
618 |
},
|
619 |
"ReAct-Pro*-Qwen2-1.5B-Instruct": {
|
@@ -629,6 +945,10 @@
|
|
629 |
"AQuA": {
|
630 |
"Score": 25.59,
|
631 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
632 |
}
|
633 |
},
|
634 |
"PoT-Qwen2-1.5B-Instruct": {
|
@@ -644,6 +964,10 @@
|
|
644 |
"AQuA": {
|
645 |
"Score": 30.71,
|
646 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
647 |
}
|
648 |
},
|
649 |
"CoT-Qwen2-1.5B-Instruct": {
|
@@ -659,6 +983,10 @@
|
|
659 |
"AQuA": {
|
660 |
"Score": 40.55,
|
661 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
662 |
}
|
663 |
},
|
664 |
"SC-CoT-Qwen2-1.5B-Instruct": {
|
@@ -674,6 +1002,29 @@
|
|
674 |
"AQuA": {
|
675 |
"Score": 23.62,
|
676 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
677 |
}
|
678 |
},
|
679 |
"IO-Qwen2-0.5B-Instruct": {
|
@@ -689,6 +1040,10 @@
|
|
689 |
"AQuA": {
|
690 |
"Score": 27.17,
|
691 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
692 |
}
|
693 |
},
|
694 |
"ReAct-Pro*-Qwen2-0.5B-Instruct": {
|
@@ -704,6 +1059,10 @@
|
|
704 |
"AQuA": {
|
705 |
"Score": 24.02,
|
706 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
707 |
}
|
708 |
},
|
709 |
"PoT-Qwen2-0.5B-Instruct": {
|
@@ -713,12 +1072,16 @@
|
|
713 |
"Eval Date": "2025/1/22"
|
714 |
},
|
715 |
"gsm8k": {
|
716 |
-
"Score": 9.
|
717 |
"Cost($)": 0.0
|
718 |
},
|
719 |
"AQuA": {
|
720 |
"Score": 17.32,
|
721 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
722 |
}
|
723 |
},
|
724 |
"CoT-Qwen2-0.5B-Instruct": {
|
@@ -734,6 +1097,10 @@
|
|
734 |
"AQuA": {
|
735 |
"Score": 33.07,
|
736 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
737 |
}
|
738 |
},
|
739 |
"SC-CoT-Qwen2-0.5B-Instruct": {
|
@@ -749,6 +1116,143 @@
|
|
749 |
"AQuA": {
|
750 |
"Score": 22.83,
|
751 |
"Cost($)": 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
752 |
}
|
753 |
}
|
754 |
}
|
|
|
1 |
{
|
2 |
+
"time": "2025-02-11 13:23:00",
|
3 |
"results": {
|
4 |
"IO": {
|
5 |
"META": {
|
|
|
14 |
"AQuA": {
|
15 |
"Score": 38.98,
|
16 |
"Cost($)": 0.038
|
17 |
+
},
|
18 |
+
"MATH-500": {
|
19 |
+
"Score": 17.2,
|
20 |
+
"Cost($)": 0.2436
|
21 |
}
|
22 |
},
|
23 |
"ReAct-Pro*": {
|
|
|
33 |
"AQuA": {
|
34 |
"Score": 64.57,
|
35 |
"Cost($)": 0.4928
|
36 |
+
},
|
37 |
+
"MATH-500": {
|
38 |
+
"Score": 23.8,
|
39 |
+
"Cost($)": 2.0406
|
40 |
}
|
41 |
},
|
42 |
"PoT": {
|
|
|
52 |
"AQuA": {
|
53 |
"Score": 59.45,
|
54 |
"Cost($)": 0.1748
|
55 |
+
},
|
56 |
+
"MATH-500": {
|
57 |
+
"Score": 28.8,
|
58 |
+
"Cost($)": 0.168
|
59 |
}
|
60 |
},
|
61 |
"CoT": {
|
|
|
71 |
"AQuA": {
|
72 |
"Score": 61.02,
|
73 |
"Cost($)": 0.0957
|
74 |
+
},
|
75 |
+
"MATH-500": {
|
76 |
+
"Score": 39.8,
|
77 |
+
"Cost($)": 0.3189
|
78 |
}
|
79 |
},
|
80 |
"SC-CoT": {
|
|
|
90 |
"AQuA": {
|
91 |
"Score": 66.14,
|
92 |
"Cost($)": 0.7888
|
93 |
+
},
|
94 |
+
"MATH-500": {
|
95 |
+
"Score": 28.8,
|
96 |
+
"Cost($)": 1.9764
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"ToT": {
|
100 |
+
"META": {
|
101 |
+
"Algorithm": "ToT",
|
102 |
+
"LLM": "gpt-3.5-turbo",
|
103 |
+
"Eval Date": "2025/1/7"
|
104 |
+
},
|
105 |
+
"gsm8k": {
|
106 |
+
"Score": 67.93,
|
107 |
+
"Cost($)": 9.1707
|
108 |
+
},
|
109 |
+
"AQuA": {
|
110 |
+
"Score": 57.09,
|
111 |
+
"Cost($)": 1.1513
|
112 |
+
},
|
113 |
+
"MATH-500": {
|
114 |
+
"Score": 9.8,
|
115 |
+
"Cost($)": 5.2914
|
116 |
}
|
117 |
},
|
118 |
"IO-Doubao-lite-32k": {
|
|
|
128 |
"AQuA": {
|
129 |
"Score": 79.13,
|
130 |
"Cost($)": 0.0058
|
131 |
+
},
|
132 |
+
"MATH-500": {
|
133 |
+
"Score": 37.4,
|
134 |
+
"Cost($)": 0.0187
|
135 |
}
|
136 |
},
|
137 |
"ReAct-Pro*-Doubao-lite-32k": {
|
|
|
147 |
"AQuA": {
|
148 |
"Score": 77.56,
|
149 |
"Cost($)": 0.0445
|
150 |
+
},
|
151 |
+
"MATH-500": {
|
152 |
+
"Score": 47.2,
|
153 |
+
"Cost($)": 0.186
|
154 |
}
|
155 |
},
|
156 |
"PoT-Doubao-lite-32k": {
|
|
|
166 |
"AQuA": {
|
167 |
"Score": 71.65,
|
168 |
"Cost($)": 0.0147
|
169 |
+
},
|
170 |
+
"MATH-500": {
|
171 |
+
"Score": 32.6,
|
172 |
+
"Cost($)": 0.0144
|
173 |
}
|
174 |
},
|
175 |
"CoT-Doubao-lite-32k": {
|
|
|
185 |
"AQuA": {
|
186 |
"Score": 82.68,
|
187 |
"Cost($)": 0.0066
|
188 |
+
},
|
189 |
+
"MATH-500": {
|
190 |
+
"Score": 59.0,
|
191 |
+
"Cost($)": 0.0255
|
192 |
}
|
193 |
},
|
194 |
"SC-CoT-Doubao-lite-32k": {
|
|
|
204 |
"AQuA": {
|
205 |
"Score": 81.1,
|
206 |
"Cost($)": 0.0519
|
207 |
+
},
|
208 |
+
"MATH-500": {
|
209 |
+
"Score": 49.2,
|
210 |
+
"Cost($)": 0.1406
|
211 |
+
}
|
212 |
+
},
|
213 |
+
"ToT-Doubao-lite-32k": {
|
214 |
+
"META": {
|
215 |
+
"Algorithm": "ToT",
|
216 |
+
"LLM": "Doubao-lite-32k",
|
217 |
+
"Eval Date": "2025/1/7"
|
218 |
+
},
|
219 |
+
"gsm8k": {
|
220 |
+
"Score": 37.83,
|
221 |
+
"Cost($)": 0.8739
|
222 |
+
},
|
223 |
+
"AQuA": {
|
224 |
+
"Score": 45.28,
|
225 |
+
"Cost($)": 0.0881
|
226 |
+
},
|
227 |
+
"MATH-500": {
|
228 |
+
"Score": 1.2,
|
229 |
+
"Cost($)": 0.2371
|
230 |
}
|
231 |
},
|
232 |
"IO-gpt-4o": {
|
|
|
242 |
"AQuA": {
|
243 |
"Score": 75.59,
|
244 |
"Cost($)": 1.1453
|
245 |
+
},
|
246 |
+
"MATH-500": {
|
247 |
+
"Score": 41.8,
|
248 |
+
"Cost($)": 2.7907
|
249 |
}
|
250 |
},
|
251 |
"ReAct-Pro*-gpt-4o": {
|
|
|
261 |
"AQuA": {
|
262 |
"Score": 57.48,
|
263 |
"Cost($)": 2.304
|
264 |
+
},
|
265 |
+
"MATH-500": {
|
266 |
+
"Score": 54.0,
|
267 |
+
"Cost($)": 17.7735
|
268 |
}
|
269 |
},
|
270 |
"PoT-gpt-4o": {
|
|
|
280 |
"AQuA": {
|
281 |
"Score": 75.2,
|
282 |
"Cost($)": 1.6087
|
283 |
+
},
|
284 |
+
"MATH-500": {
|
285 |
+
"Score": 46.2,
|
286 |
+
"Cost($)": 1.5994
|
287 |
}
|
288 |
},
|
289 |
"CoT-gpt-4o": {
|
|
|
299 |
"AQuA": {
|
300 |
"Score": 82.68,
|
301 |
"Cost($)": 1.0417
|
302 |
+
},
|
303 |
+
"MATH-500": {
|
304 |
+
"Score": 68.0,
|
305 |
+
"Cost($)": 3.0569
|
306 |
}
|
307 |
},
|
308 |
"SC-CoT-gpt-4o": {
|
|
|
318 |
"AQuA": {
|
319 |
"Score": 86.61,
|
320 |
"Cost($)": 8.1485
|
321 |
+
},
|
322 |
+
"MATH-500": {
|
323 |
+
"Score": 34.4,
|
324 |
+
"Cost($)": 19.6538
|
325 |
+
}
|
326 |
+
},
|
327 |
+
"ToT-gpt-4o": {
|
328 |
+
"META": {
|
329 |
+
"Algorithm": "ToT",
|
330 |
+
"LLM": "gpt-4o",
|
331 |
+
"Eval Date": "2025/1/22"
|
332 |
+
},
|
333 |
+
"gsm8k": {
|
334 |
+
"Score": 91.13,
|
335 |
+
"Cost($)": 86.8581
|
336 |
+
},
|
337 |
+
"AQuA": {
|
338 |
+
"Score": 81.5,
|
339 |
+
"Cost($)": 8.5295
|
340 |
+
},
|
341 |
+
"MATH-500": {
|
342 |
+
"Score": 3.2,
|
343 |
+
"Cost($)": 40.8094
|
344 |
}
|
345 |
},
|
346 |
"IO-Qwen2.5-72B-Instruct": {
|
|
|
356 |
"AQuA": {
|
357 |
"Score": 84.25,
|
358 |
"Cost($)": 0.0742
|
359 |
+
},
|
360 |
+
"MATH-500": {
|
361 |
+
"Score": 70.2,
|
362 |
+
"Cost($)": 0.2506
|
363 |
}
|
364 |
},
|
365 |
"ReAct-Pro*-Qwen2.5-72B-Instruct": {
|
|
|
375 |
"AQuA": {
|
376 |
"Score": 73.23,
|
377 |
"Cost($)": 0.3177
|
378 |
+
},
|
379 |
+
"MATH-500": {
|
380 |
+
"Score": 62.8,
|
381 |
+
"Cost($)": 3.4541
|
382 |
}
|
383 |
},
|
384 |
"PoT-Qwen2.5-72B-Instruct": {
|
|
|
394 |
"AQuA": {
|
395 |
"Score": 75.2,
|
396 |
"Cost($)": 0.1645
|
397 |
+
},
|
398 |
+
"MATH-500": {
|
399 |
+
"Score": 47.2,
|
400 |
+
"Cost($)": 0.233
|
401 |
}
|
402 |
},
|
403 |
"CoT-Qwen2.5-72B-Instruct": {
|
|
|
413 |
"AQuA": {
|
414 |
"Score": 86.22,
|
415 |
"Cost($)": 0.0808
|
416 |
+
},
|
417 |
+
"MATH-500": {
|
418 |
+
"Score": 80.2,
|
419 |
+
"Cost($)": 0.349
|
420 |
}
|
421 |
},
|
422 |
"SC-CoT-Qwen2.5-72B-Instruct": {
|
|
|
432 |
"AQuA": {
|
433 |
"Score": 85.04,
|
434 |
"Cost($)": 1.0348
|
435 |
+
},
|
436 |
+
"MATH-500": {
|
437 |
+
"Score": 74.0,
|
438 |
+
"Cost($)": 3.1556
|
439 |
+
}
|
440 |
+
},
|
441 |
+
"ToT-Qwen2.5-72B-Instruct": {
|
442 |
+
"META": {
|
443 |
+
"Algorithm": "ToT",
|
444 |
+
"LLM": "Qwen2.5-72B-Instruct",
|
445 |
+
"Eval Date": "2025/1/22"
|
446 |
+
},
|
447 |
+
"gsm8k": {
|
448 |
+
"Score": 88.88,
|
449 |
+
"Cost($)": 23.5911
|
450 |
+
},
|
451 |
+
"AQuA": {
|
452 |
+
"Score": 81.1,
|
453 |
+
"Cost($)": 3.7389
|
454 |
+
},
|
455 |
+
"MATH-500": {
|
456 |
+
"Score": 10.8,
|
457 |
+
"Cost($)": 9.0421
|
458 |
}
|
459 |
},
|
460 |
"IO-Llama-3.3-70B-Instruct": {
|
|
|
470 |
"AQuA": {
|
471 |
"Score": 82.68,
|
472 |
"Cost($)": 0.0798
|
473 |
+
},
|
474 |
+
"MATH-500": {
|
475 |
+
"Score": 69.4,
|
476 |
+
"Cost($)": 0.2386
|
477 |
}
|
478 |
},
|
479 |
"ReAct-Pro*-Llama-3.3-70B-Instruct": {
|
|
|
489 |
"AQuA": {
|
490 |
"Score": 79.13,
|
491 |
"Cost($)": 0.768
|
492 |
+
},
|
493 |
+
"MATH-500": {
|
494 |
+
"Score": 64.6,
|
495 |
+
"Cost($)": 3.1806
|
496 |
}
|
497 |
},
|
498 |
"PoT-Llama-3.3-70B-Instruct": {
|
|
|
508 |
"AQuA": {
|
509 |
"Score": 79.53,
|
510 |
"Cost($)": 0.1746
|
511 |
+
},
|
512 |
+
"MATH-500": {
|
513 |
+
"Score": 42.6,
|
514 |
+
"Cost($)": 0.2839
|
515 |
}
|
516 |
},
|
517 |
"CoT-Llama-3.3-70B-Instruct": {
|
|
|
527 |
"AQuA": {
|
528 |
"Score": 83.46,
|
529 |
"Cost($)": 0.0927
|
530 |
+
},
|
531 |
+
"MATH-500": {
|
532 |
+
"Score": 71.2,
|
533 |
+
"Cost($)": 0.3463
|
534 |
}
|
535 |
},
|
536 |
"SC-CoT-Llama-3.3-70B-Instruct": {
|
|
|
546 |
"AQuA": {
|
547 |
"Score": 82.28,
|
548 |
"Cost($)": 1.0756
|
549 |
+
},
|
550 |
+
"MATH-500": {
|
551 |
+
"Score": 74.2,
|
552 |
+
"Cost($)": 3.2239
|
553 |
+
}
|
554 |
+
},
|
555 |
+
"ToT-Llama-3.3-70B-Instruct": {
|
556 |
+
"META": {
|
557 |
+
"Algorithm": "ToT",
|
558 |
+
"LLM": "Llama-3.3-70B-Instruct",
|
559 |
+
"Eval Date": "2025/1/22"
|
560 |
+
},
|
561 |
+
"gsm8k": {
|
562 |
+
"Score": 91.89,
|
563 |
+
"Cost($)": 20.8753
|
564 |
+
},
|
565 |
+
"AQuA": {
|
566 |
+
"Score": 83.07,
|
567 |
+
"Cost($)": 2.9404
|
568 |
+
},
|
569 |
+
"MATH-500": {
|
570 |
+
"Score": 1.4,
|
571 |
+
"Cost($)": 8.2699
|
572 |
}
|
573 |
},
|
574 |
"IO-Qwen2.5-7B-Instruct": {
|
|
|
584 |
"AQuA": {
|
585 |
"Score": 78.74,
|
586 |
"Cost($)": 0.0
|
587 |
+
},
|
588 |
+
"MATH-500": {
|
589 |
+
"Score": 59.4,
|
590 |
+
"Cost($)": 0.0
|
591 |
}
|
592 |
},
|
593 |
"ReAct-Pro*-Qwen2.5-7B-Instruct": {
|
|
|
603 |
"AQuA": {
|
604 |
"Score": 74.41,
|
605 |
"Cost($)": 0.0
|
606 |
+
},
|
607 |
+
"MATH-500": {
|
608 |
+
"Score": 48.8,
|
609 |
+
"Cost($)": 0.0
|
610 |
}
|
611 |
},
|
612 |
"PoT-Qwen2.5-7B-Instruct": {
|
|
|
622 |
"AQuA": {
|
623 |
"Score": 68.11,
|
624 |
"Cost($)": 0.0
|
625 |
+
},
|
626 |
+
"MATH-500": {
|
627 |
+
"Score": 39.6,
|
628 |
+
"Cost($)": 0.0
|
629 |
}
|
630 |
},
|
631 |
"CoT-Qwen2.5-7B-Instruct": {
|
|
|
641 |
"AQuA": {
|
642 |
"Score": 80.71,
|
643 |
"Cost($)": 0.0
|
644 |
+
},
|
645 |
+
"MATH-500": {
|
646 |
+
"Score": 69.8,
|
647 |
+
"Cost($)": 0.0
|
648 |
}
|
649 |
},
|
650 |
"SC-CoT-Qwen2.5-7B-Instruct": {
|
|
|
660 |
"AQuA": {
|
661 |
"Score": 79.92,
|
662 |
"Cost($)": 0.0
|
663 |
+
},
|
664 |
+
"MATH-500": {
|
665 |
+
"Score": 67.0,
|
666 |
+
"Cost($)": 0.0
|
667 |
+
}
|
668 |
+
},
|
669 |
+
"ToT-Qwen2.5-7B-Instruct": {
|
670 |
+
"META": {
|
671 |
+
"Algorithm": "ToT",
|
672 |
+
"LLM": "Qwen2.5-7B-Instruct",
|
673 |
+
"Eval Date": "2025/1/22"
|
674 |
+
},
|
675 |
+
"gsm8k": {
|
676 |
+
"Score": 72.21,
|
677 |
+
"Cost($)": 0.0
|
678 |
+
},
|
679 |
+
"AQuA": {
|
680 |
+
"Score": 53.94,
|
681 |
+
"Cost($)": 0.0
|
682 |
+
},
|
683 |
+
"MATH-500": {
|
684 |
+
"Score": 1.4,
|
685 |
+
"Cost($)": 0.0
|
686 |
}
|
687 |
},
|
688 |
"IO-Llama-3.1-8B-Instruct": {
|
|
|
698 |
"AQuA": {
|
699 |
"Score": 51.18,
|
700 |
"Cost($)": 0.0
|
701 |
+
},
|
702 |
+
"MATH-500": {
|
703 |
+
"Score": 38.6,
|
704 |
+
"Cost($)": 0.0
|
705 |
}
|
706 |
},
|
707 |
"ReAct-Pro*-Llama-3.1-8B-Instruct": {
|
|
|
717 |
"AQuA": {
|
718 |
"Score": 55.51,
|
719 |
"Cost($)": 0.0
|
720 |
+
},
|
721 |
+
"MATH-500": {
|
722 |
+
"Score": 28.8,
|
723 |
+
"Cost($)": 0.0
|
724 |
}
|
725 |
},
|
726 |
"PoT-Llama-3.1-8B-Instruct": {
|
|
|
736 |
"AQuA": {
|
737 |
"Score": 36.61,
|
738 |
"Cost($)": 0.0
|
739 |
+
},
|
740 |
+
"MATH-500": {
|
741 |
+
"Score": 25.4,
|
742 |
+
"Cost($)": 0.0
|
743 |
}
|
744 |
},
|
745 |
"CoT-Llama-3.1-8B-Instruct": {
|
|
|
755 |
"AQuA": {
|
756 |
"Score": 60.63,
|
757 |
"Cost($)": 0.0
|
758 |
+
},
|
759 |
+
"MATH-500": {
|
760 |
+
"Score": 25.8,
|
761 |
+
"Cost($)": 0.0
|
762 |
}
|
763 |
},
|
764 |
"SC-CoT-Llama-3.1-8B-Instruct": {
|
|
|
774 |
"AQuA": {
|
775 |
"Score": 59.45,
|
776 |
"Cost($)": 0.0
|
777 |
+
},
|
778 |
+
"MATH-500": {
|
779 |
+
"Score": 30.2,
|
780 |
+
"Cost($)": 0.0
|
781 |
+
}
|
782 |
+
},
|
783 |
+
"ToT-Llama-3.1-8B-Instruct": {
|
784 |
+
"META": {
|
785 |
+
"Algorithm": "ToT",
|
786 |
+
"LLM": "Llama-3.1-8B-Instruct",
|
787 |
+
"Eval Date": "2025/1/22"
|
788 |
+
},
|
789 |
+
"gsm8k": {
|
790 |
+
"Score": 65.05,
|
791 |
+
"Cost($)": 0.0
|
792 |
+
},
|
793 |
+
"AQuA": {
|
794 |
+
"Score": 59.06,
|
795 |
+
"Cost($)": 0.0
|
796 |
+
},
|
797 |
+
"MATH-500": {
|
798 |
+
"Score": 1.8,
|
799 |
+
"Cost($)": 0.0
|
800 |
}
|
801 |
},
|
802 |
"IO-Internllm2_5-7B": {
|
|
|
812 |
"AQuA": {
|
813 |
"Score": 47.64,
|
814 |
"Cost($)": 0.0
|
815 |
+
},
|
816 |
+
"MATH-500": {
|
817 |
+
"Score": 22.8,
|
818 |
+
"Cost($)": 0.0
|
819 |
}
|
820 |
},
|
821 |
"ReAct-Pro*-Internllm2_5-7B": {
|
|
|
831 |
"AQuA": {
|
832 |
"Score": 40.94,
|
833 |
"Cost($)": 0.0
|
834 |
+
},
|
835 |
+
"MATH-500": {
|
836 |
+
"Score": 14.8,
|
837 |
+
"Cost($)": 0.0
|
838 |
}
|
839 |
},
|
840 |
"PoT-Internllm2_5-7B": {
|
|
|
850 |
"AQuA": {
|
851 |
"Score": 36.61,
|
852 |
"Cost($)": 0.0
|
853 |
+
},
|
854 |
+
"MATH-500": {
|
855 |
+
"Score": 15.0,
|
856 |
+
"Cost($)": 0.0
|
857 |
}
|
858 |
},
|
859 |
"CoT-Internllm2_5-7B": {
|
|
|
869 |
"AQuA": {
|
870 |
"Score": 52.76,
|
871 |
"Cost($)": 0.0
|
872 |
+
},
|
873 |
+
"MATH-500": {
|
874 |
+
"Score": 46.6,
|
875 |
+
"Cost($)": 0.0
|
876 |
}
|
877 |
},
|
878 |
"SC-CoT-Internllm2_5-7B": {
|
|
|
888 |
"AQuA": {
|
889 |
"Score": 39.37,
|
890 |
"Cost($)": 0.0
|
891 |
+
},
|
892 |
+
"MATH-500": {
|
893 |
+
"Score": 9.8,
|
894 |
+
"Cost($)": 0.0
|
895 |
+
}
|
896 |
+
},
|
897 |
+
"ToT-Internllm2_5-7B": {
|
898 |
+
"META": {
|
899 |
+
"Algorithm": "ToT",
|
900 |
+
"LLM": "Internllm2_5-7B",
|
901 |
+
"Eval Date": "2025/1/22"
|
902 |
+
},
|
903 |
+
"gsm8k": {
|
904 |
+
"Score": 20.85,
|
905 |
+
"Cost($)": 0.0
|
906 |
+
},
|
907 |
+
"AQuA": {
|
908 |
+
"Score": 35.83,
|
909 |
+
"Cost($)": 0.0
|
910 |
+
},
|
911 |
+
"MATH-500": {
|
912 |
+
"Score": 0.2,
|
913 |
+
"Cost($)": 0.0
|
914 |
}
|
915 |
},
|
916 |
"IO-Qwen2-1.5B-Instruct": {
|
|
|
926 |
"AQuA": {
|
927 |
"Score": 29.13,
|
928 |
"Cost($)": 0.0
|
929 |
+
},
|
930 |
+
"MATH-500": {
|
931 |
+
"Score": 7.0,
|
932 |
+
"Cost($)": 0.0
|
933 |
}
|
934 |
},
|
935 |
"ReAct-Pro*-Qwen2-1.5B-Instruct": {
|
|
|
945 |
"AQuA": {
|
946 |
"Score": 25.59,
|
947 |
"Cost($)": 0.0
|
948 |
+
},
|
949 |
+
"MATH-500": {
|
950 |
+
"Score": 8.2,
|
951 |
+
"Cost($)": 0.0
|
952 |
}
|
953 |
},
|
954 |
"PoT-Qwen2-1.5B-Instruct": {
|
|
|
964 |
"AQuA": {
|
965 |
"Score": 30.71,
|
966 |
"Cost($)": 0.0
|
967 |
+
},
|
968 |
+
"MATH-500": {
|
969 |
+
"Score": 0.8,
|
970 |
+
"Cost($)": 0.0
|
971 |
}
|
972 |
},
|
973 |
"CoT-Qwen2-1.5B-Instruct": {
|
|
|
983 |
"AQuA": {
|
984 |
"Score": 40.55,
|
985 |
"Cost($)": 0.0
|
986 |
+
},
|
987 |
+
"MATH-500": {
|
988 |
+
"Score": 15.2,
|
989 |
+
"Cost($)": 0.0
|
990 |
}
|
991 |
},
|
992 |
"SC-CoT-Qwen2-1.5B-Instruct": {
|
|
|
1002 |
"AQuA": {
|
1003 |
"Score": 23.62,
|
1004 |
"Cost($)": 0.0
|
1005 |
+
},
|
1006 |
+
"MATH-500": {
|
1007 |
+
"Score": 3.8,
|
1008 |
+
"Cost($)": 0.0
|
1009 |
+
}
|
1010 |
+
},
|
1011 |
+
"ToT-Qwen2-1.5B-Instruct": {
|
1012 |
+
"META": {
|
1013 |
+
"Algorithm": "ToT",
|
1014 |
+
"LLM": "Qwen2-1.5B-Instruct",
|
1015 |
+
"Eval Date": "2025/1/22"
|
1016 |
+
},
|
1017 |
+
"gsm8k": {
|
1018 |
+
"Score": 19.64,
|
1019 |
+
"Cost($)": 0.0
|
1020 |
+
},
|
1021 |
+
"AQuA": {
|
1022 |
+
"Score": 31.5,
|
1023 |
+
"Cost($)": 0.0
|
1024 |
+
},
|
1025 |
+
"MATH-500": {
|
1026 |
+
"Score": 0.8,
|
1027 |
+
"Cost($)": 0.0
|
1028 |
}
|
1029 |
},
|
1030 |
"IO-Qwen2-0.5B-Instruct": {
|
|
|
1040 |
"AQuA": {
|
1041 |
"Score": 27.17,
|
1042 |
"Cost($)": 0.0
|
1043 |
+
},
|
1044 |
+
"MATH-500": {
|
1045 |
+
"Score": 2.6,
|
1046 |
+
"Cost($)": 0.0
|
1047 |
}
|
1048 |
},
|
1049 |
"ReAct-Pro*-Qwen2-0.5B-Instruct": {
|
|
|
1059 |
"AQuA": {
|
1060 |
"Score": 24.02,
|
1061 |
"Cost($)": 0.0
|
1062 |
+
},
|
1063 |
+
"MATH-500": {
|
1064 |
+
"Score": 0.6,
|
1065 |
+
"Cost($)": 0.0
|
1066 |
}
|
1067 |
},
|
1068 |
"PoT-Qwen2-0.5B-Instruct": {
|
|
|
1072 |
"Eval Date": "2025/1/22"
|
1073 |
},
|
1074 |
"gsm8k": {
|
1075 |
+
"Score": 9.63,
|
1076 |
"Cost($)": 0.0
|
1077 |
},
|
1078 |
"AQuA": {
|
1079 |
"Score": 17.32,
|
1080 |
"Cost($)": 0.0
|
1081 |
+
},
|
1082 |
+
"MATH-500": {
|
1083 |
+
"Score": 0.0,
|
1084 |
+
"Cost($)": 0.0
|
1085 |
}
|
1086 |
},
|
1087 |
"CoT-Qwen2-0.5B-Instruct": {
|
|
|
1097 |
"AQuA": {
|
1098 |
"Score": 33.07,
|
1099 |
"Cost($)": 0.0
|
1100 |
+
},
|
1101 |
+
"MATH-500": {
|
1102 |
+
"Score": 6.2,
|
1103 |
+
"Cost($)": 0.0
|
1104 |
}
|
1105 |
},
|
1106 |
"SC-CoT-Qwen2-0.5B-Instruct": {
|
|
|
1116 |
"AQuA": {
|
1117 |
"Score": 22.83,
|
1118 |
"Cost($)": 0.0
|
1119 |
+
},
|
1120 |
+
"MATH-500": {
|
1121 |
+
"Score": 0.8,
|
1122 |
+
"Cost($)": 0.0
|
1123 |
+
}
|
1124 |
+
},
|
1125 |
+
"ToT-Qwen2-0.5B-Instruct": {
|
1126 |
+
"META": {
|
1127 |
+
"Algorithm": "ToT",
|
1128 |
+
"LLM": "Qwen2-0.5B-Instruct",
|
1129 |
+
"Eval Date": "2025/1/22"
|
1130 |
+
},
|
1131 |
+
"gsm8k": {
|
1132 |
+
"Score": 0.0,
|
1133 |
+
"Cost($)": 0.0
|
1134 |
+
},
|
1135 |
+
"AQuA": {
|
1136 |
+
"Score": 29.92,
|
1137 |
+
"Cost($)": 0.0
|
1138 |
+
},
|
1139 |
+
"MATH-500": {
|
1140 |
+
"Score": 0.0,
|
1141 |
+
"Cost($)": 0.0
|
1142 |
+
}
|
1143 |
+
},
|
1144 |
+
"IO-deepseek-r1:1.5b": {
|
1145 |
+
"META": {
|
1146 |
+
"Algorithm": "IO",
|
1147 |
+
"LLM": "deepseek-r1:1.5b",
|
1148 |
+
"Eval Date": "2025/1/22"
|
1149 |
+
},
|
1150 |
+
"gsm8k": {
|
1151 |
+
"Score": 64.14,
|
1152 |
+
"Cost($)": 0.0
|
1153 |
+
},
|
1154 |
+
"AQuA": {
|
1155 |
+
"Score": 68.9,
|
1156 |
+
"Cost($)": 0.0
|
1157 |
+
},
|
1158 |
+
"MATH-500": {
|
1159 |
+
"Score": 43.8,
|
1160 |
+
"Cost($)": 0.0
|
1161 |
+
}
|
1162 |
+
},
|
1163 |
+
"ReAct-Pro*-deepseek-r1:1.5b": {
|
1164 |
+
"META": {
|
1165 |
+
"Algorithm": "ReAct-Pro*",
|
1166 |
+
"LLM": "deepseek-r1:1.5b",
|
1167 |
+
"Eval Date": "2025/2/10"
|
1168 |
+
},
|
1169 |
+
"gsm8k": {
|
1170 |
+
"Score": 35.94,
|
1171 |
+
"Cost($)": 0.0
|
1172 |
+
},
|
1173 |
+
"AQuA": {
|
1174 |
+
"Score": 54.33,
|
1175 |
+
"Cost($)": 0.0
|
1176 |
+
},
|
1177 |
+
"MATH-500": {
|
1178 |
+
"Score": 24.4,
|
1179 |
+
"Cost($)": 0.0
|
1180 |
+
}
|
1181 |
+
},
|
1182 |
+
"PoT-deepseek-r1:1.5b": {
|
1183 |
+
"META": {
|
1184 |
+
"Algorithm": "PoT",
|
1185 |
+
"LLM": "deepseek-r1:1.5b",
|
1186 |
+
"Eval Date": "2025/2/10"
|
1187 |
+
},
|
1188 |
+
"gsm8k": {
|
1189 |
+
"Score": 11.9,
|
1190 |
+
"Cost($)": 0.0
|
1191 |
+
},
|
1192 |
+
"AQuA": {
|
1193 |
+
"Score": 54.72,
|
1194 |
+
"Cost($)": 0.0
|
1195 |
+
},
|
1196 |
+
"MATH-500": {
|
1197 |
+
"Score": 1.0,
|
1198 |
+
"Cost($)": 0.0
|
1199 |
+
}
|
1200 |
+
},
|
1201 |
+
"CoT-deepseek-r1:1.5b": {
|
1202 |
+
"META": {
|
1203 |
+
"Algorithm": "CoT",
|
1204 |
+
"LLM": "deepseek-r1:1.5b",
|
1205 |
+
"Eval Date": "2025/1/23"
|
1206 |
+
},
|
1207 |
+
"gsm8k": {
|
1208 |
+
"Score": 70.66,
|
1209 |
+
"Cost($)": 0.0
|
1210 |
+
},
|
1211 |
+
"AQuA": {
|
1212 |
+
"Score": 71.65,
|
1213 |
+
"Cost($)": 0.0
|
1214 |
+
},
|
1215 |
+
"MATH-500": {
|
1216 |
+
"Score": 49.4,
|
1217 |
+
"Cost($)": 0.0
|
1218 |
+
}
|
1219 |
+
},
|
1220 |
+
"SC-CoT-deepseek-r1:1.5b": {
|
1221 |
+
"META": {
|
1222 |
+
"Algorithm": "SC-CoT",
|
1223 |
+
"LLM": "deepseek-r1:1.5b",
|
1224 |
+
"Eval Date": "2025/2/10"
|
1225 |
+
},
|
1226 |
+
"gsm8k": {
|
1227 |
+
"Score": 55.34,
|
1228 |
+
"Cost($)": 0.0
|
1229 |
+
},
|
1230 |
+
"AQuA": {
|
1231 |
+
"Score": 59.06,
|
1232 |
+
"Cost($)": 0.0
|
1233 |
+
},
|
1234 |
+
"MATH-500": {
|
1235 |
+
"Score": 38.0,
|
1236 |
+
"Cost($)": 0.0
|
1237 |
+
}
|
1238 |
+
},
|
1239 |
+
"ToT-deepseek-r1:1.5b": {
|
1240 |
+
"META": {
|
1241 |
+
"Algorithm": "ToT",
|
1242 |
+
"LLM": "deepseek-r1:1.5b",
|
1243 |
+
"Eval Date": "2025/2/10"
|
1244 |
+
},
|
1245 |
+
"gsm8k": {
|
1246 |
+
"Score": 23.12,
|
1247 |
+
"Cost($)": 0.0
|
1248 |
+
},
|
1249 |
+
"AQuA": {
|
1250 |
+
"Score": 24.8,
|
1251 |
+
"Cost($)": 0.0
|
1252 |
+
},
|
1253 |
+
"MATH-500": {
|
1254 |
+
"Score": 0.4,
|
1255 |
+
"Cost($)": 0.0
|
1256 |
}
|
1257 |
}
|
1258 |
}
|
src/overall_results.csv
CHANGED
@@ -1,51 +1,67 @@
|
|
1 |
-
Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA-Cost($)
|
2 |
-
1.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,
|
3 |
-
2.0,SC-CoT,Qwen2.5-72B-Instruct,2025/1/22,
|
4 |
-
3.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,
|
5 |
-
4.0,
|
6 |
-
5.0,
|
7 |
-
6.0,
|
8 |
-
7.0,IO,
|
9 |
-
8.0,CoT,
|
10 |
-
9.0,
|
11 |
-
10.0,
|
12 |
-
11.0,
|
13 |
-
12.0,
|
14 |
-
13.0,
|
15 |
-
14.0,
|
16 |
-
15.0,
|
17 |
-
16.0,
|
18 |
-
17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,
|
19 |
-
18.0,ReAct-Pro*,Qwen2.5-
|
20 |
-
19.0,
|
21 |
-
20.0,
|
22 |
-
21.0,PoT,
|
23 |
-
22.0,
|
24 |
-
23.0,
|
25 |
-
24.0,
|
26 |
-
25.0,
|
27 |
-
26.0,
|
28 |
-
27.0,CoT,
|
29 |
-
28.0,IO,
|
30 |
-
29.0,
|
31 |
-
30.0,
|
32 |
-
31.0,
|
33 |
-
32.0,ReAct-Pro*,
|
34 |
-
33.0,
|
35 |
-
34.0,
|
36 |
-
35.0,
|
37 |
-
36.0,SC-CoT,
|
38 |
-
37.0,
|
39 |
-
38.0,
|
40 |
-
39.0,
|
41 |
-
40.0,
|
42 |
-
41.0,
|
43 |
-
42.0,
|
44 |
-
43.0,
|
45 |
-
44.0,
|
46 |
-
45.0,
|
47 |
-
46.0,
|
48 |
-
47.0,SC-CoT,
|
49 |
-
48.0,
|
50 |
-
49.0,PoT,
|
51 |
-
50.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA-Cost($),MATH-500-Score,MATH-500-Cost($)
|
2 |
+
1.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,86.43,92.87,0.7195,86.22,0.0808,80.2,0.349
|
3 |
+
2.0,SC-CoT,Qwen2.5-72B-Instruct,2025/1/22,84.30,93.86,5.9858,85.04,1.0348,74.0,3.1556
|
4 |
+
3.0,SC-CoT,Llama-3.3-70B-Instruct,2025/1/22,83.85,95.07,6.2005,82.28,1.0756,74.2,3.2239
|
5 |
+
4.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,82.86,93.93,0.687,83.46,0.0927,71.2,0.3463
|
6 |
+
5.0,CoT,gpt-4o,2025/1/22,81.59,94.09,4.5367,82.68,1.0417,68.0,3.0569
|
7 |
+
6.0,IO,Llama-3.3-70B-Instruct,2025/1/22,81.45,92.27,0.4709,82.68,0.0798,69.4,0.2386
|
8 |
+
7.0,IO,Qwen2.5-72B-Instruct,2025/1/22,80.34,86.58,0.4899,84.25,0.0742,70.2,0.2506
|
9 |
+
8.0,SC-CoT,Qwen2.5-7B-Instruct,2025/1/22,79.35,91.13,0.0,79.92,0.0,67.0,0.0
|
10 |
+
9.0,CoT,Qwen2.5-7B-Instruct,2025/1/22,78.73,85.67,0.0,80.71,0.0,69.8,0.0
|
11 |
+
10.0,ReAct-Pro*,Llama-3.3-70B-Instruct,2025/1/22,77.12,87.64,10.1124,79.13,0.768,64.6,3.1806
|
12 |
+
11.0,CoT,Doubao-lite-32k,2025/1/7,77.00,89.31,0.0558,82.68,0.0066,59.0,0.0255
|
13 |
+
12.0,ReAct-Pro*,Qwen2.5-72B-Instruct,2025/1/22,74.43,87.26,10.5479,73.23,0.3177,62.8,3.4541
|
14 |
+
13.0,SC-CoT,Doubao-lite-32k,2025/1/7,72.52,87.26,0.2083,81.1,0.0519,49.2,0.1406
|
15 |
+
14.0,PoT,Qwen2.5-72B-Instruct,2025/1/22,71.58,92.34,0.7054,75.2,0.1645,47.2,0.233
|
16 |
+
15.0,PoT,gpt-4o,2025/1/22,71.50,93.1,4.2166,75.2,1.6087,46.2,1.5994
|
17 |
+
16.0,SC-CoT,gpt-4o,2025/1/22,70.44,90.3,31.0542,86.61,8.1485,34.4,19.6538
|
18 |
+
17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,70.12,85.6,0.2512,77.56,0.0445,47.2,0.186
|
19 |
+
18.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,68.69,82.87,0.0,74.41,0.0,48.8,0.0
|
20 |
+
19.0,IO,gpt-4o,2025/1/22,68.60,88.4,3.3463,75.59,1.1453,41.8,2.7907
|
21 |
+
20.0,IO,Qwen2.5-7B-Instruct,2025/1/22,65.13,57.24,0.0,78.74,0.0,59.4,0.0
|
22 |
+
21.0,PoT,Llama-3.3-70B-Instruct,2025/1/22,65.07,73.09,0.9736,79.53,0.1746,42.6,0.2839
|
23 |
+
22.0,CoT,deepseek-r1:1.5b,2025/1/23,63.90,70.66,0.0,71.65,0.0,49.4,0.0
|
24 |
+
23.0,IO,Doubao-lite-32k,2025/1/7,62.85,72.02,0.0354,79.13,0.0058,37.4,0.0187
|
25 |
+
24.0,PoT,Doubao-lite-32k,2025/1/7,61.29,79.61,0.0576,71.65,0.0147,32.6,0.0144
|
26 |
+
25.0,ToT,Qwen2.5-72B-Instruct,2025/1/22,60.26,88.88,23.5911,81.1,3.7389,10.8,9.0421
|
27 |
+
26.0,CoT,gpt-3.5-turbo,2025/1/7,59.84,78.7,0.6788,61.02,0.0957,39.8,0.3189
|
28 |
+
27.0,CoT,Internllm2_5-7B,2025/1/22,59.02,77.71,0.0,52.76,0.0,46.6,0.0
|
29 |
+
28.0,IO,deepseek-r1:1.5b,2025/1/22,58.95,64.14,0.0,68.9,0.0,43.8,0.0
|
30 |
+
29.0,ToT,Llama-3.3-70B-Instruct,2025/1/22,58.79,91.89,20.8753,83.07,2.9404,1.4,8.2699
|
31 |
+
30.0,ToT,gpt-4o,2025/1/22,58.61,91.13,86.8581,81.5,8.5295,3.2,40.8094
|
32 |
+
31.0,SC-CoT,gpt-3.5-turbo,2025/1/7,58.28,79.91,3.3938,66.14,0.7888,28.8,1.9764
|
33 |
+
32.0,ReAct-Pro*,gpt-4o,2025/1/22,58.26,63.31,39.0751,57.48,2.304,54.0,17.7735
|
34 |
+
33.0,PoT,Qwen2.5-7B-Instruct,2025/1/22,55.51,58.83,0.0,68.11,0.0,39.6,0.0
|
35 |
+
34.0,PoT,gpt-3.5-turbo,2025/1/7,55.04,76.88,0.6902,59.45,0.1748,28.8,0.168
|
36 |
+
35.0,ReAct-Pro*,gpt-3.5-turbo,2025/1/7,54.43,74.91,3.4633,64.57,0.4928,23.8,2.0406
|
37 |
+
36.0,SC-CoT,Llama-3.1-8B-Instruct,2025/1/22,54.37,73.46,0.0,59.45,0.0,30.2,0.0
|
38 |
+
37.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,53.96,75.44,0.0,60.63,0.0,25.8,0.0
|
39 |
+
38.0,SC-CoT,deepseek-r1:1.5b,2025/2/10,50.80,55.34,0.0,59.06,0.0,38.0,0.0
|
40 |
+
39.0,ReAct-Pro*,Llama-3.1-8B-Instruct,2025/1/22,50.70,67.78,0.0,55.51,0.0,28.8,0.0
|
41 |
+
40.0,IO,Llama-3.1-8B-Instruct,2025/1/22,48.98,57.16,0.0,51.18,0.0,38.6,0.0
|
42 |
+
41.0,ToT,gpt-3.5-turbo,2025/1/7,44.94,67.93,9.1707,57.09,1.1513,9.8,5.2914
|
43 |
+
42.0,ToT,Qwen2.5-7B-Instruct,2025/1/22,42.52,72.21,0.0,53.94,0.0,1.4,0.0
|
44 |
+
43.0,ToT,Llama-3.1-8B-Instruct,2025/1/22,41.97,65.05,0.0,59.06,0.0,1.8,0.0
|
45 |
+
44.0,ReAct-Pro*,deepseek-r1:1.5b,2025/2/10,38.22,35.94,0.0,54.33,0.0,24.4,0.0
|
46 |
+
45.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,37.08,55.5,0.0,40.55,0.0,15.2,0.0
|
47 |
+
46.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,33.56,38.67,0.0,36.61,0.0,25.4,0.0
|
48 |
+
47.0,SC-CoT,Internllm2_5-7B,2025/1/22,32.46,48.22,0.0,39.37,0.0,9.8,0.0
|
49 |
+
48.0,IO,gpt-3.5-turbo,2025/1/7,31.34,37.83,0.3328,38.98,0.038,17.2,0.2436
|
50 |
+
49.0,PoT,Internllm2_5-7B,2025/1/22,29.94,38.21,0.0,36.61,0.0,15.0,0.0
|
51 |
+
50.0,ReAct-Pro*,Internllm2_5-7B,2025/1/22,29.75,33.51,0.0,40.94,0.0,14.8,0.0
|
52 |
+
51.0,ToT,Doubao-lite-32k,2025/1/7,28.10,37.83,0.8739,45.28,0.0881,1.2,0.2371
|
53 |
+
52.0,IO,Internllm2_5-7B,2025/1/22,27.35,11.6,0.0,47.64,0.0,22.8,0.0
|
54 |
+
53.0,CoT,Qwen2-0.5B-Instruct,2025/1/22,25.07,35.94,0.0,33.07,0.0,6.2,0.0
|
55 |
+
54.0,PoT,deepseek-r1:1.5b,2025/2/10,22.54,11.9,0.0,54.72,0.0,1.0,0.0
|
56 |
+
55.0,ReAct-Pro*,Qwen2-1.5B-Instruct,2025/1/22,19.55,24.87,0.0,25.59,0.0,8.2,0.0
|
57 |
+
56.0,ToT,Internllm2_5-7B,2025/1/22,18.96,20.85,0.0,35.83,0.0,0.2,0.0
|
58 |
+
57.0,IO,Qwen2-1.5B-Instruct,2025/1/22,17.60,16.68,0.0,29.13,0.0,7.0,0.0
|
59 |
+
58.0,ToT,Qwen2-1.5B-Instruct,2025/1/22,17.31,19.64,0.0,31.5,0.0,0.8,0.0
|
60 |
+
59.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,16.67,18.5,0.0,30.71,0.0,0.8,0.0
|
61 |
+
60.0,ToT,deepseek-r1:1.5b,2025/2/10,16.11,23.12,0.0,24.8,0.0,0.4,0.0
|
62 |
+
61.0,IO,Qwen2-0.5B-Instruct,2025/1/22,14.83,14.71,0.0,27.17,0.0,2.6,0.0
|
63 |
+
62.0,SC-CoT,Qwen2-1.5B-Instruct,2025/1/22,13.06,11.75,0.0,23.62,0.0,3.8,0.0
|
64 |
+
63.0,ReAct-Pro*,Qwen2-0.5B-Instruct,2025/1/22,10.76,7.66,0.0,24.02,0.0,0.6,0.0
|
65 |
+
64.0,ToT,Qwen2-0.5B-Instruct,2025/1/22,9.97,0.0,0.0,29.92,0.0,0.0,0.0
|
66 |
+
65.0,PoT,Qwen2-0.5B-Instruct,2025/1/22,8.98,9.63,0.0,17.32,0.0,0.0,0.0
|
67 |
+
66.0,SC-CoT,Qwen2-0.5B-Instruct,2025/1/22,8.43,1.67,0.0,22.83,0.0,0.8,0.0
|
src/record.csv
CHANGED
@@ -1,148 +1,199 @@
|
|
1 |
-
Algorithm,Dataset,Eval Date,LLM,Score,Pass rate,X-shot,Parameters,Samples,Total input tokens,Average input tokens,Total output tokens,Average output tokens,All tokens,Cost($),Note
|
2 |
-
IO,gsm8k,2025/1/7,gpt-3.5-turbo,37.83,99.92,8,,
|
3 |
-
IO,gsm8k,2025/1/7,Doubao-lite-32k,72.02,99.92,8,,
|
4 |
-
IO,gsm8k,2025/1/22,gpt-4o,88.
|
5 |
-
IO,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,86.58,100,8,,
|
6 |
-
IO,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,92.27,100,8,,
|
7 |
-
IO,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,57.24,100,8,,
|
8 |
-
IO,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,57.16,99.55,8,,
|
9 |
-
IO,gsm8k,2025/1/22,Internllm2_5-7B,11.
|
10 |
-
IO,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,16.68,100,8,,
|
11 |
-
IO,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,14.71,100,8,,
|
12 |
-
|
13 |
-
ReAct-Pro*,gsm8k,2025/1/7,
|
14 |
-
ReAct-Pro*,gsm8k,2025/1/
|
15 |
-
ReAct-Pro*,gsm8k,2025/1/22,
|
16 |
-
ReAct-Pro*,gsm8k,2025/1/22,
|
17 |
-
ReAct-Pro*,gsm8k,2025/1/22,
|
18 |
-
ReAct-Pro*,gsm8k,2025/1/22,
|
19 |
-
ReAct-Pro*,gsm8k,2025/1/22,
|
20 |
-
ReAct-Pro*,gsm8k,2025/1/22,
|
21 |
-
ReAct-Pro*,gsm8k,2025/1/22,Qwen2-
|
22 |
-
|
23 |
-
|
24 |
-
PoT,gsm8k,2025/1/
|
25 |
-
PoT,gsm8k,2025/1/
|
26 |
-
PoT,gsm8k,2025/1/22,
|
27 |
-
PoT,gsm8k,2025/1/22,Qwen2.5-
|
28 |
-
PoT,gsm8k,2025/1/22,Llama-3.
|
29 |
-
PoT,gsm8k,2025/1/22,
|
30 |
-
PoT,gsm8k,2025/1/22,
|
31 |
-
PoT,gsm8k,2025/1/22,
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
CoT,gsm8k,2025/1/
|
36 |
-
CoT,gsm8k,2025/1/
|
37 |
-
CoT,gsm8k,2025/1/22,
|
38 |
-
CoT,gsm8k,2025/1/22,
|
39 |
-
CoT,gsm8k,2025/1/22,
|
40 |
-
CoT,gsm8k,2025/1/22,Qwen2
|
41 |
-
CoT,gsm8k,2025/1/22,
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
SC-CoT,gsm8k,2025/1/
|
47 |
-
SC-CoT,gsm8k,2025/1/
|
48 |
-
SC-CoT,gsm8k,2025/1/22,
|
49 |
-
SC-CoT,gsm8k,2025/1/22,
|
50 |
-
SC-CoT,gsm8k,2025/1/22,
|
51 |
-
SC-CoT,gsm8k,2025/1/22,Qwen2
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Algorithm,Dataset,Eval Date,LLM,Score,Pass rate,X-shot,Parameters,Samples,Total input tokens,Average input tokens,Total output tokens,Average output tokens,All tokens,Cost($),Note,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
2 |
+
IO,gsm8k,2025/1/7,gpt-3.5-turbo,37.83,99.92,8,,"1,319","546,990",415,"39,563",30,"586,553",0.3328,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
3 |
+
IO,gsm8k,2025/1/7,Doubao-lite-32k,72.02,99.92,8,,"1,319","617,377",468,"123,106",93,"740,483",0.0354,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
4 |
+
IO,gsm8k,2025/1/22,gpt-4o,88.40,100.00,8,,"1,319","542,416",411,"199,030",151,"741,446",3.3463,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
5 |
+
IO,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,86.58,100.00,8,,"1,319","555,340",421,"313,720",238,"869,060",0.4899,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
6 |
+
IO,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,92.27,100.00,8,,"1,319","583,916",443,"251,359",191,"835,275",0.4709,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
7 |
+
IO,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,57.24,100.00,8,,"1,319","596,229",452,"291,684",221,"887,913",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
8 |
+
IO,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,57.16,99.55,8,,"1,319","550,941",418,"1,194,488",906,"1,745,429",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
9 |
+
IO,gsm8k,2025/1/22,Internllm2_5-7B,11.60,97.95,8,,"1,319","679,302",515,"434,426",329,"1,113,728",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
10 |
+
IO,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,16.68,100.00,8,,"1,319","568,530",431,"168,466",128,"736,996",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
11 |
+
IO,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,14.71,100.00,8,,"1,319","568,116",431,"266,781",202,"834,897",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
12 |
+
IO,gsm8k,2025/1/22,deepseek-r1:1.5b,64.14,99.62,8,,"1,319","561,935",426,"921,116",698,"1,483,051",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
13 |
+
ReAct-Pro*,gsm8k,2025/1/7,gpt-3.5-turbo,74.91,99.39,8,max_steps=10,"1,319","6,506,164","4,933","140,122",106,"6,646,286",3.4633,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
14 |
+
ReAct-Pro*,gsm8k,2025/1/7,Doubao-lite-32k,85.60,99.62,8,max_steps=10,"1,319","5,862,016","4,444","136,623",104,"5,998,639",0.2512,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
15 |
+
ReAct-Pro*,gsm8k,2025/1/22,gpt-4o,63.31,99.55,8,max_steps=10,"1,319","14,411,173","10,926","304,714",231,"14,715,887",39.0751,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
16 |
+
ReAct-Pro*,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,87.26,100.00,8,max_steps=10,"1,319","18,160,983","13,769","549,454",417,"18,710,437",10.5479,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
17 |
+
ReAct-Pro*,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,87.64,99.92,8,max_steps=10,"1,319","17,038,928","12,918","898,936",682,"17,937,864",10.1124,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
18 |
+
ReAct-Pro*,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,82.87,100.00,8,max_steps=10,"1,319","14,355,752","10,884","495,162",375,"14,850,914",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
19 |
+
ReAct-Pro*,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,67.78,98.56,8,max_steps=10,"1,319","21,044,978","15,955","1,790,789","1,358","22,835,767",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
20 |
+
ReAct-Pro*,gsm8k,2025/1/22,Internllm2_5-7B,33.51,97.95,8,max_steps=10,"1,319","30,120,070","22,836","5,549,919","4,208","35,669,989",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
21 |
+
ReAct-Pro*,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,24.87,80.21,8,max_steps=10,"1,319","9,133,603","6,925","694,398",526,"9,828,001",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
22 |
+
ReAct-Pro*,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,7.66,95.22,8,max_steps=10,"1,319","52,431,343","39,751","2,961,268","2,245","55,392,611",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
23 |
+
ReAct-Pro*,gsm8k,2025/2/10,deepseek-r1:1.5b,35.94,99.62,8,max_steps=10,"1,319","19,299,381","14,632","4,919,696","3,730","24,219,077",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
24 |
+
PoT,gsm8k,2025/1/7,gpt-3.5-turbo,76.88,99.24,8,,"1,319","1,090,418",827,"96,662",73,"1,187,080",0.6902,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
25 |
+
PoT,gsm8k,2025/1/7,Doubao-lite-32k,79.61,92.57,8,,"1,319","1,170,038",887,"118,017",89,"1,288,055",0.0576,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
26 |
+
PoT,gsm8k,2025/1/22,gpt-4o,93.10,99.77,8,,"1,319","1,101,672",835,"146,240",111,"1,247,912",4.2166,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
27 |
+
PoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,92.34,99.39,8,,"1,319","1,106,682",839,"144,528",110,"1,251,210",0.7054,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
28 |
+
PoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,73.09,79.61,8,,"1,319","1,126,025",854,"601,019",456,"1,727,044",0.9736,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
29 |
+
PoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,58.83,70.51,8,,"1,319","1,145,390",868,"217,432",165,"1,362,822",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
30 |
+
PoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,38.67,55.42,8,,"1,319","1,147,538",870,"243,573",185,"1,391,111",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
31 |
+
PoT,gsm8k,2025/1/22,Internllm2_5-7B,38.21,48.90,8,,"1,319","1,136,843",862,"188,106",143,"1,324,949",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
32 |
+
PoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,18.50,31.01,8,,"1,319","1,151,528",873,"175,994",133,"1,327,522",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
33 |
+
PoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,9.63,16.91,8,,"1,319","1,151,528",873,"237,607",180,"1,389,135",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
34 |
+
PoT,gsm8k,2025/2/10,deepseek-r1:1.5b,11.90,17.44,8,,"1,319","1,138,872",863,"815,637",618,"1,954,509",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
35 |
+
CoT,gsm8k,2025/1/7,gpt-3.5-turbo,78.70,100.00,8,,"1,319","953,242",723,"134,799",102,"1,088,041",0.6788,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
36 |
+
CoT,gsm8k,2025/1/7,Doubao-lite-32k,89.31,100.00,8,,"1,319","1,042,095",790,"159,725",121,"1,201,820",0.0558,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
37 |
+
CoT,gsm8k,2025/1/22,gpt-4o,94.09,100.00,8,,"1,319","948,668",719,"216,498",164,"1,165,166",4.5367,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
38 |
+
CoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,92.87,100.00,8,,"1,319","1,005,119",762,"271,133",206,"1,276,252",0.7195,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
39 |
+
CoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,93.93,100.00,8,,"1,319","990,168",751,"228,497",173,"1,218,665",0.6870,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
40 |
+
CoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,85.67,100.00,8,,"1,319","1,046,008",793,"244,797",186,"1,290,805",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
41 |
+
CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,75.44,99.92,8,,"1,319","990,168",751,"258,161",196,"1,248,329",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
42 |
+
CoT,gsm8k,2025/1/22,Internllm2_5-7B,77.71,99.70,8,,"1,319","968,163",734,"234,000",177,"1,202,163",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
43 |
+
CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,55.50,100.00,8,,"1,319","1,032,818",783,"185,707",141,"1,218,525",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
44 |
+
CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,35.94,99.92,8,,"1,319","1,032,818",783,"190,641",145,"1,223,459",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
45 |
+
CoT,gsm8k,2025/1/23,deepseek-r1:1.5b,70.66,99.77,8,,"1,319","1,011,714",767,"1,078,911",818,"2,090,625",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
46 |
+
SC-CoT,gsm8k,2025/1/7,gpt-3.5-turbo,79.91,99.92,8,"temperature=1, path_num=5","1,319","2,740,652","2,078","1,348,960","1,023","4,089,612",3.3938,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
47 |
+
SC-CoT,gsm8k,2025/1/7,Doubao-lite-32k,87.26,99.92,8,"temperature=1, path_num=5","1,319","2,691,714","2,041","1,197,099",908,"3,888,813",0.2083,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
48 |
+
SC-CoT,gsm8k,2025/1/22,gpt-4o,90.30,99.92,8,"temperature=1, path_num=5","1,319","3,590,336","2,722","2,207,837","1,674","5,798,173",31.0542,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
49 |
+
SC-CoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,93.86,100.00,8,"temperature=1, path_num=5","1,319","8,136,223","6,168","2,481,785","1,882","10,618,008",5.9858,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
50 |
+
SC-CoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,95.07,100.00,8,"temperature=1, path_num=5","1,319","8,413,717","6,379","2,585,077","1,960","10,998,794",6.2005,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
51 |
+
SC-CoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,91.13,100.00,8,"temperature=1, path_num=5","1,319","8,586,888","6,510","2,554,097","1,936","11,140,985",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
52 |
+
SC-CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,73.46,99.55,8,"temperature=1, path_num=5","1,319","8,630,514","6,543","3,148,202","2,387","11,778,716",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
53 |
+
SC-CoT,gsm8k,2025/1/22,Internllm2_5-7B,48.22,98.41,8,"temperature=1, path_num=5","1,319","10,678,792","8,096","3,847,639","2,917","14,526,431",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
54 |
+
SC-CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,11.75,91.89,8,"temperature=1, path_num=5","1,319","9,066,115","6,873","3,345,827","2,537","12,411,942",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
55 |
+
SC-CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,1.67,94.69,8,"temperature=1, path_num=5","1,319","11,019,864","8,355","5,445,856","4,129","16,465,720",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
56 |
+
SC-CoT,gsm8k,2025/2/10,deepseek-r1:1.5b,55.34,99.70,8,"temperature=1, path_num=5","1,319","14,540,096","11,024","11,245,769","8,526","25,785,865",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
57 |
+
ToT,gsm8k,2025/1/7,gpt-3.5-turbo,67.93,99.70,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","15,920,037","12,070","807,138",612,"16,727,175",9.1707,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
58 |
+
ToT,gsm8k,2025/1/7,Doubao-lite-32k,37.83,87.34,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","19,208,597","14,563","1,065,752",808,"20,274,349",0.8739,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
59 |
+
ToT,gsm8k,2025/1/22,gpt-4o,91.13,100.00,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","29,445,237","22,324","1,324,498","1,004","30,769,735",86.8581,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
60 |
+
ToT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,88.88,100.00,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","40,435,361","30,656","1,411,787","1,070","41,847,148",23.5911,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
61 |
+
ToT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,91.89,100.00,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","35,096,810","26,609","1,932,877","1,465","37,029,687",20.8753,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
62 |
+
ToT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,72.21,99.01,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","20,196,528","15,312","11,460,791","8,689","31,657,319",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
63 |
+
ToT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,65.05,91.96,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","15,554,967","11,793","877,135",665,"16,432,102",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
64 |
+
ToT,gsm8k,2025/1/22,Internllm2_5-7B,20.85,70.13,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","11,768,118","8,922","1,410,011","1,069","13,178,129",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
65 |
+
ToT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,19.64,77.26,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","12,124,248","9,192","634,439",481,"12,758,687",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
66 |
+
ToT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,-,-,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319",-,-,-,-,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
67 |
+
ToT,gsm8k,2025/2/10,deepseek-r1:1.5b,23.12,72.48,8,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true","1,319","2,738,244","2,076","683,242",518,"3,421,486",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
68 |
+
IO,AQuA,2025/1/7,gpt-3.5-turbo,38.98,100.00,0,,254,"25,701",101,"16,770",66,"42,471",0.0380,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
69 |
+
IO,AQuA,2025/1/7,Doubao-lite-32k,79.13,100.00,0,,254,"33,058",130,"54,684",215,"87,742",0.0058,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
70 |
+
IO,AQuA,2025/1/22,gpt-4o,75.59,97.24,0,,254,"25,631",101,"108,121",426,"133,752",1.1453,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
71 |
+
IO,AQuA,2025/1/22,Qwen2.5-72B-Instruct,84.25,99.61,0,,254,"25,397",100,"106,207",418,"131,604",0.0742,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
72 |
+
IO,AQuA,2025/1/22,Llama-3.3-70B-Instruct,82.68,99.21,0,,254,"32,809",129,"108,758",428,"141,567",0.0798,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
73 |
+
IO,AQuA,2025/1/22,Qwen2.5-7B-Instruct,78.74,98.43,0,,254,"33,271",131,"104,500",411,"137,771",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
74 |
+
IO,AQuA,2025/1/22,Llama-3.1-8B-Instruct,51.18,98.82,0,,254,"26,459",104,"106,647",420,"133,106",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
75 |
+
IO,AQuA,2025/1/22,Internllm2_5-7B,47.64,90.94,0,,254,"50,232",198,"134,809",531,"185,041",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
76 |
+
IO,AQuA,2025/1/22,Qwen2-1.5B-Instruct,29.13,97.64,0,,254,"27,937",110,"43,110",170,"71,047",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
77 |
+
IO,AQuA,2025/1/22,Qwen2-0.5B-Instruct,27.17,98.82,0,,254,"27,937",110,"82,478",325,"110,415",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
78 |
+
IO,AQuA,2025/1/22,deepseek-r1:1.5b,68.90,94.88,0,,254,"26,667",105,"325,100","1,280","351,767",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
79 |
+
CoT,AQuA,2025/1/7,gpt-3.5-turbo,61.02,93.70,0,,254,"25,447",100,"55,346",218,"80,793",0.0957,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
80 |
+
CoT,AQuA,2025/1/7,Doubao-lite-32k,82.68,97.24,0,,254,"27,978",110,"66,599",262,"94,577",0.0066,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
81 |
+
CoT,AQuA,2025/1/22,gpt-4o,82.68,98.03,0,,254,"25,123",99,"97,894",385,"123,017",1.0417,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
82 |
+
CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,86.22,99.21,0,,254,"25,143",99,"118,146",465,"143,289",0.0808,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
83 |
+
CoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,83.46,98.43,0,,254,"32,555",128,"131,834",519,"164,389",0.0927,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
84 |
+
CoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,80.71,99.61,0,,254,"33,017",130,"116,719",460,"149,736",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
85 |
+
CoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,60.63,100.00,0,,254,"32,555",128,"111,880",440,"144,435",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
86 |
+
CoT,AQuA,2025/1/22,Internllm2_5-7B,52.76,89.37,0,,254,"26,610",105,"100,910",397,"127,520",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
87 |
+
CoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,40.55,98.82,0,,254,"30,477",120,"79,563",313,"110,040",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
88 |
+
CoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,33.07,98.82,0,,254,"30,477",120,"86,862",342,"117,339",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
89 |
+
CoT,AQuA,2025/1/23,deepseek-r1:1.5b,71.65,96.85,0,,254,"26,413",104,"306,659","1,207","333,072",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
90 |
+
PoT,AQuA,2025/1/7,gpt-3.5-turbo,59.45,100.00,0,,254,"225,162",886,"41,492",163,"266,654",0.1748,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
91 |
+
PoT,AQuA,2025/1/7,Doubao-lite-32k,71.65,96.85,0,,254,"259,863","1,023","49,573",195,"309,436",0.0147,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
92 |
+
PoT,AQuA,2025/1/22,gpt-4o,75.20,100.00,0,,254,"222,717",877,"105,191",414,"327,908",1.6087,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
93 |
+
PoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,75.20,100.00,0,,254,"249,215",981,"42,549",168,"291,764",0.1645,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
94 |
+
PoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,79.53,99.21,0,,254,"240,735",948,"69,064",272,"309,799",0.1746,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
95 |
+
PoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,68.11,100.00,0,,254,"264,517","1,041","49,211",194,"313,728",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
96 |
+
PoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,36.61,96.85,0,,254,"240,613",947,"50,301",198,"290,914",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
97 |
+
PoT,AQuA,2025/1/22,Internllm2_5-7B,36.61,98.82,0,,254,"233,505",919,"68,457",270,"301,962",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
98 |
+
PoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,30.71,96.46,0,,254,"246,560",971,"51,915",204,"298,475",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
99 |
+
PoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,17.32,92.13,0,,254,"258,867","1,019","63,414",250,"322,281",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
100 |
+
PoT,AQuA,2025/2/10,deepseek-r1:1.5b,54.72,97.24,0,,254,"250,690",987,"765,957","3,016","1,016,647",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
101 |
+
SC-CoT,AQuA,2025/1/22,gpt-3.5-turbo,66.14,99.21,0,"temperature=1, path_num=5",254,"482,192","1,898","365,143","1,438","847,335",0.7888,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
102 |
+
SC-CoT,AQuA,2025/1/22,Doubao-lite-32k,81.10,97.24,0,"temperature=1, path_num=5",254,"503,751","1,983","382,235","1,505","885,986",0.0519,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
103 |
+
SC-CoT,AQuA,2025/1/22,gpt-4o,86.61,98.82,0,"temperature=1, path_num=5",254,"744,478","2,931","628,728","2,475","1,373,206",8.1485,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
104 |
+
SC-CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,85.04,99.21,0,"temperature=1, path_num=5",254,"1,051,218","4,139","784,451","3,088","1,835,669",1.0348,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
105 |
+
SC-CoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,82.28,99.21,0,"temperature=1, path_num=5",254,"1,135,251","4,469","772,673","3,042","1,907,924",1.0756,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
106 |
+
SC-CoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,79.92,100.00,0,"temperature=1, path_num=5",254,"1,098,280","4,324","747,052","2,941","1,845,332",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
107 |
+
SC-CoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,59.45,97.24,0,"temperature=1, path_num=5",254,"971,003","3,823","680,330","2,678","1,651,333",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
108 |
+
SC-CoT,AQuA,2025/1/22,Internllm2_5-7B,39.37,98.03,0,"temperature=1, path_num=5",254,"1,420,494","5,592","875,728","3,448","2,296,222",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
109 |
+
SC-CoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,23.62,96.46,0,"temperature=1, path_num=5",254,"1,034,362","4,072","740,973","2,917","1,775,335",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
110 |
+
SC-CoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,22.83,97.24,0,"temperature=1, path_num=5",254,"1,246,929","4,909","968,162","3,812","2,215,091",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
111 |
+
SC-CoT,AQuA,2025/2/10,deepseek-r1:1.5b,59.06,96.85,0,"temperature=1, path_num=5",254,"2,547,772","10,031","3,254,939","12,815","5,802,711",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
112 |
+
ReAct-Pro*,AQuA,2025/1/7,gpt-3.5-turbo,64.57,98.03,0,max_steps=10,254,"862,614","3,396","40,973",161,"903,587",0.4928,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
113 |
+
ReAct-Pro*,AQuA,2025/1/7,Doubao-lite-32k,77.56,96.06,0,max_steps=10,254,"977,890","3,850","54,951",216,"1,032,841",0.0445,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
114 |
+
ReAct-Pro*,AQuA,2025/1/22,gpt-4o,57.48,97.24,0,max_steps=10,254,"615,589","2,424","76,507",301,"692,096",2.3040,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
115 |
+
ReAct-Pro*,AQuA,2025/1/22,Qwen2.5-72B-Instruct,73.23,100.00,0,max_steps=10,254,"441,765","1,739","121,838",480,"563,603",0.3177,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
116 |
+
ReAct-Pro*,AQuA,2025/1/22,Llama-3.3-70B-Instruct,79.13,99.61,0,max_steps=10,254,"1,119,143","4,406","243,236",958,"1,362,379",0.7680,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
117 |
+
ReAct-Pro*,AQuA,2025/1/22,Qwen2.5-7B-Instruct,74.41,99.21,0,max_steps=10,254,"564,165","2,221","131,679",518,"695,844",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
118 |
+
ReAct-Pro*,AQuA,2025/1/22,Llama-3.1-8B-Instruct,55.51,96.85,0,max_steps=10,254,"3,764,723","14,822","576,098","2,268","4,340,821",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
119 |
+
ReAct-Pro*,AQuA,2025/1/22,Internllm2_5-7B,40.94,96.85,0,max_steps=10,254,"3,592,039","14,142","836,762","3,294","4,428,801",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
120 |
+
ReAct-Pro*,AQuA,2025/1/22,Qwen2-1.5B-Instruct,25.59,96.06,0,max_steps=10,254,"4,555,858","17,936","516,146","2,032","5,072,004",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
121 |
+
ReAct-Pro*,AQuA,2025/1/22,Qwen2-0.5B-Instruct,24.02,96.85,0,max_steps=10,254,"6,344,167","24,977","825,920","3,252","7,170,087",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
122 |
+
ReAct-Pro*,AQuA,2025/2/10,deepseek-r1:1.5b,54.33,96.46,0,max_steps=10,254,"10,578,715","41,648","3,866,326","15,222","14,445,041",0.0000,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,,,,,,,,,
|
123 |
+
ToT,AQuA,2025/1/7,gpt-3.5-turbo,57.09,99.61,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"1,850,767","7,286","150,629",593,"2,001,396",1.1513,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
124 |
+
ToT,AQuA,2025/1/7,Doubao-lite-32k,45.28,74.02,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"1,850,249","7,284","150,301",592,"2,000,550",0.0881,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
125 |
+
ToT,AQuA,2025/1/22,gpt-4o,81.50,99.21,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"2,347,538","9,242","266,069","1,048","2,613,607",8.5295,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
126 |
+
ToT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,81.10,99.21,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"6,371,642","25,085","260,613","1,026","6,632,255",3.7389,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
127 |
+
ToT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,83.07,100.00,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"4,735,188","18,642","480,660","1,892","5,215,848",2.9404,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
128 |
+
ToT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,53.94,100.00,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"8,224,468","32,380","378,214","1,489","8,602,682",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
129 |
+
ToT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,59.06,100.00,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"4,896,222","19,276","843,462","3,321","5,739,684",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
130 |
+
ToT,AQuA,2025/1/22,Internllm2_5-7B,35.83,99.61,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"4,263,136","16,784","471,424","1,856","4,734,560",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
131 |
+
ToT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,31.50,98.82,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"6,058,022","23,850","192,680",759,"6,250,702",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
132 |
+
ToT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,29.92,100.00,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"8,100,085","31,890","600,196","2,363","8,700,281",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
133 |
+
ToT,AQuA,2025/2/10,deepseek-r1:1.5b,24.80,55.51,0,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",254,"605,028","2,382","189,484",746,"794,512",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
134 |
+
IO,MATH-500,2025/1/24,gpt-3.5-turbo,17.20,100.00,4,,500,"154,881",310,"110,744",221,"265,625",0.2436,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
135 |
+
IO,MATH-500,2025/1/24,Doubao-lite-32k,37.40,100.00,4,,500,"166,870",334,"144,860",290,"311,730",0.0187,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
136 |
+
IO,MATH-500,2025/1/22,gpt-4o,41.80,100.00,4,,500,"153,832",308,"240,615",481,"394,447",2.7907,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
137 |
+
IO,MATH-500,2025/1/24,Qwen2.5-72B-Instruct,70.20,100.00,4,,500,"169,549",339,"275,042",550,"444,591",0.2506,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
138 |
+
IO,MATH-500,2025/1/24,Llama-3.3-70B-Instruct,69.40,100.00,4,,500,"155,879",312,"267,337",535,"423,216",0.2386,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
139 |
+
IO,MATH-500,2025/1/24,Qwen2.5-7B-Instruct,59.40,100.00,4,,500,"169,549",339,"241,813",484,"411,362",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
140 |
+
IO,MATH-500,2025/1/24,Llama-3.1-8B-Instruct,38.60,100.00,4,,500,"155,563",311,"348,371",697,"503,934",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
141 |
+
IO,MATH-500,2025/1/24,Internllm2_5-7B,22.80,100.00,4,,500,"201,883",404,"266,005",532,"467,888",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
142 |
+
IO,MATH-500,2025/1/24,Qwen2-1.5B-Instruct,7.00,100.00,4,,500,"158,777",318,"255,101",510,"413,878",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
143 |
+
IO,MATH-500,2025/1/24,Qwen2-0.5B-Instruct,2.60,100.00,4,,500,"159,049",318,"270,281",541,"429,330",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
144 |
+
IO,MATH-500,2025/1/24,deepseek-r1:1.5b,43.80,100.00,4,,500,"157,049",314,"865,499","1,731","1,022,548",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
145 |
+
CoT,MATH-500,2025/1/24,gpt-3.5-turbo,39.80,100.00,4,,500,"329,381",659,"102,815",206,"432,196",0.3189,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
146 |
+
CoT,MATH-500,2025/1/22,Doubao-lite-32k,59.00,100.00,4,,500,"336,370",673,"143,571",287,"479,941",0.0255,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
147 |
+
CoT,MATH-500,2025/1/24,gpt-4o,68.00,100.00,4,,500,"329,332",659,"223,356",447,"552,688",3.0569,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
148 |
+
CoT,MATH-500,2025/1/22,Qwen2.5-72B-Instruct,80.20,100.00,4,,500,"338,549",677,"280,466",561,"619,015",0.3490,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
149 |
+
CoT,MATH-500,2025/1/24,Llama-3.3-70B-Instruct,71.20,100.00,4,,500,"342,879",686,"271,342",543,"614,221",0.3463,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
150 |
+
CoT,MATH-500,2025/1/24,Qwen2.5-7B-Instruct,69.80,100.00,4,,500,"354,049",708,"263,155",526,"617,204",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
151 |
+
CoT,MATH-500,2025/1/24,Llama-3.1-8B-Instruct,25.80,100.00,4,,500,"342,879",686,"282,689",565,"625,568",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
152 |
+
CoT,MATH-500,2025/1/24,Internllm2_5-7B,46.60,100.00,4,,500,"332,883",666,"213,891",428,"546,774",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
153 |
+
CoT,MATH-500,2025/1/24,Qwen2-1.5B-Instruct,15.20,100.00,4,,500,"349,049",698,"187,328",375,"536,377",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
154 |
+
CoT,MATH-500,2025/1/24,Qwen2-0.5B-Instruct,6.20,100.00,4,,500,"349,049",698,"200,139",400,"549,188",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
155 |
+
CoT,MATH-500,2025/1/24,deepseek-r1:1.5b,49.40,100.00,4,,500,"341,549",683,"857,580","1,715","1,199,129",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
156 |
+
PoT,MATH-500,2025/2/10,gpt-3.5-turbo,28.80,83.80,4,,500,"239,902",480,"32,014",64,"271,916",0.1680,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
157 |
+
PoT,MATH-500,2025/2/10,Doubao-lite-32k,32.60,68.00,4,,500,"254,377",509,"48,771",98,"303,148",0.0144,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
158 |
+
PoT,MATH-500,2025/2/10,gpt-4o,46.20,86.40,4,,500,"241,357",483,"99,603",199,"340,960",1.5994,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
159 |
+
PoT,MATH-500,2025/2/10,Qwen2.5-72B-Instruct,47.20,82.20,4,,500,"242,549",485,"170,823",342,"413,372",0.2330,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
160 |
+
PoT,MATH-500,2025/2/10,Llama-3.3-70B-Instruct,42.60,80.20,4,,500,"253,879",508,"249,717",499,"503,596",0.2839,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
161 |
+
PoT,MATH-500,2025/2/10,Qwen2.5-7B-Instruct,39.60,74.40,4,,500,"258,549",517,"150,263",301,"408,812",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
162 |
+
PoT,MATH-500,2025/2/10,Llama-3.1-8B-Instruct,25.40,68.40,4,,500,"253,879",508,"208,392",417,"462,271",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
163 |
+
PoT,MATH-500,2025/2/10,Internllm2_5-7B,15.00,32.40,4,,500,"247,883",496,"120,826",242,"368,709",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
164 |
+
PoT,MATH-500,2025/2/10,Qwen2-1.5B-Instruct,0.80,2.20,4,,500,"248,509",497,"538,361","1,077","786,870",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
165 |
+
PoT,MATH-500,2025/2/10,Qwen2-0.5B-Instruct,0.00,0.00,4,,500,"253,549",507,"183,653",367,"437,202",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
166 |
+
PoT,MATH-500,2025/2/10,deepseek-r1:1.5b,1.00,1.60,4,,500,"245,549",491,"785,518","1,571","1,031,067",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
167 |
+
SC-CoT,MATH-500,2025/2/10,gpt-3.5-turbo,28.80,100.00,4,"temperature=1, path_num=5",500,"1,381,818","2,764","856,994","1,714","2,238,812",1.9764,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
168 |
+
SC-CoT,MATH-500,2025/2/10,Doubao-lite-32k,49.20,100.00,4,"temperature=1, path_num=5",500,"1,507,651","3,015","963,159","1,926","2,470,810",0.1406,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
169 |
+
SC-CoT,MATH-500,2025/2/10,gpt-4o,34.40,100.00,4,"temperature=1, path_num=5",500,"1,986,584","3,973","1,468,739","2,937","3,455,323",19.6538,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
170 |
+
SC-CoT,MATH-500,2025/2/10,Qwen2.5-72B-Instruct,74.00,100.00,4,"temperature=1, path_num=5",500,"3,823,997","7,648","1,773,516","3,547","5,597,513",3.1556,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
171 |
+
SC-CoT,MATH-500,2025/2/10,Llama-3.3-70B-Instruct,74.20,100.00,4,"temperature=1, path_num=5",500,"3,959,492","7,919","1,759,247","3,518","5,718,739",3.2239,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
172 |
+
SC-CoT,MATH-500,2025/2/10,Qwen2.5-7B-Instruct,67.00,100.00,4,"temperature=1, path_num=5",500,"3,833,751","7,668","1,617,733","3,235","5,451,484",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
173 |
+
SC-CoT,MATH-500,2025/2/10,Llama-3.1-8B-Instruct,30.20,100.00,4,"temperature=1, path_num=5",500,"3,546,673","7,093","1,488,264","2,977","5,034,937",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
174 |
+
SC-CoT,MATH-500,2025/2/10,Internllm2_5-7B,9.80,100.00,4,"temperature=1, path_num=5",500,"4,193,296","8,387","1,645,170","3,290","5,838,466",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
175 |
+
SC-CoT,MATH-500,2025/2/10,Qwen2-1.5B-Instruct,3.80,99.00,4,"temperature=1, path_num=5",500,"3,832,429","7,665","1,737,013","3,474","5,569,442",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
176 |
+
SC-CoT,MATH-500,2025/2/10,Qwen2-0.5B-Instruct,0.80,100.00,4,"temperature=1, path_num=5",500,"4,448,663","8,897","2,413,393","4,827","6,862,056",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
177 |
+
SC-CoT,MATH-500,2025/2/10,deepseek-r1:1.5b,38.00,100.00,4,"temperature=1, path_num=5",500,"7,080,559","14,161","7,661,550","15,323","14,742,109",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
178 |
+
ReAct-Pro*,MATH-500,2025/2/10,gpt-3.5-turbo,23.80,100.00,4,max_steps=10,500,"3,708,461","7,417","124,253",249,"3,832,714",2.0406,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
179 |
+
ReAct-Pro*,MATH-500,2025/2/10,Doubao-lite-32k,47.20,100.00,4,max_steps=10,500,"4,234,620","8,469","154,046",308,"4,388,666",0.1860,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
180 |
+
ReAct-Pro*,MATH-500,2025/2/10,gpt-4o,54.00,100.00,4,max_steps=10,500,"5,834,537","11,669","318,718",637,"6,153,255",17.7735,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
181 |
+
ReAct-Pro*,MATH-500,2025/2/10,Qwen2.5-72B-Instruct,62.80,100.00,4,max_steps=10,500,"5,747,268","11,495","379,849",760,"6,127,117",3.4541,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
182 |
+
ReAct-Pro*,MATH-500,2025/2/10,Llama-3.3-70B-Instruct,64.60,100.00,4,max_steps=10,500,"5,223,611","10,447","418,268",837,"5,641,879",3.1806,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
183 |
+
ReAct-Pro*,MATH-500,2025/2/10,Qwen2.5-7B-Instruct,48.80,100.00,4,max_steps=10,500,"4,646,708","9,293","343,532",687,"4,990,240",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
184 |
+
ReAct-Pro*,MATH-500,2025/2/10,Llama-3.1-8B-Instruct,28.80,100.00,4,max_steps=10,500,"7,486,706","14,973","1,276,923","2,554","8,763,629",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
185 |
+
ReAct-Pro*,MATH-500,2025/2/10,Internllm2_5-7B,14.80,100.00,4,max_steps=10,500,"11,831,496","23,663","2,354,609","4,709","14,186,105",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
186 |
+
ReAct-Pro*,MATH-500,2025/2/10,Qwen2-1.5B-Instruct,8.20,100.00,4,max_steps=10,500,"8,430,774","16,862","556,287","1,113","8,987,061",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
187 |
+
ReAct-Pro*,MATH-500,2025/2/10,Qwen2-0.5B-Instruct,0.60,100.00,4,max_steps=10,500,"18,137,392","36,275","1,305,048","2,610","19,442,440",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
188 |
+
ReAct-Pro*,MATH-500,2025/2/10,deepseek-r1:1.5b,24.40,100.00,4,max_steps=10,500,"20,729,970","41,460","9,447,378","18,895","30,177,348",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
189 |
+
ToT,MATH-500,2025/2/10,gpt-3.5-turbo,9.80,100.00,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"9,711,244","19,422","290,523",581,"10,001,767",5.2914,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
190 |
+
ToT,MATH-500,2025/2/10,Doubao-lite-32k,1.20,94.20,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"5,338,500","10,677","226,000",452,"5,564,500",0.2371,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
191 |
+
ToT,MATH-500,2025/2/10,gpt-4o,3.20,100.00,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"14,881,985","29,764","360,447",721,"15,242,432",40.8094,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
192 |
+
ToT,MATH-500,2025/2/10,Qwen2.5-72B-Instruct,10.80,100.00,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"15,657,730","31,315","381,631",763,"16,039,361",9.0421,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
193 |
+
ToT,MATH-500,2025/2/10,Llama-3.3-70B-Instruct,1.40,69.80,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"14,099,500","28,199","570,000","1,140","14,669,500",8.2699,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
194 |
+
ToT,MATH-500,2025/2/10,Qwen2.5-7B-Instruct,1.40,91.60,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"9,749,000","19,498","418,500",837,"10,167,500",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
195 |
+
ToT,MATH-500,2025/2/10,Llama-3.1-8B-Instruct,1.80,90.80,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"7,729,000","15,458","1,306,000","2,612","9,035,000",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
196 |
+
ToT,MATH-500,2025/2/10,Internllm2_5-7B,0.20,99.00,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"7,515,000","15,030","835,500","1,671","8,350,500",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
197 |
+
ToT,MATH-500,2025/2/10,Qwen2-1.5B-Instruct,0.80,97.20,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"4,408,000","8,816","127,000",254,"4,535,000",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
198 |
+
ToT,MATH-500,2025/2/10,Qwen2-0.5B-Instruct,0.00,96.20,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"5,590,500","11,181","406,000",812,"5,996,500",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
199 |
+
ToT,MATH-500,2025/2/10,deepseek-r1:1.5b,0.40,71.60,4,"search_type=bfs, b=1, max_depth=6, max_steps=6, generation_n=1, evaluation_n=3, evaluation_type=vote, use_llm_completion=true",500,"1,831,000","3,662","110,500",221,"1,941,500",0.0000,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|