Update app.py
Browse files
app.py
CHANGED
@@ -125,7 +125,7 @@ def load_dataset_by_name(dataset_name, split="train"):
|
|
125 |
retry=retry_if_exception_type(Exception)
|
126 |
)
|
127 |
|
128 |
-
def get_model_response(question, options, prompt_template, model_name):
|
129 |
with semaphore:
|
130 |
try:
|
131 |
model_config = MODELS[model_name]
|
@@ -135,23 +135,21 @@ def get_model_response(question, options, prompt_template, model_name):
|
|
135 |
provider = model_config["provider"]
|
136 |
|
137 |
if provider == "togetherai":
|
138 |
-
response =
|
139 |
model=model_config["model_id"],
|
140 |
messages=[{"role": "user", "content": prompt}]
|
141 |
)
|
142 |
response_text = response.choices[0].message.content.strip()
|
143 |
|
144 |
elif provider == "openai":
|
145 |
-
response =
|
146 |
model=model_config["model_id"],
|
147 |
-
messages=[{
|
148 |
-
"role": "user",
|
149 |
-
"content": prompt}]
|
150 |
)
|
151 |
response_text = response.choices[0].message.content.strip()
|
152 |
|
153 |
elif provider == "anthropic":
|
154 |
-
response =
|
155 |
model=model_config["model_id"],
|
156 |
messages=[{"role": "user", "content": prompt}],
|
157 |
max_tokens=4096
|
@@ -189,12 +187,13 @@ def evaluate_response(model_response, correct_answer):
|
|
189 |
is_correct = model_response.lower().strip() == correct_answer.lower().strip()
|
190 |
return is_correct
|
191 |
|
192 |
-
def process_single_evaluation(question, prompt_template, model_name):
|
193 |
answer, response_text = get_model_response(
|
194 |
question['question'],
|
195 |
question['options'],
|
196 |
prompt_template,
|
197 |
-
model_name
|
|
|
198 |
)
|
199 |
is_correct = evaluate_response(answer, question['correct_answer'])
|
200 |
return {
|
@@ -210,7 +209,7 @@ def process_single_evaluation(question, prompt_template, model_name):
|
|
210 |
'model_name': model_name
|
211 |
}
|
212 |
|
213 |
-
def process_evaluations_concurrently(questions, prompt_template, models_to_evaluate, progress_callback):
|
214 |
results = []
|
215 |
total_iterations = len(models_to_evaluate) * len(questions)
|
216 |
current_iteration = 0
|
@@ -219,7 +218,7 @@ def process_evaluations_concurrently(questions, prompt_template, models_to_evalu
|
|
219 |
future_to_params = {}
|
220 |
for model_name in models_to_evaluate:
|
221 |
for question in questions:
|
222 |
-
future = executor.submit(process_single_evaluation, question, prompt_template, model_name)
|
223 |
future_to_params[future] = (model_name, question)
|
224 |
|
225 |
for future in as_completed(future_to_params):
|
@@ -318,6 +317,13 @@ Important:
|
|
318 |
with st.spinner("Starting evaluation..."):
|
319 |
selected_questions = questions[:num_questions]
|
320 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
321 |
progress_container = st.container()
|
322 |
progress_bar = progress_container.progress(0)
|
323 |
status_text = progress_container.empty()
|
@@ -326,12 +332,13 @@ Important:
|
|
326 |
progress = current / total
|
327 |
progress_bar.progress(progress)
|
328 |
status_text.text(f"Progress: {current}/{total} evaluations completed")
|
329 |
-
|
330 |
results = process_evaluations_concurrently(
|
331 |
selected_questions,
|
332 |
prompt_template,
|
333 |
models_to_evaluate,
|
334 |
-
update_progress
|
|
|
335 |
)
|
336 |
|
337 |
all_results = {}
|
|
|
125 |
retry=retry_if_exception_type(Exception)
|
126 |
)
|
127 |
|
128 |
+
def get_model_response(question, options, prompt_template, model_name, clients):
|
129 |
with semaphore:
|
130 |
try:
|
131 |
model_config = MODELS[model_name]
|
|
|
135 |
provider = model_config["provider"]
|
136 |
|
137 |
if provider == "togetherai":
|
138 |
+
response = clients["togetherai"].chat.completions.create(
|
139 |
model=model_config["model_id"],
|
140 |
messages=[{"role": "user", "content": prompt}]
|
141 |
)
|
142 |
response_text = response.choices[0].message.content.strip()
|
143 |
|
144 |
elif provider == "openai":
|
145 |
+
response = clients["openai"].chat.completions.create(
|
146 |
model=model_config["model_id"],
|
147 |
+
messages=[{"role": "user", "content": prompt}]
|
|
|
|
|
148 |
)
|
149 |
response_text = response.choices[0].message.content.strip()
|
150 |
|
151 |
elif provider == "anthropic":
|
152 |
+
response = clients["anthropic"].messages.create(
|
153 |
model=model_config["model_id"],
|
154 |
messages=[{"role": "user", "content": prompt}],
|
155 |
max_tokens=4096
|
|
|
187 |
is_correct = model_response.lower().strip() == correct_answer.lower().strip()
|
188 |
return is_correct
|
189 |
|
190 |
+
def process_single_evaluation(question, prompt_template, model_name, clients):
|
191 |
answer, response_text = get_model_response(
|
192 |
question['question'],
|
193 |
question['options'],
|
194 |
prompt_template,
|
195 |
+
model_name,
|
196 |
+
clients
|
197 |
)
|
198 |
is_correct = evaluate_response(answer, question['correct_answer'])
|
199 |
return {
|
|
|
209 |
'model_name': model_name
|
210 |
}
|
211 |
|
212 |
+
def process_evaluations_concurrently(questions, prompt_template, models_to_evaluate, progress_callback, clients):
|
213 |
results = []
|
214 |
total_iterations = len(models_to_evaluate) * len(questions)
|
215 |
current_iteration = 0
|
|
|
218 |
future_to_params = {}
|
219 |
for model_name in models_to_evaluate:
|
220 |
for question in questions:
|
221 |
+
future = executor.submit(process_single_evaluation, question, prompt_template, model_name, clients)
|
222 |
future_to_params[future] = (model_name, question)
|
223 |
|
224 |
for future in as_completed(future_to_params):
|
|
|
317 |
with st.spinner("Starting evaluation..."):
|
318 |
selected_questions = questions[:num_questions]
|
319 |
|
320 |
+
# Create a clients dictionary
|
321 |
+
clients = {
|
322 |
+
"togetherai": st.session_state["togetherai_client"],
|
323 |
+
"openai": st.session_state["openai_client"],
|
324 |
+
"anthropic": st.session_state["anthropic_client"]
|
325 |
+
}
|
326 |
+
|
327 |
progress_container = st.container()
|
328 |
progress_bar = progress_container.progress(0)
|
329 |
status_text = progress_container.empty()
|
|
|
332 |
progress = current / total
|
333 |
progress_bar.progress(progress)
|
334 |
status_text.text(f"Progress: {current}/{total} evaluations completed")
|
335 |
+
|
336 |
results = process_evaluations_concurrently(
|
337 |
selected_questions,
|
338 |
prompt_template,
|
339 |
models_to_evaluate,
|
340 |
+
update_progress,
|
341 |
+
clients
|
342 |
)
|
343 |
|
344 |
all_results = {}
|