ehagey commited on
Commit
f5c4a1c
·
verified ·
1 Parent(s): 0dd6bfb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -13
app.py CHANGED
@@ -125,7 +125,7 @@ def load_dataset_by_name(dataset_name, split="train"):
125
  retry=retry_if_exception_type(Exception)
126
  )
127
 
128
- def get_model_response(question, options, prompt_template, model_name):
129
  with semaphore:
130
  try:
131
  model_config = MODELS[model_name]
@@ -135,23 +135,21 @@ def get_model_response(question, options, prompt_template, model_name):
135
  provider = model_config["provider"]
136
 
137
  if provider == "togetherai":
138
- response = st.session_state.togetherai_client.chat.completions.create(
139
  model=model_config["model_id"],
140
  messages=[{"role": "user", "content": prompt}]
141
  )
142
  response_text = response.choices[0].message.content.strip()
143
 
144
  elif provider == "openai":
145
- response = st.session_state.openai_client.chat.completions.create(
146
  model=model_config["model_id"],
147
- messages=[{
148
- "role": "user",
149
- "content": prompt}]
150
  )
151
  response_text = response.choices[0].message.content.strip()
152
 
153
  elif provider == "anthropic":
154
- response = st.session_state.anthropic_client.messages.create(
155
  model=model_config["model_id"],
156
  messages=[{"role": "user", "content": prompt}],
157
  max_tokens=4096
@@ -189,12 +187,13 @@ def evaluate_response(model_response, correct_answer):
189
  is_correct = model_response.lower().strip() == correct_answer.lower().strip()
190
  return is_correct
191
 
192
- def process_single_evaluation(question, prompt_template, model_name):
193
  answer, response_text = get_model_response(
194
  question['question'],
195
  question['options'],
196
  prompt_template,
197
- model_name
 
198
  )
199
  is_correct = evaluate_response(answer, question['correct_answer'])
200
  return {
@@ -210,7 +209,7 @@ def process_single_evaluation(question, prompt_template, model_name):
210
  'model_name': model_name
211
  }
212
 
213
- def process_evaluations_concurrently(questions, prompt_template, models_to_evaluate, progress_callback):
214
  results = []
215
  total_iterations = len(models_to_evaluate) * len(questions)
216
  current_iteration = 0
@@ -219,7 +218,7 @@ def process_evaluations_concurrently(questions, prompt_template, models_to_evalu
219
  future_to_params = {}
220
  for model_name in models_to_evaluate:
221
  for question in questions:
222
- future = executor.submit(process_single_evaluation, question, prompt_template, model_name)
223
  future_to_params[future] = (model_name, question)
224
 
225
  for future in as_completed(future_to_params):
@@ -318,6 +317,13 @@ Important:
318
  with st.spinner("Starting evaluation..."):
319
  selected_questions = questions[:num_questions]
320
 
 
 
 
 
 
 
 
321
  progress_container = st.container()
322
  progress_bar = progress_container.progress(0)
323
  status_text = progress_container.empty()
@@ -326,12 +332,13 @@ Important:
326
  progress = current / total
327
  progress_bar.progress(progress)
328
  status_text.text(f"Progress: {current}/{total} evaluations completed")
329
-
330
  results = process_evaluations_concurrently(
331
  selected_questions,
332
  prompt_template,
333
  models_to_evaluate,
334
- update_progress
 
335
  )
336
 
337
  all_results = {}
 
125
  retry=retry_if_exception_type(Exception)
126
  )
127
 
128
+ def get_model_response(question, options, prompt_template, model_name, clients):
129
  with semaphore:
130
  try:
131
  model_config = MODELS[model_name]
 
135
  provider = model_config["provider"]
136
 
137
  if provider == "togetherai":
138
+ response = clients["togetherai"].chat.completions.create(
139
  model=model_config["model_id"],
140
  messages=[{"role": "user", "content": prompt}]
141
  )
142
  response_text = response.choices[0].message.content.strip()
143
 
144
  elif provider == "openai":
145
+ response = clients["openai"].chat.completions.create(
146
  model=model_config["model_id"],
147
+ messages=[{"role": "user", "content": prompt}]
 
 
148
  )
149
  response_text = response.choices[0].message.content.strip()
150
 
151
  elif provider == "anthropic":
152
+ response = clients["anthropic"].messages.create(
153
  model=model_config["model_id"],
154
  messages=[{"role": "user", "content": prompt}],
155
  max_tokens=4096
 
187
  is_correct = model_response.lower().strip() == correct_answer.lower().strip()
188
  return is_correct
189
 
190
+ def process_single_evaluation(question, prompt_template, model_name, clients):
191
  answer, response_text = get_model_response(
192
  question['question'],
193
  question['options'],
194
  prompt_template,
195
+ model_name,
196
+ clients
197
  )
198
  is_correct = evaluate_response(answer, question['correct_answer'])
199
  return {
 
209
  'model_name': model_name
210
  }
211
 
212
+ def process_evaluations_concurrently(questions, prompt_template, models_to_evaluate, progress_callback, clients):
213
  results = []
214
  total_iterations = len(models_to_evaluate) * len(questions)
215
  current_iteration = 0
 
218
  future_to_params = {}
219
  for model_name in models_to_evaluate:
220
  for question in questions:
221
+ future = executor.submit(process_single_evaluation, question, prompt_template, model_name, clients)
222
  future_to_params[future] = (model_name, question)
223
 
224
  for future in as_completed(future_to_params):
 
317
  with st.spinner("Starting evaluation..."):
318
  selected_questions = questions[:num_questions]
319
 
320
+ # Create a clients dictionary
321
+ clients = {
322
+ "togetherai": st.session_state["togetherai_client"],
323
+ "openai": st.session_state["openai_client"],
324
+ "anthropic": st.session_state["anthropic_client"]
325
+ }
326
+
327
  progress_container = st.container()
328
  progress_bar = progress_container.progress(0)
329
  status_text = progress_container.empty()
 
332
  progress = current / total
333
  progress_bar.progress(progress)
334
  status_text.text(f"Progress: {current}/{total} evaluations completed")
335
+
336
  results = process_evaluations_concurrently(
337
  selected_questions,
338
  prompt_template,
339
  models_to_evaluate,
340
+ update_progress,
341
+ clients
342
  )
343
 
344
  all_results = {}